#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/prefetch.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
}
EXPORT_SYMBOL(unregister_shrinker);
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+ struct shrink_control *sc,
+ unsigned long nr_to_scan)
+{
+ sc->nr_to_scan = nr_to_scan;
+ return (*shrinker->shrink)(shrinker, sc);
+}
+
#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+unsigned long shrink_slab(struct shrink_control *shrink,
+ unsigned long nr_pages_scanned,
+ unsigned long lru_pages)
{
struct shrinker *shrinker;
unsigned long ret = 0;
- if (scanned == 0)
- scanned = SWAP_CLUSTER_MAX;
+ if (nr_pages_scanned == 0)
+ nr_pages_scanned = SWAP_CLUSTER_MAX;
- if (!down_read_trylock(&shrinker_rwsem))
- return 1; /* Assume we'll be able to shrink next time */
+ if (!down_read_trylock(&shrinker_rwsem)) {
+ /* Assume we'll be able to shrink next time */
+ ret = 1;
+ goto out;
+ }
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass;
- max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- delta = (4 * scanned) / shrinker->seeks;
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
int shrink_ret;
int nr_before;
- nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
- gfp_mask);
+ nr_before = do_shrinker_shrink(shrinker, shrink, 0);
+ shrink_ret = do_shrinker_shrink(shrinker, shrink,
+ this_scan);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
+out:
+ cond_resched();
return ret;
}
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
- lock_page_nosync(page);
+ lock_page(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
unlock_page(page);
freepage = mapping->a_ops->freepage;
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
* back off and wait for congestion to clear because further reclaim
* will encounter the same problem
*/
- if (nr_dirty == nr_congested && nr_dirty != 0)
+ if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
zone_set_flag(zone, ZONE_CONGESTED);
free_page_list(&free_pages);
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
* surrounding the tag page. Only take those pages of
* the same active state as that tag page. We may safely
* round the target page pfn down to the requested order
- * as the mem_map is guarenteed valid out to MAX_ORDER,
+ * as the mem_map is guaranteed valid out to MAX_ORDER,
* where that page is in a different zone we will detect
* it from its zone id and abort this block scan.
*/
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
nr_lumpy_taken++;
if (PageDirty(cursor_page))
nr_lumpy_dirty++;
struct page *page;
list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
- nr_active++;
+ nr_active += numpages;
}
if (count)
- count[lru]++;
+ count[lru] += numpages;
}
return nr_active;
{
int ret = -EBUSY;
+ VM_BUG_ON(!page_count(page));
+
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && get_page_unless_zero(page)) {
+ if (PageLRU(page)) {
int lru = page_lru(page);
ret = 0;
+ get_page(page);
ClearPageLRU(page);
del_page_from_lru_list(zone, page, lru);
add_page_to_lru_list(zone, page, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
+ pgmoved += hpage_nr_pages(page);
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
}
if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- nr_rotated++;
+ nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
return false;
- /*
- * If we failed to reclaim and have scanned the full list, stop.
- * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
- * faster but obviously would be less likely to succeed
- * allocation. If this is desirable, use GFP_REPEAT to decide
- * if both reclaimed and scanned should be checked or just
- * reclaimed
- */
- if (!nr_reclaimed && !nr_scanned)
- return false;
+ /* Consider stopping depending on scan and reclaim activity */
+ if (sc->gfp_mask & __GFP_REPEAT) {
+ /*
+ * For __GFP_REPEAT allocations, stop reclaiming if the
+ * full LRU list has been scanned and we are still failing
+ * to reclaim pages. This full LRU scan is potentially
+ * expensive but a __GFP_REPEAT caller really wants to succeed
+ */
+ if (!nr_reclaimed && !nr_scanned)
+ return false;
+ } else {
+ /*
+ * For non-__GFP_REPEAT allocations which can presumably
+ * fail without consequence, stop if we failed to reclaim
+ * any pages from the last SWAP_CLUSTER_MAX number of
+ * pages that were scanned. This will return to the
+ * caller faster at the risk reclaim/compaction and
+ * the resulting allocation attempt fails
+ */
+ if (!nr_reclaimed)
+ return false;
+ }
/*
* If we have not reclaimed enough pages for compaction and the
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list l;
- unsigned long nr_reclaimed;
+ unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- unsigned long nr_scanned = sc->nr_scanned;
restart:
nr_reclaimed = 0;
+ nr_scanned = sc->nr_scanned;
get_scan_count(zone, sc, nr, priority);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
-/*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
+/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone_reclaimable(zone)) {
- all_unreclaimable = false;
- break;
- }
+ if (!zone->all_unreclaimable)
+ return false;
}
- return all_unreclaimable;
+ return true;
}
/*
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
- struct scan_control *sc)
+ struct scan_control *sc,
+ struct shrink_control *shrink)
{
int priority;
unsigned long total_scanned = 0;
lru_pages += zone_reclaimable_pages(zone);
}
- shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+ shrink_slab(shrink, sc->nr_scanned, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
struct zone *preferred_zone;
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
- NULL, &preferred_zone);
+ &cpuset_current_mems_allowed,
+ &preferred_zone);
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
}
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
+ /*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+ * check.
+ */
+ if (oom_killer_disabled)
+ return 0;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
.mem_cgroup = NULL,
.nodemask = nodemask,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
.order = 0,
.mem_cgroup = mem_cont,
.nodemask = NULL, /* we don't care the placement */
+ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ };
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
};
- sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
- (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
}
#endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ * o a 16M DMA zone that is balanced will not balance a zone on any
+ * reasonable sized machine
+ * o On all other machines, the top zone must be at least a reasonable
+ * percentage of the middle zones. For example, on 32-bit x86, highmem
+ * would need to be at least 256M for it to be balance a whole node.
+ * Similarly, on x86-64 the Normal zone would need to be at least 1G
+ * to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+ int classzone_idx)
+{
+ unsigned long present_pages = 0;
+ int i;
+
+ for (i = 0; i <= classzone_idx; i++)
+ present_pages += pgdat->node_zones[i].present_pages;
+
+ return balanced_pages > (present_pages >> 2);
+}
+
/* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
{
int i;
+ unsigned long balanced = 0;
+ bool all_zones_ok = true;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return 1;
+ return true;
- /* If after HZ/10, a zone is below the high mark, it's premature */
+ /* Check the watermark levels */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable)
+ /*
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well if kswapd
+ * is to sleep
+ */
+ if (zone->all_unreclaimable) {
+ balanced += zone->present_pages;
continue;
+ }
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- 0, 0))
- return 1;
+ classzone_idx, 0))
+ all_zones_ok = false;
+ else
+ balanced += zone->present_pages;
}
- return 0;
+ /*
+ * For high-order requests, the balanced zones must contain at least
+ * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+ * must be balanced
+ */
+ if (order)
+ return !pgdat_balanced(pgdat, balanced, classzone_idx);
+ else
+ return !all_zones_ok;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* of pages is balanced across the zones.
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int classzone_idx)
+ int *classzone_idx)
{
int all_zones_ok;
- int any_zone_ok;
+ unsigned long balanced;
int priority;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
.order = order,
.mem_cgroup = NULL,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
disable_swap_token();
all_zones_ok = 1;
- any_zone_ok = 0;
+ balanced = 0;
/*
* Scan in the highmem->dma direction for the highest
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
+ *classzone_idx = i;
break;
}
}
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
+ unsigned long balance_gap;
if (!populated_zone(zone))
continue;
mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
/*
- * We put equal pressure on every zone, unless one
- * zone has way too many pages free already.
+ * We put equal pressure on every zone, unless
+ * one zone has way too many pages free
+ * already. The "too many pages" is defined
+ * as the high wmark plus a "gap" where the
+ * gap is either the low watermark or 1%
+ * of the zone, whichever is smaller.
*/
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages +
+ KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
if (!zone_watermark_ok_safe(zone, order,
- 8*high_wmark_pages(zone), end_zone, 0))
+ high_wmark_pages(zone) + balance_gap,
+ end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
+ nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
+
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && !zone_reclaimable(zone))
+ if (nr_slab == 0 &&
+ !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- /*
- * Compact the zone for higher orders to reduce
- * latencies for higher-order allocations that
- * would ordinarily call try_to_compact_pages()
- */
- if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
- compact_zone_order(zone, sc.order, sc.gfp_mask,
- false);
-
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
* spectulatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
- if (i <= classzone_idx)
- any_zone_ok = 1;
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
}
}
- if (all_zones_ok || (order && any_zone_ok))
+ if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
/*
* order-0: All zones must meet high watermark for a balanced node
- * high-order: Any zone below pgdats classzone_idx must meet the high
- * watermark for a balanced node
+ * high-order: Balanced zones must make up at least 25% of the node
+ * for the node to be balanced
*/
- if (!(all_zones_ok || (order && any_zone_ok))) {
+ if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
cond_resched();
try_to_freeze();
}
}
- return sc.nr_reclaimed;
+ /*
+ * Return the order we were reclaiming at so sleeping_prematurely()
+ * makes a decision on the order we were last reclaiming at. However,
+ * if another caller entered the allocator slow path while kswapd
+ * was awake, order will remain at the higher level
+ */
+ *classzone_idx = end_zone;
+ return order;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (!sleeping_prematurely(pgdat, order, remaining)) {
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order);
+ kswapd_try_to_sleep(pgdat, order, classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
}
ret = try_to_freeze();
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balance_pgdat(pgdat, order, classzone_idx);
+ order = balance_pgdat(pgdat, order, &classzone_idx);
}
}
return 0;
.swappiness = vm_swappiness,
.order = 0,
};
- struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
.swappiness = vm_swappiness,
.order = order,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
unsigned long nr_slab_pages0, nr_slab_pages1;
cond_resched();
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
- if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
break;
/* Freed enough memory */