Commit d867ca5c authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] vmscan: zone pressure simplification and fix

The zone->pressure field is supposed to record the amount of reclaim pressure
which this zone is under.  We need this info so we know whether to unmap
pages from pagetables right from the outset of a balance_pgdat() or
try_to_free_pages() invokation.

The problem with the current code is that the exponential average gets tugged
around too much: as we perform the increasing-priority scan, the pressure
metric is made artificially low by the early part of the scan.

So instead what we do here is to record within the zone the scanning priority
from the zone's previous scan.  It is defined as the priority at which the
zone achieved the "enough pages free" state.  This prev_priority is used on
the next scan for the do-we-need-to-be-unmapping-pages decision.
parent b25bb608
......@@ -89,17 +89,24 @@ struct zone {
ZONE_PADDING(_pad2_)
/*
* measure of scanning intensity for this zone. It is calculated
* as exponentially decaying average of the scanning priority
* required to free enough pages in this zone
* (zone_adj_pressure()).
/*
* prev_priority holds the scanning priority for this zone. It is
* defined as the scanning priority at which we achieved our reclaim
* target at the previous try_to_free_pages() or balance_pgdat()
* invokation.
*
* We use prev_priority as a measure of how much stress page reclaim is
* under - it drives the swappiness decision: whether to unmap mapped
* pages.
*
* 0 --- low pressure
* temp_priority is used to remember the scanning priority at which
* this zone was successfully refilled to free_pages == pages_high.
*
* (DEF_PRIORITY << 10) --- high pressure
* Access to both these fields is quite racy even on uniprocessor. But
* it is expected to average out OK.
*/
int pressure;
int temp_priority;
int prev_priority;
/*
* free areas of different sizes
......
......@@ -79,25 +79,6 @@ static long total_memory;
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
/*
* exponentially decaying average
*/
static inline int expavg(int avg, int val)
{
return ((val - avg) >> 1) + avg;
}
static void zone_adj_pressure(struct zone *zone, int priority)
{
zone->pressure = expavg(zone->pressure,
(DEF_PRIORITY - priority) << 10);
}
static int pressure_to_priority(int pressure)
{
return DEF_PRIORITY - (pressure >> 10);
}
/*
* The list of shrinker callbacks used by to apply pressure to
* ageable caches.
......@@ -646,7 +627,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
* `distress' is a measure of how much trouble we're having reclaiming
* pages. 0 -> no problems. 100 -> great trouble.
*/
distress = 100 >> pressure_to_priority(zone->pressure);
distress = 100 >> zone->prev_priority;
/*
* The point of this algorithm is to decide when to start reclaiming
......@@ -830,6 +811,9 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
int nr_mapped = 0;
int max_scan;
if (zone->free_pages < zone->pages_high)
zone->temp_priority = priority;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
......@@ -843,10 +827,8 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
ret += shrink_zone(zone, max_scan, gfp_mask,
to_reclaim, &nr_mapped, ps, priority);
*total_scanned += max_scan + nr_mapped;
if (ret >= nr_pages) {
zone_adj_pressure(zone, priority);
if (ret >= nr_pages)
break;
}
}
return ret;
}
......@@ -880,6 +862,9 @@ int try_to_free_pages(struct zone *cz,
inc_page_state(allocstall);
for (zone = cz; zone >= cz->zone_pgdat->node_zones; --zone)
zone->temp_priority = DEF_PRIORITY;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int total_scanned = 0;
struct page_state ps;
......@@ -912,9 +897,9 @@ int try_to_free_pages(struct zone *cz,
}
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
out_of_memory();
for (zone = cz; zone >= cz->zone_pgdat->node_zones; -- zone)
zone_adj_pressure(zone, -1);
out:
for (zone = cz; zone >= cz->zone_pgdat->node_zones; --zone)
zone->prev_priority = zone->temp_priority;
return ret;
}
......@@ -945,6 +930,12 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
inc_page_state(pageoutrun);
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
zone->temp_priority = DEF_PRIORITY;
}
for (priority = DEF_PRIORITY; priority; priority--) {
int all_zones_ok = 1;
......@@ -961,11 +952,10 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8);
} else { /* Zone balancing */
to_reclaim = zone->pages_high-zone->free_pages;
if (to_reclaim <= 0) {
zone_adj_pressure(zone, priority);
if (to_reclaim <= 0)
continue;
}
}
zone->temp_priority = priority;
all_zones_ok = 0;
max_scan = zone->nr_inactive >> priority;
if (max_scan < to_reclaim * 2)
......@@ -989,13 +979,11 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
if (to_free > 0)
blk_congestion_wait(WRITE, HZ/10);
}
if (priority <= 0) {
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (zone->free_pages < zone->pages_high)
zone_adj_pressure(zone, -1);
}
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
zone->prev_priority = zone->temp_priority;
}
return nr_pages - to_free;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment