Commit 3bb1a852ab6c9cdf211a2f4a2f502340c8c38eca

Authored by Martin Bligh
Committed by Linus Torvalds
1 parent 2ae88149a2

[PATCH] vmscan: Fix temp_priority race

The temp_priority field in zone is racy, as we can walk through a reclaim
path, and just before we copy it into prev_priority, it can be overwritten
(say with DEF_PRIORITY) by another reclaimer.

The same bug is contained in both try_to_free_pages and balance_pgdat, but
it is fixed slightly differently.  In balance_pgdat, we keep a separate
priority record per zone in a local array.  In try_to_free_pages there is
no need to do this, as the priority level is the same for all zones that we
reclaim from.

Impact of this bug is that temp_priority is copied into prev_priority, and
setting this artificially high causes reclaimers to set distress
artificially low.  They then fail to reclaim mapped pages, when they are,
in fact, under severe memory pressure (their priority may be as low as 0).
This causes the OOM killer to fire incorrectly.

From: Andrew Morton <akpm@osdl.org>

__zone_reclaim() isn't modifying zone->prev_priority.  But zone->prev_priority
is used in the decision whether or not to bring mapped pages onto the inactive
list.  Hence there's a risk here that __zone_reclaim() will fail because
zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up
stuck on the active list.

Fix that up by decreasing (ie making more urgent) zone->prev_priority as
__zone_reclaim() scans the zone's pages.

This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created.  It should be
possible to remove that now, and to just start out at DEF_PRIORITY?

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 43 additions and 22 deletions Side-by-side Diff

include/linux/mmzone.h
... ... @@ -218,13 +218,9 @@
218 218 * under - it drives the swappiness decision: whether to unmap mapped
219 219 * pages.
220 220 *
221   - * temp_priority is used to remember the scanning priority at which
222   - * this zone was successfully refilled to free_pages == pages_high.
223   - *
224   - * Access to both these fields is quite racy even on uniprocessor. But
  221 + * Access to both this field is quite racy even on uniprocessor. But
225 222 * it is expected to average out OK.
226 223 */
227   - int temp_priority;
228 224 int prev_priority;
229 225  
230 226  
... ... @@ -2407,7 +2407,7 @@
2407 2407 zone->zone_pgdat = pgdat;
2408 2408 zone->free_pages = 0;
2409 2409  
2410   - zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
  2410 + zone->prev_priority = DEF_PRIORITY;
2411 2411  
2412 2412 zone_pcp_init(zone);
2413 2413 INIT_LIST_HEAD(&zone->active_list);
... ... @@ -723,6 +723,20 @@
723 723 return nr_reclaimed;
724 724 }
725 725  
  726 +/*
  727 + * We are about to scan this zone at a certain priority level. If that priority
  728 + * level is smaller (ie: more urgent) than the previous priority, then note
  729 + * that priority level within the zone. This is done so that when the next
  730 + * process comes in to scan this zone, it will immediately start out at this
  731 + * priority level rather than having to build up its own scanning priority.
  732 + * Here, this priority affects only the reclaim-mapped threshold.
  733 + */
  734 +static inline void note_zone_scanning_priority(struct zone *zone, int priority)
  735 +{
  736 + if (priority < zone->prev_priority)
  737 + zone->prev_priority = priority;
  738 +}
  739 +
726 740 static inline int zone_is_near_oom(struct zone *zone)
727 741 {
728 742 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
... ... @@ -972,9 +986,7 @@
972 986 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
973 987 continue;
974 988  
975   - zone->temp_priority = priority;
976   - if (zone->prev_priority > priority)
977   - zone->prev_priority = priority;
  989 + note_zone_scanning_priority(zone, priority);
978 990  
979 991 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
980 992 continue; /* Let kswapd poll it */
... ... @@ -1024,7 +1036,6 @@
1024 1036 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1025 1037 continue;
1026 1038  
1027   - zone->temp_priority = DEF_PRIORITY;
1028 1039 lru_pages += zone->nr_active + zone->nr_inactive;
1029 1040 }
1030 1041  
1031 1042  
... ... @@ -1065,13 +1076,22 @@
1065 1076 if (!sc.all_unreclaimable)
1066 1077 ret = 1;
1067 1078 out:
  1079 + /*
  1080 + * Now that we've scanned all the zones at this priority level, note
  1081 + * that level within the zone so that the next thread which performs
  1082 + * scanning of this zone will immediately start out at this priority
  1083 + * level. This affects only the decision whether or not to bring
  1084 + * mapped pages onto the inactive list.
  1085 + */
  1086 + if (priority < 0)
  1087 + priority = 0;
1068 1088 for (i = 0; zones[i] != 0; i++) {
1069 1089 struct zone *zone = zones[i];
1070 1090  
1071 1091 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1072 1092 continue;
1073 1093  
1074   - zone->prev_priority = zone->temp_priority;
  1094 + zone->prev_priority = priority;
1075 1095 }
1076 1096 return ret;
1077 1097 }
... ... @@ -1111,6 +1131,11 @@
1111 1131 .swap_cluster_max = SWAP_CLUSTER_MAX,
1112 1132 .swappiness = vm_swappiness,
1113 1133 };
  1134 + /*
  1135 + * temp_priority is used to remember the scanning priority at which
  1136 + * this zone was successfully refilled to free_pages == pages_high.
  1137 + */
  1138 + int temp_priority[MAX_NR_ZONES];
1114 1139  
1115 1140 loop_again:
1116 1141 total_scanned = 0;
1117 1142  
... ... @@ -1118,12 +1143,9 @@
1118 1143 sc.may_writepage = !laptop_mode;
1119 1144 count_vm_event(PAGEOUTRUN);
1120 1145  
1121   - for (i = 0; i < pgdat->nr_zones; i++) {
1122   - struct zone *zone = pgdat->node_zones + i;
  1146 + for (i = 0; i < pgdat->nr_zones; i++)
  1147 + temp_priority[i] = DEF_PRIORITY;
1123 1148  
1124   - zone->temp_priority = DEF_PRIORITY;
1125   - }
1126   -
1127 1149 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1128 1150 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1129 1151 unsigned long lru_pages = 0;
1130 1152  
... ... @@ -1183,10 +1205,9 @@
1183 1205 if (!zone_watermark_ok(zone, order, zone->pages_high,
1184 1206 end_zone, 0))
1185 1207 all_zones_ok = 0;
1186   - zone->temp_priority = priority;
1187   - if (zone->prev_priority > priority)
1188   - zone->prev_priority = priority;
  1208 + temp_priority[i] = priority;
1189 1209 sc.nr_scanned = 0;
  1210 + note_zone_scanning_priority(zone, priority);
1190 1211 nr_reclaimed += shrink_zone(priority, zone, &sc);
1191 1212 reclaim_state->reclaimed_slab = 0;
1192 1213 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1193 1214  
... ... @@ -1226,10 +1247,15 @@
1226 1247 break;
1227 1248 }
1228 1249 out:
  1250 + /*
  1251 + * Note within each zone the priority level at which this zone was
  1252 + * brought into a happy state. So that the next thread which scans this
  1253 + * zone will start out at that priority level.
  1254 + */
1229 1255 for (i = 0; i < pgdat->nr_zones; i++) {
1230 1256 struct zone *zone = pgdat->node_zones + i;
1231 1257  
1232   - zone->prev_priority = zone->temp_priority;
  1258 + zone->prev_priority = temp_priority[i];
1233 1259 }
1234 1260 if (!all_zones_ok) {
1235 1261 cond_resched();
... ... @@ -1614,6 +1640,7 @@
1614 1640 */
1615 1641 priority = ZONE_RECLAIM_PRIORITY;
1616 1642 do {
  1643 + note_zone_scanning_priority(zone, priority);
1617 1644 nr_reclaimed += shrink_zone(priority, zone, &sc);
1618 1645 priority--;
1619 1646 } while (priority >= 0 && nr_reclaimed < nr_pages);
... ... @@ -587,11 +587,9 @@
587 587 seq_printf(m,
588 588 "\n all_unreclaimable: %u"
589 589 "\n prev_priority: %i"
590   - "\n temp_priority: %i"
591 590 "\n start_pfn: %lu",
592 591 zone->all_unreclaimable,
593 592 zone->prev_priority,
594   - zone->temp_priority,
595 593 zone->zone_start_pfn);
596 594 spin_unlock_irqrestore(&zone->lock, flags);
597 595 seq_putc(m, '\n');