Commit b4fc580f75325271de2841891bb5816cea5ca101

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent 59d4b11371

mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines

commit 3484b2de9499df23c4604a513b36f96326ae81ad upstream.

The arrangement of struct zone has changed over time and now it has
reached the point where there is some inappropriate sharing going on.
On x86-64 for example

o The zone->node field is shared with the zone lock and zone->node is
  accessed frequently from the page allocator due to the fair zone
  allocation policy.

o span_seqlock is almost never used by shares a line with free_area

o Some zone statistics share a cache line with the LRU lock so
  reclaim-intensive and allocator-intensive workloads can bounce the cache
  line on a stat update

This patch rearranges struct zone to put read-only and read-mostly
fields together and then splits the page allocator intensive fields, the
zone statistics and the page reclaim intensive fields into their own
cache lines.  Note that the type of lowmem_reserve changes due to the
watermark calculations being signed and avoiding a signed/unsigned
conversion there.

On the test configuration I used the overall size of struct zone shrunk
by one cache line.  On smaller machines, this is not likely to be
noticable.  However, on a 4-node NUMA machine running tiobench the
system CPU overhead is reduced by this patch.

          3.16.0-rc3  3.16.0-rc3
             vanillarearrange-v5r9
User          746.94      759.78
System      65336.22    58350.98
Elapsed     27553.52    27282.02

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 3 changed files with 110 additions and 106 deletions Side-by-side Diff

include/linux/mmzone.h
... ... @@ -321,19 +321,12 @@
321 321 #ifndef __GENERATING_BOUNDS_H
322 322  
323 323 struct zone {
324   - /* Fields commonly accessed by the page allocator */
  324 + /* Read-mostly fields */
325 325  
326 326 /* zone watermarks, access with *_wmark_pages(zone) macros */
327 327 unsigned long watermark[NR_WMARK];
328 328  
329 329 /*
330   - * When free pages are below this point, additional steps are taken
331   - * when reading the number of free pages to avoid per-cpu counter
332   - * drift allowing watermarks to be breached
333   - */
334   - unsigned long percpu_drift_mark;
335   -
336   - /*
337 330 * We don't know if the memory that we're going to allocate will be freeable
338 331 * or/and it will be released eventually, so to avoid totally wasting several
339 332 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
340 333  
341 334  
342 335  
343 336  
344 337  
345 338  
346 339  
347 340  
... ... @@ -341,42 +334,27 @@
341 334 * on the higher zones). This array is recalculated at runtime if the
342 335 * sysctl_lowmem_reserve_ratio sysctl changes.
343 336 */
344   - unsigned long lowmem_reserve[MAX_NR_ZONES];
  337 + long lowmem_reserve[MAX_NR_ZONES];
345 338  
346   - /*
347   - * This is a per-zone reserve of pages that should not be
348   - * considered dirtyable memory.
349   - */
350   - unsigned long dirty_balance_reserve;
351   -
352 339 #ifdef CONFIG_NUMA
353 340 int node;
  341 +#endif
  342 +
354 343 /*
355   - * zone reclaim becomes active if more unmapped pages exist.
  344 + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
  345 + * this zone's LRU. Maintained by the pageout code.
356 346 */
357   - unsigned long min_unmapped_pages;
358   - unsigned long min_slab_pages;
359   -#endif
  347 + unsigned int inactive_ratio;
  348 +
  349 + struct pglist_data *zone_pgdat;
360 350 struct per_cpu_pageset __percpu *pageset;
  351 +
361 352 /*
362   - * free areas of different sizes
  353 + * This is a per-zone reserve of pages that should not be
  354 + * considered dirtyable memory.
363 355 */
364   - spinlock_t lock;
365   -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
366   - /* Set to true when the PG_migrate_skip bits should be cleared */
367   - bool compact_blockskip_flush;
  356 + unsigned long dirty_balance_reserve;
368 357  
369   - /* pfn where compaction free scanner should start */
370   - unsigned long compact_cached_free_pfn;
371   - /* pfn where async and sync compaction migration scanner should start */
372   - unsigned long compact_cached_migrate_pfn[2];
373   -#endif
374   -#ifdef CONFIG_MEMORY_HOTPLUG
375   - /* see spanned/present_pages for more description */
376   - seqlock_t span_seqlock;
377   -#endif
378   - struct free_area free_area[MAX_ORDER];
379   -
380 358 #ifndef CONFIG_SPARSEMEM
381 359 /*
382 360 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
383 361  
384 362  
385 363  
... ... @@ -385,71 +363,14 @@
385 363 unsigned long *pageblock_flags;
386 364 #endif /* CONFIG_SPARSEMEM */
387 365  
388   -#ifdef CONFIG_COMPACTION
  366 +#ifdef CONFIG_NUMA
389 367 /*
390   - * On compaction failure, 1<<compact_defer_shift compactions
391   - * are skipped before trying again. The number attempted since
392   - * last failure is tracked with compact_considered.
  368 + * zone reclaim becomes active if more unmapped pages exist.
393 369 */
394   - unsigned int compact_considered;
395   - unsigned int compact_defer_shift;
396   - int compact_order_failed;
397   -#endif
  370 + unsigned long min_unmapped_pages;
  371 + unsigned long min_slab_pages;
  372 +#endif /* CONFIG_NUMA */
398 373  
399   - ZONE_PADDING(_pad1_)
400   -
401   - /* Fields commonly accessed by the page reclaim scanner */
402   - spinlock_t lru_lock;
403   - struct lruvec lruvec;
404   -
405   - unsigned long pages_scanned; /* since last reclaim */
406   - unsigned long flags; /* zone flags, see below */
407   -
408   - /* Zone statistics */
409   - atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
410   -
411   - /*
412   - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
413   - * this zone's LRU. Maintained by the pageout code.
414   - */
415   - unsigned int inactive_ratio;
416   -
417   -
418   - ZONE_PADDING(_pad2_)
419   - /* Rarely used or read-mostly fields */
420   -
421   - /*
422   - * wait_table -- the array holding the hash table
423   - * wait_table_hash_nr_entries -- the size of the hash table array
424   - * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
425   - *
426   - * The purpose of all these is to keep track of the people
427   - * waiting for a page to become available and make them
428   - * runnable again when possible. The trouble is that this
429   - * consumes a lot of space, especially when so few things
430   - * wait on pages at a given time. So instead of using
431   - * per-page waitqueues, we use a waitqueue hash table.
432   - *
433   - * The bucket discipline is to sleep on the same queue when
434   - * colliding and wake all in that wait queue when removing.
435   - * When something wakes, it must check to be sure its page is
436   - * truly available, a la thundering herd. The cost of a
437   - * collision is great, but given the expected load of the
438   - * table, they should be so rare as to be outweighed by the
439   - * benefits from the saved space.
440   - *
441   - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
442   - * primary users of these fields, and in mm/page_alloc.c
443   - * free_area_init_core() performs the initialization of them.
444   - */
445   - wait_queue_head_t * wait_table;
446   - unsigned long wait_table_hash_nr_entries;
447   - unsigned long wait_table_bits;
448   -
449   - /*
450   - * Discontig memory support fields.
451   - */
452   - struct pglist_data *zone_pgdat;
453 374 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
454 375 unsigned long zone_start_pfn;
455 376  
456 377  
457 378  
458 379  
459 380  
460 381  
... ... @@ -495,20 +416,104 @@
495 416 * adjust_managed_page_count() should be used instead of directly
496 417 * touching zone->managed_pages and totalram_pages.
497 418 */
  419 + unsigned long managed_pages;
498 420 unsigned long spanned_pages;
499 421 unsigned long present_pages;
500   - unsigned long managed_pages;
501 422  
  423 + const char *name;
  424 +
502 425 /*
503 426 * Number of MIGRATE_RESEVE page block. To maintain for just
504 427 * optimization. Protected by zone->lock.
505 428 */
506 429 int nr_migrate_reserve_block;
507 430  
  431 +#ifdef CONFIG_MEMORY_HOTPLUG
  432 + /* see spanned/present_pages for more description */
  433 + seqlock_t span_seqlock;
  434 +#endif
  435 +
508 436 /*
509   - * rarely used fields:
  437 + * wait_table -- the array holding the hash table
  438 + * wait_table_hash_nr_entries -- the size of the hash table array
  439 + * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
  440 + *
  441 + * The purpose of all these is to keep track of the people
  442 + * waiting for a page to become available and make them
  443 + * runnable again when possible. The trouble is that this
  444 + * consumes a lot of space, especially when so few things
  445 + * wait on pages at a given time. So instead of using
  446 + * per-page waitqueues, we use a waitqueue hash table.
  447 + *
  448 + * The bucket discipline is to sleep on the same queue when
  449 + * colliding and wake all in that wait queue when removing.
  450 + * When something wakes, it must check to be sure its page is
  451 + * truly available, a la thundering herd. The cost of a
  452 + * collision is great, but given the expected load of the
  453 + * table, they should be so rare as to be outweighed by the
  454 + * benefits from the saved space.
  455 + *
  456 + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
  457 + * primary users of these fields, and in mm/page_alloc.c
  458 + * free_area_init_core() performs the initialization of them.
510 459 */
511   - const char *name;
  460 + wait_queue_head_t *wait_table;
  461 + unsigned long wait_table_hash_nr_entries;
  462 + unsigned long wait_table_bits;
  463 +
  464 + ZONE_PADDING(_pad1_)
  465 +
  466 + /* Write-intensive fields used from the page allocator */
  467 + spinlock_t lock;
  468 +
  469 + /* free areas of different sizes */
  470 + struct free_area free_area[MAX_ORDER];
  471 +
  472 + /* zone flags, see below */
  473 + unsigned long flags;
  474 +
  475 + ZONE_PADDING(_pad2_)
  476 +
  477 + /* Write-intensive fields used by page reclaim */
  478 +
  479 + /* Fields commonly accessed by the page reclaim scanner */
  480 + spinlock_t lru_lock;
  481 + unsigned long pages_scanned; /* since last reclaim */
  482 + struct lruvec lruvec;
  483 +
  484 + /*
  485 + * When free pages are below this point, additional steps are taken
  486 + * when reading the number of free pages to avoid per-cpu counter
  487 + * drift allowing watermarks to be breached
  488 + */
  489 + unsigned long percpu_drift_mark;
  490 +
  491 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
  492 + /* pfn where compaction free scanner should start */
  493 + unsigned long compact_cached_free_pfn;
  494 + /* pfn where async and sync compaction migration scanner should start */
  495 + unsigned long compact_cached_migrate_pfn[2];
  496 +#endif
  497 +
  498 +#ifdef CONFIG_COMPACTION
  499 + /*
  500 + * On compaction failure, 1<<compact_defer_shift compactions
  501 + * are skipped before trying again. The number attempted since
  502 + * last failure is tracked with compact_considered.
  503 + */
  504 + unsigned int compact_considered;
  505 + unsigned int compact_defer_shift;
  506 + int compact_order_failed;
  507 +#endif
  508 +
  509 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
  510 + /* Set to true when the PG_migrate_skip bits should be cleared */
  511 + bool compact_blockskip_flush;
  512 +#endif
  513 +
  514 + ZONE_PADDING(_pad3_)
  515 + /* Zone statistics */
  516 + atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
512 517 } ____cacheline_internodealigned_in_smp;
513 518  
514 519 typedef enum {
... ... @@ -1685,7 +1685,6 @@
1685 1685 {
1686 1686 /* free_pages my go negative - that's OK */
1687 1687 long min = mark;
1688   - long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1689 1688 int o;
1690 1689 long free_cma = 0;
1691 1690  
... ... @@ -1700,7 +1699,7 @@
1700 1699 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1701 1700 #endif
1702 1701  
1703   - if (free_pages - free_cma <= min + lowmem_reserve)
  1702 + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
1704 1703 return false;
1705 1704 for (o = 0; o < order; o++) {
1706 1705 /* At the next order, this order's pages become unavailable */
... ... @@ -3224,7 +3223,7 @@
3224 3223 );
3225 3224 printk("lowmem_reserve[]:");
3226 3225 for (i = 0; i < MAX_NR_ZONES; i++)
3227   - printk(" %lu", zone->lowmem_reserve[i]);
  3226 + printk(" %ld", zone->lowmem_reserve[i]);
3228 3227 printk("\n");
3229 3228 }
3230 3229  
... ... @@ -5527,7 +5526,7 @@
5527 5526 for_each_online_pgdat(pgdat) {
5528 5527 for (i = 0; i < MAX_NR_ZONES; i++) {
5529 5528 struct zone *zone = pgdat->node_zones + i;
5530   - unsigned long max = 0;
  5529 + long max = 0;
5531 5530  
5532 5531 /* Find valid and maximum lowmem_reserve in the zone */
5533 5532 for (j = i; j < MAX_NR_ZONES; j++) {
... ... @@ -1065,10 +1065,10 @@
1065 1065 zone_page_state(zone, i));
1066 1066  
1067 1067 seq_printf(m,
1068   - "\n protection: (%lu",
  1068 + "\n protection: (%ld",
1069 1069 zone->lowmem_reserve[0]);
1070 1070 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1071   - seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
  1071 + seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1072 1072 seq_printf(m,
1073 1073 ")"
1074 1074 "\n pagesets");