Commit 52bf84aa206cd2c2516dfa3e03b578edf8a3242f

Authored by Rik van Riel
Committed by Ingo Molnar
1 parent a57beec5d4

sched/numa, mm: Remove p->numa_migrate_deferred

Excessive migration of pages can hurt the performance of workloads
that span multiple NUMA nodes.  However, it turns out that the
p->numa_migrate_deferred knob is a really big hammer, which does
reduce migration rates, but does not actually help performance.

Now that the second stage of the automatic numa balancing code
has stabilized, it is time to replace the simplistic migration
deferral code with something smarter.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Showing 5 changed files with 1 additions and 70 deletions Side-by-side Diff

Documentation/sysctl/kernel.txt
... ... @@ -386,8 +386,7 @@
386 386 feature is too high then the rate the kernel samples for NUMA hinting
387 387 faults may be controlled by the numa_balancing_scan_period_min_ms,
388 388 numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
389   -numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
390   -numa_balancing_migrate_deferred.
  389 +numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
391 390  
392 391 ==============================================================
393 392  
... ... @@ -427,13 +426,6 @@
427 426  
428 427 numa_balancing_scan_size_mb is how many megabytes worth of pages are
429 428 scanned for a given scan.
430   -
431   -numa_balancing_migrate_deferred is how many page migrations get skipped
432   -unconditionally, after a page migration is skipped because a page is shared
433   -with other tasks. This reduces page migration overhead, and determines
434   -how much stronger the "move task near its memory" policy scheduler becomes,
435   -versus the "move memory near its task" memory management policy, for workloads
436   -with shared memory.
437 429  
438 430 ==============================================================
439 431  
include/linux/sched.h
... ... @@ -1457,7 +1457,6 @@
1457 1457 unsigned int numa_scan_period;
1458 1458 unsigned int numa_scan_period_max;
1459 1459 int numa_preferred_nid;
1460   - int numa_migrate_deferred;
1461 1460 unsigned long numa_migrate_retry;
1462 1461 u64 node_stamp; /* migration stamp */
1463 1462 struct callback_head numa_work;
... ... @@ -819,14 +819,6 @@
819 819 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820 820 unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 821  
822   -/*
823   - * After skipping a page migration on a shared page, skip N more numa page
824   - * migrations unconditionally. This reduces the number of NUMA migrations
825   - * in shared memory workloads, and has the effect of pulling tasks towards
826   - * where their memory lives, over pulling the memory towards the task.
827   - */
828   -unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829   -
830 822 static unsigned int task_nr_scan_windows(struct task_struct *p)
831 823 {
832 824 unsigned long rss = 0;
... ... @@ -384,13 +384,6 @@
384 384 .mode = 0644,
385 385 .proc_handler = proc_dointvec,
386 386 },
387   - {
388   - .procname = "numa_balancing_migrate_deferred",
389   - .data = &sysctl_numa_balancing_migrate_deferred,
390   - .maxlen = sizeof(unsigned int),
391   - .mode = 0644,
392   - .proc_handler = proc_dointvec,
393   - },
394 387 #endif /* CONFIG_NUMA_BALANCING */
395 388 #endif /* CONFIG_SCHED_DEBUG */
396 389 {
... ... @@ -2304,35 +2304,6 @@
2304 2304 kmem_cache_free(sn_cache, n);
2305 2305 }
2306 2306  
2307   -#ifdef CONFIG_NUMA_BALANCING
2308   -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2309   -{
2310   - /* Never defer a private fault */
2311   - if (cpupid_match_pid(p, last_cpupid))
2312   - return false;
2313   -
2314   - if (p->numa_migrate_deferred) {
2315   - p->numa_migrate_deferred--;
2316   - return true;
2317   - }
2318   - return false;
2319   -}
2320   -
2321   -static inline void defer_numa_migrate(struct task_struct *p)
2322   -{
2323   - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2324   -}
2325   -#else
2326   -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2327   -{
2328   - return false;
2329   -}
2330   -
2331   -static inline void defer_numa_migrate(struct task_struct *p)
2332   -{
2333   -}
2334   -#endif /* CONFIG_NUMA_BALANCING */
2335   -
2336 2307 /**
2337 2308 * mpol_misplaced - check whether current page node is valid in policy
2338 2309 *
2339 2310  
... ... @@ -2435,24 +2406,8 @@
2435 2406 */
2436 2407 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2437 2408 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2438   -
2439   - /* See sysctl_numa_balancing_migrate_deferred comment */
2440   - if (!cpupid_match_pid(current, last_cpupid))
2441   - defer_numa_migrate(current);
2442   -
2443 2409 goto out;
2444 2410 }
2445   -
2446   - /*
2447   - * The quadratic filter above reduces extraneous migration
2448   - * of shared pages somewhat. This code reduces it even more,
2449   - * reducing the overhead of page migrations of shared pages.
2450   - * This makes workloads with shared pages rely more on
2451   - * "move task near its memory", and less on "move memory
2452   - * towards its task", which is exactly what we want.
2453   - */
2454   - if (numa_migrate_deferred(current, last_cpupid))
2455   - goto out;
2456 2411 }
2457 2412  
2458 2413 if (curnid != polnid)