sched/numa, mm: Remove p->numa_migrate_deferred

Excessive migration of pages can hurt the performance of workloads that span multiple NUMA nodes. However, it turns out that the p->numa_migrate_deferred knob is a really big hammer, which does reduce migration rates, but does not actually help performance. Now that the second stage of the automatic numa balancing code has stabilized, it is time to replace the simplistic migration deferral code with something smarter. Signed-off-by: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Chegu Vinod <chegu_vinod@hp.com> Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>

sched/numa, mm: Remove p->numa_migrate_deferred
Excessive migration of pages can hurt the performance of workloads that span multiple NUMA nodes. However, it turns out that the p->numa_migrate_deferred knob is a really big hammer, which does reduce migration rates, but does not actually help performance. Now that the second stage of the automatic numa balancing code has stabilized, it is time to replace the simplistic migration deferral code with something smarter. Signed-off-by: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Chegu Vinod <chegu_vinod@hp.com> Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Rik van Riel · Ingo Molnar
1 parent a57beec5d4
Showing 5 changed files with 1 additions and 70 deletions Side-by-side Diff
Documentation/sysctl/kernel.txt
include/linux/sched.h
kernel/sched/fair.c
kernel/sysctl.c
mm/mempolicy.c
@@ -386,8 +386,7 @@
 feature is too high then the rate the kernel samples for NUMA hinting
 faults may be controlled by the numa_balancing_scan_period_min_ms,
 numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
-numa_balancing_migrate_deferred.
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
  
 ==============================================================
  
@@ -427,13 +426,6 @@
  
 numa_balancing_scan_size_mb is how many megabytes worth of pages are
 scanned for a given scan.
-
-numa_balancing_migrate_deferred is how many page migrations get skipped
-unconditionally, after a page migration is skipped because a page is shared
-with other tasks. This reduces page migration overhead, and determines
-how much stronger the "move task near its memory" policy scheduler becomes,
-versus the "move memory near its task" memory management policy, for workloads
-with shared memory.
  
 ==============================================================
  
@@ -1457,7 +1457,6 @@
 	unsigned int numa_scan_period;
 	unsigned int numa_scan_period_max;
 	int numa_preferred_nid;
-	int numa_migrate_deferred;
 	unsigned long numa_migrate_retry;
 	u64 node_stamp;			/* migration stamp  */
 	struct callback_head numa_work;
@@ -819,14 +819,6 @@
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
 	unsigned long rss = 0;
@@ -384,13 +384,6 @@
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname       = "numa_balancing_migrate_deferred",
-		.data           = &sysctl_numa_balancing_migrate_deferred,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
 	{
@@ -2304,35 +2304,6 @@
 	kmem_cache_free(sn_cache, n);
 }
  
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-	/* Never defer a private fault */
-	if (cpupid_match_pid(p, last_cpupid))
-		return false;
-
-	if (p->numa_migrate_deferred) {
-		p->numa_migrate_deferred--;
-		return true;
-	}
-	return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-	p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
-}
-#else
-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-	return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 /**
  * mpol_misplaced - check whether current page node is valid in policy
  *
  
@@ -2435,24 +2406,8 @@
 		 */
 		last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
 		if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
-
-			/* See sysctl_numa_balancing_migrate_deferred comment */
-			if (!cpupid_match_pid(current, last_cpupid))
-				defer_numa_migrate(current);
-
 			goto out;
 		}
-
-		/*
-		 * The quadratic filter above reduces extraneous migration
-		 * of shared pages somewhat. This code reduces it even more,
-		 * reducing the overhead of page migrations of shared pages.
-		 * This makes workloads with shared pages rely more on
-		 * "move task near its memory", and less on "move memory
-		 * towards its task", which is exactly what we want.
-		 */
-		if (numa_migrate_deferred(current, last_cpupid))
-			goto out;
 	}
  
 	if (curnid != polnid)
...	...	@@ -386,8 +386,7 @@
386	386	feature is too high then the rate the kernel samples for NUMA hinting
387	387	faults may be controlled by the numa_balancing_scan_period_min_ms,
388	388	numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
389		-numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
390		-numa_balancing_migrate_deferred.
	389	+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
391	390
392	391	==============================================================
393	392
...	...	@@ -427,13 +426,6 @@
427	426
428	427	numa_balancing_scan_size_mb is how many megabytes worth of pages are
429	428	scanned for a given scan.
430		-
431		-numa_balancing_migrate_deferred is how many page migrations get skipped
432		-unconditionally, after a page migration is skipped because a page is shared
433		-with other tasks. This reduces page migration overhead, and determines
434		-how much stronger the "move task near its memory" policy scheduler becomes,
435		-versus the "move memory near its task" memory management policy, for workloads
436		-with shared memory.
437	429
438	430	==============================================================
439	431
...	...	@@ -1457,7 +1457,6 @@
1457	1457	unsigned int numa_scan_period;
1458	1458	unsigned int numa_scan_period_max;
1459	1459	int numa_preferred_nid;
1460		- int numa_migrate_deferred;
1461	1460	unsigned long numa_migrate_retry;
1462	1461	u64 node_stamp; /* migration stamp */
1463	1462	struct callback_head numa_work;
...	...	@@ -819,14 +819,6 @@
819	819	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820	820	unsigned int sysctl_numa_balancing_scan_delay = 1000;
821	821
822		-/*
823		- * After skipping a page migration on a shared page, skip N more numa page
824		- * migrations unconditionally. This reduces the number of NUMA migrations
825		- * in shared memory workloads, and has the effect of pulling tasks towards
826		- * where their memory lives, over pulling the memory towards the task.
827		- */
828		-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829		-
830	822	static unsigned int task_nr_scan_windows(struct task_struct *p)
831	823	{
832	824	unsigned long rss = 0;
...	...	@@ -384,13 +384,6 @@
384	384	.mode = 0644,
385	385	.proc_handler = proc_dointvec,
386	386	},
387		- {
388		- .procname = "numa_balancing_migrate_deferred",
389		- .data = &sysctl_numa_balancing_migrate_deferred,
390		- .maxlen = sizeof(unsigned int),
391		- .mode = 0644,
392		- .proc_handler = proc_dointvec,
393		- },
394	387	#endif /* CONFIG_NUMA_BALANCING */
395	388	#endif /* CONFIG_SCHED_DEBUG */
396	389	{
...	...	@@ -2304,35 +2304,6 @@
2304	2304	kmem_cache_free(sn_cache, n);
2305	2305	}
2306	2306
2307		-#ifdef CONFIG_NUMA_BALANCING
2308		-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2309		-{
2310		- /* Never defer a private fault */
2311		- if (cpupid_match_pid(p, last_cpupid))
2312		- return false;
2313		-
2314		- if (p->numa_migrate_deferred) {
2315		- p->numa_migrate_deferred--;
2316		- return true;
2317		- }
2318		- return false;
2319		-}
2320		-
2321		-static inline void defer_numa_migrate(struct task_struct *p)
2322		-{
2323		- p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2324		-}
2325		-#else
2326		-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2327		-{
2328		- return false;
2329		-}
2330		-
2331		-static inline void defer_numa_migrate(struct task_struct *p)
2332		-{
2333		-}
2334		-#endif /* CONFIG_NUMA_BALANCING */
2335		-
2336	2307	/**
2337	2308	* mpol_misplaced - check whether current page node is valid in policy
2338	2309	*
2339	2310
...	...	@@ -2435,24 +2406,8 @@
2435	2406	*/
2436	2407	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2437	2408	if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2438		-
2439		- /* See sysctl_numa_balancing_migrate_deferred comment */
2440		- if (!cpupid_match_pid(current, last_cpupid))
2441		- defer_numa_migrate(current);
2442		-
2443	2409	goto out;
2444	2410	}
2445		-
2446		- /*
2447		- * The quadratic filter above reduces extraneous migration
2448		- * of shared pages somewhat. This code reduces it even more,
2449		- * reducing the overhead of page migrations of shared pages.
2450		- * This makes workloads with shared pages rely more on
2451		- * "move task near its memory", and less on "move memory
2452		- * towards its task", which is exactly what we want.
2453		- */
2454		- if (numa_migrate_deferred(current, last_cpupid))
2455		- goto out;
2456	2411	}
2457	2412
2458	2413	if (curnid != polnid)