mm: extended batches for generic mmu_gather

Instead of using a single batch (the small on-stack, or an allocated page), try and extend the batch every time it runs out and only flush once either the extend fails or we're done. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Requested-by: Nick Piggin <npiggin@kernel.dk> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: extended batches for generic mmu_gather
Instead of using a single batch (the small on-stack, or an allocated page), try and extend the batch every time it runs out and only flush once either the extend fails or we're done. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Requested-by: Nick Piggin <npiggin@kernel.dk> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Peter Zijlstra · Linus Torvalds
1 parent 2672391169
Showing 2 changed files with 84 additions and 47 deletions Side-by-side Diff
include/asm-generic/tlb.h
mm/memory.c
@@ -19,16 +19,6 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
  
-/*
- * For UP we don't need to worry about TLB flush
- * and page free order so much..
- */
-#ifdef CONFIG_SMP
-  #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
-#else
-  #define tlb_fast_mode(tlb) 1
-#endif
-
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 /*
  * Semi RCU freeing of the page directories.
@@ -78,6 +68,16 @@
  */
 #define MMU_GATHER_BUNDLE	8
  
+struct mmu_gather_batch {
+	struct mmu_gather_batch	*next;
+	unsigned int		nr;
+	unsigned int		max;
+	struct page		*pages[0];
+};
+
+#define MAX_GATHER_BATCH	\
+	((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))
+
 /* struct mmu_gather is an opaque type used by the mm code for passing around
  * any data needed by arch specific code for tlb_remove_page.
  */
  
  
  
  
@@ -86,22 +86,48 @@
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	struct mmu_table_batch	*batch;
 #endif
-	unsigned int		nr;	/* set to ~0U means fast mode */
-	unsigned int		max;	/* nr < max */
-	unsigned int		need_flush;/* Really unmapped some ptes? */
-	unsigned int		fullmm; /* non-zero means full mm flush */
-	struct page		**pages;
-	struct page		*local[MMU_GATHER_BUNDLE];
+	unsigned int		need_flush : 1,	/* Did free PTEs */
+				fast_mode  : 1; /* No batching   */
+
+	unsigned int		fullmm;
+
+	struct mmu_gather_batch *active;
+	struct mmu_gather_batch	local;
+	struct page		*__pages[MMU_GATHER_BUNDLE];
 };
  
-static inline void __tlb_alloc_page(struct mmu_gather *tlb)
+/*
+ * For UP we don't need to worry about TLB flush
+ * and page free order so much..
+ */
+#ifdef CONFIG_SMP
+  #define tlb_fast_mode(tlb) (tlb->fast_mode)
+#else
+  #define tlb_fast_mode(tlb) 1
+#endif
+
+static inline int tlb_next_batch(struct mmu_gather *tlb)
 {
-	unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+	struct mmu_gather_batch *batch;
  
-	if (addr) {
-		tlb->pages = (void *)addr;
-		tlb->max = PAGE_SIZE / sizeof(struct page *);
+	batch = tlb->active;
+	if (batch->next) {
+		tlb->active = batch->next;
+		return 1;
 	}
+
+	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+	if (!batch)
+		return 0;
+
+	batch->next = NULL;
+	batch->nr   = 0;
+	batch->max  = MAX_GATHER_BATCH;
+
+	tlb->active->next = batch;
+	tlb->active = batch;
+
+	return 1;
 }
  
 /* tlb_gather_mmu
  
@@ -114,17 +140,14 @@
 {
 	tlb->mm = mm;
  
-	tlb->max = ARRAY_SIZE(tlb->local);
-	tlb->pages = tlb->local;
+	tlb->fullmm     = fullmm;
+	tlb->need_flush = 0;
+	tlb->fast_mode  = (num_possible_cpus() == 1);
+	tlb->local.next = NULL;
+	tlb->local.nr   = 0;
+	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
+	tlb->active     = &tlb->local;
  
-	if (num_online_cpus() > 1) {
-		tlb->nr = 0;
-		__tlb_alloc_page(tlb);
-	} else /* Use fast mode if only one CPU is online */
-		tlb->nr = ~0U;
-
-	tlb->fullmm = fullmm;
-
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb->batch = NULL;
 #endif
@@ -133,6 +156,8 @@
 static inline void
 tlb_flush_mmu(struct mmu_gather *tlb)
 {
+	struct mmu_gather_batch *batch;
+
 	if (!tlb->need_flush)
 		return;
 	tlb->need_flush = 0;
  
@@ -140,17 +165,15 @@
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb_table_flush(tlb);
 #endif
-	if (!tlb_fast_mode(tlb)) {
-		free_pages_and_swap_cache(tlb->pages, tlb->nr);
-		tlb->nr = 0;
-		/*
-		 * If we are using the local on-stack array of pages for MMU
-		 * gather, try allocating an off-stack array again as we have
-		 * recently freed pages.
-		 */
-		if (tlb->pages == tlb->local)
-			__tlb_alloc_page(tlb);
+
+	if (tlb_fast_mode(tlb))
+		return;
+
+	for (batch = &tlb->local; batch; batch = batch->next) {
+		free_pages_and_swap_cache(batch->pages, batch->nr);
+		batch->nr = 0;
 	}
+	tlb->active = &tlb->local;
 }
  
 /* tlb_finish_mmu
  
@@ -160,13 +183,18 @@
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
+	struct mmu_gather_batch *batch, *next;
+
 	tlb_flush_mmu(tlb);
  
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
  
-	if (tlb->pages != tlb->local)
-		free_pages((unsigned long)tlb->pages, 0);
+	for (batch = tlb->local.next; batch; batch = next) {
+		next = batch->next;
+		free_pages((unsigned long)batch, 0);
+	}
+	tlb->local.next = NULL;
 }
  
 /* __tlb_remove_page
  
  
  
@@ -177,15 +205,24 @@
  */
 static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
+	struct mmu_gather_batch *batch;
+
 	tlb->need_flush = 1;
+
 	if (tlb_fast_mode(tlb)) {
 		free_page_and_swap_cache(page);
 		return 1; /* avoid calling tlb_flush_mmu() */
 	}
-	tlb->pages[tlb->nr++] = page;
-	VM_BUG_ON(tlb->nr > tlb->max);
  
-	return tlb->max - tlb->nr;
+	batch = tlb->active;
+	batch->pages[batch->nr++] = page;
+	VM_BUG_ON(batch->nr > batch->max);
+	if (batch->nr == batch->max) {
+		if (!tlb_next_batch(tlb))
+			return 0;
+	}
+
+	return batch->max - batch->nr;
 }
  
 /* tlb_remove_page
@@ -994,8 +994,8 @@
 	spinlock_t *ptl;
 	int rss[NR_MM_COUNTERS];
  
-	init_rss_vec(rss);
 again:
+	init_rss_vec(rss);
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
 	do {
...	...	@@ -19,16 +19,6 @@
19	19	#include <asm/pgalloc.h>
20	20	#include <asm/tlbflush.h>
21	21
22		-/*
23		- * For UP we don't need to worry about TLB flush
24		- * and page free order so much..
25		- */
26		-#ifdef CONFIG_SMP
27		- #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
28		-#else
29		- #define tlb_fast_mode(tlb) 1
30		-#endif
31		-
32	22	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
33	23	/*
34	24	* Semi RCU freeing of the page directories.
...	...	@@ -78,6 +68,16 @@
78	68	*/
79	69	#define MMU_GATHER_BUNDLE 8
80	70
	71	+struct mmu_gather_batch {
	72	+ struct mmu_gather_batch *next;
	73	+ unsigned int nr;
	74	+ unsigned int max;
	75	+ struct page *pages[0];
	76	+};
	77	+
	78	+#define MAX_GATHER_BATCH \
	79	+ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))
	80	+
81	81	/* struct mmu_gather is an opaque type used by the mm code for passing around
82	82	* any data needed by arch specific code for tlb_remove_page.
83	83	*/
84	84
85	85
86	86
87	87
...	...	@@ -86,22 +86,48 @@
86	86	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
87	87	struct mmu_table_batch *batch;
88	88	#endif
89		- unsigned int nr; /* set to ~0U means fast mode */
90		- unsigned int max; /* nr < max */
91		- unsigned int need_flush;/* Really unmapped some ptes? */
92		- unsigned int fullmm; /* non-zero means full mm flush */
93		- struct page **pages;
94		- struct page *local[MMU_GATHER_BUNDLE];
	89	+ unsigned int need_flush : 1, /* Did free PTEs */
	90	+ fast_mode : 1; /* No batching */
	91	+
	92	+ unsigned int fullmm;
	93	+
	94	+ struct mmu_gather_batch *active;
	95	+ struct mmu_gather_batch local;
	96	+ struct page *__pages[MMU_GATHER_BUNDLE];
95	97	};
96	98
97		-static inline void __tlb_alloc_page(struct mmu_gather *tlb)
	99	+/*
	100	+ * For UP we don't need to worry about TLB flush
	101	+ * and page free order so much..
	102	+ */
	103	+#ifdef CONFIG_SMP
	104	+ #define tlb_fast_mode(tlb) (tlb->fast_mode)
	105	+#else
	106	+ #define tlb_fast_mode(tlb) 1
	107	+#endif
	108	+
	109	+static inline int tlb_next_batch(struct mmu_gather *tlb)
98	110	{
99		- unsigned long addr = __get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
	111	+ struct mmu_gather_batch *batch;
100	112
101		- if (addr) {
102		- tlb->pages = (void *)addr;
103		- tlb->max = PAGE_SIZE / sizeof(struct page *);
	113	+ batch = tlb->active;
	114	+ if (batch->next) {
	115	+ tlb->active = batch->next;
	116	+ return 1;
104	117	}
	118	+
	119	+ batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
	120	+ if (!batch)
	121	+ return 0;
	122	+
	123	+ batch->next = NULL;
	124	+ batch->nr = 0;
	125	+ batch->max = MAX_GATHER_BATCH;
	126	+
	127	+ tlb->active->next = batch;
	128	+ tlb->active = batch;
	129	+
	130	+ return 1;
105	131	}
106	132
107	133	/* tlb_gather_mmu
108	134
...	...	@@ -114,17 +140,14 @@
114	140	{
115	141	tlb->mm = mm;
116	142
117		- tlb->max = ARRAY_SIZE(tlb->local);
118		- tlb->pages = tlb->local;
	143	+ tlb->fullmm = fullmm;
	144	+ tlb->need_flush = 0;
	145	+ tlb->fast_mode = (num_possible_cpus() == 1);
	146	+ tlb->local.next = NULL;
	147	+ tlb->local.nr = 0;
	148	+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
	149	+ tlb->active = &tlb->local;
119	150
120		- if (num_online_cpus() > 1) {
121		- tlb->nr = 0;
122		- __tlb_alloc_page(tlb);
123		- } else /* Use fast mode if only one CPU is online */
124		- tlb->nr = ~0U;
125		-
126		- tlb->fullmm = fullmm;
127		-
128	151	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
129	152	tlb->batch = NULL;
130	153	#endif
...	...	@@ -133,6 +156,8 @@
133	156	static inline void
134	157	tlb_flush_mmu(struct mmu_gather *tlb)
135	158	{
	159	+ struct mmu_gather_batch *batch;
	160	+
136	161	if (!tlb->need_flush)
137	162	return;
138	163	tlb->need_flush = 0;
139	164
...	...	@@ -140,17 +165,15 @@
140	165	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
141	166	tlb_table_flush(tlb);
142	167	#endif
143		- if (!tlb_fast_mode(tlb)) {
144		- free_pages_and_swap_cache(tlb->pages, tlb->nr);
145		- tlb->nr = 0;
146		- /*
147		- * If we are using the local on-stack array of pages for MMU
148		- * gather, try allocating an off-stack array again as we have
149		- * recently freed pages.
150		- */
151		- if (tlb->pages == tlb->local)
152		- __tlb_alloc_page(tlb);
	168	+
	169	+ if (tlb_fast_mode(tlb))
	170	+ return;
	171	+
	172	+ for (batch = &tlb->local; batch; batch = batch->next) {
	173	+ free_pages_and_swap_cache(batch->pages, batch->nr);
	174	+ batch->nr = 0;
153	175	}
	176	+ tlb->active = &tlb->local;
154	177	}
155	178
156	179	/* tlb_finish_mmu
157	180
...	...	@@ -160,13 +183,18 @@
160	183	static inline void
161	184	tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
162	185	{
	186	+ struct mmu_gather_batch batch, next;
	187	+
163	188	tlb_flush_mmu(tlb);
164	189
165	190	/* keep the page table cache within bounds */
166	191	check_pgt_cache();
167	192
168		- if (tlb->pages != tlb->local)
169		- free_pages((unsigned long)tlb->pages, 0);
	193	+ for (batch = tlb->local.next; batch; batch = next) {
	194	+ next = batch->next;
	195	+ free_pages((unsigned long)batch, 0);
	196	+ }
	197	+ tlb->local.next = NULL;
170	198	}
171	199
172	200	/* __tlb_remove_page
173	201
174	202
175	203
...	...	@@ -177,15 +205,24 @@
177	205	*/
178	206	static inline int __tlb_remove_page(struct mmu_gather tlb, struct page page)
179	207	{
	208	+ struct mmu_gather_batch *batch;
	209	+
180	210	tlb->need_flush = 1;
	211	+
181	212	if (tlb_fast_mode(tlb)) {
182	213	free_page_and_swap_cache(page);
183	214	return 1; /* avoid calling tlb_flush_mmu() */
184	215	}
185		- tlb->pages[tlb->nr++] = page;
186		- VM_BUG_ON(tlb->nr > tlb->max);
187	216
188		- return tlb->max - tlb->nr;
	217	+ batch = tlb->active;
	218	+ batch->pages[batch->nr++] = page;
	219	+ VM_BUG_ON(batch->nr > batch->max);
	220	+ if (batch->nr == batch->max) {
	221	+ if (!tlb_next_batch(tlb))
	222	+ return 0;
	223	+ }
	224	+
	225	+ return batch->max - batch->nr;
189	226	}
190	227
191	228	/* tlb_remove_page
...	...	@@ -994,8 +994,8 @@
994	994	spinlock_t *ptl;
995	995	int rss[NR_MM_COUNTERS];
996	996
997		- init_rss_vec(rss);
998	997	again:
	998	+ init_rss_vec(rss);
999	999	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1000	1000	arch_enter_lazy_mmu_mode();
1001	1001	do {