Blame view

mm/hugetlb.c 76.9 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
  /*
   * Generic hugetlb support.
   * (C) William Irwin, April 2004
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5
6
7
8
  #include <linux/list.h>
  #include <linux/init.h>
  #include <linux/module.h>
  #include <linux/mm.h>
e1759c215   Alexey Dobriyan   proc: switch /pro...
9
  #include <linux/seq_file.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
10
11
  #include <linux/sysctl.h>
  #include <linux/highmem.h>
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
12
  #include <linux/mmu_notifier.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
  #include <linux/nodemask.h>
63551ae0f   David Gibson   [PATCH] Hugepage ...
14
  #include <linux/pagemap.h>
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
15
  #include <linux/mempolicy.h>
aea47ff36   Christoph Lameter   [PATCH] mm: make ...
16
  #include <linux/cpuset.h>
3935baa9b   David Gibson   [PATCH] hugepage:...
17
  #include <linux/mutex.h>
aa888a749   Andi Kleen   hugetlb: support ...
18
  #include <linux/bootmem.h>
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
19
  #include <linux/sysfs.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
20
  #include <linux/slab.h>
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
21
  #include <linux/rmap.h>
fd6a03edd   Naoya Horiguchi   HWPOISON, hugetlb...
22
23
  #include <linux/swap.h>
  #include <linux/swapops.h>
d6606683a   Linus Torvalds   Revert duplicate ...
24

63551ae0f   David Gibson   [PATCH] Hugepage ...
25
26
  #include <asm/page.h>
  #include <asm/pgtable.h>
78a34ae29   Adrian Bunk   mm/hugetlb.c must...
27
  #include <asm/io.h>
63551ae0f   David Gibson   [PATCH] Hugepage ...
28
29
  
  #include <linux/hugetlb.h>
9a3052306   Lee Schermerhorn   hugetlb: add per ...
30
  #include <linux/node.h>
7835e98b2   Nick Piggin   [PATCH] remove se...
31
  #include "internal.h"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
  
  const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
396faf030   Mel Gorman   Allow huge page a...
34
35
  static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
  unsigned long hugepages_treat_as_movable;
a55164389   Andi Kleen   hugetlb: modular ...
36

e5ff21594   Andi Kleen   hugetlb: multiple...
37
38
39
  static int max_hstate;
  unsigned int default_hstate_idx;
  struct hstate hstates[HUGE_MAX_HSTATE];
53ba51d21   Jon Tollefson   hugetlb: allow ar...
40
  __initdata LIST_HEAD(huge_boot_pages);
e5ff21594   Andi Kleen   hugetlb: multiple...
41
42
43
  /* for command line parsing */
  static struct hstate * __initdata parsed_hstate;
  static unsigned long __initdata default_hstate_max_huge_pages;
e11bfbfcb   Nick Piggin   hugetlb: override...
44
  static unsigned long __initdata default_hstate_size;
e5ff21594   Andi Kleen   hugetlb: multiple...
45
46
47
  
  #define for_each_hstate(h) \
  	for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
396faf030   Mel Gorman   Allow huge page a...
48

3935baa9b   David Gibson   [PATCH] hugepage:...
49
50
51
52
  /*
   * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
   */
  static DEFINE_SPINLOCK(hugetlb_lock);
0bd0f9fb1   Eric Paris   [PATCH] hugetlb: ...
53

e7c4b0bfd   Andy Whitcroft   huge page private...
54
  /*
968229048   Andy Whitcroft   hugetlb: move res...
55
56
   * Region tracking -- allows tracking of reservations and instantiated pages
   *                    across the pages in a mapping.
84afd99b8   Andy Whitcroft   hugetlb reservati...
57
58
59
60
61
62
63
64
65
66
   *
   * The region data structures are protected by a combination of the mmap_sem
   * and the hugetlb_instantion_mutex.  To access or modify a region the caller
   * must either hold the mmap_sem for write, or the mmap_sem for read and
   * the hugetlb_instantiation mutex:
   *
   * 	down_write(&mm->mmap_sem);
   * or
   * 	down_read(&mm->mmap_sem);
   * 	mutex_lock(&hugetlb_instantiation_mutex);
968229048   Andy Whitcroft   hugetlb: move res...
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
   */
  struct file_region {
  	struct list_head link;
  	long from;
  	long to;
  };
  
  static long region_add(struct list_head *head, long f, long t)
  {
  	struct file_region *rg, *nrg, *trg;
  
  	/* Locate the region we are either in or before. */
  	list_for_each_entry(rg, head, link)
  		if (f <= rg->to)
  			break;
  
  	/* Round our left edge to the current segment if it encloses us. */
  	if (f > rg->from)
  		f = rg->from;
  
  	/* Check for and consume any regions we now overlap with. */
  	nrg = rg;
  	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
  		if (&rg->link == head)
  			break;
  		if (rg->from > t)
  			break;
  
  		/* If this area reaches higher then extend our area to
  		 * include it completely.  If this is not the first area
  		 * which we intend to reuse, free it. */
  		if (rg->to > t)
  			t = rg->to;
  		if (rg != nrg) {
  			list_del(&rg->link);
  			kfree(rg);
  		}
  	}
  	nrg->from = f;
  	nrg->to = t;
  	return 0;
  }
  
  static long region_chg(struct list_head *head, long f, long t)
  {
  	struct file_region *rg, *nrg;
  	long chg = 0;
  
  	/* Locate the region we are before or in. */
  	list_for_each_entry(rg, head, link)
  		if (f <= rg->to)
  			break;
  
  	/* If we are below the current region then a new region is required.
  	 * Subtle, allocate a new region at the position but make it zero
  	 * size such that we can guarantee to record the reservation. */
  	if (&rg->link == head || t < rg->from) {
  		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
  		if (!nrg)
  			return -ENOMEM;
  		nrg->from = f;
  		nrg->to   = f;
  		INIT_LIST_HEAD(&nrg->link);
  		list_add(&nrg->link, rg->link.prev);
  
  		return t - f;
  	}
  
  	/* Round our left edge to the current segment if it encloses us. */
  	if (f > rg->from)
  		f = rg->from;
  	chg = t - f;
  
  	/* Check for and consume any regions we now overlap with. */
  	list_for_each_entry(rg, rg->link.prev, link) {
  		if (&rg->link == head)
  			break;
  		if (rg->from > t)
  			return chg;
  
  		/* We overlap with this area, if it extends futher than
  		 * us then we must extend ourselves.  Account for its
  		 * existing reservation. */
  		if (rg->to > t) {
  			chg += rg->to - t;
  			t = rg->to;
  		}
  		chg -= rg->to - rg->from;
  	}
  	return chg;
  }
  
  static long region_truncate(struct list_head *head, long end)
  {
  	struct file_region *rg, *trg;
  	long chg = 0;
  
  	/* Locate the region we are either in or before. */
  	list_for_each_entry(rg, head, link)
  		if (end <= rg->to)
  			break;
  	if (&rg->link == head)
  		return 0;
  
  	/* If we are in the middle of a region then adjust it. */
  	if (end > rg->from) {
  		chg = rg->to - end;
  		rg->to = end;
  		rg = list_entry(rg->link.next, typeof(*rg), link);
  	}
  
  	/* Drop any remaining regions. */
  	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
  		if (&rg->link == head)
  			break;
  		chg += rg->to - rg->from;
  		list_del(&rg->link);
  		kfree(rg);
  	}
  	return chg;
  }
84afd99b8   Andy Whitcroft   hugetlb reservati...
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
  static long region_count(struct list_head *head, long f, long t)
  {
  	struct file_region *rg;
  	long chg = 0;
  
  	/* Locate each segment we overlap with, and count that overlap. */
  	list_for_each_entry(rg, head, link) {
  		int seg_from;
  		int seg_to;
  
  		if (rg->to <= f)
  			continue;
  		if (rg->from >= t)
  			break;
  
  		seg_from = max(rg->from, f);
  		seg_to = min(rg->to, t);
  
  		chg += seg_to - seg_from;
  	}
  
  	return chg;
  }
968229048   Andy Whitcroft   hugetlb: move res...
211
  /*
e7c4b0bfd   Andy Whitcroft   huge page private...
212
   * Convert the address within this vma to the page offset within
e7c4b0bfd   Andy Whitcroft   huge page private...
213
214
   * the mapping, in pagecache page units; huge pages here.
   */
a55164389   Andi Kleen   hugetlb: modular ...
215
216
  static pgoff_t vma_hugecache_offset(struct hstate *h,
  			struct vm_area_struct *vma, unsigned long address)
e7c4b0bfd   Andy Whitcroft   huge page private...
217
  {
a55164389   Andi Kleen   hugetlb: modular ...
218
219
  	return ((address - vma->vm_start) >> huge_page_shift(h)) +
  			(vma->vm_pgoff >> huge_page_order(h));
e7c4b0bfd   Andy Whitcroft   huge page private...
220
  }
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
221
222
223
224
225
  pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
  				     unsigned long address)
  {
  	return vma_hugecache_offset(hstate_vma(vma), vma, address);
  }
84afd99b8   Andy Whitcroft   hugetlb reservati...
226
  /*
08fba6998   Mel Gorman   mm: report the pa...
227
228
229
230
231
232
233
234
235
236
237
238
239
240
   * Return the size of the pages allocated when backing a VMA. In the majority
   * cases this will be same size as used by the page table entries.
   */
  unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
  {
  	struct hstate *hstate;
  
  	if (!is_vm_hugetlb_page(vma))
  		return PAGE_SIZE;
  
  	hstate = hstate_vma(vma);
  
  	return 1UL << (hstate->order + PAGE_SHIFT);
  }
f340ca0f0   Joerg Roedel   hugetlbfs: export...
241
  EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
08fba6998   Mel Gorman   mm: report the pa...
242
243
  
  /*
3340289dd   Mel Gorman   mm: report the MM...
244
245
246
247
248
249
250
251
252
253
254
255
256
   * Return the page size being used by the MMU to back a VMA. In the majority
   * of cases, the page size used by the kernel matches the MMU size. On
   * architectures where it differs, an architecture-specific version of this
   * function is required.
   */
  #ifndef vma_mmu_pagesize
  unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  {
  	return vma_kernel_pagesize(vma);
  }
  #endif
  
  /*
84afd99b8   Andy Whitcroft   hugetlb reservati...
257
258
259
260
261
262
   * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
   * bits of the reservation map pointer, which are always clear due to
   * alignment.
   */
  #define HPAGE_RESV_OWNER    (1UL << 0)
  #define HPAGE_RESV_UNMAPPED (1UL << 1)
04f2cbe35   Mel Gorman   hugetlb: guarante...
263
  #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
84afd99b8   Andy Whitcroft   hugetlb reservati...
264

a1e78772d   Mel Gorman   hugetlb: reserve ...
265
266
267
268
269
270
271
272
273
  /*
   * These helpers are used to track how many pages are reserved for
   * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
   * is guaranteed to have their future faults succeed.
   *
   * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
   * the reserve counters are updated with the hugetlb_lock held. It is safe
   * to reset the VMA at fork() time as it is not in use yet and there is no
   * chance of the global counters getting corrupted as a result of the values.
84afd99b8   Andy Whitcroft   hugetlb reservati...
274
275
276
277
278
279
280
281
282
   *
   * The private mapping reservation is represented in a subtly different
   * manner to a shared mapping.  A shared mapping has a region map associated
   * with the underlying file, this region map represents the backing file
   * pages which have ever had a reservation assigned which this persists even
   * after the page is instantiated.  A private mapping has a region map
   * associated with the original mmap which is attached to all VMAs which
   * reference it, this region map represents those offsets which have consumed
   * reservation ie. where pages have been instantiated.
a1e78772d   Mel Gorman   hugetlb: reserve ...
283
   */
e7c4b0bfd   Andy Whitcroft   huge page private...
284
285
286
287
288
289
290
291
292
293
  static unsigned long get_vma_private_data(struct vm_area_struct *vma)
  {
  	return (unsigned long)vma->vm_private_data;
  }
  
  static void set_vma_private_data(struct vm_area_struct *vma,
  							unsigned long value)
  {
  	vma->vm_private_data = (void *)value;
  }
84afd99b8   Andy Whitcroft   hugetlb reservati...
294
295
296
297
  struct resv_map {
  	struct kref refs;
  	struct list_head regions;
  };
2a4b3ded5   Harvey Harrison   mm: hugetlb.c mak...
298
  static struct resv_map *resv_map_alloc(void)
84afd99b8   Andy Whitcroft   hugetlb reservati...
299
300
301
302
303
304
305
306
307
308
  {
  	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
  	if (!resv_map)
  		return NULL;
  
  	kref_init(&resv_map->refs);
  	INIT_LIST_HEAD(&resv_map->regions);
  
  	return resv_map;
  }
2a4b3ded5   Harvey Harrison   mm: hugetlb.c mak...
309
  static void resv_map_release(struct kref *ref)
84afd99b8   Andy Whitcroft   hugetlb reservati...
310
311
312
313
314
315
316
317
318
  {
  	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
  
  	/* Clear out any active regions before we release the map. */
  	region_truncate(&resv_map->regions, 0);
  	kfree(resv_map);
  }
  
  static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
a1e78772d   Mel Gorman   hugetlb: reserve ...
319
320
  {
  	VM_BUG_ON(!is_vm_hugetlb_page(vma));
f83a275db   Mel Gorman   mm: account for M...
321
  	if (!(vma->vm_flags & VM_MAYSHARE))
84afd99b8   Andy Whitcroft   hugetlb reservati...
322
323
  		return (struct resv_map *)(get_vma_private_data(vma) &
  							~HPAGE_RESV_MASK);
2a4b3ded5   Harvey Harrison   mm: hugetlb.c mak...
324
  	return NULL;
a1e78772d   Mel Gorman   hugetlb: reserve ...
325
  }
84afd99b8   Andy Whitcroft   hugetlb reservati...
326
  static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
a1e78772d   Mel Gorman   hugetlb: reserve ...
327
328
  {
  	VM_BUG_ON(!is_vm_hugetlb_page(vma));
f83a275db   Mel Gorman   mm: account for M...
329
  	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
a1e78772d   Mel Gorman   hugetlb: reserve ...
330

84afd99b8   Andy Whitcroft   hugetlb reservati...
331
332
  	set_vma_private_data(vma, (get_vma_private_data(vma) &
  				HPAGE_RESV_MASK) | (unsigned long)map);
04f2cbe35   Mel Gorman   hugetlb: guarante...
333
334
335
336
  }
  
  static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
  {
04f2cbe35   Mel Gorman   hugetlb: guarante...
337
  	VM_BUG_ON(!is_vm_hugetlb_page(vma));
f83a275db   Mel Gorman   mm: account for M...
338
  	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
e7c4b0bfd   Andy Whitcroft   huge page private...
339
340
  
  	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
04f2cbe35   Mel Gorman   hugetlb: guarante...
341
342
343
344
345
  }
  
  static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
  {
  	VM_BUG_ON(!is_vm_hugetlb_page(vma));
e7c4b0bfd   Andy Whitcroft   huge page private...
346
347
  
  	return (get_vma_private_data(vma) & flag) != 0;
a1e78772d   Mel Gorman   hugetlb: reserve ...
348
349
350
  }
  
  /* Decrement the reserved pages in the hugepage pool by one */
a55164389   Andi Kleen   hugetlb: modular ...
351
352
  static void decrement_hugepage_resv_vma(struct hstate *h,
  			struct vm_area_struct *vma)
a1e78772d   Mel Gorman   hugetlb: reserve ...
353
  {
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
354
355
  	if (vma->vm_flags & VM_NORESERVE)
  		return;
f83a275db   Mel Gorman   mm: account for M...
356
  	if (vma->vm_flags & VM_MAYSHARE) {
a1e78772d   Mel Gorman   hugetlb: reserve ...
357
  		/* Shared mappings always use reserves */
a55164389   Andi Kleen   hugetlb: modular ...
358
  		h->resv_huge_pages--;
84afd99b8   Andy Whitcroft   hugetlb reservati...
359
  	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
a1e78772d   Mel Gorman   hugetlb: reserve ...
360
361
362
363
  		/*
  		 * Only the process that called mmap() has reserves for
  		 * private mappings.
  		 */
a55164389   Andi Kleen   hugetlb: modular ...
364
  		h->resv_huge_pages--;
a1e78772d   Mel Gorman   hugetlb: reserve ...
365
366
  	}
  }
04f2cbe35   Mel Gorman   hugetlb: guarante...
367
  /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
a1e78772d   Mel Gorman   hugetlb: reserve ...
368
369
370
  void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
  {
  	VM_BUG_ON(!is_vm_hugetlb_page(vma));
f83a275db   Mel Gorman   mm: account for M...
371
  	if (!(vma->vm_flags & VM_MAYSHARE))
a1e78772d   Mel Gorman   hugetlb: reserve ...
372
373
374
375
  		vma->vm_private_data = (void *)0;
  }
  
  /* Returns true if the VMA has associated reserve pages */
7f09ca51e   Mel Gorman   hugetlb: fix a hu...
376
  static int vma_has_reserves(struct vm_area_struct *vma)
a1e78772d   Mel Gorman   hugetlb: reserve ...
377
  {
f83a275db   Mel Gorman   mm: account for M...
378
  	if (vma->vm_flags & VM_MAYSHARE)
7f09ca51e   Mel Gorman   hugetlb: fix a hu...
379
380
381
382
  		return 1;
  	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
  		return 1;
  	return 0;
a1e78772d   Mel Gorman   hugetlb: reserve ...
383
  }
69d177c2f   Andy Whitcroft   hugetlbfs: handle...
384
385
386
387
388
389
390
391
392
393
394
395
  static void clear_gigantic_page(struct page *page,
  			unsigned long addr, unsigned long sz)
  {
  	int i;
  	struct page *p = page;
  
  	might_sleep();
  	for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
  		cond_resched();
  		clear_user_highpage(p, addr + i * PAGE_SIZE);
  	}
  }
a55164389   Andi Kleen   hugetlb: modular ...
396
397
  static void clear_huge_page(struct page *page,
  			unsigned long addr, unsigned long sz)
79ac6ba40   David Gibson   [PATCH] hugepage:...
398
399
  {
  	int i;
74dbdd239   Andrea Arcangeli   mm: hugetlb: fix ...
400
  	if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
ebdd4aea8   Hannes Eder   hugetlb: fix spar...
401
402
403
  		clear_gigantic_page(page, addr, sz);
  		return;
  	}
69d177c2f   Andy Whitcroft   hugetlbfs: handle...
404

79ac6ba40   David Gibson   [PATCH] hugepage:...
405
  	might_sleep();
a55164389   Andi Kleen   hugetlb: modular ...
406
  	for (i = 0; i < sz/PAGE_SIZE; i++) {
79ac6ba40   David Gibson   [PATCH] hugepage:...
407
  		cond_resched();
281e0e3b3   Ralf Baechle   hugetlb: fix clea...
408
  		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
79ac6ba40   David Gibson   [PATCH] hugepage:...
409
410
  	}
  }
0ebabb416   Naoya Horiguchi   hugetlb: redefine...
411
  static void copy_user_gigantic_page(struct page *dst, struct page *src,
69d177c2f   Andy Whitcroft   hugetlbfs: handle...
412
413
414
415
416
417
  			   unsigned long addr, struct vm_area_struct *vma)
  {
  	int i;
  	struct hstate *h = hstate_vma(vma);
  	struct page *dst_base = dst;
  	struct page *src_base = src;
0ebabb416   Naoya Horiguchi   hugetlb: redefine...
418

69d177c2f   Andy Whitcroft   hugetlbfs: handle...
419
420
421
422
423
424
425
426
427
  	for (i = 0; i < pages_per_huge_page(h); ) {
  		cond_resched();
  		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
  
  		i++;
  		dst = mem_map_next(dst, dst_base, i);
  		src = mem_map_next(src, src_base, i);
  	}
  }
0ebabb416   Naoya Horiguchi   hugetlb: redefine...
428
429
  
  static void copy_user_huge_page(struct page *dst, struct page *src,
9de455b20   Atsushi Nemoto   [PATCH] Pass vma ...
430
  			   unsigned long addr, struct vm_area_struct *vma)
79ac6ba40   David Gibson   [PATCH] hugepage:...
431
432
  {
  	int i;
a55164389   Andi Kleen   hugetlb: modular ...
433
  	struct hstate *h = hstate_vma(vma);
79ac6ba40   David Gibson   [PATCH] hugepage:...
434

ebdd4aea8   Hannes Eder   hugetlb: fix spar...
435
  	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
0ebabb416   Naoya Horiguchi   hugetlb: redefine...
436
  		copy_user_gigantic_page(dst, src, addr, vma);
ebdd4aea8   Hannes Eder   hugetlb: fix spar...
437
438
  		return;
  	}
69d177c2f   Andy Whitcroft   hugetlbfs: handle...
439

79ac6ba40   David Gibson   [PATCH] hugepage:...
440
  	might_sleep();
a55164389   Andi Kleen   hugetlb: modular ...
441
  	for (i = 0; i < pages_per_huge_page(h); i++) {
79ac6ba40   David Gibson   [PATCH] hugepage:...
442
  		cond_resched();
9de455b20   Atsushi Nemoto   [PATCH] Pass vma ...
443
  		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
79ac6ba40   David Gibson   [PATCH] hugepage:...
444
445
  	}
  }
0ebabb416   Naoya Horiguchi   hugetlb: redefine...
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
  static void copy_gigantic_page(struct page *dst, struct page *src)
  {
  	int i;
  	struct hstate *h = page_hstate(src);
  	struct page *dst_base = dst;
  	struct page *src_base = src;
  
  	for (i = 0; i < pages_per_huge_page(h); ) {
  		cond_resched();
  		copy_highpage(dst, src);
  
  		i++;
  		dst = mem_map_next(dst, dst_base, i);
  		src = mem_map_next(src, src_base, i);
  	}
  }
  
  void copy_huge_page(struct page *dst, struct page *src)
  {
  	int i;
  	struct hstate *h = page_hstate(src);
  
  	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
  		copy_gigantic_page(dst, src);
  		return;
  	}
  
  	might_sleep();
  	for (i = 0; i < pages_per_huge_page(h); i++) {
  		cond_resched();
  		copy_highpage(dst + i, src + i);
  	}
  }
a55164389   Andi Kleen   hugetlb: modular ...
479
  static void enqueue_huge_page(struct hstate *h, struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
480
481
  {
  	int nid = page_to_nid(page);
a55164389   Andi Kleen   hugetlb: modular ...
482
483
484
  	list_add(&page->lru, &h->hugepage_freelists[nid]);
  	h->free_huge_pages++;
  	h->free_huge_pages_node[nid]++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
485
  }
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
486
487
488
489
490
491
492
493
  static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
  {
  	struct page *page;
  
  	if (list_empty(&h->hugepage_freelists[nid]))
  		return NULL;
  	page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
  	list_del(&page->lru);
a9869b837   Naoya Horiguchi   hugetlb: move ref...
494
  	set_page_refcounted(page);
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
495
496
497
498
  	h->free_huge_pages--;
  	h->free_huge_pages_node[nid]--;
  	return page;
  }
a55164389   Andi Kleen   hugetlb: modular ...
499
500
  static struct page *dequeue_huge_page_vma(struct hstate *h,
  				struct vm_area_struct *vma,
04f2cbe35   Mel Gorman   hugetlb: guarante...
501
  				unsigned long address, int avoid_reserve)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
502
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  	struct page *page = NULL;
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
504
  	struct mempolicy *mpol;
19770b326   Mel Gorman   mm: filter based ...
505
  	nodemask_t *nodemask;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
506
  	struct zonelist *zonelist;
dd1a239f6   Mel Gorman   mm: have zonelist...
507
508
  	struct zone *zone;
  	struct zoneref *z;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509

c0ff7453b   Miao Xie   cpuset,mm: fix no...
510
511
512
  	get_mems_allowed();
  	zonelist = huge_zonelist(vma, address,
  					htlb_alloc_mask, &mpol, &nodemask);
a1e78772d   Mel Gorman   hugetlb: reserve ...
513
514
515
516
517
  	/*
  	 * A child process with MAP_PRIVATE mappings created by their parent
  	 * have no page reserves. This check ensures that reservations are
  	 * not "stolen". The child may still get SIGKILLed
  	 */
7f09ca51e   Mel Gorman   hugetlb: fix a hu...
518
  	if (!vma_has_reserves(vma) &&
a55164389   Andi Kleen   hugetlb: modular ...
519
  			h->free_huge_pages - h->resv_huge_pages == 0)
c0ff7453b   Miao Xie   cpuset,mm: fix no...
520
  		goto err;
a1e78772d   Mel Gorman   hugetlb: reserve ...
521

04f2cbe35   Mel Gorman   hugetlb: guarante...
522
  	/* If reserves cannot be used, ensure enough pages are in the pool */
a55164389   Andi Kleen   hugetlb: modular ...
523
  	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
c0ff7453b   Miao Xie   cpuset,mm: fix no...
524
  		goto err;;
04f2cbe35   Mel Gorman   hugetlb: guarante...
525

19770b326   Mel Gorman   mm: filter based ...
526
527
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
  						MAX_NR_ZONES - 1, nodemask) {
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
528
529
530
531
532
533
534
  		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
  			page = dequeue_huge_page_node(h, zone_to_nid(zone));
  			if (page) {
  				if (!avoid_reserve)
  					decrement_hugepage_resv_vma(h, vma);
  				break;
  			}
3abf7afd4   Andrew Morton   dequeue_huge_page...
535
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
536
  	}
c0ff7453b   Miao Xie   cpuset,mm: fix no...
537
  err:
52cd3b074   Lee Schermerhorn   mempolicy: rework...
538
  	mpol_cond_put(mpol);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
539
  	put_mems_allowed();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
540
541
  	return page;
  }
a55164389   Andi Kleen   hugetlb: modular ...
542
  static void update_and_free_page(struct hstate *h, struct page *page)
6af2acb66   Adam Litke   hugetlb: Move upd...
543
544
  {
  	int i;
a55164389   Andi Kleen   hugetlb: modular ...
545

18229df5b   Andy Whitcroft   hugetlb: pull gig...
546
  	VM_BUG_ON(h->order >= MAX_ORDER);
a55164389   Andi Kleen   hugetlb: modular ...
547
548
549
  	h->nr_huge_pages--;
  	h->nr_huge_pages_node[page_to_nid(page)]--;
  	for (i = 0; i < pages_per_huge_page(h); i++) {
6af2acb66   Adam Litke   hugetlb: Move upd...
550
551
552
553
554
555
  		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
  				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
  				1 << PG_private | 1<< PG_writeback);
  	}
  	set_compound_page_dtor(page, NULL);
  	set_page_refcounted(page);
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
556
  	arch_release_hugepage(page);
a55164389   Andi Kleen   hugetlb: modular ...
557
  	__free_pages(page, huge_page_order(h));
6af2acb66   Adam Litke   hugetlb: Move upd...
558
  }
e5ff21594   Andi Kleen   hugetlb: multiple...
559
560
561
562
563
564
565
566
567
568
  struct hstate *size_to_hstate(unsigned long size)
  {
  	struct hstate *h;
  
  	for_each_hstate(h) {
  		if (huge_page_size(h) == size)
  			return h;
  	}
  	return NULL;
  }
27a85ef1b   David Gibson   [PATCH] hugepage:...
569
570
  static void free_huge_page(struct page *page)
  {
a55164389   Andi Kleen   hugetlb: modular ...
571
572
573
574
  	/*
  	 * Can't pass hstate in here because it is called from the
  	 * compound page destructor.
  	 */
e5ff21594   Andi Kleen   hugetlb: multiple...
575
  	struct hstate *h = page_hstate(page);
7893d1d50   Adam Litke   hugetlb: Try to g...
576
  	int nid = page_to_nid(page);
c79fb75e5   Adam Litke   hugetlb: fix quot...
577
  	struct address_space *mapping;
27a85ef1b   David Gibson   [PATCH] hugepage:...
578

c79fb75e5   Adam Litke   hugetlb: fix quot...
579
  	mapping = (struct address_space *) page_private(page);
e5df70ab1   Andy Whitcroft   hugetlb: ensure w...
580
  	set_page_private(page, 0);
23be7468e   Mel Gorman   hugetlb: fix infi...
581
  	page->mapping = NULL;
7893d1d50   Adam Litke   hugetlb: Try to g...
582
  	BUG_ON(page_count(page));
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
583
  	BUG_ON(page_mapcount(page));
27a85ef1b   David Gibson   [PATCH] hugepage:...
584
585
586
  	INIT_LIST_HEAD(&page->lru);
  
  	spin_lock(&hugetlb_lock);
aa888a749   Andi Kleen   hugetlb: support ...
587
  	if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
a55164389   Andi Kleen   hugetlb: modular ...
588
589
590
  		update_and_free_page(h, page);
  		h->surplus_huge_pages--;
  		h->surplus_huge_pages_node[nid]--;
7893d1d50   Adam Litke   hugetlb: Try to g...
591
  	} else {
a55164389   Andi Kleen   hugetlb: modular ...
592
  		enqueue_huge_page(h, page);
7893d1d50   Adam Litke   hugetlb: Try to g...
593
  	}
27a85ef1b   David Gibson   [PATCH] hugepage:...
594
  	spin_unlock(&hugetlb_lock);
c79fb75e5   Adam Litke   hugetlb: fix quot...
595
  	if (mapping)
9a119c056   Adam Litke   hugetlb: allow bu...
596
  		hugetlb_put_quota(mapping, 1);
27a85ef1b   David Gibson   [PATCH] hugepage:...
597
  }
a55164389   Andi Kleen   hugetlb: modular ...
598
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
b7ba30c67   Andi Kleen   hugetlb: factor o...
599
600
601
  {
  	set_compound_page_dtor(page, free_huge_page);
  	spin_lock(&hugetlb_lock);
a55164389   Andi Kleen   hugetlb: modular ...
602
603
  	h->nr_huge_pages++;
  	h->nr_huge_pages_node[nid]++;
b7ba30c67   Andi Kleen   hugetlb: factor o...
604
605
606
  	spin_unlock(&hugetlb_lock);
  	put_page(page); /* free it into the hugepage allocator */
  }
20a0307c0   Wu Fengguang   mm: introduce Pag...
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
  static void prep_compound_gigantic_page(struct page *page, unsigned long order)
  {
  	int i;
  	int nr_pages = 1 << order;
  	struct page *p = page + 1;
  
  	/* we rely on prep_new_huge_page to set the destructor */
  	set_compound_order(page, order);
  	__SetPageHead(page);
  	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
  		__SetPageTail(p);
  		p->first_page = page;
  	}
  }
  
  int PageHuge(struct page *page)
  {
  	compound_page_dtor *dtor;
  
  	if (!PageCompound(page))
  		return 0;
  
  	page = compound_head(page);
  	dtor = get_compound_page_dtor(page);
  
  	return dtor == free_huge_page;
  }
43131e141   Naoya Horiguchi   HWPOISON, hugetlb...
634
  EXPORT_SYMBOL_GPL(PageHuge);
a55164389   Andi Kleen   hugetlb: modular ...
635
  static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
636
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
637
  	struct page *page;
f96efd585   Joe Jin   hugetlb: fix race...
638

aa888a749   Andi Kleen   hugetlb: support ...
639
640
  	if (h->order >= MAX_ORDER)
  		return NULL;
6484eb3e2   Mel Gorman   page allocator: d...
641
  	page = alloc_pages_exact_node(nid,
551883ae8   Nishanth Aravamudan   page allocator: e...
642
643
  		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
  						__GFP_REPEAT|__GFP_NOWARN,
a55164389   Andi Kleen   hugetlb: modular ...
644
  		huge_page_order(h));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
645
  	if (page) {
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
646
  		if (arch_prepare_hugepage(page)) {
caff3a2c3   Gerald Schaefer   hugetlb: call arc...
647
  			__free_pages(page, huge_page_order(h));
7b8ee84d8   Harvey Harrison   mm: fix integer a...
648
  			return NULL;
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
649
  		}
a55164389   Andi Kleen   hugetlb: modular ...
650
  		prep_new_huge_page(h, page, nid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
651
  	}
63b4613c3   Nishanth Aravamudan   hugetlb: fix huge...
652
653
654
  
  	return page;
  }
5ced66c90   Andi Kleen   hugetlb: abstract...
655
  /*
6ae11b278   Lee Schermerhorn   hugetlb: add node...
656
657
658
659
660
   * common helper functions for hstate_next_node_to_{alloc|free}.
   * We may have allocated or freed a huge page based on a different
   * nodes_allowed previously, so h->next_node_to_{alloc|free} might
   * be outside of *nodes_allowed.  Ensure that we use an allowed
   * node for alloc or free.
9a76db099   Lee Schermerhorn   hugetlb: rework h...
661
   */
6ae11b278   Lee Schermerhorn   hugetlb: add node...
662
  static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
9a76db099   Lee Schermerhorn   hugetlb: rework h...
663
  {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
664
  	nid = next_node(nid, *nodes_allowed);
9a76db099   Lee Schermerhorn   hugetlb: rework h...
665
  	if (nid == MAX_NUMNODES)
6ae11b278   Lee Schermerhorn   hugetlb: add node...
666
  		nid = first_node(*nodes_allowed);
9a76db099   Lee Schermerhorn   hugetlb: rework h...
667
668
669
670
  	VM_BUG_ON(nid >= MAX_NUMNODES);
  
  	return nid;
  }
6ae11b278   Lee Schermerhorn   hugetlb: add node...
671
672
673
674
675
676
  static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
  {
  	if (!node_isset(nid, *nodes_allowed))
  		nid = next_node_allowed(nid, nodes_allowed);
  	return nid;
  }
9a76db099   Lee Schermerhorn   hugetlb: rework h...
677
  /*
6ae11b278   Lee Schermerhorn   hugetlb: add node...
678
679
680
681
   * returns the previously saved node ["this node"] from which to
   * allocate a persistent huge page for the pool and advance the
   * next node from which to allocate, handling wrap at end of node
   * mask.
5ced66c90   Andi Kleen   hugetlb: abstract...
682
   */
6ae11b278   Lee Schermerhorn   hugetlb: add node...
683
684
  static int hstate_next_node_to_alloc(struct hstate *h,
  					nodemask_t *nodes_allowed)
5ced66c90   Andi Kleen   hugetlb: abstract...
685
  {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
686
687
688
689
690
691
  	int nid;
  
  	VM_BUG_ON(!nodes_allowed);
  
  	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
  	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
9a76db099   Lee Schermerhorn   hugetlb: rework h...
692

9a76db099   Lee Schermerhorn   hugetlb: rework h...
693
  	return nid;
5ced66c90   Andi Kleen   hugetlb: abstract...
694
  }
6ae11b278   Lee Schermerhorn   hugetlb: add node...
695
  static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
63b4613c3   Nishanth Aravamudan   hugetlb: fix huge...
696
697
698
699
700
  {
  	struct page *page;
  	int start_nid;
  	int next_nid;
  	int ret = 0;
6ae11b278   Lee Schermerhorn   hugetlb: add node...
701
  	start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
702
  	next_nid = start_nid;
63b4613c3   Nishanth Aravamudan   hugetlb: fix huge...
703
704
  
  	do {
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
705
  		page = alloc_fresh_huge_page_node(h, next_nid);
9a76db099   Lee Schermerhorn   hugetlb: rework h...
706
  		if (page) {
63b4613c3   Nishanth Aravamudan   hugetlb: fix huge...
707
  			ret = 1;
9a76db099   Lee Schermerhorn   hugetlb: rework h...
708
709
  			break;
  		}
6ae11b278   Lee Schermerhorn   hugetlb: add node...
710
  		next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
9a76db099   Lee Schermerhorn   hugetlb: rework h...
711
  	} while (next_nid != start_nid);
63b4613c3   Nishanth Aravamudan   hugetlb: fix huge...
712

3b1163006   Adam Litke   Subject: [PATCH] ...
713
714
715
716
  	if (ret)
  		count_vm_event(HTLB_BUDDY_PGALLOC);
  	else
  		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
63b4613c3   Nishanth Aravamudan   hugetlb: fix huge...
717
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
718
  }
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
719
  /*
6ae11b278   Lee Schermerhorn   hugetlb: add node...
720
721
722
723
   * helper for free_pool_huge_page() - return the previously saved
   * node ["this node"] from which to free a huge page.  Advance the
   * next node id whether or not we find a free huge page to free so
   * that the next attempt to free addresses the next node.
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
724
   */
6ae11b278   Lee Schermerhorn   hugetlb: add node...
725
  static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
726
  {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
727
728
729
730
731
732
  	int nid;
  
  	VM_BUG_ON(!nodes_allowed);
  
  	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
  	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
9a76db099   Lee Schermerhorn   hugetlb: rework h...
733

9a76db099   Lee Schermerhorn   hugetlb: rework h...
734
  	return nid;
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
735
736
737
738
739
740
741
742
  }
  
  /*
   * Free huge page from pool from next node to free.
   * Attempt to keep persistent huge pages more or less
   * balanced over allowed nodes.
   * Called with hugetlb_lock locked.
   */
6ae11b278   Lee Schermerhorn   hugetlb: add node...
743
744
  static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
  							 bool acct_surplus)
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
745
746
747
748
  {
  	int start_nid;
  	int next_nid;
  	int ret = 0;
6ae11b278   Lee Schermerhorn   hugetlb: add node...
749
  	start_nid = hstate_next_node_to_free(h, nodes_allowed);
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
750
751
752
  	next_nid = start_nid;
  
  	do {
685f34570   Lee Schermerhorn   hugetlb: use free...
753
754
755
756
757
758
  		/*
  		 * If we're returning unused surplus pages, only examine
  		 * nodes with surplus pages.
  		 */
  		if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
  		    !list_empty(&h->hugepage_freelists[next_nid])) {
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
759
760
761
762
763
764
  			struct page *page =
  				list_entry(h->hugepage_freelists[next_nid].next,
  					  struct page, lru);
  			list_del(&page->lru);
  			h->free_huge_pages--;
  			h->free_huge_pages_node[next_nid]--;
685f34570   Lee Schermerhorn   hugetlb: use free...
765
766
767
768
  			if (acct_surplus) {
  				h->surplus_huge_pages--;
  				h->surplus_huge_pages_node[next_nid]--;
  			}
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
769
770
  			update_and_free_page(h, page);
  			ret = 1;
9a76db099   Lee Schermerhorn   hugetlb: rework h...
771
  			break;
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
772
  		}
6ae11b278   Lee Schermerhorn   hugetlb: add node...
773
  		next_nid = hstate_next_node_to_free(h, nodes_allowed);
9a76db099   Lee Schermerhorn   hugetlb: rework h...
774
  	} while (next_nid != start_nid);
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
775
776
777
  
  	return ret;
  }
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
778
  static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
7893d1d50   Adam Litke   hugetlb: Try to g...
779
780
  {
  	struct page *page;
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
781
  	unsigned int r_nid;
7893d1d50   Adam Litke   hugetlb: Try to g...
782

aa888a749   Andi Kleen   hugetlb: support ...
783
784
  	if (h->order >= MAX_ORDER)
  		return NULL;
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
  	/*
  	 * Assume we will successfully allocate the surplus page to
  	 * prevent racing processes from causing the surplus to exceed
  	 * overcommit
  	 *
  	 * This however introduces a different race, where a process B
  	 * tries to grow the static hugepage pool while alloc_pages() is
  	 * called by process A. B will only examine the per-node
  	 * counters in determining if surplus huge pages can be
  	 * converted to normal huge pages in adjust_pool_surplus(). A
  	 * won't be able to increment the per-node counter, until the
  	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
  	 * no more huge pages can be converted from surplus to normal
  	 * state (and doesn't try to convert again). Thus, we have a
  	 * case where a surplus huge page exists, the pool is grown, and
  	 * the surplus huge page still exists after, even though it
  	 * should just have been converted to a normal huge page. This
  	 * does not leak memory, though, as the hugepage will be freed
  	 * once it is out of use. It also does not allow the counters to
  	 * go out of whack in adjust_pool_surplus() as we don't modify
  	 * the node values until we've gotten the hugepage and only the
  	 * per-node value is checked there.
  	 */
  	spin_lock(&hugetlb_lock);
a55164389   Andi Kleen   hugetlb: modular ...
809
  	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
810
811
812
  		spin_unlock(&hugetlb_lock);
  		return NULL;
  	} else {
a55164389   Andi Kleen   hugetlb: modular ...
813
814
  		h->nr_huge_pages++;
  		h->surplus_huge_pages++;
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
815
816
  	}
  	spin_unlock(&hugetlb_lock);
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
817
818
819
820
821
822
823
824
  	if (nid == NUMA_NO_NODE)
  		page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
  				   __GFP_REPEAT|__GFP_NOWARN,
  				   huge_page_order(h));
  	else
  		page = alloc_pages_exact_node(nid,
  			htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
  			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
825

caff3a2c3   Gerald Schaefer   hugetlb: call arc...
826
827
828
829
  	if (page && arch_prepare_hugepage(page)) {
  		__free_pages(page, huge_page_order(h));
  		return NULL;
  	}
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
830
  	spin_lock(&hugetlb_lock);
7893d1d50   Adam Litke   hugetlb: Try to g...
831
  	if (page) {
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
832
  		r_nid = page_to_nid(page);
7893d1d50   Adam Litke   hugetlb: Try to g...
833
  		set_compound_page_dtor(page, free_huge_page);
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
834
835
836
  		/*
  		 * We incremented the global counters already
  		 */
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
837
838
  		h->nr_huge_pages_node[r_nid]++;
  		h->surplus_huge_pages_node[r_nid]++;
3b1163006   Adam Litke   Subject: [PATCH] ...
839
  		__count_vm_event(HTLB_BUDDY_PGALLOC);
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
840
  	} else {
a55164389   Andi Kleen   hugetlb: modular ...
841
842
  		h->nr_huge_pages--;
  		h->surplus_huge_pages--;
3b1163006   Adam Litke   Subject: [PATCH] ...
843
  		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
7893d1d50   Adam Litke   hugetlb: Try to g...
844
  	}
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
845
  	spin_unlock(&hugetlb_lock);
7893d1d50   Adam Litke   hugetlb: Try to g...
846
847
848
  
  	return page;
  }
e4e574b76   Adam Litke   hugetlb: Try to g...
849
  /*
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
   * This allocation function is useful in the context where vma is irrelevant.
   * E.g. soft-offlining uses this function because it only cares physical
   * address of error page.
   */
  struct page *alloc_huge_page_node(struct hstate *h, int nid)
  {
  	struct page *page;
  
  	spin_lock(&hugetlb_lock);
  	page = dequeue_huge_page_node(h, nid);
  	spin_unlock(&hugetlb_lock);
  
  	if (!page)
  		page = alloc_buddy_huge_page(h, nid);
  
  	return page;
  }
  
  /*
e4e574b76   Adam Litke   hugetlb: Try to g...
869
870
871
   * Increase the hugetlb pool such that it can accomodate a reservation
   * of size 'delta'.
   */
a55164389   Andi Kleen   hugetlb: modular ...
872
  static int gather_surplus_pages(struct hstate *h, int delta)
e4e574b76   Adam Litke   hugetlb: Try to g...
873
874
875
876
877
  {
  	struct list_head surplus_list;
  	struct page *page, *tmp;
  	int ret, i;
  	int needed, allocated;
a55164389   Andi Kleen   hugetlb: modular ...
878
  	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
ac09b3a15   Adam Litke   hugetlb: close a ...
879
  	if (needed <= 0) {
a55164389   Andi Kleen   hugetlb: modular ...
880
  		h->resv_huge_pages += delta;
e4e574b76   Adam Litke   hugetlb: Try to g...
881
  		return 0;
ac09b3a15   Adam Litke   hugetlb: close a ...
882
  	}
e4e574b76   Adam Litke   hugetlb: Try to g...
883
884
885
886
887
888
889
890
  
  	allocated = 0;
  	INIT_LIST_HEAD(&surplus_list);
  
  	ret = -ENOMEM;
  retry:
  	spin_unlock(&hugetlb_lock);
  	for (i = 0; i < needed; i++) {
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
891
  		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
a9869b837   Naoya Horiguchi   hugetlb: move ref...
892
  		if (!page)
e4e574b76   Adam Litke   hugetlb: Try to g...
893
894
895
896
897
  			/*
  			 * We were not able to allocate enough pages to
  			 * satisfy the entire reservation so we free what
  			 * we've allocated so far.
  			 */
e4e574b76   Adam Litke   hugetlb: Try to g...
898
  			goto free;
e4e574b76   Adam Litke   hugetlb: Try to g...
899
900
901
902
903
904
905
906
907
908
  
  		list_add(&page->lru, &surplus_list);
  	}
  	allocated += needed;
  
  	/*
  	 * After retaking hugetlb_lock, we need to recalculate 'needed'
  	 * because either resv_huge_pages or free_huge_pages may have changed.
  	 */
  	spin_lock(&hugetlb_lock);
a55164389   Andi Kleen   hugetlb: modular ...
909
910
  	needed = (h->resv_huge_pages + delta) -
  			(h->free_huge_pages + allocated);
e4e574b76   Adam Litke   hugetlb: Try to g...
911
912
913
914
915
916
917
  	if (needed > 0)
  		goto retry;
  
  	/*
  	 * The surplus_list now contains _at_least_ the number of extra pages
  	 * needed to accomodate the reservation.  Add the appropriate number
  	 * of pages to the hugetlb pool and free the extras back to the buddy
ac09b3a15   Adam Litke   hugetlb: close a ...
918
919
920
  	 * allocator.  Commit the entire reservation here to prevent another
  	 * process from stealing the pages as they are added to the pool but
  	 * before they are reserved.
e4e574b76   Adam Litke   hugetlb: Try to g...
921
922
  	 */
  	needed += allocated;
a55164389   Andi Kleen   hugetlb: modular ...
923
  	h->resv_huge_pages += delta;
e4e574b76   Adam Litke   hugetlb: Try to g...
924
  	ret = 0;
a9869b837   Naoya Horiguchi   hugetlb: move ref...
925
926
  
  	spin_unlock(&hugetlb_lock);
19fc3f0ac   Adam Litke   hugetlb: decrease...
927
  	/* Free the needed pages to the hugetlb pool */
e4e574b76   Adam Litke   hugetlb: Try to g...
928
  	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
19fc3f0ac   Adam Litke   hugetlb: decrease...
929
930
  		if ((--needed) < 0)
  			break;
e4e574b76   Adam Litke   hugetlb: Try to g...
931
  		list_del(&page->lru);
a9869b837   Naoya Horiguchi   hugetlb: move ref...
932
933
934
935
936
937
  		/*
  		 * This page is now managed by the hugetlb allocator and has
  		 * no users -- drop the buddy allocator's reference.
  		 */
  		put_page_testzero(page);
  		VM_BUG_ON(page_count(page));
a55164389   Andi Kleen   hugetlb: modular ...
938
  		enqueue_huge_page(h, page);
19fc3f0ac   Adam Litke   hugetlb: decrease...
939
940
941
  	}
  
  	/* Free unnecessary surplus pages to the buddy allocator */
a9869b837   Naoya Horiguchi   hugetlb: move ref...
942
  free:
19fc3f0ac   Adam Litke   hugetlb: decrease...
943
  	if (!list_empty(&surplus_list)) {
19fc3f0ac   Adam Litke   hugetlb: decrease...
944
945
  		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
  			list_del(&page->lru);
a9869b837   Naoya Horiguchi   hugetlb: move ref...
946
  			put_page(page);
af767cbdd   Adam Litke   hugetlb: fix dyna...
947
  		}
e4e574b76   Adam Litke   hugetlb: Try to g...
948
  	}
a9869b837   Naoya Horiguchi   hugetlb: move ref...
949
  	spin_lock(&hugetlb_lock);
e4e574b76   Adam Litke   hugetlb: Try to g...
950
951
952
953
954
955
956
957
  
  	return ret;
  }
  
  /*
   * When releasing a hugetlb pool reservation, any surplus pages that were
   * allocated to satisfy the reservation must be explicitly freed if they were
   * never used.
685f34570   Lee Schermerhorn   hugetlb: use free...
958
   * Called with hugetlb_lock held.
e4e574b76   Adam Litke   hugetlb: Try to g...
959
   */
a55164389   Andi Kleen   hugetlb: modular ...
960
961
  static void return_unused_surplus_pages(struct hstate *h,
  					unsigned long unused_resv_pages)
e4e574b76   Adam Litke   hugetlb: Try to g...
962
  {
e4e574b76   Adam Litke   hugetlb: Try to g...
963
  	unsigned long nr_pages;
ac09b3a15   Adam Litke   hugetlb: close a ...
964
  	/* Uncommit the reservation */
a55164389   Andi Kleen   hugetlb: modular ...
965
  	h->resv_huge_pages -= unused_resv_pages;
ac09b3a15   Adam Litke   hugetlb: close a ...
966

aa888a749   Andi Kleen   hugetlb: support ...
967
968
969
  	/* Cannot return gigantic pages currently */
  	if (h->order >= MAX_ORDER)
  		return;
a55164389   Andi Kleen   hugetlb: modular ...
970
  	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
e4e574b76   Adam Litke   hugetlb: Try to g...
971

685f34570   Lee Schermerhorn   hugetlb: use free...
972
973
  	/*
  	 * We want to release as many surplus pages as possible, spread
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
974
975
976
977
978
  	 * evenly across all nodes with memory. Iterate across these nodes
  	 * until we can no longer free unreserved surplus pages. This occurs
  	 * when the nodes with surplus pages have no free pages.
  	 * free_pool_huge_page() will balance the the freed pages across the
  	 * on-line nodes with memory and will handle the hstate accounting.
685f34570   Lee Schermerhorn   hugetlb: use free...
979
980
  	 */
  	while (nr_pages--) {
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
981
  		if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
685f34570   Lee Schermerhorn   hugetlb: use free...
982
  			break;
e4e574b76   Adam Litke   hugetlb: Try to g...
983
984
  	}
  }
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
985
986
987
988
989
990
991
992
993
  /*
   * Determine if the huge page at addr within the vma has an associated
   * reservation.  Where it does not we will need to logically increase
   * reservation and actually increase quota before an allocation can occur.
   * Where any new reservation would be required the reservation change is
   * prepared, but not committed.  Once the page has been quota'd allocated
   * an instantiated the change should be committed via vma_commit_reservation.
   * No action is required on failure.
   */
e2f17d945   Roel Kluin   hugetlb: chg cann...
994
  static long vma_needs_reservation(struct hstate *h,
a55164389   Andi Kleen   hugetlb: modular ...
995
  			struct vm_area_struct *vma, unsigned long addr)
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
996
997
998
  {
  	struct address_space *mapping = vma->vm_file->f_mapping;
  	struct inode *inode = mapping->host;
f83a275db   Mel Gorman   mm: account for M...
999
  	if (vma->vm_flags & VM_MAYSHARE) {
a55164389   Andi Kleen   hugetlb: modular ...
1000
  		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1001
1002
  		return region_chg(&inode->i_mapping->private_list,
  							idx, idx + 1);
84afd99b8   Andy Whitcroft   hugetlb reservati...
1003
1004
  	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
  		return 1;
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1005

84afd99b8   Andy Whitcroft   hugetlb reservati...
1006
  	} else  {
e2f17d945   Roel Kluin   hugetlb: chg cann...
1007
  		long err;
a55164389   Andi Kleen   hugetlb: modular ...
1008
  		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
84afd99b8   Andy Whitcroft   hugetlb reservati...
1009
1010
1011
1012
1013
1014
1015
  		struct resv_map *reservations = vma_resv_map(vma);
  
  		err = region_chg(&reservations->regions, idx, idx + 1);
  		if (err < 0)
  			return err;
  		return 0;
  	}
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1016
  }
a55164389   Andi Kleen   hugetlb: modular ...
1017
1018
  static void vma_commit_reservation(struct hstate *h,
  			struct vm_area_struct *vma, unsigned long addr)
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1019
1020
1021
  {
  	struct address_space *mapping = vma->vm_file->f_mapping;
  	struct inode *inode = mapping->host;
f83a275db   Mel Gorman   mm: account for M...
1022
  	if (vma->vm_flags & VM_MAYSHARE) {
a55164389   Andi Kleen   hugetlb: modular ...
1023
  		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1024
  		region_add(&inode->i_mapping->private_list, idx, idx + 1);
84afd99b8   Andy Whitcroft   hugetlb reservati...
1025
1026
  
  	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
a55164389   Andi Kleen   hugetlb: modular ...
1027
  		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
84afd99b8   Andy Whitcroft   hugetlb reservati...
1028
1029
1030
1031
  		struct resv_map *reservations = vma_resv_map(vma);
  
  		/* Mark this page used in the map. */
  		region_add(&reservations->regions, idx, idx + 1);
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1032
1033
  	}
  }
a1e78772d   Mel Gorman   hugetlb: reserve ...
1034
  static struct page *alloc_huge_page(struct vm_area_struct *vma,
04f2cbe35   Mel Gorman   hugetlb: guarante...
1035
  				    unsigned long addr, int avoid_reserve)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1036
  {
a55164389   Andi Kleen   hugetlb: modular ...
1037
  	struct hstate *h = hstate_vma(vma);
348ea204c   Adam Litke   hugetlb: split al...
1038
  	struct page *page;
a1e78772d   Mel Gorman   hugetlb: reserve ...
1039
1040
  	struct address_space *mapping = vma->vm_file->f_mapping;
  	struct inode *inode = mapping->host;
e2f17d945   Roel Kluin   hugetlb: chg cann...
1041
  	long chg;
a1e78772d   Mel Gorman   hugetlb: reserve ...
1042
1043
1044
1045
1046
  
  	/*
  	 * Processes that did not create the mapping will have no reserves and
  	 * will not have accounted against quota. Check that the quota can be
  	 * made before satisfying the allocation
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1047
1048
  	 * MAP_NORESERVE mappings may also need pages and quota allocated
  	 * if no reserve mapping overlaps.
a1e78772d   Mel Gorman   hugetlb: reserve ...
1049
  	 */
a55164389   Andi Kleen   hugetlb: modular ...
1050
  	chg = vma_needs_reservation(h, vma, addr);
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1051
1052
1053
  	if (chg < 0)
  		return ERR_PTR(chg);
  	if (chg)
a1e78772d   Mel Gorman   hugetlb: reserve ...
1054
1055
  		if (hugetlb_get_quota(inode->i_mapping, chg))
  			return ERR_PTR(-ENOSPC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1056
1057
  
  	spin_lock(&hugetlb_lock);
a55164389   Andi Kleen   hugetlb: modular ...
1058
  	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1059
  	spin_unlock(&hugetlb_lock);
b45b5bd65   David Gibson   [PATCH] hugepage:...
1060

68842c9b9   Ken Chen   hugetlbfs: fix qu...
1061
  	if (!page) {
bf50bab2b   Naoya Horiguchi   hugetlb: add allo...
1062
  		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
68842c9b9   Ken Chen   hugetlbfs: fix qu...
1063
  		if (!page) {
a1e78772d   Mel Gorman   hugetlb: reserve ...
1064
  			hugetlb_put_quota(inode->i_mapping, chg);
4a6018f7f   Mel Gorman   hugetlbfs: kill a...
1065
  			return ERR_PTR(-VM_FAULT_SIGBUS);
68842c9b9   Ken Chen   hugetlbfs: fix qu...
1066
1067
  		}
  	}
348ea204c   Adam Litke   hugetlb: split al...
1068

a1e78772d   Mel Gorman   hugetlb: reserve ...
1069
  	set_page_private(page, (unsigned long) mapping);
90d8b7e61   Adam Litke   hugetlb: enforce ...
1070

a55164389   Andi Kleen   hugetlb: modular ...
1071
  	vma_commit_reservation(h, vma, addr);
c37f9fb11   Andy Whitcroft   hugetlb: allow hu...
1072

90d8b7e61   Adam Litke   hugetlb: enforce ...
1073
  	return page;
b45b5bd65   David Gibson   [PATCH] hugepage:...
1074
  }
91f47662d   Cyrill Gorcunov   mm: hugetlb: remo...
1075
  int __weak alloc_bootmem_huge_page(struct hstate *h)
aa888a749   Andi Kleen   hugetlb: support ...
1076
1077
  {
  	struct huge_bootmem_page *m;
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1078
  	int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
aa888a749   Andi Kleen   hugetlb: support ...
1079
1080
1081
1082
1083
  
  	while (nr_nodes) {
  		void *addr;
  
  		addr = __alloc_bootmem_node_nopanic(
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1084
  				NODE_DATA(hstate_next_node_to_alloc(h,
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1085
  						&node_states[N_HIGH_MEMORY])),
aa888a749   Andi Kleen   hugetlb: support ...
1086
1087
1088
1089
1090
1091
1092
1093
1094
  				huge_page_size(h), huge_page_size(h), 0);
  
  		if (addr) {
  			/*
  			 * Use the beginning of the huge page to store the
  			 * huge_bootmem_page struct (until gather_bootmem
  			 * puts them into the mem_map).
  			 */
  			m = addr;
91f47662d   Cyrill Gorcunov   mm: hugetlb: remo...
1095
  			goto found;
aa888a749   Andi Kleen   hugetlb: support ...
1096
  		}
aa888a749   Andi Kleen   hugetlb: support ...
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
  		nr_nodes--;
  	}
  	return 0;
  
  found:
  	BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
  	/* Put them into a private list first because mem_map is not up yet */
  	list_add(&m->list, &huge_boot_pages);
  	m->hstate = h;
  	return 1;
  }
18229df5b   Andy Whitcroft   hugetlb: pull gig...
1108
1109
1110
1111
1112
1113
1114
  static void prep_compound_huge_page(struct page *page, int order)
  {
  	if (unlikely(order > (MAX_ORDER - 1)))
  		prep_compound_gigantic_page(page, order);
  	else
  		prep_compound_page(page, order);
  }
aa888a749   Andi Kleen   hugetlb: support ...
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
  /* Put bootmem huge pages into the standard lists after mem_map is up */
  static void __init gather_bootmem_prealloc(void)
  {
  	struct huge_bootmem_page *m;
  
  	list_for_each_entry(m, &huge_boot_pages, list) {
  		struct page *page = virt_to_page(m);
  		struct hstate *h = m->hstate;
  		__ClearPageReserved(page);
  		WARN_ON(page_count(page) != 1);
18229df5b   Andy Whitcroft   hugetlb: pull gig...
1125
  		prep_compound_huge_page(page, h->order);
aa888a749   Andi Kleen   hugetlb: support ...
1126
1127
1128
  		prep_new_huge_page(h, page, page_to_nid(page));
  	}
  }
8faa8b077   Andi Kleen   hugetlb: support ...
1129
  static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1130
1131
  {
  	unsigned long i;
a55164389   Andi Kleen   hugetlb: modular ...
1132

e5ff21594   Andi Kleen   hugetlb: multiple...
1133
  	for (i = 0; i < h->max_huge_pages; ++i) {
aa888a749   Andi Kleen   hugetlb: support ...
1134
1135
1136
  		if (h->order >= MAX_ORDER) {
  			if (!alloc_bootmem_huge_page(h))
  				break;
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1137
1138
  		} else if (!alloc_fresh_huge_page(h,
  					 &node_states[N_HIGH_MEMORY]))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1139
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1140
  	}
8faa8b077   Andi Kleen   hugetlb: support ...
1141
  	h->max_huge_pages = i;
e5ff21594   Andi Kleen   hugetlb: multiple...
1142
1143
1144
1145
1146
1147
1148
  }
  
  static void __init hugetlb_init_hstates(void)
  {
  	struct hstate *h;
  
  	for_each_hstate(h) {
8faa8b077   Andi Kleen   hugetlb: support ...
1149
1150
1151
  		/* oversize hugepages were init'ed in early boot */
  		if (h->order < MAX_ORDER)
  			hugetlb_hstate_alloc_pages(h);
e5ff21594   Andi Kleen   hugetlb: multiple...
1152
1153
  	}
  }
4abd32dba   Andi Kleen   hugetlb: printk c...
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
  static char * __init memfmt(char *buf, unsigned long n)
  {
  	if (n >= (1UL << 30))
  		sprintf(buf, "%lu GB", n >> 30);
  	else if (n >= (1UL << 20))
  		sprintf(buf, "%lu MB", n >> 20);
  	else
  		sprintf(buf, "%lu KB", n >> 10);
  	return buf;
  }
e5ff21594   Andi Kleen   hugetlb: multiple...
1164
1165
1166
1167
1168
  static void __init report_hugepages(void)
  {
  	struct hstate *h;
  
  	for_each_hstate(h) {
4abd32dba   Andi Kleen   hugetlb: printk c...
1169
1170
1171
1172
1173
1174
  		char buf[32];
  		printk(KERN_INFO "HugeTLB registered %s page size, "
  				 "pre-allocated %ld pages
  ",
  			memfmt(buf, huge_page_size(h)),
  			h->free_huge_pages);
e5ff21594   Andi Kleen   hugetlb: multiple...
1175
1176
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1177
  #ifdef CONFIG_HIGHMEM
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1178
1179
  static void try_to_free_low(struct hstate *h, unsigned long count,
  						nodemask_t *nodes_allowed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1180
  {
4415cc8df   Christoph Lameter   [PATCH] Hugepages...
1181
  	int i;
aa888a749   Andi Kleen   hugetlb: support ...
1182
1183
  	if (h->order >= MAX_ORDER)
  		return;
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1184
  	for_each_node_mask(i, *nodes_allowed) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1185
  		struct page *page, *next;
a55164389   Andi Kleen   hugetlb: modular ...
1186
1187
1188
  		struct list_head *freel = &h->hugepage_freelists[i];
  		list_for_each_entry_safe(page, next, freel, lru) {
  			if (count >= h->nr_huge_pages)
6b0c880df   Adam Litke   hugetlb: fix pool...
1189
  				return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1190
1191
1192
  			if (PageHighMem(page))
  				continue;
  			list_del(&page->lru);
e5ff21594   Andi Kleen   hugetlb: multiple...
1193
  			update_and_free_page(h, page);
a55164389   Andi Kleen   hugetlb: modular ...
1194
1195
  			h->free_huge_pages--;
  			h->free_huge_pages_node[page_to_nid(page)]--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1196
1197
1198
1199
  		}
  	}
  }
  #else
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1200
1201
  static inline void try_to_free_low(struct hstate *h, unsigned long count,
  						nodemask_t *nodes_allowed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1202
1203
1204
  {
  }
  #endif
20a0307c0   Wu Fengguang   mm: introduce Pag...
1205
1206
1207
1208
1209
  /*
   * Increment or decrement surplus_huge_pages.  Keep node-specific counters
   * balanced by operating on them in a round-robin fashion.
   * Returns 1 if an adjustment was made.
   */
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1210
1211
  static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
  				int delta)
20a0307c0   Wu Fengguang   mm: introduce Pag...
1212
  {
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1213
  	int start_nid, next_nid;
20a0307c0   Wu Fengguang   mm: introduce Pag...
1214
1215
1216
  	int ret = 0;
  
  	VM_BUG_ON(delta != -1 && delta != 1);
20a0307c0   Wu Fengguang   mm: introduce Pag...
1217

e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1218
  	if (delta < 0)
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1219
  		start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1220
  	else
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1221
  		start_nid = hstate_next_node_to_free(h, nodes_allowed);
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1222
1223
1224
1225
1226
  	next_nid = start_nid;
  
  	do {
  		int nid = next_nid;
  		if (delta < 0)  {
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1227
1228
1229
  			/*
  			 * To shrink on this node, there must be a surplus page
  			 */
9a76db099   Lee Schermerhorn   hugetlb: rework h...
1230
  			if (!h->surplus_huge_pages_node[nid]) {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1231
1232
  				next_nid = hstate_next_node_to_alloc(h,
  								nodes_allowed);
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1233
  				continue;
9a76db099   Lee Schermerhorn   hugetlb: rework h...
1234
  			}
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1235
1236
  		}
  		if (delta > 0) {
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1237
1238
1239
1240
  			/*
  			 * Surplus cannot exceed the total number of pages
  			 */
  			if (h->surplus_huge_pages_node[nid] >=
9a76db099   Lee Schermerhorn   hugetlb: rework h...
1241
  						h->nr_huge_pages_node[nid]) {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1242
1243
  				next_nid = hstate_next_node_to_free(h,
  								nodes_allowed);
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1244
  				continue;
9a76db099   Lee Schermerhorn   hugetlb: rework h...
1245
  			}
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1246
  		}
20a0307c0   Wu Fengguang   mm: introduce Pag...
1247
1248
1249
1250
1251
  
  		h->surplus_huge_pages += delta;
  		h->surplus_huge_pages_node[nid] += delta;
  		ret = 1;
  		break;
e8c5c8249   Lee Schermerhorn   hugetlb: balance ...
1252
  	} while (next_nid != start_nid);
20a0307c0   Wu Fengguang   mm: introduce Pag...
1253

20a0307c0   Wu Fengguang   mm: introduce Pag...
1254
1255
  	return ret;
  }
a55164389   Andi Kleen   hugetlb: modular ...
1256
  #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1257
1258
  static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
  						nodemask_t *nodes_allowed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1259
  {
7893d1d50   Adam Litke   hugetlb: Try to g...
1260
  	unsigned long min_count, ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1261

aa888a749   Andi Kleen   hugetlb: support ...
1262
1263
  	if (h->order >= MAX_ORDER)
  		return h->max_huge_pages;
7893d1d50   Adam Litke   hugetlb: Try to g...
1264
1265
1266
1267
  	/*
  	 * Increase the pool size
  	 * First take pages out of surplus state.  Then make up the
  	 * remaining difference by allocating fresh huge pages.
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
1268
1269
1270
1271
1272
1273
  	 *
  	 * We might race with alloc_buddy_huge_page() here and be unable
  	 * to convert a surplus huge page to a normal huge page. That is
  	 * not critical, though, it just means the overall size of the
  	 * pool might be one hugepage larger than it needs to be, but
  	 * within all the constraints specified by the sysctls.
7893d1d50   Adam Litke   hugetlb: Try to g...
1274
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1275
  	spin_lock(&hugetlb_lock);
a55164389   Andi Kleen   hugetlb: modular ...
1276
  	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1277
  		if (!adjust_pool_surplus(h, nodes_allowed, -1))
7893d1d50   Adam Litke   hugetlb: Try to g...
1278
1279
  			break;
  	}
a55164389   Andi Kleen   hugetlb: modular ...
1280
  	while (count > persistent_huge_pages(h)) {
7893d1d50   Adam Litke   hugetlb: Try to g...
1281
1282
1283
1284
1285
1286
  		/*
  		 * If this allocation races such that we no longer need the
  		 * page, free_huge_page will handle it by freeing the page
  		 * and reducing the surplus.
  		 */
  		spin_unlock(&hugetlb_lock);
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1287
  		ret = alloc_fresh_huge_page(h, nodes_allowed);
7893d1d50   Adam Litke   hugetlb: Try to g...
1288
1289
1290
  		spin_lock(&hugetlb_lock);
  		if (!ret)
  			goto out;
536240f2b   Mel Gorman   hugetlb: abort a ...
1291
1292
1293
  		/* Bail for signals. Probably ctrl-c from user */
  		if (signal_pending(current))
  			goto out;
7893d1d50   Adam Litke   hugetlb: Try to g...
1294
  	}
7893d1d50   Adam Litke   hugetlb: Try to g...
1295
1296
1297
1298
1299
1300
1301
  
  	/*
  	 * Decrease the pool size
  	 * First return free pages to the buddy allocator (being careful
  	 * to keep enough around to satisfy reservations).  Then place
  	 * pages into surplus state as needed so the pool will shrink
  	 * to the desired size as pages become free.
d1c3fb1f8   Nishanth Aravamudan   hugetlb: introduc...
1302
1303
1304
1305
1306
1307
1308
1309
  	 *
  	 * By placing pages into the surplus state independent of the
  	 * overcommit value, we are allowing the surplus pool size to
  	 * exceed overcommit. There are few sane options here. Since
  	 * alloc_buddy_huge_page() is checking the global counter,
  	 * though, we'll note that we're not allowed to exceed surplus
  	 * and won't grow the pool anywhere else. Not until one of the
  	 * sysctls are changed, or the surplus pages go out of use.
7893d1d50   Adam Litke   hugetlb: Try to g...
1310
  	 */
a55164389   Andi Kleen   hugetlb: modular ...
1311
  	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
6b0c880df   Adam Litke   hugetlb: fix pool...
1312
  	min_count = max(count, min_count);
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1313
  	try_to_free_low(h, min_count, nodes_allowed);
a55164389   Andi Kleen   hugetlb: modular ...
1314
  	while (min_count < persistent_huge_pages(h)) {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1315
  		if (!free_pool_huge_page(h, nodes_allowed, 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1316
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1317
  	}
a55164389   Andi Kleen   hugetlb: modular ...
1318
  	while (count < persistent_huge_pages(h)) {
6ae11b278   Lee Schermerhorn   hugetlb: add node...
1319
  		if (!adjust_pool_surplus(h, nodes_allowed, 1))
7893d1d50   Adam Litke   hugetlb: Try to g...
1320
1321
1322
  			break;
  	}
  out:
a55164389   Andi Kleen   hugetlb: modular ...
1323
  	ret = persistent_huge_pages(h);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1324
  	spin_unlock(&hugetlb_lock);
7893d1d50   Adam Litke   hugetlb: Try to g...
1325
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1326
  }
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1327
1328
1329
1330
1331
1332
1333
1334
1335
  #define HSTATE_ATTR_RO(_name) \
  	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  
  #define HSTATE_ATTR(_name) \
  	static struct kobj_attribute _name##_attr = \
  		__ATTR(_name, 0644, _name##_show, _name##_store)
  
  static struct kobject *hugepages_kobj;
  static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1336
1337
1338
  static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
  
  static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1339
1340
  {
  	int i;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1341

a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1342
  	for (i = 0; i < HUGE_MAX_HSTATE; i++)
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1343
1344
1345
  		if (hstate_kobjs[i] == kobj) {
  			if (nidp)
  				*nidp = NUMA_NO_NODE;
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1346
  			return &hstates[i];
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1347
1348
1349
  		}
  
  	return kobj_to_node_hstate(kobj, nidp);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1350
  }
06808b082   Lee Schermerhorn   hugetlb: derive h...
1351
  static ssize_t nr_hugepages_show_common(struct kobject *kobj,
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1352
1353
  					struct kobj_attribute *attr, char *buf)
  {
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
  	struct hstate *h;
  	unsigned long nr_huge_pages;
  	int nid;
  
  	h = kobj_to_hstate(kobj, &nid);
  	if (nid == NUMA_NO_NODE)
  		nr_huge_pages = h->nr_huge_pages;
  	else
  		nr_huge_pages = h->nr_huge_pages_node[nid];
  
  	return sprintf(buf, "%lu
  ", nr_huge_pages);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1366
  }
06808b082   Lee Schermerhorn   hugetlb: derive h...
1367
1368
1369
  static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
  			struct kobject *kobj, struct kobj_attribute *attr,
  			const char *buf, size_t len)
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1370
1371
  {
  	int err;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1372
  	int nid;
06808b082   Lee Schermerhorn   hugetlb: derive h...
1373
  	unsigned long count;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1374
  	struct hstate *h;
bad44b5be   David Rientjes   mm: add gfp flags...
1375
  	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1376

06808b082   Lee Schermerhorn   hugetlb: derive h...
1377
  	err = strict_strtoul(buf, 10, &count);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1378
1379
  	if (err)
  		return 0;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
  	h = kobj_to_hstate(kobj, &nid);
  	if (nid == NUMA_NO_NODE) {
  		/*
  		 * global hstate attribute
  		 */
  		if (!(obey_mempolicy &&
  				init_nodemask_of_mempolicy(nodes_allowed))) {
  			NODEMASK_FREE(nodes_allowed);
  			nodes_allowed = &node_states[N_HIGH_MEMORY];
  		}
  	} else if (nodes_allowed) {
  		/*
  		 * per node hstate attribute: adjust count to global,
  		 * but restrict alloc/free to the specified node.
  		 */
  		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
  		init_nodemask_of_node(nodes_allowed, nid);
  	} else
  		nodes_allowed = &node_states[N_HIGH_MEMORY];
06808b082   Lee Schermerhorn   hugetlb: derive h...
1399
  	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1400

9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1401
  	if (nodes_allowed != &node_states[N_HIGH_MEMORY])
06808b082   Lee Schermerhorn   hugetlb: derive h...
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
  		NODEMASK_FREE(nodes_allowed);
  
  	return len;
  }
  
  static ssize_t nr_hugepages_show(struct kobject *kobj,
  				       struct kobj_attribute *attr, char *buf)
  {
  	return nr_hugepages_show_common(kobj, attr, buf);
  }
  
  static ssize_t nr_hugepages_store(struct kobject *kobj,
  	       struct kobj_attribute *attr, const char *buf, size_t len)
  {
  	return nr_hugepages_store_common(false, kobj, attr, buf, len);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1417
1418
  }
  HSTATE_ATTR(nr_hugepages);
06808b082   Lee Schermerhorn   hugetlb: derive h...
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
  #ifdef CONFIG_NUMA
  
  /*
   * hstate attribute for optionally mempolicy-based constraint on persistent
   * huge page alloc/free.
   */
  static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
  				       struct kobj_attribute *attr, char *buf)
  {
  	return nr_hugepages_show_common(kobj, attr, buf);
  }
  
  static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
  	       struct kobj_attribute *attr, const char *buf, size_t len)
  {
  	return nr_hugepages_store_common(true, kobj, attr, buf, len);
  }
  HSTATE_ATTR(nr_hugepages_mempolicy);
  #endif
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1438
1439
1440
  static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
  					struct kobj_attribute *attr, char *buf)
  {
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1441
  	struct hstate *h = kobj_to_hstate(kobj, NULL);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1442
1443
1444
1445
1446
1447
1448
1449
  	return sprintf(buf, "%lu
  ", h->nr_overcommit_huge_pages);
  }
  static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
  		struct kobj_attribute *attr, const char *buf, size_t count)
  {
  	int err;
  	unsigned long input;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1450
  	struct hstate *h = kobj_to_hstate(kobj, NULL);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
  
  	err = strict_strtoul(buf, 10, &input);
  	if (err)
  		return 0;
  
  	spin_lock(&hugetlb_lock);
  	h->nr_overcommit_huge_pages = input;
  	spin_unlock(&hugetlb_lock);
  
  	return count;
  }
  HSTATE_ATTR(nr_overcommit_hugepages);
  
  static ssize_t free_hugepages_show(struct kobject *kobj,
  					struct kobj_attribute *attr, char *buf)
  {
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
  	struct hstate *h;
  	unsigned long free_huge_pages;
  	int nid;
  
  	h = kobj_to_hstate(kobj, &nid);
  	if (nid == NUMA_NO_NODE)
  		free_huge_pages = h->free_huge_pages;
  	else
  		free_huge_pages = h->free_huge_pages_node[nid];
  
  	return sprintf(buf, "%lu
  ", free_huge_pages);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1479
1480
1481
1482
1483
1484
  }
  HSTATE_ATTR_RO(free_hugepages);
  
  static ssize_t resv_hugepages_show(struct kobject *kobj,
  					struct kobj_attribute *attr, char *buf)
  {
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1485
  	struct hstate *h = kobj_to_hstate(kobj, NULL);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1486
1487
1488
1489
1490
1491
1492
1493
  	return sprintf(buf, "%lu
  ", h->resv_huge_pages);
  }
  HSTATE_ATTR_RO(resv_hugepages);
  
  static ssize_t surplus_hugepages_show(struct kobject *kobj,
  					struct kobj_attribute *attr, char *buf)
  {
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
  	struct hstate *h;
  	unsigned long surplus_huge_pages;
  	int nid;
  
  	h = kobj_to_hstate(kobj, &nid);
  	if (nid == NUMA_NO_NODE)
  		surplus_huge_pages = h->surplus_huge_pages;
  	else
  		surplus_huge_pages = h->surplus_huge_pages_node[nid];
  
  	return sprintf(buf, "%lu
  ", surplus_huge_pages);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1506
1507
1508
1509
1510
1511
1512
1513
1514
  }
  HSTATE_ATTR_RO(surplus_hugepages);
  
  static struct attribute *hstate_attrs[] = {
  	&nr_hugepages_attr.attr,
  	&nr_overcommit_hugepages_attr.attr,
  	&free_hugepages_attr.attr,
  	&resv_hugepages_attr.attr,
  	&surplus_hugepages_attr.attr,
06808b082   Lee Schermerhorn   hugetlb: derive h...
1515
1516
1517
  #ifdef CONFIG_NUMA
  	&nr_hugepages_mempolicy_attr.attr,
  #endif
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1518
1519
1520
1521
1522
1523
  	NULL,
  };
  
  static struct attribute_group hstate_attr_group = {
  	.attrs = hstate_attrs,
  };
094e9539b   Jeff Mahoney   hugetlb: fix sect...
1524
1525
1526
  static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
  				    struct kobject **hstate_kobjs,
  				    struct attribute_group *hstate_attr_group)
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1527
1528
  {
  	int retval;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1529
  	int hi = h - hstates;
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1530

9a3052306   Lee Schermerhorn   hugetlb: add per ...
1531
1532
  	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
  	if (!hstate_kobjs[hi])
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1533
  		return -ENOMEM;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1534
  	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1535
  	if (retval)
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1536
  		kobject_put(hstate_kobjs[hi]);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
  
  	return retval;
  }
  
  static void __init hugetlb_sysfs_init(void)
  {
  	struct hstate *h;
  	int err;
  
  	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
  	if (!hugepages_kobj)
  		return;
  
  	for_each_hstate(h) {
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1551
1552
  		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
  					 hstate_kobjs, &hstate_attr_group);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1553
1554
1555
1556
1557
  		if (err)
  			printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
  								h->name);
  	}
  }
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
  #ifdef CONFIG_NUMA
  
  /*
   * node_hstate/s - associate per node hstate attributes, via their kobjects,
   * with node sysdevs in node_devices[] using a parallel array.  The array
   * index of a node sysdev or _hstate == node id.
   * This is here to avoid any static dependency of the node sysdev driver, in
   * the base kernel, on the hugetlb module.
   */
  struct node_hstate {
  	struct kobject		*hugepages_kobj;
  	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
  };
  struct node_hstate node_hstates[MAX_NUMNODES];
  
  /*
   * A subset of global hstate attributes for node sysdevs
   */
  static struct attribute *per_node_hstate_attrs[] = {
  	&nr_hugepages_attr.attr,
  	&free_hugepages_attr.attr,
  	&surplus_hugepages_attr.attr,
  	NULL,
  };
  
  static struct attribute_group per_node_hstate_attr_group = {
  	.attrs = per_node_hstate_attrs,
  };
  
  /*
   * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
   * Returns node id via non-NULL nidp.
   */
  static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
  {
  	int nid;
  
  	for (nid = 0; nid < nr_node_ids; nid++) {
  		struct node_hstate *nhs = &node_hstates[nid];
  		int i;
  		for (i = 0; i < HUGE_MAX_HSTATE; i++)
  			if (nhs->hstate_kobjs[i] == kobj) {
  				if (nidp)
  					*nidp = nid;
  				return &hstates[i];
  			}
  	}
  
  	BUG();
  	return NULL;
  }
  
  /*
   * Unregister hstate attributes from a single node sysdev.
   * No-op if no hstate attributes attached.
   */
  void hugetlb_unregister_node(struct node *node)
  {
  	struct hstate *h;
  	struct node_hstate *nhs = &node_hstates[node->sysdev.id];
  
  	if (!nhs->hugepages_kobj)
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1620
  		return;		/* no hstate attributes */
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
  
  	for_each_hstate(h)
  		if (nhs->hstate_kobjs[h - hstates]) {
  			kobject_put(nhs->hstate_kobjs[h - hstates]);
  			nhs->hstate_kobjs[h - hstates] = NULL;
  		}
  
  	kobject_put(nhs->hugepages_kobj);
  	nhs->hugepages_kobj = NULL;
  }
  
  /*
   * hugetlb module exit:  unregister hstate attributes from node sysdevs
   * that have them.
   */
  static void hugetlb_unregister_all_nodes(void)
  {
  	int nid;
  
  	/*
  	 * disable node sysdev registrations.
  	 */
  	register_hugetlbfs_with_node(NULL, NULL);
  
  	/*
  	 * remove hstate attributes from any nodes that have them.
  	 */
  	for (nid = 0; nid < nr_node_ids; nid++)
  		hugetlb_unregister_node(&node_devices[nid]);
  }
  
  /*
   * Register hstate attributes for a single node sysdev.
   * No-op if attributes already registered.
   */
  void hugetlb_register_node(struct node *node)
  {
  	struct hstate *h;
  	struct node_hstate *nhs = &node_hstates[node->sysdev.id];
  	int err;
  
  	if (nhs->hugepages_kobj)
  		return;		/* already allocated */
  
  	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
  							&node->sysdev.kobj);
  	if (!nhs->hugepages_kobj)
  		return;
  
  	for_each_hstate(h) {
  		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
  						nhs->hstate_kobjs,
  						&per_node_hstate_attr_group);
  		if (err) {
  			printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
  					" for node %d
  ",
  						h->name, node->sysdev.id);
  			hugetlb_unregister_node(node);
  			break;
  		}
  	}
  }
  
  /*
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1686
1687
1688
   * hugetlb init time:  register hstate attributes for all registered node
   * sysdevs of nodes that have memory.  All on-line nodes should have
   * registered their associated sysdev by this time.
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1689
1690
1691
1692
   */
  static void hugetlb_register_all_nodes(void)
  {
  	int nid;
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1693
  	for_each_node_state(nid, N_HIGH_MEMORY) {
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
  		struct node *node = &node_devices[nid];
  		if (node->sysdev.id == nid)
  			hugetlb_register_node(node);
  	}
  
  	/*
  	 * Let the node sysdev driver know we're here so it can
  	 * [un]register hstate attributes on node hotplug.
  	 */
  	register_hugetlbfs_with_node(hugetlb_register_node,
  				     hugetlb_unregister_node);
  }
  #else	/* !CONFIG_NUMA */
  
  static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
  {
  	BUG();
  	if (nidp)
  		*nidp = -1;
  	return NULL;
  }
  
  static void hugetlb_unregister_all_nodes(void) { }
  
  static void hugetlb_register_all_nodes(void) { }
  
  #endif
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1721
1722
1723
  static void __exit hugetlb_exit(void)
  {
  	struct hstate *h;
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1724
  	hugetlb_unregister_all_nodes();
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
  	for_each_hstate(h) {
  		kobject_put(hstate_kobjs[h - hstates]);
  	}
  
  	kobject_put(hugepages_kobj);
  }
  module_exit(hugetlb_exit);
  
  static int __init hugetlb_init(void)
  {
0ef89d25d   Benjamin Herrenschmidt   mm/hugetlb: don't...
1735
1736
1737
1738
1739
1740
  	/* Some platform decide whether they support huge pages at boot
  	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
  	 * there is no such support
  	 */
  	if (HPAGE_SHIFT == 0)
  		return 0;
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1741

e11bfbfcb   Nick Piggin   hugetlb: override...
1742
1743
1744
1745
  	if (!size_to_hstate(default_hstate_size)) {
  		default_hstate_size = HPAGE_SIZE;
  		if (!size_to_hstate(default_hstate_size))
  			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1746
  	}
e11bfbfcb   Nick Piggin   hugetlb: override...
1747
1748
1749
  	default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
  	if (default_hstate_max_huge_pages)
  		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1750
1751
  
  	hugetlb_init_hstates();
aa888a749   Andi Kleen   hugetlb: support ...
1752
  	gather_bootmem_prealloc();
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1753
1754
1755
  	report_hugepages();
  
  	hugetlb_sysfs_init();
9a3052306   Lee Schermerhorn   hugetlb: add per ...
1756
  	hugetlb_register_all_nodes();
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1757
1758
1759
1760
1761
1762
1763
1764
  	return 0;
  }
  module_init(hugetlb_init);
  
  /* Should be called on processing a hugepagesz=... option */
  void __init hugetlb_add_hstate(unsigned order)
  {
  	struct hstate *h;
8faa8b077   Andi Kleen   hugetlb: support ...
1765
  	unsigned long i;
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
  	if (size_to_hstate(PAGE_SIZE << order)) {
  		printk(KERN_WARNING "hugepagesz= specified twice, ignoring
  ");
  		return;
  	}
  	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
  	BUG_ON(order == 0);
  	h = &hstates[max_hstate++];
  	h->order = order;
  	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
8faa8b077   Andi Kleen   hugetlb: support ...
1776
1777
1778
1779
  	h->nr_huge_pages = 0;
  	h->free_huge_pages = 0;
  	for (i = 0; i < MAX_NUMNODES; ++i)
  		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
9b5e5d0fd   Lee Schermerhorn   hugetlb: use only...
1780
1781
  	h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
  	h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1782
1783
  	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
  					huge_page_size(h)/1024);
8faa8b077   Andi Kleen   hugetlb: support ...
1784

a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1785
1786
  	parsed_hstate = h;
  }
e11bfbfcb   Nick Piggin   hugetlb: override...
1787
  static int __init hugetlb_nrpages_setup(char *s)
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1788
1789
  {
  	unsigned long *mhp;
8faa8b077   Andi Kleen   hugetlb: support ...
1790
  	static unsigned long *last_mhp;
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1791
1792
1793
1794
1795
1796
1797
1798
1799
  
  	/*
  	 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
  	 * so this hugepages= parameter goes to the "default hstate".
  	 */
  	if (!max_hstate)
  		mhp = &default_hstate_max_huge_pages;
  	else
  		mhp = &parsed_hstate->max_huge_pages;
8faa8b077   Andi Kleen   hugetlb: support ...
1800
1801
1802
1803
1804
1805
  	if (mhp == last_mhp) {
  		printk(KERN_WARNING "hugepages= specified twice without "
  			"interleaving hugepagesz=, ignoring
  ");
  		return 1;
  	}
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1806
1807
  	if (sscanf(s, "%lu", mhp) <= 0)
  		*mhp = 0;
8faa8b077   Andi Kleen   hugetlb: support ...
1808
1809
1810
1811
1812
1813
1814
1815
1816
  	/*
  	 * Global state is always initialized later in hugetlb_init.
  	 * But we need to allocate >= MAX_ORDER hstates here early to still
  	 * use the bootmem allocator.
  	 */
  	if (max_hstate && parsed_hstate->order >= MAX_ORDER)
  		hugetlb_hstate_alloc_pages(parsed_hstate);
  
  	last_mhp = mhp;
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1817
1818
  	return 1;
  }
e11bfbfcb   Nick Piggin   hugetlb: override...
1819
1820
1821
1822
1823
1824
1825
1826
  __setup("hugepages=", hugetlb_nrpages_setup);
  
  static int __init hugetlb_default_setup(char *s)
  {
  	default_hstate_size = memparse(s, &s);
  	return 1;
  }
  __setup("default_hugepagesz=", hugetlb_default_setup);
a34378701   Nishanth Aravamudan   hugetlb: new sysf...
1827

8a2134605   Nishanth Aravamudan   hugetlb: fix CONF...
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
  static unsigned int cpuset_mems_nr(unsigned int *array)
  {
  	int node;
  	unsigned int nr = 0;
  
  	for_each_node_mask(node, cpuset_current_mems_allowed)
  		nr += array[node];
  
  	return nr;
  }
  
  #ifdef CONFIG_SYSCTL
06808b082   Lee Schermerhorn   hugetlb: derive h...
1840
1841
1842
  static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
  			 struct ctl_table *table, int write,
  			 void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1843
  {
e5ff21594   Andi Kleen   hugetlb: multiple...
1844
1845
1846
1847
1848
1849
1850
1851
  	struct hstate *h = &default_hstate;
  	unsigned long tmp;
  
  	if (!write)
  		tmp = h->max_huge_pages;
  
  	table->data = &tmp;
  	table->maxlen = sizeof(unsigned long);
8d65af789   Alexey Dobriyan   sysctl: remove "s...
1852
  	proc_doulongvec_minmax(table, write, buffer, length, ppos);
e5ff21594   Andi Kleen   hugetlb: multiple...
1853

06808b082   Lee Schermerhorn   hugetlb: derive h...
1854
  	if (write) {
bad44b5be   David Rientjes   mm: add gfp flags...
1855
1856
  		NODEMASK_ALLOC(nodemask_t, nodes_allowed,
  						GFP_KERNEL | __GFP_NORETRY);
06808b082   Lee Schermerhorn   hugetlb: derive h...
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
  		if (!(obey_mempolicy &&
  			       init_nodemask_of_mempolicy(nodes_allowed))) {
  			NODEMASK_FREE(nodes_allowed);
  			nodes_allowed = &node_states[N_HIGH_MEMORY];
  		}
  		h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
  
  		if (nodes_allowed != &node_states[N_HIGH_MEMORY])
  			NODEMASK_FREE(nodes_allowed);
  	}
e5ff21594   Andi Kleen   hugetlb: multiple...
1867

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1868
1869
  	return 0;
  }
396faf030   Mel Gorman   Allow huge page a...
1870

06808b082   Lee Schermerhorn   hugetlb: derive h...
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
  int hugetlb_sysctl_handler(struct ctl_table *table, int write,
  			  void __user *buffer, size_t *length, loff_t *ppos)
  {
  
  	return hugetlb_sysctl_handler_common(false, table, write,
  							buffer, length, ppos);
  }
  
  #ifdef CONFIG_NUMA
  int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
  			  void __user *buffer, size_t *length, loff_t *ppos)
  {
  	return hugetlb_sysctl_handler_common(true, table, write,
  							buffer, length, ppos);
  }
  #endif /* CONFIG_NUMA */
396faf030   Mel Gorman   Allow huge page a...
1887
  int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
1888
  			void __user *buffer,
396faf030   Mel Gorman   Allow huge page a...
1889
1890
  			size_t *length, loff_t *ppos)
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
1891
  	proc_dointvec(table, write, buffer, length, ppos);
396faf030   Mel Gorman   Allow huge page a...
1892
1893
1894
1895
1896
1897
  	if (hugepages_treat_as_movable)
  		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
  	else
  		htlb_alloc_mask = GFP_HIGHUSER;
  	return 0;
  }
a3d0c6aa1   Nishanth Aravamudan   hugetlb: add lock...
1898
  int hugetlb_overcommit_handler(struct ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
1899
  			void __user *buffer,
a3d0c6aa1   Nishanth Aravamudan   hugetlb: add lock...
1900
1901
  			size_t *length, loff_t *ppos)
  {
a55164389   Andi Kleen   hugetlb: modular ...
1902
  	struct hstate *h = &default_hstate;
e5ff21594   Andi Kleen   hugetlb: multiple...
1903
1904
1905
1906
1907
1908
1909
  	unsigned long tmp;
  
  	if (!write)
  		tmp = h->nr_overcommit_huge_pages;
  
  	table->data = &tmp;
  	table->maxlen = sizeof(unsigned long);
8d65af789   Alexey Dobriyan   sysctl: remove "s...
1910
  	proc_doulongvec_minmax(table, write, buffer, length, ppos);
e5ff21594   Andi Kleen   hugetlb: multiple...
1911
1912
1913
1914
1915
1916
  
  	if (write) {
  		spin_lock(&hugetlb_lock);
  		h->nr_overcommit_huge_pages = tmp;
  		spin_unlock(&hugetlb_lock);
  	}
a3d0c6aa1   Nishanth Aravamudan   hugetlb: add lock...
1917
1918
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1919
  #endif /* CONFIG_SYSCTL */
e1759c215   Alexey Dobriyan   proc: switch /pro...
1920
  void hugetlb_report_meminfo(struct seq_file *m)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1921
  {
a55164389   Andi Kleen   hugetlb: modular ...
1922
  	struct hstate *h = &default_hstate;
e1759c215   Alexey Dobriyan   proc: switch /pro...
1923
  	seq_printf(m,
4f98a2fee   Rik van Riel   vmscan: split LRU...
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
  			"HugePages_Total:   %5lu
  "
  			"HugePages_Free:    %5lu
  "
  			"HugePages_Rsvd:    %5lu
  "
  			"HugePages_Surp:    %5lu
  "
  			"Hugepagesize:   %8lu kB
  ",
a55164389   Andi Kleen   hugetlb: modular ...
1934
1935
1936
1937
1938
  			h->nr_huge_pages,
  			h->free_huge_pages,
  			h->resv_huge_pages,
  			h->surplus_huge_pages,
  			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1939
1940
1941
1942
  }
  
  int hugetlb_report_node_meminfo(int nid, char *buf)
  {
a55164389   Andi Kleen   hugetlb: modular ...
1943
  	struct hstate *h = &default_hstate;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1944
1945
1946
  	return sprintf(buf,
  		"Node %d HugePages_Total: %5u
  "
a1de09195   Nishanth Aravamudan   hugetlb: indicate...
1947
1948
1949
1950
  		"Node %d HugePages_Free:  %5u
  "
  		"Node %d HugePages_Surp:  %5u
  ",
a55164389   Andi Kleen   hugetlb: modular ...
1951
1952
1953
  		nid, h->nr_huge_pages_node[nid],
  		nid, h->free_huge_pages_node[nid],
  		nid, h->surplus_huge_pages_node[nid]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1954
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1955
1956
1957
  /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
  unsigned long hugetlb_total_pages(void)
  {
a55164389   Andi Kleen   hugetlb: modular ...
1958
1959
  	struct hstate *h = &default_hstate;
  	return h->nr_huge_pages * pages_per_huge_page(h);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1960
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1961

a55164389   Andi Kleen   hugetlb: modular ...
1962
  static int hugetlb_acct_memory(struct hstate *h, long delta)
fc1b8a73d   Mel Gorman   hugetlb: move hug...
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
  {
  	int ret = -ENOMEM;
  
  	spin_lock(&hugetlb_lock);
  	/*
  	 * When cpuset is configured, it breaks the strict hugetlb page
  	 * reservation as the accounting is done on a global variable. Such
  	 * reservation is completely rubbish in the presence of cpuset because
  	 * the reservation is not checked against page availability for the
  	 * current cpuset. Application can still potentially OOM'ed by kernel
  	 * with lack of free htlb page in cpuset that the task is in.
  	 * Attempt to enforce strict accounting with cpuset is almost
  	 * impossible (or too ugly) because cpuset is too fluid that
  	 * task or memory node can be dynamically moved between cpusets.
  	 *
  	 * The change of semantics for shared hugetlb mapping with cpuset is
  	 * undesirable. However, in order to preserve some of the semantics,
  	 * we fall back to check against current free page availability as
  	 * a best attempt and hopefully to minimize the impact of changing
  	 * semantics that cpuset has.
  	 */
  	if (delta > 0) {
a55164389   Andi Kleen   hugetlb: modular ...
1985
  		if (gather_surplus_pages(h, delta) < 0)
fc1b8a73d   Mel Gorman   hugetlb: move hug...
1986
  			goto out;
a55164389   Andi Kleen   hugetlb: modular ...
1987
1988
  		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
  			return_unused_surplus_pages(h, delta);
fc1b8a73d   Mel Gorman   hugetlb: move hug...
1989
1990
1991
1992
1993
1994
  			goto out;
  		}
  	}
  
  	ret = 0;
  	if (delta < 0)
a55164389   Andi Kleen   hugetlb: modular ...
1995
  		return_unused_surplus_pages(h, (unsigned long) -delta);
fc1b8a73d   Mel Gorman   hugetlb: move hug...
1996
1997
1998
1999
2000
  
  out:
  	spin_unlock(&hugetlb_lock);
  	return ret;
  }
84afd99b8   Andy Whitcroft   hugetlb reservati...
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
  static void hugetlb_vm_op_open(struct vm_area_struct *vma)
  {
  	struct resv_map *reservations = vma_resv_map(vma);
  
  	/*
  	 * This new VMA should share its siblings reservation map if present.
  	 * The VMA will only ever have a valid reservation map pointer where
  	 * it is being copied for another still existing VMA.  As that VMA
  	 * has a reference to the reservation map it cannot dissappear until
  	 * after this open call completes.  It is therefore safe to take a
  	 * new reference here without additional locking.
  	 */
  	if (reservations)
  		kref_get(&reservations->refs);
  }
a1e78772d   Mel Gorman   hugetlb: reserve ...
2016
2017
  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  {
a55164389   Andi Kleen   hugetlb: modular ...
2018
  	struct hstate *h = hstate_vma(vma);
84afd99b8   Andy Whitcroft   hugetlb reservati...
2019
2020
2021
2022
2023
2024
  	struct resv_map *reservations = vma_resv_map(vma);
  	unsigned long reserve;
  	unsigned long start;
  	unsigned long end;
  
  	if (reservations) {
a55164389   Andi Kleen   hugetlb: modular ...
2025
2026
  		start = vma_hugecache_offset(h, vma, vma->vm_start);
  		end = vma_hugecache_offset(h, vma, vma->vm_end);
84afd99b8   Andy Whitcroft   hugetlb reservati...
2027
2028
2029
2030
2031
  
  		reserve = (end - start) -
  			region_count(&reservations->regions, start, end);
  
  		kref_put(&reservations->refs, resv_map_release);
7251ff78b   Adam Litke   hugetlb: quota is...
2032
  		if (reserve) {
a55164389   Andi Kleen   hugetlb: modular ...
2033
  			hugetlb_acct_memory(h, -reserve);
7251ff78b   Adam Litke   hugetlb: quota is...
2034
2035
  			hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
  		}
84afd99b8   Andy Whitcroft   hugetlb reservati...
2036
  	}
a1e78772d   Mel Gorman   hugetlb: reserve ...
2037
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2038
2039
2040
2041
2042
2043
  /*
   * We cannot handle pagefaults against hugetlb pages at all.  They cause
   * handle_mm_fault() to try to instantiate regular-sized pages in the
   * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
   * this far.
   */
d0217ac04   Nick Piggin   mm: fault feedbac...
2044
  static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2045
2046
  {
  	BUG();
d0217ac04   Nick Piggin   mm: fault feedbac...
2047
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2048
  }
f0f37e2f7   Alexey Dobriyan   const: mark struc...
2049
  const struct vm_operations_struct hugetlb_vm_ops = {
d0217ac04   Nick Piggin   mm: fault feedbac...
2050
  	.fault = hugetlb_vm_op_fault,
84afd99b8   Andy Whitcroft   hugetlb reservati...
2051
  	.open = hugetlb_vm_op_open,
a1e78772d   Mel Gorman   hugetlb: reserve ...
2052
  	.close = hugetlb_vm_op_close,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2053
  };
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2054
2055
  static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
  				int writable)
63551ae0f   David Gibson   [PATCH] Hugepage ...
2056
2057
  {
  	pte_t entry;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2058
  	if (writable) {
63551ae0f   David Gibson   [PATCH] Hugepage ...
2059
2060
2061
  		entry =
  		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
  	} else {
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2062
  		entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
63551ae0f   David Gibson   [PATCH] Hugepage ...
2063
2064
2065
2066
2067
2068
  	}
  	entry = pte_mkyoung(entry);
  	entry = pte_mkhuge(entry);
  
  	return entry;
  }
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2069
2070
2071
2072
  static void set_huge_ptep_writable(struct vm_area_struct *vma,
  				   unsigned long address, pte_t *ptep)
  {
  	pte_t entry;
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2073
2074
  	entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
  	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
4b3073e1c   Russell King   MM: Pass a PTE po...
2075
  		update_mmu_cache(vma, address, ptep);
8dab5241d   Benjamin Herrenschmidt   Rework ptep_set_a...
2076
  	}
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2077
  }
63551ae0f   David Gibson   [PATCH] Hugepage ...
2078
2079
2080
2081
2082
  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
  			    struct vm_area_struct *vma)
  {
  	pte_t *src_pte, *dst_pte, entry;
  	struct page *ptepage;
1c59827d1   Hugh Dickins   [PATCH] mm: huget...
2083
  	unsigned long addr;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2084
  	int cow;
a55164389   Andi Kleen   hugetlb: modular ...
2085
2086
  	struct hstate *h = hstate_vma(vma);
  	unsigned long sz = huge_page_size(h);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2087
2088
  
  	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2089

a55164389   Andi Kleen   hugetlb: modular ...
2090
  	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
2091
2092
2093
  		src_pte = huge_pte_offset(src, addr);
  		if (!src_pte)
  			continue;
a55164389   Andi Kleen   hugetlb: modular ...
2094
  		dst_pte = huge_pte_alloc(dst, addr, sz);
63551ae0f   David Gibson   [PATCH] Hugepage ...
2095
2096
  		if (!dst_pte)
  			goto nomem;
c5c99429f   Larry Woodman   fix hugepages lea...
2097
2098
2099
2100
  
  		/* If the pagetables are shared don't copy or take references */
  		if (dst_pte == src_pte)
  			continue;
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
2101
  		spin_lock(&dst->page_table_lock);
464787581   Nick Piggin   hugetlb: fix lock...
2102
  		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2103
  		if (!huge_pte_none(huge_ptep_get(src_pte))) {
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2104
  			if (cow)
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2105
2106
  				huge_ptep_set_wrprotect(src, addr, src_pte);
  			entry = huge_ptep_get(src_pte);
1c59827d1   Hugh Dickins   [PATCH] mm: huget...
2107
2108
  			ptepage = pte_page(entry);
  			get_page(ptepage);
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2109
  			page_dup_rmap(ptepage);
1c59827d1   Hugh Dickins   [PATCH] mm: huget...
2110
2111
2112
  			set_huge_pte_at(dst, addr, dst_pte, entry);
  		}
  		spin_unlock(&src->page_table_lock);
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
2113
  		spin_unlock(&dst->page_table_lock);
63551ae0f   David Gibson   [PATCH] Hugepage ...
2114
2115
2116
2117
2118
2119
  	}
  	return 0;
  
  nomem:
  	return -ENOMEM;
  }
290408d4a   Naoya Horiguchi   hugetlb: hugepage...
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
  static int is_hugetlb_entry_migration(pte_t pte)
  {
  	swp_entry_t swp;
  
  	if (huge_pte_none(pte) || pte_present(pte))
  		return 0;
  	swp = pte_to_swp_entry(pte);
  	if (non_swap_entry(swp) && is_migration_entry(swp)) {
  		return 1;
  	} else
  		return 0;
  }
fd6a03edd   Naoya Horiguchi   HWPOISON, hugetlb...
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
  static int is_hugetlb_entry_hwpoisoned(pte_t pte)
  {
  	swp_entry_t swp;
  
  	if (huge_pte_none(pte) || pte_present(pte))
  		return 0;
  	swp = pte_to_swp_entry(pte);
  	if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
  		return 1;
  	} else
  		return 0;
  }
502717f4e   Kenneth W Chen   [PATCH] hugetlb: ...
2144
  void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
04f2cbe35   Mel Gorman   hugetlb: guarante...
2145
  			    unsigned long end, struct page *ref_page)
63551ae0f   David Gibson   [PATCH] Hugepage ...
2146
2147
2148
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
c7546f8f0   David Gibson   [PATCH] Fix hugep...
2149
  	pte_t *ptep;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2150
2151
  	pte_t pte;
  	struct page *page;
fe1668ae5   Kenneth W Chen   [PATCH] enforce p...
2152
  	struct page *tmp;
a55164389   Andi Kleen   hugetlb: modular ...
2153
2154
  	struct hstate *h = hstate_vma(vma);
  	unsigned long sz = huge_page_size(h);
c0a499c2c   Kenneth W Chen   [PATCH] __unmap_h...
2155
2156
2157
2158
2159
  	/*
  	 * A page gathering list, protected by per file i_mmap_lock. The
  	 * lock is used to avoid list corruption from multiple unmapping
  	 * of the same page since we are using page->lru.
  	 */
fe1668ae5   Kenneth W Chen   [PATCH] enforce p...
2160
  	LIST_HEAD(page_list);
63551ae0f   David Gibson   [PATCH] Hugepage ...
2161
2162
  
  	WARN_ON(!is_vm_hugetlb_page(vma));
a55164389   Andi Kleen   hugetlb: modular ...
2163
2164
  	BUG_ON(start & ~huge_page_mask(h));
  	BUG_ON(end & ~huge_page_mask(h));
63551ae0f   David Gibson   [PATCH] Hugepage ...
2165

cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
2166
  	mmu_notifier_invalidate_range_start(mm, start, end);
508034a32   Hugh Dickins   [PATCH] mm: unmap...
2167
  	spin_lock(&mm->page_table_lock);
a55164389   Andi Kleen   hugetlb: modular ...
2168
  	for (address = start; address < end; address += sz) {
c7546f8f0   David Gibson   [PATCH] Fix hugep...
2169
  		ptep = huge_pte_offset(mm, address);
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2170
  		if (!ptep)
c7546f8f0   David Gibson   [PATCH] Fix hugep...
2171
  			continue;
39dde65c9   Kenneth W Chen   [PATCH] shared pa...
2172
2173
  		if (huge_pmd_unshare(mm, &address, ptep))
  			continue;
04f2cbe35   Mel Gorman   hugetlb: guarante...
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
  		/*
  		 * If a reference page is supplied, it is because a specific
  		 * page is being unmapped, not a range. Ensure the page we
  		 * are about to unmap is the actual page of interest.
  		 */
  		if (ref_page) {
  			pte = huge_ptep_get(ptep);
  			if (huge_pte_none(pte))
  				continue;
  			page = pte_page(pte);
  			if (page != ref_page)
  				continue;
  
  			/*
  			 * Mark the VMA as having unmapped its page so that
  			 * future faults in this VMA will fail rather than
  			 * looking like data was lost
  			 */
  			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
  		}
c7546f8f0   David Gibson   [PATCH] Fix hugep...
2194
  		pte = huge_ptep_get_and_clear(mm, address, ptep);
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2195
  		if (huge_pte_none(pte))
63551ae0f   David Gibson   [PATCH] Hugepage ...
2196
  			continue;
c7546f8f0   David Gibson   [PATCH] Fix hugep...
2197

fd6a03edd   Naoya Horiguchi   HWPOISON, hugetlb...
2198
2199
2200
2201
2202
  		/*
  		 * HWPoisoned hugepage is already unmapped and dropped reference
  		 */
  		if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
  			continue;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2203
  		page = pte_page(pte);
6649a3863   Ken Chen   [PATCH] hugetlb: ...
2204
2205
  		if (pte_dirty(pte))
  			set_page_dirty(page);
fe1668ae5   Kenneth W Chen   [PATCH] enforce p...
2206
  		list_add(&page->lru, &page_list);
63551ae0f   David Gibson   [PATCH] Hugepage ...
2207
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2208
  	spin_unlock(&mm->page_table_lock);
508034a32   Hugh Dickins   [PATCH] mm: unmap...
2209
  	flush_tlb_range(vma, start, end);
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
2210
  	mmu_notifier_invalidate_range_end(mm, start, end);
fe1668ae5   Kenneth W Chen   [PATCH] enforce p...
2211
  	list_for_each_entry_safe(page, tmp, &page_list, lru) {
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2212
  		page_remove_rmap(page);
fe1668ae5   Kenneth W Chen   [PATCH] enforce p...
2213
2214
2215
  		list_del(&page->lru);
  		put_page(page);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2216
  }
63551ae0f   David Gibson   [PATCH] Hugepage ...
2217

502717f4e   Kenneth W Chen   [PATCH] hugetlb: ...
2218
  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
04f2cbe35   Mel Gorman   hugetlb: guarante...
2219
  			  unsigned long end, struct page *ref_page)
502717f4e   Kenneth W Chen   [PATCH] hugetlb: ...
2220
  {
a137e1cc6   Andi Kleen   hugetlbfs: per mo...
2221
2222
2223
  	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
  	__unmap_hugepage_range(vma, start, end, ref_page);
  	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
502717f4e   Kenneth W Chen   [PATCH] hugetlb: ...
2224
  }
04f2cbe35   Mel Gorman   hugetlb: guarante...
2225
2226
2227
2228
2229
2230
  /*
   * This is called when the original mapper is failing to COW a MAP_PRIVATE
   * mappping it owns the reserve page for. The intention is to unmap the page
   * from other VMAs and let the children be SIGKILLed if they are faulting the
   * same region.
   */
2a4b3ded5   Harvey Harrison   mm: hugetlb.c mak...
2231
2232
  static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  				struct page *page, unsigned long address)
04f2cbe35   Mel Gorman   hugetlb: guarante...
2233
  {
7526674de   Adam Litke   hugetlb: make unm...
2234
  	struct hstate *h = hstate_vma(vma);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2235
2236
2237
2238
2239
2240
2241
2242
2243
  	struct vm_area_struct *iter_vma;
  	struct address_space *mapping;
  	struct prio_tree_iter iter;
  	pgoff_t pgoff;
  
  	/*
  	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
  	 * from page cache lookup which is in HPAGE_SIZE units.
  	 */
7526674de   Adam Litke   hugetlb: make unm...
2244
  	address = address & huge_page_mask(h);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2245
2246
2247
  	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
  		+ (vma->vm_pgoff >> PAGE_SHIFT);
  	mapping = (struct address_space *)page_private(page);
4eb2b1dcd   Mel Gorman   hugetlb: acquire ...
2248
2249
2250
2251
2252
2253
  	/*
  	 * Take the mapping lock for the duration of the table walk. As
  	 * this mapping should be shared between all the VMAs,
  	 * __unmap_hugepage_range() is called as the lock is already held
  	 */
  	spin_lock(&mapping->i_mmap_lock);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
  	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  		/* Do not unmap the current VMA */
  		if (iter_vma == vma)
  			continue;
  
  		/*
  		 * Unmap the page from other VMAs without their own reserves.
  		 * They get marked to be SIGKILLed if they fault in these
  		 * areas. This is because a future no-page fault on this VMA
  		 * could insert a zeroed page instead of the data existing
  		 * from the time of fork. This would look like data corruption
  		 */
  		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
4eb2b1dcd   Mel Gorman   hugetlb: acquire ...
2267
  			__unmap_hugepage_range(iter_vma,
7526674de   Adam Litke   hugetlb: make unm...
2268
  				address, address + huge_page_size(h),
04f2cbe35   Mel Gorman   hugetlb: guarante...
2269
2270
  				page);
  	}
4eb2b1dcd   Mel Gorman   hugetlb: acquire ...
2271
  	spin_unlock(&mapping->i_mmap_lock);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2272
2273
2274
  
  	return 1;
  }
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2275
2276
2277
  /*
   * Hugetlb_cow() should be called with page lock of the original hugepage held.
   */
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2278
  static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
04f2cbe35   Mel Gorman   hugetlb: guarante...
2279
2280
  			unsigned long address, pte_t *ptep, pte_t pte,
  			struct page *pagecache_page)
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2281
  {
a55164389   Andi Kleen   hugetlb: modular ...
2282
  	struct hstate *h = hstate_vma(vma);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2283
  	struct page *old_page, *new_page;
79ac6ba40   David Gibson   [PATCH] hugepage:...
2284
  	int avoidcopy;
04f2cbe35   Mel Gorman   hugetlb: guarante...
2285
  	int outside_reserve = 0;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2286
2287
  
  	old_page = pte_page(pte);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2288
  retry_avoidcopy:
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2289
2290
  	/* If no-one else is actually using this page, avoid the copy
  	 * and just make the page writable */
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2291
  	avoidcopy = (page_mapcount(old_page) == 1);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2292
  	if (avoidcopy) {
56c9cfb13   Naoya Horiguchi   hugetlb, rmap: fi...
2293
2294
  		if (PageAnon(old_page))
  			page_move_anon_rmap(old_page, vma, address);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2295
  		set_huge_ptep_writable(vma, address, ptep);
83c54070e   Nick Piggin   mm: fault feedbac...
2296
  		return 0;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2297
  	}
04f2cbe35   Mel Gorman   hugetlb: guarante...
2298
2299
2300
2301
2302
2303
2304
2305
2306
  	/*
  	 * If the process that created a MAP_PRIVATE mapping is about to
  	 * perform a COW due to a shared page count, attempt to satisfy
  	 * the allocation without using the existing reserves. The pagecache
  	 * page is used to determine if the reserve at this address was
  	 * consumed or not. If reserves were used, a partial faulted mapping
  	 * at the time of fork() could consume its reserves on COW instead
  	 * of the full address range.
  	 */
f83a275db   Mel Gorman   mm: account for M...
2307
  	if (!(vma->vm_flags & VM_MAYSHARE) &&
04f2cbe35   Mel Gorman   hugetlb: guarante...
2308
2309
2310
  			is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
  			old_page != pagecache_page)
  		outside_reserve = 1;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2311
  	page_cache_get(old_page);
b76c8cfbf   Larry Woodman   hugetlb: prevent ...
2312
2313
2314
  
  	/* Drop page_table_lock as buddy allocator may be called */
  	spin_unlock(&mm->page_table_lock);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2315
  	new_page = alloc_huge_page(vma, address, outside_reserve);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2316

2fc39cec6   Adam Litke   hugetlb: debit qu...
2317
  	if (IS_ERR(new_page)) {
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2318
  		page_cache_release(old_page);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
  
  		/*
  		 * If a process owning a MAP_PRIVATE mapping fails to COW,
  		 * it is due to references held by a child and an insufficient
  		 * huge page pool. To guarantee the original mappers
  		 * reliability, unmap the page from child processes. The child
  		 * may get SIGKILLed if it later faults.
  		 */
  		if (outside_reserve) {
  			BUG_ON(huge_pte_none(pte));
  			if (unmap_ref_private(mm, vma, old_page, address)) {
  				BUG_ON(page_count(old_page) != 1);
  				BUG_ON(huge_pte_none(pte));
b76c8cfbf   Larry Woodman   hugetlb: prevent ...
2332
  				spin_lock(&mm->page_table_lock);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2333
2334
2335
2336
  				goto retry_avoidcopy;
  			}
  			WARN_ON_ONCE(1);
  		}
b76c8cfbf   Larry Woodman   hugetlb: prevent ...
2337
2338
  		/* Caller expects lock to be held */
  		spin_lock(&mm->page_table_lock);
2fc39cec6   Adam Litke   hugetlb: debit qu...
2339
  		return -PTR_ERR(new_page);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2340
  	}
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2341
2342
2343
2344
  	/*
  	 * When the original hugepage is shared one, it does not have
  	 * anon_vma prepared.
  	 */
44e2aa937   Dean Nelson   mm/hugetlb.c: add...
2345
2346
2347
  	if (unlikely(anon_vma_prepare(vma))) {
  		/* Caller expects lock to be held */
  		spin_lock(&mm->page_table_lock);
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2348
  		return VM_FAULT_OOM;
44e2aa937   Dean Nelson   mm/hugetlb.c: add...
2349
  	}
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2350

0ebabb416   Naoya Horiguchi   hugetlb: redefine...
2351
  	copy_user_huge_page(new_page, old_page, address, vma);
0ed361dec   Nick Piggin   mm: fix PageUptod...
2352
  	__SetPageUptodate(new_page);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2353

b76c8cfbf   Larry Woodman   hugetlb: prevent ...
2354
2355
2356
2357
2358
  	/*
  	 * Retake the page_table_lock to check for racing updates
  	 * before the page tables are altered
  	 */
  	spin_lock(&mm->page_table_lock);
a55164389   Andi Kleen   hugetlb: modular ...
2359
  	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2360
  	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2361
  		/* Break COW */
3edd4fc95   Doug Doan   hugetlb: call mmu...
2362
2363
2364
  		mmu_notifier_invalidate_range_start(mm,
  			address & huge_page_mask(h),
  			(address & huge_page_mask(h)) + huge_page_size(h));
8fe627ec5   Gerald Schaefer   hugetlbfs: add mi...
2365
  		huge_ptep_clear_flush(vma, address, ptep);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2366
2367
  		set_huge_pte_at(mm, address, ptep,
  				make_huge_pte(vma, new_page, 1));
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2368
  		page_remove_rmap(old_page);
cd67f0d2a   Naoya Horiguchi   hugetlb, rmap: us...
2369
  		hugepage_add_new_anon_rmap(new_page, vma, address);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2370
2371
  		/* Make the old page be freed below */
  		new_page = old_page;
3edd4fc95   Doug Doan   hugetlb: call mmu...
2372
2373
2374
  		mmu_notifier_invalidate_range_end(mm,
  			address & huge_page_mask(h),
  			(address & huge_page_mask(h)) + huge_page_size(h));
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2375
2376
2377
  	}
  	page_cache_release(new_page);
  	page_cache_release(old_page);
83c54070e   Nick Piggin   mm: fault feedbac...
2378
  	return 0;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2379
  }
04f2cbe35   Mel Gorman   hugetlb: guarante...
2380
  /* Return the pagecache page at a given address within a VMA */
a55164389   Andi Kleen   hugetlb: modular ...
2381
2382
  static struct page *hugetlbfs_pagecache_page(struct hstate *h,
  			struct vm_area_struct *vma, unsigned long address)
04f2cbe35   Mel Gorman   hugetlb: guarante...
2383
2384
  {
  	struct address_space *mapping;
e7c4b0bfd   Andy Whitcroft   huge page private...
2385
  	pgoff_t idx;
04f2cbe35   Mel Gorman   hugetlb: guarante...
2386
2387
  
  	mapping = vma->vm_file->f_mapping;
a55164389   Andi Kleen   hugetlb: modular ...
2388
  	idx = vma_hugecache_offset(h, vma, address);
04f2cbe35   Mel Gorman   hugetlb: guarante...
2389
2390
2391
  
  	return find_lock_page(mapping, idx);
  }
3ae77f43b   Hugh Dickins   mm: hugetlbfs_pag...
2392
2393
2394
2395
2396
  /*
   * Return whether there is a pagecache page to back given address within VMA.
   * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
   */
  static bool hugetlbfs_pagecache_present(struct hstate *h,
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
  			struct vm_area_struct *vma, unsigned long address)
  {
  	struct address_space *mapping;
  	pgoff_t idx;
  	struct page *page;
  
  	mapping = vma->vm_file->f_mapping;
  	idx = vma_hugecache_offset(h, vma, address);
  
  	page = find_get_page(mapping, idx);
  	if (page)
  		put_page(page);
  	return page != NULL;
  }
a1ed3dda0   Robert P. J. Day   MM: Make needless...
2411
  static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
788c7df45   Hugh Dickins   hugetlb: fault fl...
2412
  			unsigned long address, pte_t *ptep, unsigned int flags)
ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
2413
  {
a55164389   Andi Kleen   hugetlb: modular ...
2414
  	struct hstate *h = hstate_vma(vma);
ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
2415
  	int ret = VM_FAULT_SIGBUS;
e7c4b0bfd   Andy Whitcroft   huge page private...
2416
  	pgoff_t idx;
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2417
  	unsigned long size;
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2418
2419
  	struct page *page;
  	struct address_space *mapping;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2420
  	pte_t new_pte;
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2421

04f2cbe35   Mel Gorman   hugetlb: guarante...
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
  	/*
  	 * Currently, we are forced to kill the process in the event the
  	 * original mapper has unmapped pages from the child due to a failed
  	 * COW. Warn that such a situation has occured as it may not be obvious
  	 */
  	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
  		printk(KERN_WARNING
  			"PID %d killed due to inadequate hugepage pool
  ",
  			current->pid);
  		return ret;
  	}
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2434
  	mapping = vma->vm_file->f_mapping;
a55164389   Andi Kleen   hugetlb: modular ...
2435
  	idx = vma_hugecache_offset(h, vma, address);
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2436
2437
2438
2439
2440
  
  	/*
  	 * Use page lock to guard against racing truncation
  	 * before we get page_table_lock.
  	 */
6bda666a0   Christoph Lameter   [PATCH] hugepages...
2441
2442
2443
  retry:
  	page = find_lock_page(mapping, idx);
  	if (!page) {
a55164389   Andi Kleen   hugetlb: modular ...
2444
  		size = i_size_read(mapping->host) >> huge_page_shift(h);
ebed4bfc8   Hugh Dickins   [PATCH] hugetlb: ...
2445
2446
  		if (idx >= size)
  			goto out;
04f2cbe35   Mel Gorman   hugetlb: guarante...
2447
  		page = alloc_huge_page(vma, address, 0);
2fc39cec6   Adam Litke   hugetlb: debit qu...
2448
2449
  		if (IS_ERR(page)) {
  			ret = -PTR_ERR(page);
6bda666a0   Christoph Lameter   [PATCH] hugepages...
2450
2451
  			goto out;
  		}
a55164389   Andi Kleen   hugetlb: modular ...
2452
  		clear_huge_page(page, address, huge_page_size(h));
0ed361dec   Nick Piggin   mm: fix PageUptod...
2453
  		__SetPageUptodate(page);
ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
2454

f83a275db   Mel Gorman   mm: account for M...
2455
  		if (vma->vm_flags & VM_MAYSHARE) {
6bda666a0   Christoph Lameter   [PATCH] hugepages...
2456
  			int err;
45c682a68   Ken Chen   hugetlb: fix i_bl...
2457
  			struct inode *inode = mapping->host;
6bda666a0   Christoph Lameter   [PATCH] hugepages...
2458
2459
2460
2461
  
  			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
  			if (err) {
  				put_page(page);
6bda666a0   Christoph Lameter   [PATCH] hugepages...
2462
2463
2464
2465
  				if (err == -EEXIST)
  					goto retry;
  				goto out;
  			}
45c682a68   Ken Chen   hugetlb: fix i_bl...
2466
2467
  
  			spin_lock(&inode->i_lock);
a55164389   Andi Kleen   hugetlb: modular ...
2468
  			inode->i_blocks += blocks_per_huge_page(h);
45c682a68   Ken Chen   hugetlb: fix i_bl...
2469
  			spin_unlock(&inode->i_lock);
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2470
  			page_dup_rmap(page);
23be7468e   Mel Gorman   hugetlb: fix infi...
2471
  		} else {
6bda666a0   Christoph Lameter   [PATCH] hugepages...
2472
  			lock_page(page);
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2473
2474
2475
2476
2477
  			if (unlikely(anon_vma_prepare(vma))) {
  				ret = VM_FAULT_OOM;
  				goto backout_unlocked;
  			}
  			hugepage_add_new_anon_rmap(page, vma, address);
23be7468e   Mel Gorman   hugetlb: fix infi...
2478
  		}
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2479
  	} else {
998b4382c   Naoya Horiguchi   hugetlb: fix meta...
2480
2481
2482
2483
2484
2485
  		/*
  		 * If memory error occurs between mmap() and fault, some process
  		 * don't have hwpoisoned swap entry for errored virtual address.
  		 * So we need to block hugepage fault by PG_hwpoison bit check.
  		 */
  		if (unlikely(PageHWPoison(page))) {
aa50d3a7a   Andi Kleen   Encode huge page ...
2486
2487
  			ret = VM_FAULT_HWPOISON | 
  			      VM_FAULT_SET_HINDEX(h - hstates);
998b4382c   Naoya Horiguchi   hugetlb: fix meta...
2488
2489
  			goto backout_unlocked;
  		}
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2490
  		page_dup_rmap(page);
6bda666a0   Christoph Lameter   [PATCH] hugepages...
2491
  	}
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2492

57303d801   Andy Whitcroft   hugetlbfs: alloca...
2493
2494
2495
2496
2497
2498
  	/*
  	 * If we are going to COW a private mapping later, we examine the
  	 * pending reservations for this page now. This will ensure that
  	 * any allocations necessary to record that reservation occur outside
  	 * the spinlock.
  	 */
788c7df45   Hugh Dickins   hugetlb: fault fl...
2499
  	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2b26736c8   Andy Whitcroft   allocate structur...
2500
2501
2502
2503
  		if (vma_needs_reservation(h, vma, address) < 0) {
  			ret = VM_FAULT_OOM;
  			goto backout_unlocked;
  		}
57303d801   Andy Whitcroft   hugetlbfs: alloca...
2504

ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
2505
  	spin_lock(&mm->page_table_lock);
a55164389   Andi Kleen   hugetlb: modular ...
2506
  	size = i_size_read(mapping->host) >> huge_page_shift(h);
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2507
2508
  	if (idx >= size)
  		goto backout;
83c54070e   Nick Piggin   mm: fault feedbac...
2509
  	ret = 0;
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2510
  	if (!huge_pte_none(huge_ptep_get(ptep)))
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2511
  		goto backout;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2512
2513
2514
  	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
  				&& (vma->vm_flags & VM_SHARED)));
  	set_huge_pte_at(mm, address, ptep, new_pte);
788c7df45   Hugh Dickins   hugetlb: fault fl...
2515
  	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2516
  		/* Optimization, do the COW without a second fault */
04f2cbe35   Mel Gorman   hugetlb: guarante...
2517
  		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2518
  	}
ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
2519
  	spin_unlock(&mm->page_table_lock);
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2520
2521
  	unlock_page(page);
  out:
ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
2522
  	return ret;
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2523
2524
2525
  
  backout:
  	spin_unlock(&mm->page_table_lock);
2b26736c8   Andy Whitcroft   allocate structur...
2526
  backout_unlocked:
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2527
2528
2529
  	unlock_page(page);
  	put_page(page);
  	goto out;
ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
2530
  }
86e5216f8   Adam Litke   [PATCH] Hugetlb: ...
2531
  int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
788c7df45   Hugh Dickins   hugetlb: fault fl...
2532
  			unsigned long address, unsigned int flags)
86e5216f8   Adam Litke   [PATCH] Hugetlb: ...
2533
2534
2535
  {
  	pte_t *ptep;
  	pte_t entry;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2536
  	int ret;
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2537
  	struct page *page = NULL;
57303d801   Andy Whitcroft   hugetlbfs: alloca...
2538
  	struct page *pagecache_page = NULL;
3935baa9b   David Gibson   [PATCH] hugepage:...
2539
  	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
a55164389   Andi Kleen   hugetlb: modular ...
2540
  	struct hstate *h = hstate_vma(vma);
86e5216f8   Adam Litke   [PATCH] Hugetlb: ...
2541

fd6a03edd   Naoya Horiguchi   HWPOISON, hugetlb...
2542
2543
2544
  	ptep = huge_pte_offset(mm, address);
  	if (ptep) {
  		entry = huge_ptep_get(ptep);
290408d4a   Naoya Horiguchi   hugetlb: hugepage...
2545
2546
2547
2548
  		if (unlikely(is_hugetlb_entry_migration(entry))) {
  			migration_entry_wait(mm, (pmd_t *)ptep, address);
  			return 0;
  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
aa50d3a7a   Andi Kleen   Encode huge page ...
2549
2550
  			return VM_FAULT_HWPOISON_LARGE | 
  			       VM_FAULT_SET_HINDEX(h - hstates);
fd6a03edd   Naoya Horiguchi   HWPOISON, hugetlb...
2551
  	}
a55164389   Andi Kleen   hugetlb: modular ...
2552
  	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
86e5216f8   Adam Litke   [PATCH] Hugetlb: ...
2553
2554
  	if (!ptep)
  		return VM_FAULT_OOM;
3935baa9b   David Gibson   [PATCH] hugepage:...
2555
2556
2557
2558
2559
2560
  	/*
  	 * Serialize hugepage allocation and instantiation, so that we don't
  	 * get spurious allocation failures if two CPUs race to instantiate
  	 * the same page in the page cache.
  	 */
  	mutex_lock(&hugetlb_instantiation_mutex);
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2561
2562
  	entry = huge_ptep_get(ptep);
  	if (huge_pte_none(entry)) {
788c7df45   Hugh Dickins   hugetlb: fault fl...
2563
  		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
b4d1d99fd   David Gibson   hugetlb: handle u...
2564
  		goto out_mutex;
3935baa9b   David Gibson   [PATCH] hugepage:...
2565
  	}
86e5216f8   Adam Litke   [PATCH] Hugetlb: ...
2566

83c54070e   Nick Piggin   mm: fault feedbac...
2567
  	ret = 0;
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2568

57303d801   Andy Whitcroft   hugetlbfs: alloca...
2569
2570
2571
2572
2573
2574
2575
2576
  	/*
  	 * If we are going to COW the mapping later, we examine the pending
  	 * reservations for this page now. This will ensure that any
  	 * allocations necessary to record that reservation occur outside the
  	 * spinlock. For private mappings, we also lookup the pagecache
  	 * page now as it is used to determine if a reservation has been
  	 * consumed.
  	 */
788c7df45   Hugh Dickins   hugetlb: fault fl...
2577
  	if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
2b26736c8   Andy Whitcroft   allocate structur...
2578
2579
  		if (vma_needs_reservation(h, vma, address) < 0) {
  			ret = VM_FAULT_OOM;
b4d1d99fd   David Gibson   hugetlb: handle u...
2580
  			goto out_mutex;
2b26736c8   Andy Whitcroft   allocate structur...
2581
  		}
57303d801   Andy Whitcroft   hugetlbfs: alloca...
2582

f83a275db   Mel Gorman   mm: account for M...
2583
  		if (!(vma->vm_flags & VM_MAYSHARE))
57303d801   Andy Whitcroft   hugetlbfs: alloca...
2584
2585
2586
  			pagecache_page = hugetlbfs_pagecache_page(h,
  								vma, address);
  	}
56c9cfb13   Naoya Horiguchi   hugetlb, rmap: fi...
2587
2588
2589
2590
2591
2592
2593
2594
2595
  	/*
  	 * hugetlb_cow() requires page locks of pte_page(entry) and
  	 * pagecache_page, so here we need take the former one
  	 * when page != pagecache_page or !pagecache_page.
  	 * Note that locking order is always pagecache_page -> page,
  	 * so no worry about deadlock.
  	 */
  	page = pte_page(entry);
  	if (page != pagecache_page)
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2596
  		lock_page(page);
0fe6e20b9   Naoya Horiguchi   hugetlb, rmap: ad...
2597

1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2598
2599
  	spin_lock(&mm->page_table_lock);
  	/* Check for a racing update before calling hugetlb_cow */
b4d1d99fd   David Gibson   hugetlb: handle u...
2600
2601
  	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
  		goto out_page_table_lock;
788c7df45   Hugh Dickins   hugetlb: fault fl...
2602
  	if (flags & FAULT_FLAG_WRITE) {
b4d1d99fd   David Gibson   hugetlb: handle u...
2603
  		if (!pte_write(entry)) {
57303d801   Andy Whitcroft   hugetlbfs: alloca...
2604
2605
  			ret = hugetlb_cow(mm, vma, address, ptep, entry,
  							pagecache_page);
b4d1d99fd   David Gibson   hugetlb: handle u...
2606
2607
2608
2609
2610
  			goto out_page_table_lock;
  		}
  		entry = pte_mkdirty(entry);
  	}
  	entry = pte_mkyoung(entry);
788c7df45   Hugh Dickins   hugetlb: fault fl...
2611
2612
  	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
  						flags & FAULT_FLAG_WRITE))
4b3073e1c   Russell King   MM: Pass a PTE po...
2613
  		update_mmu_cache(vma, address, ptep);
b4d1d99fd   David Gibson   hugetlb: handle u...
2614
2615
  
  out_page_table_lock:
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2616
  	spin_unlock(&mm->page_table_lock);
57303d801   Andy Whitcroft   hugetlbfs: alloca...
2617
2618
2619
2620
2621
  
  	if (pagecache_page) {
  		unlock_page(pagecache_page);
  		put_page(pagecache_page);
  	}
1f64d69c7   Dean Nelson   mm/hugetlb.c: avo...
2622
2623
  	if (page != pagecache_page)
  		unlock_page(page);
57303d801   Andy Whitcroft   hugetlbfs: alloca...
2624

b4d1d99fd   David Gibson   hugetlb: handle u...
2625
  out_mutex:
3935baa9b   David Gibson   [PATCH] hugepage:...
2626
  	mutex_unlock(&hugetlb_instantiation_mutex);
1e8f889b1   David Gibson   [PATCH] Hugetlb: ...
2627
2628
  
  	return ret;
86e5216f8   Adam Litke   [PATCH] Hugetlb: ...
2629
  }
ceb868796   Andi Kleen   hugetlb: introduc...
2630
2631
2632
2633
2634
2635
2636
2637
  /* Can be overriden by architectures */
  __attribute__((weak)) struct page *
  follow_huge_pud(struct mm_struct *mm, unsigned long address,
  	       pud_t *pud, int write)
  {
  	BUG();
  	return NULL;
  }
63551ae0f   David Gibson   [PATCH] Hugepage ...
2638
2639
  int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
  			struct page **pages, struct vm_area_struct **vmas,
5b23dbe81   Adam Litke   hugetlb: follow_h...
2640
  			unsigned long *position, int *length, int i,
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2641
  			unsigned int flags)
63551ae0f   David Gibson   [PATCH] Hugepage ...
2642
  {
d5d4b0aa4   Kenneth W Chen   [PATCH] optimize ...
2643
2644
  	unsigned long pfn_offset;
  	unsigned long vaddr = *position;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2645
  	int remainder = *length;
a55164389   Andi Kleen   hugetlb: modular ...
2646
  	struct hstate *h = hstate_vma(vma);
63551ae0f   David Gibson   [PATCH] Hugepage ...
2647

1c59827d1   Hugh Dickins   [PATCH] mm: huget...
2648
  	spin_lock(&mm->page_table_lock);
63551ae0f   David Gibson   [PATCH] Hugepage ...
2649
  	while (vaddr < vma->vm_end && remainder) {
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2650
  		pte_t *pte;
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2651
  		int absent;
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2652
  		struct page *page;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2653

4c8872659   Adam Litke   [PATCH] hugetlb: ...
2654
2655
  		/*
  		 * Some archs (sparc64, sh*) have multiple pte_ts to
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2656
  		 * each hugepage.  We have to make sure we get the
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2657
2658
  		 * first, for the page indexing below to work.
  		 */
a55164389   Andi Kleen   hugetlb: modular ...
2659
  		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2660
2661
2662
2663
  		absent = !pte || huge_pte_none(huge_ptep_get(pte));
  
  		/*
  		 * When coredumping, it suits get_dump_page if we just return
3ae77f43b   Hugh Dickins   mm: hugetlbfs_pag...
2664
2665
2666
2667
  		 * an error where there's an empty slot with no huge pagecache
  		 * to back it.  This way, we avoid allocating a hugepage, and
  		 * the sparse dumpfile avoids allocating disk blocks, but its
  		 * huge holes still show up with zeroes where they need to be.
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2668
  		 */
3ae77f43b   Hugh Dickins   mm: hugetlbfs_pag...
2669
2670
  		if (absent && (flags & FOLL_DUMP) &&
  		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2671
2672
2673
  			remainder = 0;
  			break;
  		}
63551ae0f   David Gibson   [PATCH] Hugepage ...
2674

2a15efc95   Hugh Dickins   mm: follow_hugetl...
2675
2676
  		if (absent ||
  		    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2677
  			int ret;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2678

4c8872659   Adam Litke   [PATCH] hugetlb: ...
2679
  			spin_unlock(&mm->page_table_lock);
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2680
2681
  			ret = hugetlb_fault(mm, vma, vaddr,
  				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2682
  			spin_lock(&mm->page_table_lock);
a89182c76   Adam Litke   Fix VM_FAULT flag...
2683
  			if (!(ret & VM_FAULT_ERROR))
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2684
  				continue;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2685

4c8872659   Adam Litke   [PATCH] hugetlb: ...
2686
  			remainder = 0;
4c8872659   Adam Litke   [PATCH] hugetlb: ...
2687
2688
  			break;
  		}
a55164389   Andi Kleen   hugetlb: modular ...
2689
  		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2690
  		page = pte_page(huge_ptep_get(pte));
d5d4b0aa4   Kenneth W Chen   [PATCH] optimize ...
2691
  same_page:
d6692183a   Kenneth W Chen   [PATCH] fix extra...
2692
  		if (pages) {
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2693
  			pages[i] = mem_map_offset(page, pfn_offset);
4b2e38ad7   KOSAKI Motohiro   hugepage: support...
2694
  			get_page(pages[i]);
d6692183a   Kenneth W Chen   [PATCH] fix extra...
2695
  		}
63551ae0f   David Gibson   [PATCH] Hugepage ...
2696
2697
2698
2699
2700
  
  		if (vmas)
  			vmas[i] = vma;
  
  		vaddr += PAGE_SIZE;
d5d4b0aa4   Kenneth W Chen   [PATCH] optimize ...
2701
  		++pfn_offset;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2702
2703
  		--remainder;
  		++i;
d5d4b0aa4   Kenneth W Chen   [PATCH] optimize ...
2704
  		if (vaddr < vma->vm_end && remainder &&
a55164389   Andi Kleen   hugetlb: modular ...
2705
  				pfn_offset < pages_per_huge_page(h)) {
d5d4b0aa4   Kenneth W Chen   [PATCH] optimize ...
2706
2707
2708
2709
2710
2711
  			/*
  			 * We use pfn_offset to avoid touching the pageframes
  			 * of this compound page.
  			 */
  			goto same_page;
  		}
63551ae0f   David Gibson   [PATCH] Hugepage ...
2712
  	}
1c59827d1   Hugh Dickins   [PATCH] mm: huget...
2713
  	spin_unlock(&mm->page_table_lock);
63551ae0f   David Gibson   [PATCH] Hugepage ...
2714
2715
  	*length = remainder;
  	*position = vaddr;
2a15efc95   Hugh Dickins   mm: follow_hugetl...
2716
  	return i ? i : -EFAULT;
63551ae0f   David Gibson   [PATCH] Hugepage ...
2717
  }
8f860591f   Zhang, Yanmin   [PATCH] Enable mp...
2718
2719
2720
2721
2722
2723
2724
2725
  
  void hugetlb_change_protection(struct vm_area_struct *vma,
  		unsigned long address, unsigned long end, pgprot_t newprot)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long start = address;
  	pte_t *ptep;
  	pte_t pte;
a55164389   Andi Kleen   hugetlb: modular ...
2726
  	struct hstate *h = hstate_vma(vma);
8f860591f   Zhang, Yanmin   [PATCH] Enable mp...
2727
2728
2729
  
  	BUG_ON(address >= end);
  	flush_cache_range(vma, address, end);
39dde65c9   Kenneth W Chen   [PATCH] shared pa...
2730
  	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
8f860591f   Zhang, Yanmin   [PATCH] Enable mp...
2731
  	spin_lock(&mm->page_table_lock);
a55164389   Andi Kleen   hugetlb: modular ...
2732
  	for (; address < end; address += huge_page_size(h)) {
8f860591f   Zhang, Yanmin   [PATCH] Enable mp...
2733
2734
2735
  		ptep = huge_pte_offset(mm, address);
  		if (!ptep)
  			continue;
39dde65c9   Kenneth W Chen   [PATCH] shared pa...
2736
2737
  		if (huge_pmd_unshare(mm, &address, ptep))
  			continue;
7f2e9525b   Gerald Schaefer   hugetlbfs: common...
2738
  		if (!huge_pte_none(huge_ptep_get(ptep))) {
8f860591f   Zhang, Yanmin   [PATCH] Enable mp...
2739
2740
2741
  			pte = huge_ptep_get_and_clear(mm, address, ptep);
  			pte = pte_mkhuge(pte_modify(pte, newprot));
  			set_huge_pte_at(mm, address, ptep, pte);
8f860591f   Zhang, Yanmin   [PATCH] Enable mp...
2742
2743
2744
  		}
  	}
  	spin_unlock(&mm->page_table_lock);
39dde65c9   Kenneth W Chen   [PATCH] shared pa...
2745
  	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
8f860591f   Zhang, Yanmin   [PATCH] Enable mp...
2746
2747
2748
  
  	flush_tlb_range(vma, start, end);
  }
a1e78772d   Mel Gorman   hugetlb: reserve ...
2749
2750
  int hugetlb_reserve_pages(struct inode *inode,
  					long from, long to,
5a6fe1259   Mel Gorman   Do not account fo...
2751
2752
  					struct vm_area_struct *vma,
  					int acctflag)
e4e574b76   Adam Litke   hugetlb: Try to g...
2753
  {
17c9d12e1   Mel Gorman   Do not account fo...
2754
  	long ret, chg;
a55164389   Andi Kleen   hugetlb: modular ...
2755
  	struct hstate *h = hstate_inode(inode);
e4e574b76   Adam Litke   hugetlb: Try to g...
2756

a1e78772d   Mel Gorman   hugetlb: reserve ...
2757
  	/*
17c9d12e1   Mel Gorman   Do not account fo...
2758
2759
2760
2761
2762
2763
2764
2765
  	 * Only apply hugepage reservation if asked. At fault time, an
  	 * attempt will be made for VM_NORESERVE to allocate a page
  	 * and filesystem quota without using reserves
  	 */
  	if (acctflag & VM_NORESERVE)
  		return 0;
  
  	/*
a1e78772d   Mel Gorman   hugetlb: reserve ...
2766
2767
2768
2769
2770
  	 * Shared mappings base their reservation on the number of pages that
  	 * are already allocated on behalf of the file. Private mappings need
  	 * to reserve the full area even if read-only as mprotect() may be
  	 * called to make the mapping read-write. Assume !vma is a shm mapping
  	 */
f83a275db   Mel Gorman   mm: account for M...
2771
  	if (!vma || vma->vm_flags & VM_MAYSHARE)
a1e78772d   Mel Gorman   hugetlb: reserve ...
2772
  		chg = region_chg(&inode->i_mapping->private_list, from, to);
17c9d12e1   Mel Gorman   Do not account fo...
2773
2774
2775
2776
  	else {
  		struct resv_map *resv_map = resv_map_alloc();
  		if (!resv_map)
  			return -ENOMEM;
a1e78772d   Mel Gorman   hugetlb: reserve ...
2777
  		chg = to - from;
84afd99b8   Andy Whitcroft   hugetlb reservati...
2778

17c9d12e1   Mel Gorman   Do not account fo...
2779
2780
2781
  		set_vma_resv_map(vma, resv_map);
  		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
  	}
e4e574b76   Adam Litke   hugetlb: Try to g...
2782
2783
  	if (chg < 0)
  		return chg;
8a6301127   Ken Chen   pretend cpuset ha...
2784

17c9d12e1   Mel Gorman   Do not account fo...
2785
  	/* There must be enough filesystem quota for the mapping */
90d8b7e61   Adam Litke   hugetlb: enforce ...
2786
2787
  	if (hugetlb_get_quota(inode->i_mapping, chg))
  		return -ENOSPC;
5a6fe1259   Mel Gorman   Do not account fo...
2788
2789
  
  	/*
17c9d12e1   Mel Gorman   Do not account fo...
2790
2791
  	 * Check enough hugepages are available for the reservation.
  	 * Hand back the quota if there are not
5a6fe1259   Mel Gorman   Do not account fo...
2792
  	 */
a55164389   Andi Kleen   hugetlb: modular ...
2793
  	ret = hugetlb_acct_memory(h, chg);
68842c9b9   Ken Chen   hugetlbfs: fix qu...
2794
2795
  	if (ret < 0) {
  		hugetlb_put_quota(inode->i_mapping, chg);
a43a8c39b   Kenneth W Chen   [PATCH] tightenin...
2796
  		return ret;
68842c9b9   Ken Chen   hugetlbfs: fix qu...
2797
  	}
17c9d12e1   Mel Gorman   Do not account fo...
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
  
  	/*
  	 * Account for the reservations made. Shared mappings record regions
  	 * that have reservations as they are shared by multiple VMAs.
  	 * When the last VMA disappears, the region map says how much
  	 * the reservation was and the page cache tells how much of
  	 * the reservation was consumed. Private mappings are per-VMA and
  	 * only the consumed reservations are tracked. When the VMA
  	 * disappears, the original reservation is the VMA size and the
  	 * consumed reservations are stored in the map. Hence, nothing
  	 * else has to be done for private mappings here
  	 */
f83a275db   Mel Gorman   mm: account for M...
2810
  	if (!vma || vma->vm_flags & VM_MAYSHARE)
a1e78772d   Mel Gorman   hugetlb: reserve ...
2811
  		region_add(&inode->i_mapping->private_list, from, to);
a43a8c39b   Kenneth W Chen   [PATCH] tightenin...
2812
2813
2814
2815
2816
  	return 0;
  }
  
  void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
  {
a55164389   Andi Kleen   hugetlb: modular ...
2817
  	struct hstate *h = hstate_inode(inode);
a43a8c39b   Kenneth W Chen   [PATCH] tightenin...
2818
  	long chg = region_truncate(&inode->i_mapping->private_list, offset);
45c682a68   Ken Chen   hugetlb: fix i_bl...
2819
2820
  
  	spin_lock(&inode->i_lock);
e4c6f8bed   Eric Sandeen   hugetlbfs: fix i_...
2821
  	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
45c682a68   Ken Chen   hugetlb: fix i_bl...
2822
  	spin_unlock(&inode->i_lock);
90d8b7e61   Adam Litke   hugetlb: enforce ...
2823
  	hugetlb_put_quota(inode->i_mapping, (chg - freed));
a55164389   Andi Kleen   hugetlb: modular ...
2824
  	hugetlb_acct_memory(h, -(chg - freed));
a43a8c39b   Kenneth W Chen   [PATCH] tightenin...
2825
  }
93f70f900   Naoya Horiguchi   HWPOISON, hugetlb...
2826

d5bd91069   Andi Kleen   hugepage: move is...
2827
  #ifdef CONFIG_MEMORY_FAILURE
6de2b1aab   Naoya Horiguchi   HWPOISON, hugetlb...
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
  /* Should be called in hugetlb_lock */
  static int is_hugepage_on_freelist(struct page *hpage)
  {
  	struct page *page;
  	struct page *tmp;
  	struct hstate *h = page_hstate(hpage);
  	int nid = page_to_nid(hpage);
  
  	list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
  		if (page == hpage)
  			return 1;
  	return 0;
  }
93f70f900   Naoya Horiguchi   HWPOISON, hugetlb...
2841
2842
2843
2844
  /*
   * This function is called from memory failure code.
   * Assume the caller holds page lock of the head page.
   */
6de2b1aab   Naoya Horiguchi   HWPOISON, hugetlb...
2845
  int dequeue_hwpoisoned_huge_page(struct page *hpage)
93f70f900   Naoya Horiguchi   HWPOISON, hugetlb...
2846
2847
2848
  {
  	struct hstate *h = page_hstate(hpage);
  	int nid = page_to_nid(hpage);
6de2b1aab   Naoya Horiguchi   HWPOISON, hugetlb...
2849
  	int ret = -EBUSY;
93f70f900   Naoya Horiguchi   HWPOISON, hugetlb...
2850
2851
  
  	spin_lock(&hugetlb_lock);
6de2b1aab   Naoya Horiguchi   HWPOISON, hugetlb...
2852
2853
  	if (is_hugepage_on_freelist(hpage)) {
  		list_del(&hpage->lru);
8c6c2ecb4   Naoya Horiguchi   HWPOSION, hugetlb...
2854
  		set_page_refcounted(hpage);
6de2b1aab   Naoya Horiguchi   HWPOISON, hugetlb...
2855
2856
2857
2858
  		h->free_huge_pages--;
  		h->free_huge_pages_node[nid]--;
  		ret = 0;
  	}
93f70f900   Naoya Horiguchi   HWPOISON, hugetlb...
2859
  	spin_unlock(&hugetlb_lock);
6de2b1aab   Naoya Horiguchi   HWPOISON, hugetlb...
2860
  	return ret;
93f70f900   Naoya Horiguchi   HWPOISON, hugetlb...
2861
  }
6de2b1aab   Naoya Horiguchi   HWPOISON, hugetlb...
2862
  #endif