Commit 4c887265977213985091476be40ab11dfdcb4caf

Authored by Adam Litke
Committed by Linus Torvalds
1 parent 551110a94a

[PATCH] hugetlb: demand fault handler

Below is a patch to implement demand faulting for huge pages.  The main
motivation for changing from prefaulting to demand faulting is so that huge
page memory areas can be allocated according to NUMA policy.

Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler.  The bulk of the patch just moves the logic from
hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().

Signed-off-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 97 additions and 90 deletions Side-by-side Diff

fs/hugetlbfs/inode.c
... ... @@ -48,7 +48,6 @@
48 48 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
49 49 {
50 50 struct inode *inode = file->f_dentry->d_inode;
51   - struct address_space *mapping = inode->i_mapping;
52 51 loff_t len, vma_len;
53 52 int ret;
54 53  
... ... @@ -79,10 +78,8 @@
79 78 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
80 79 goto out;
81 80  
82   - ret = hugetlb_prefault(mapping, vma);
83   - if (ret)
84   - goto out;
85   -
  81 + ret = 0;
  82 + hugetlb_prefault_arch_hook(vma->vm_mm);
86 83 if (inode->i_size < len)
87 84 inode->i_size = len;
88 85 out:
... ... @@ -321,10 +321,7 @@
321 321  
322 322 for (address = start; address < end; address += HPAGE_SIZE) {
323 323 ptep = huge_pte_offset(mm, address);
324   - if (! ptep)
325   - /* This can happen on truncate, or if an
326   - * mmap() is aborted due to an error before
327   - * the prefault */
  324 + if (!ptep)
328 325 continue;
329 326  
330 327 pte = huge_ptep_get_and_clear(mm, address, ptep);
331 328  
332 329  
333 330  
334 331  
335 332  
336 333  
337 334  
338 335  
339 336  
340 337  
341 338  
342 339  
343 340  
... ... @@ -340,81 +337,92 @@
340 337 flush_tlb_range(vma, start, end);
341 338 }
342 339  
343   -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
  340 +static struct page *find_lock_huge_page(struct address_space *mapping,
  341 + unsigned long idx)
344 342 {
345   - struct mm_struct *mm = current->mm;
346   - unsigned long addr;
347   - int ret = 0;
  343 + struct page *page;
  344 + int err;
  345 + struct inode *inode = mapping->host;
  346 + unsigned long size;
348 347  
349   - WARN_ON(!is_vm_hugetlb_page(vma));
350   - BUG_ON(vma->vm_start & ~HPAGE_MASK);
351   - BUG_ON(vma->vm_end & ~HPAGE_MASK);
  348 +retry:
  349 + page = find_lock_page(mapping, idx);
  350 + if (page)
  351 + goto out;
352 352  
353   - hugetlb_prefault_arch_hook(mm);
  353 + /* Check to make sure the mapping hasn't been truncated */
  354 + size = i_size_read(inode) >> HPAGE_SHIFT;
  355 + if (idx >= size)
  356 + goto out;
354 357  
355   - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
356   - unsigned long idx;
357   - pte_t *pte = huge_pte_alloc(mm, addr);
358   - struct page *page;
  358 + if (hugetlb_get_quota(mapping))
  359 + goto out;
  360 + page = alloc_huge_page();
  361 + if (!page) {
  362 + hugetlb_put_quota(mapping);
  363 + goto out;
  364 + }
359 365  
360   - if (!pte) {
361   - ret = -ENOMEM;
362   - goto out;
363   - }
364   -
365   - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
366   - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
367   - page = find_get_page(mapping, idx);
368   - if (!page) {
369   - /* charge the fs quota first */
370   - if (hugetlb_get_quota(mapping)) {
371   - ret = -ENOMEM;
372   - goto out;
373   - }
374   - page = alloc_huge_page();
375   - if (!page) {
376   - hugetlb_put_quota(mapping);
377   - ret = -ENOMEM;
378   - goto out;
379   - }
380   - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
381   - if (! ret) {
382   - unlock_page(page);
383   - } else {
384   - hugetlb_put_quota(mapping);
385   - free_huge_page(page);
386   - goto out;
387   - }
388   - }
389   - spin_lock(&mm->page_table_lock);
390   - add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
391   - set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
392   - spin_unlock(&mm->page_table_lock);
  366 + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
  367 + if (err) {
  368 + put_page(page);
  369 + hugetlb_put_quota(mapping);
  370 + if (err == -EEXIST)
  371 + goto retry;
  372 + page = NULL;
393 373 }
394 374 out:
395   - return ret;
  375 + return page;
396 376 }
397 377  
398   -/*
399   - * On ia64 at least, it is possible to receive a hugetlb fault from a
400   - * stale zero entry left in the TLB from earlier hardware prefetching.
401   - * Low-level arch code should already have flushed the stale entry as
402   - * part of its fault handling, but we do need to accept this minor fault
403   - * and return successfully. Whereas the "normal" case is that this is
404   - * an access to a hugetlb page which has been truncated off since mmap.
405   - */
406 378 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
407 379 unsigned long address, int write_access)
408 380 {
409 381 int ret = VM_FAULT_SIGBUS;
  382 + unsigned long idx;
  383 + unsigned long size;
410 384 pte_t *pte;
  385 + struct page *page;
  386 + struct address_space *mapping;
411 387  
  388 + pte = huge_pte_alloc(mm, address);
  389 + if (!pte)
  390 + goto out;
  391 +
  392 + mapping = vma->vm_file->f_mapping;
  393 + idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
  394 + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
  395 +
  396 + /*
  397 + * Use page lock to guard against racing truncation
  398 + * before we get page_table_lock.
  399 + */
  400 + page = find_lock_huge_page(mapping, idx);
  401 + if (!page)
  402 + goto out;
  403 +
412 404 spin_lock(&mm->page_table_lock);
413   - pte = huge_pte_offset(mm, address);
414   - if (pte && !pte_none(*pte))
415   - ret = VM_FAULT_MINOR;
  405 + size = i_size_read(mapping->host) >> HPAGE_SHIFT;
  406 + if (idx >= size)
  407 + goto backout;
  408 +
  409 + ret = VM_FAULT_MINOR;
  410 + if (!pte_none(*pte))
  411 + goto backout;
  412 +
  413 + add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
  414 + set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
416 415 spin_unlock(&mm->page_table_lock);
  416 + unlock_page(page);
  417 +out:
417 418 return ret;
  419 +
  420 +backout:
  421 + spin_unlock(&mm->page_table_lock);
  422 + hugetlb_put_quota(mapping);
  423 + unlock_page(page);
  424 + put_page(page);
  425 + goto out;
418 426 }
419 427  
420 428 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
421 429  
422 430  
423 431  
424 432  
425 433  
426 434  
... ... @@ -424,34 +432,36 @@
424 432 unsigned long vpfn, vaddr = *position;
425 433 int remainder = *length;
426 434  
427   - BUG_ON(!is_vm_hugetlb_page(vma));
428   -
429 435 vpfn = vaddr/PAGE_SIZE;
430 436 spin_lock(&mm->page_table_lock);
431 437 while (vaddr < vma->vm_end && remainder) {
  438 + pte_t *pte;
  439 + struct page *page;
432 440  
433   - if (pages) {
434   - pte_t *pte;
435   - struct page *page;
  441 + /*
  442 + * Some archs (sparc64, sh*) have multiple pte_ts to
  443 + * each hugepage. We have to make * sure we get the
  444 + * first, for the page indexing below to work.
  445 + */
  446 + pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
436 447  
437   - /* Some archs (sparc64, sh*) have multiple
438   - * pte_ts to each hugepage. We have to make
439   - * sure we get the first, for the page
440   - * indexing below to work. */
441   - pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
  448 + if (!pte || pte_none(*pte)) {
  449 + int ret;
442 450  
443   - /* the hugetlb file might have been truncated */
444   - if (!pte || pte_none(*pte)) {
445   - remainder = 0;
446   - if (!i)
447   - i = -EFAULT;
448   - break;
449   - }
  451 + spin_unlock(&mm->page_table_lock);
  452 + ret = hugetlb_fault(mm, vma, vaddr, 0);
  453 + spin_lock(&mm->page_table_lock);
  454 + if (ret == VM_FAULT_MINOR)
  455 + continue;
450 456  
451   - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
  457 + remainder = 0;
  458 + if (!i)
  459 + i = -EFAULT;
  460 + break;
  461 + }
452 462  
453   - WARN_ON(!PageCompound(page));
454   -
  463 + if (pages) {
  464 + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
455 465 get_page(page);
456 466 pages[i] = page;
457 467 }