Commit 1e8f889b10d8d2223105719e36ce45688fedbd59

Authored by David Gibson
Committed by Linus Torvalds
1 parent 86e5216f8d

[PATCH] Hugetlb: Copy on Write support

Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be
supported.  This helps us to safely use hugetlb pages in many more
applications.  The patch makes the following changes.  If needed, I also have
it broken out according to the following paragraphs.

1. Add a pair of functions to set/clear write access on huge ptes.  The
   writable check in make_huge_pte is moved out to the caller for use by COW
   later.

2. Hugetlb copy-on-write requires special case handling in the following
   situations:

   - copy_hugetlb_page_range() - Copied pages must be write protected so
     a COW fault will be triggered (if necessary) if those pages are written
     to.

   - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the
     page cache.  MAP_PRIVATE pages still need to be locked however.

3. Provide hugetlb_cow() and calls from hugetlb_fault() and
   hugetlb_no_page() which handles the COW fault by making the actual copy.

4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps
   will be allowed.  Make MAP_HUGETLB exempt from the depricated VM_RESERVED
   mapping check.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 108 additions and 22 deletions Side-by-side Diff

fs/hugetlbfs/inode.c
... ... @@ -100,9 +100,6 @@
100 100 loff_t len, vma_len;
101 101 int ret;
102 102  
103   - if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
104   - return -EINVAL;
105   -
106 103 if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
107 104 return -EINVAL;
108 105  
... ... @@ -261,11 +261,12 @@
261 261 .nopage = hugetlb_nopage,
262 262 };
263 263  
264   -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
  264 +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
  265 + int writable)
265 266 {
266 267 pte_t entry;
267 268  
268   - if (vma->vm_flags & VM_WRITE) {
  269 + if (writable) {
269 270 entry =
270 271 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
271 272 } else {
272 273  
273 274  
... ... @@ -277,13 +278,28 @@
277 278 return entry;
278 279 }
279 280  
  281 +static void set_huge_ptep_writable(struct vm_area_struct *vma,
  282 + unsigned long address, pte_t *ptep)
  283 +{
  284 + pte_t entry;
  285 +
  286 + entry = pte_mkwrite(pte_mkdirty(*ptep));
  287 + ptep_set_access_flags(vma, address, ptep, entry, 1);
  288 + update_mmu_cache(vma, address, entry);
  289 + lazy_mmu_prot_update(entry);
  290 +}
  291 +
  292 +
280 293 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
281 294 struct vm_area_struct *vma)
282 295 {
283 296 pte_t *src_pte, *dst_pte, entry;
284 297 struct page *ptepage;
285 298 unsigned long addr;
  299 + int cow;
286 300  
  301 + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
  302 +
287 303 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
288 304 src_pte = huge_pte_offset(src, addr);
289 305 if (!src_pte)
... ... @@ -294,6 +310,8 @@
294 310 spin_lock(&dst->page_table_lock);
295 311 spin_lock(&src->page_table_lock);
296 312 if (!pte_none(*src_pte)) {
  313 + if (cow)
  314 + ptep_set_wrprotect(src, addr, src_pte);
297 315 entry = *src_pte;
298 316 ptepage = pte_page(entry);
299 317 get_page(ptepage);
... ... @@ -346,7 +364,7 @@
346 364 }
347 365  
348 366 static struct page *find_or_alloc_huge_page(struct address_space *mapping,
349   - unsigned long idx)
  367 + unsigned long idx, int shared)
350 368 {
351 369 struct page *page;
352 370 int err;
353 371  
354 372  
355 373  
... ... @@ -364,26 +382,80 @@
364 382 goto out;
365 383 }
366 384  
367   - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
368   - if (err) {
369   - put_page(page);
370   - hugetlb_put_quota(mapping);
371   - if (err == -EEXIST)
372   - goto retry;
373   - page = NULL;
  385 + if (shared) {
  386 + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
  387 + if (err) {
  388 + put_page(page);
  389 + hugetlb_put_quota(mapping);
  390 + if (err == -EEXIST)
  391 + goto retry;
  392 + page = NULL;
  393 + }
  394 + } else {
  395 + /* Caller expects a locked page */
  396 + lock_page(page);
374 397 }
375 398 out:
376 399 return page;
377 400 }
378 401  
  402 +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
  403 + unsigned long address, pte_t *ptep, pte_t pte)
  404 +{
  405 + struct page *old_page, *new_page;
  406 + int i, avoidcopy;
  407 +
  408 + old_page = pte_page(pte);
  409 +
  410 + /* If no-one else is actually using this page, avoid the copy
  411 + * and just make the page writable */
  412 + avoidcopy = (page_count(old_page) == 1);
  413 + if (avoidcopy) {
  414 + set_huge_ptep_writable(vma, address, ptep);
  415 + return VM_FAULT_MINOR;
  416 + }
  417 +
  418 + page_cache_get(old_page);
  419 + new_page = alloc_huge_page();
  420 +
  421 + if (!new_page) {
  422 + page_cache_release(old_page);
  423 +
  424 + /* Logically this is OOM, not a SIGBUS, but an OOM
  425 + * could cause the kernel to go killing other
  426 + * processes which won't help the hugepage situation
  427 + * at all (?) */
  428 + return VM_FAULT_SIGBUS;
  429 + }
  430 +
  431 + spin_unlock(&mm->page_table_lock);
  432 + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
  433 + copy_user_highpage(new_page + i, old_page + i,
  434 + address + i*PAGE_SIZE);
  435 + spin_lock(&mm->page_table_lock);
  436 +
  437 + ptep = huge_pte_offset(mm, address & HPAGE_MASK);
  438 + if (likely(pte_same(*ptep, pte))) {
  439 + /* Break COW */
  440 + set_huge_pte_at(mm, address, ptep,
  441 + make_huge_pte(vma, new_page, 1));
  442 + /* Make the old page be freed below */
  443 + new_page = old_page;
  444 + }
  445 + page_cache_release(new_page);
  446 + page_cache_release(old_page);
  447 + return VM_FAULT_MINOR;
  448 +}
  449 +
379 450 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
380   - unsigned long address, pte_t *ptep)
  451 + unsigned long address, pte_t *ptep, int write_access)
381 452 {
382 453 int ret = VM_FAULT_SIGBUS;
383 454 unsigned long idx;
384 455 unsigned long size;
385 456 struct page *page;
386 457 struct address_space *mapping;
  458 + pte_t new_pte;
387 459  
388 460 mapping = vma->vm_file->f_mapping;
389 461 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
390 462  
... ... @@ -393,10 +465,13 @@
393 465 * Use page lock to guard against racing truncation
394 466 * before we get page_table_lock.
395 467 */
396   - page = find_or_alloc_huge_page(mapping, idx);
  468 + page = find_or_alloc_huge_page(mapping, idx,
  469 + vma->vm_flags & VM_SHARED);
397 470 if (!page)
398 471 goto out;
399 472  
  473 + BUG_ON(!PageLocked(page));
  474 +
400 475 spin_lock(&mm->page_table_lock);
401 476 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
402 477 if (idx >= size)
... ... @@ -407,7 +482,15 @@
407 482 goto backout;
408 483  
409 484 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
410   - set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
  485 + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
  486 + && (vma->vm_flags & VM_SHARED)));
  487 + set_huge_pte_at(mm, address, ptep, new_pte);
  488 +
  489 + if (write_access && !(vma->vm_flags & VM_SHARED)) {
  490 + /* Optimization, do the COW without a second fault */
  491 + ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
  492 + }
  493 +
411 494 spin_unlock(&mm->page_table_lock);
412 495 unlock_page(page);
413 496 out:
... ... @@ -426,6 +509,7 @@
426 509 {
427 510 pte_t *ptep;
428 511 pte_t entry;
  512 + int ret;
429 513  
430 514 ptep = huge_pte_alloc(mm, address);
431 515 if (!ptep)
432 516  
... ... @@ -433,13 +517,18 @@
433 517  
434 518 entry = *ptep;
435 519 if (pte_none(entry))
436   - return hugetlb_no_page(mm, vma, address, ptep);
  520 + return hugetlb_no_page(mm, vma, address, ptep, write_access);
437 521  
438   - /*
439   - * We could get here if another thread instantiated the pte
440   - * before the test above.
441   - */
442   - return VM_FAULT_MINOR;
  522 + ret = VM_FAULT_MINOR;
  523 +
  524 + spin_lock(&mm->page_table_lock);
  525 + /* Check for a racing update before calling hugetlb_cow */
  526 + if (likely(pte_same(entry, *ptep)))
  527 + if (write_access && !pte_write(entry))
  528 + ret = hugetlb_cow(mm, vma, address, ptep, entry);
  529 + spin_unlock(&mm->page_table_lock);
  530 +
  531 + return ret;
443 532 }
444 533  
445 534 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,