Commit 1e8f889b10d8d2223105719e36ce45688fedbd59
Committed by
Linus Torvalds
1 parent
86e5216f8d
Exists in
master
and in
4 other branches
[PATCH] Hugetlb: Copy on Write support
Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be supported. This helps us to safely use hugetlb pages in many more applications. The patch makes the following changes. If needed, I also have it broken out according to the following paragraphs. 1. Add a pair of functions to set/clear write access on huge ptes. The writable check in make_huge_pte is moved out to the caller for use by COW later. 2. Hugetlb copy-on-write requires special case handling in the following situations: - copy_hugetlb_page_range() - Copied pages must be write protected so a COW fault will be triggered (if necessary) if those pages are written to. - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the page cache. MAP_PRIVATE pages still need to be locked however. 3. Provide hugetlb_cow() and calls from hugetlb_fault() and hugetlb_no_page() which handles the COW fault by making the actual copy. 4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps will be allowed. Make MAP_HUGETLB exempt from the depricated VM_RESERVED mapping check. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Adam Litke <agl@us.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "Seth, Rohit" <rohit.seth@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 2 changed files with 108 additions and 22 deletions Side-by-side Diff
fs/hugetlbfs/inode.c
mm/hugetlb.c
... | ... | @@ -261,11 +261,12 @@ |
261 | 261 | .nopage = hugetlb_nopage, |
262 | 262 | }; |
263 | 263 | |
264 | -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | |
264 | +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | |
265 | + int writable) | |
265 | 266 | { |
266 | 267 | pte_t entry; |
267 | 268 | |
268 | - if (vma->vm_flags & VM_WRITE) { | |
269 | + if (writable) { | |
269 | 270 | entry = |
270 | 271 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
271 | 272 | } else { |
272 | 273 | |
273 | 274 | |
... | ... | @@ -277,13 +278,28 @@ |
277 | 278 | return entry; |
278 | 279 | } |
279 | 280 | |
281 | +static void set_huge_ptep_writable(struct vm_area_struct *vma, | |
282 | + unsigned long address, pte_t *ptep) | |
283 | +{ | |
284 | + pte_t entry; | |
285 | + | |
286 | + entry = pte_mkwrite(pte_mkdirty(*ptep)); | |
287 | + ptep_set_access_flags(vma, address, ptep, entry, 1); | |
288 | + update_mmu_cache(vma, address, entry); | |
289 | + lazy_mmu_prot_update(entry); | |
290 | +} | |
291 | + | |
292 | + | |
280 | 293 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
281 | 294 | struct vm_area_struct *vma) |
282 | 295 | { |
283 | 296 | pte_t *src_pte, *dst_pte, entry; |
284 | 297 | struct page *ptepage; |
285 | 298 | unsigned long addr; |
299 | + int cow; | |
286 | 300 | |
301 | + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | |
302 | + | |
287 | 303 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
288 | 304 | src_pte = huge_pte_offset(src, addr); |
289 | 305 | if (!src_pte) |
... | ... | @@ -294,6 +310,8 @@ |
294 | 310 | spin_lock(&dst->page_table_lock); |
295 | 311 | spin_lock(&src->page_table_lock); |
296 | 312 | if (!pte_none(*src_pte)) { |
313 | + if (cow) | |
314 | + ptep_set_wrprotect(src, addr, src_pte); | |
297 | 315 | entry = *src_pte; |
298 | 316 | ptepage = pte_page(entry); |
299 | 317 | get_page(ptepage); |
... | ... | @@ -346,7 +364,7 @@ |
346 | 364 | } |
347 | 365 | |
348 | 366 | static struct page *find_or_alloc_huge_page(struct address_space *mapping, |
349 | - unsigned long idx) | |
367 | + unsigned long idx, int shared) | |
350 | 368 | { |
351 | 369 | struct page *page; |
352 | 370 | int err; |
353 | 371 | |
354 | 372 | |
355 | 373 | |
... | ... | @@ -364,26 +382,80 @@ |
364 | 382 | goto out; |
365 | 383 | } |
366 | 384 | |
367 | - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | |
368 | - if (err) { | |
369 | - put_page(page); | |
370 | - hugetlb_put_quota(mapping); | |
371 | - if (err == -EEXIST) | |
372 | - goto retry; | |
373 | - page = NULL; | |
385 | + if (shared) { | |
386 | + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | |
387 | + if (err) { | |
388 | + put_page(page); | |
389 | + hugetlb_put_quota(mapping); | |
390 | + if (err == -EEXIST) | |
391 | + goto retry; | |
392 | + page = NULL; | |
393 | + } | |
394 | + } else { | |
395 | + /* Caller expects a locked page */ | |
396 | + lock_page(page); | |
374 | 397 | } |
375 | 398 | out: |
376 | 399 | return page; |
377 | 400 | } |
378 | 401 | |
402 | +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |
403 | + unsigned long address, pte_t *ptep, pte_t pte) | |
404 | +{ | |
405 | + struct page *old_page, *new_page; | |
406 | + int i, avoidcopy; | |
407 | + | |
408 | + old_page = pte_page(pte); | |
409 | + | |
410 | + /* If no-one else is actually using this page, avoid the copy | |
411 | + * and just make the page writable */ | |
412 | + avoidcopy = (page_count(old_page) == 1); | |
413 | + if (avoidcopy) { | |
414 | + set_huge_ptep_writable(vma, address, ptep); | |
415 | + return VM_FAULT_MINOR; | |
416 | + } | |
417 | + | |
418 | + page_cache_get(old_page); | |
419 | + new_page = alloc_huge_page(); | |
420 | + | |
421 | + if (!new_page) { | |
422 | + page_cache_release(old_page); | |
423 | + | |
424 | + /* Logically this is OOM, not a SIGBUS, but an OOM | |
425 | + * could cause the kernel to go killing other | |
426 | + * processes which won't help the hugepage situation | |
427 | + * at all (?) */ | |
428 | + return VM_FAULT_SIGBUS; | |
429 | + } | |
430 | + | |
431 | + spin_unlock(&mm->page_table_lock); | |
432 | + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) | |
433 | + copy_user_highpage(new_page + i, old_page + i, | |
434 | + address + i*PAGE_SIZE); | |
435 | + spin_lock(&mm->page_table_lock); | |
436 | + | |
437 | + ptep = huge_pte_offset(mm, address & HPAGE_MASK); | |
438 | + if (likely(pte_same(*ptep, pte))) { | |
439 | + /* Break COW */ | |
440 | + set_huge_pte_at(mm, address, ptep, | |
441 | + make_huge_pte(vma, new_page, 1)); | |
442 | + /* Make the old page be freed below */ | |
443 | + new_page = old_page; | |
444 | + } | |
445 | + page_cache_release(new_page); | |
446 | + page_cache_release(old_page); | |
447 | + return VM_FAULT_MINOR; | |
448 | +} | |
449 | + | |
379 | 450 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
380 | - unsigned long address, pte_t *ptep) | |
451 | + unsigned long address, pte_t *ptep, int write_access) | |
381 | 452 | { |
382 | 453 | int ret = VM_FAULT_SIGBUS; |
383 | 454 | unsigned long idx; |
384 | 455 | unsigned long size; |
385 | 456 | struct page *page; |
386 | 457 | struct address_space *mapping; |
458 | + pte_t new_pte; | |
387 | 459 | |
388 | 460 | mapping = vma->vm_file->f_mapping; |
389 | 461 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
390 | 462 | |
... | ... | @@ -393,10 +465,13 @@ |
393 | 465 | * Use page lock to guard against racing truncation |
394 | 466 | * before we get page_table_lock. |
395 | 467 | */ |
396 | - page = find_or_alloc_huge_page(mapping, idx); | |
468 | + page = find_or_alloc_huge_page(mapping, idx, | |
469 | + vma->vm_flags & VM_SHARED); | |
397 | 470 | if (!page) |
398 | 471 | goto out; |
399 | 472 | |
473 | + BUG_ON(!PageLocked(page)); | |
474 | + | |
400 | 475 | spin_lock(&mm->page_table_lock); |
401 | 476 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
402 | 477 | if (idx >= size) |
... | ... | @@ -407,7 +482,15 @@ |
407 | 482 | goto backout; |
408 | 483 | |
409 | 484 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); |
410 | - set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); | |
485 | + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | |
486 | + && (vma->vm_flags & VM_SHARED))); | |
487 | + set_huge_pte_at(mm, address, ptep, new_pte); | |
488 | + | |
489 | + if (write_access && !(vma->vm_flags & VM_SHARED)) { | |
490 | + /* Optimization, do the COW without a second fault */ | |
491 | + ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | |
492 | + } | |
493 | + | |
411 | 494 | spin_unlock(&mm->page_table_lock); |
412 | 495 | unlock_page(page); |
413 | 496 | out: |
... | ... | @@ -426,6 +509,7 @@ |
426 | 509 | { |
427 | 510 | pte_t *ptep; |
428 | 511 | pte_t entry; |
512 | + int ret; | |
429 | 513 | |
430 | 514 | ptep = huge_pte_alloc(mm, address); |
431 | 515 | if (!ptep) |
432 | 516 | |
... | ... | @@ -433,13 +517,18 @@ |
433 | 517 | |
434 | 518 | entry = *ptep; |
435 | 519 | if (pte_none(entry)) |
436 | - return hugetlb_no_page(mm, vma, address, ptep); | |
520 | + return hugetlb_no_page(mm, vma, address, ptep, write_access); | |
437 | 521 | |
438 | - /* | |
439 | - * We could get here if another thread instantiated the pte | |
440 | - * before the test above. | |
441 | - */ | |
442 | - return VM_FAULT_MINOR; | |
522 | + ret = VM_FAULT_MINOR; | |
523 | + | |
524 | + spin_lock(&mm->page_table_lock); | |
525 | + /* Check for a racing update before calling hugetlb_cow */ | |
526 | + if (likely(pte_same(entry, *ptep))) | |
527 | + if (write_access && !pte_write(entry)) | |
528 | + ret = hugetlb_cow(mm, vma, address, ptep, entry); | |
529 | + spin_unlock(&mm->page_table_lock); | |
530 | + | |
531 | + return ret; | |
443 | 532 | } |
444 | 533 | |
445 | 534 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |