Commit 9d8cebd4bcd7c3878462fdfda34bbcdeb4df7ef4

Authored by KOSAKI Motohiro
Committed by Linus Torvalds
1 parent 93e4a89a8c

mm: fix mbind vma merge problem

Strangely, current mbind() doesn't merge vma with neighbor vma although it's possible.
Unfortunately, many vma can reduce performance...

This patch fixes it.

    reproduced program
    ----------------------------------------------------------------
     #include <numaif.h>
     #include <numa.h>
     #include <sys/mman.h>
     #include <stdio.h>
     #include <unistd.h>
     #include <stdlib.h>
     #include <string.h>

    static unsigned long pagesize;

    int main(int argc, char** argv)
    {
    	void* addr;
    	int ch;
    	int node;
    	struct bitmask *nmask = numa_allocate_nodemask();
    	int err;
    	int node_set = 0;
    	char buf[128];

    	while ((ch = getopt(argc, argv, "n:")) != -1){
    		switch (ch){
    		case 'n':
    			node = strtol(optarg, NULL, 0);
    			numa_bitmask_setbit(nmask, node);
    			node_set = 1;
    			break;
    		default:
    			;
    		}
    	}
    	argc -= optind;
    	argv += optind;

    	if (!node_set)
    		numa_bitmask_setbit(nmask, 0);

    	pagesize = getpagesize();

    	addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE,
    		    MAP_ANON|MAP_PRIVATE, 0, 0);
    	if (addr == MAP_FAILED)
    		perror("mmap "), exit(1);

    	fprintf(stderr, "pid = %d \n" "addr = %p\n", getpid(), addr);

    	/* make page populate */
    	memset(addr, 0, pagesize*3);

    	/* first mbind */
    	err = mbind(addr+pagesize, pagesize, MPOL_BIND, nmask->maskp,
    		    nmask->size, MPOL_MF_MOVE_ALL);
    	if (err)
    		error("mbind1 ");

    	/* second mbind */
    	err = mbind(addr, pagesize*3, MPOL_DEFAULT, NULL, 0, 0);
    	if (err)
    		error("mbind2 ");

    	sprintf(buf, "cat /proc/%d/maps", getpid());
    	system(buf);

    	return 0;
    }
    ----------------------------------------------------------------

result without this patch

	addr = 0x7fe26ef09000
	[snip]
	7fe26ef09000-7fe26ef0a000 rw-p 00000000 00:00 0
	7fe26ef0a000-7fe26ef0b000 rw-p 00000000 00:00 0
	7fe26ef0b000-7fe26ef0c000 rw-p 00000000 00:00 0
	7fe26ef0c000-7fe26ef0d000 rw-p 00000000 00:00 0

	=> 0x7fe26ef09000-0x7fe26ef0c000 have three vmas.

result with this patch

	addr = 0x7fc9ebc76000
	[snip]
	7fc9ebc76000-7fc9ebc7a000 rw-p 00000000 00:00 0
	7fffbe690000-7fffbe6a5000 rw-p 00000000	00:00 0	[stack]

	=> 0x7fc9ebc76000-0x7fc9ebc7a000 have only one vma.

[minchan.kim@gmail.com: fix file offset passed to vma_merge()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 39 additions and 13 deletions Side-by-side Diff

... ... @@ -563,24 +563,50 @@
563 563 }
564 564  
565 565 /* Step 2: apply policy to a range and do splits. */
566   -static int mbind_range(struct vm_area_struct *vma, unsigned long start,
567   - unsigned long end, struct mempolicy *new)
  566 +static int mbind_range(struct mm_struct *mm, unsigned long start,
  567 + unsigned long end, struct mempolicy *new_pol)
568 568 {
569 569 struct vm_area_struct *next;
570   - int err;
  570 + struct vm_area_struct *prev;
  571 + struct vm_area_struct *vma;
  572 + int err = 0;
  573 + pgoff_t pgoff;
  574 + unsigned long vmstart;
  575 + unsigned long vmend;
571 576  
572   - err = 0;
573   - for (; vma && vma->vm_start < end; vma = next) {
  577 + vma = find_vma_prev(mm, start, &prev);
  578 + if (!vma || vma->vm_start > start)
  579 + return -EFAULT;
  580 +
  581 + for (; vma && vma->vm_start < end; prev = vma, vma = next) {
574 582 next = vma->vm_next;
575   - if (vma->vm_start < start)
576   - err = split_vma(vma->vm_mm, vma, start, 1);
577   - if (!err && vma->vm_end > end)
578   - err = split_vma(vma->vm_mm, vma, end, 0);
579   - if (!err)
580   - err = policy_vma(vma, new);
  583 + vmstart = max(start, vma->vm_start);
  584 + vmend = min(end, vma->vm_end);
  585 +
  586 + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  587 + prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
  588 + vma->anon_vma, vma->vm_file, pgoff, new_pol);
  589 + if (prev) {
  590 + vma = prev;
  591 + next = vma->vm_next;
  592 + continue;
  593 + }
  594 + if (vma->vm_start != vmstart) {
  595 + err = split_vma(vma->vm_mm, vma, vmstart, 1);
  596 + if (err)
  597 + goto out;
  598 + }
  599 + if (vma->vm_end != vmend) {
  600 + err = split_vma(vma->vm_mm, vma, vmend, 0);
  601 + if (err)
  602 + goto out;
  603 + }
  604 + err = policy_vma(vma, new_pol);
581 605 if (err)
582   - break;
  606 + goto out;
583 607 }
  608 +
  609 + out:
584 610 return err;
585 611 }
586 612  
... ... @@ -1047,7 +1073,7 @@
1047 1073 if (!IS_ERR(vma)) {
1048 1074 int nr_failed = 0;
1049 1075  
1050   - err = mbind_range(vma, start, end, new);
  1076 + err = mbind_range(mm, start, end, new);
1051 1077  
1052 1078 if (!list_empty(&pagelist))
1053 1079 nr_failed = migrate_pages(&pagelist, new_vma_page,