13 Jan, 2012

1 commit

  • This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT
    mode that avoids writing back pages to backing storage. Async compaction
    maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT.
    For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is
    used.

    This avoids sync compaction stalling for an excessive length of time,
    particularly when copying files to a USB stick where there might be a
    large number of dirty pages backed by a filesystem that does not support
    ->writepages.

    [aarcange@redhat.com: This patch is heavily based on Andrea's work]
    [akpm@linux-foundation.org: fix fs/nfs/write.c build]
    [akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build]
    Signed-off-by: Mel Gorman
    Reviewed-by: Rik van Riel
    Cc: Andrea Arcangeli
    Cc: Minchan Kim
    Cc: Dave Jones
    Cc: Jan Kara
    Cc: Andy Isaacson
    Cc: Nai Xia
    Cc: Johannes Weiner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Mel Gorman
     

11 Jan, 2012

1 commit


30 Dec, 2011

1 commit

  • commit 8aacc9f550 ("mm/mempolicy.c: fix pgoff in mbind vma merge") is the
    slightly incorrect fix.

    Why? Think following case.

    1. map 4 pages of a file at offset 0

    [0123]

    2. map 2 pages just after the first mapping of the same file but with
    page offset 2

    [0123][23]

    3. mbind() 2 pages from the first mapping at offset 2.
    mbind_range() should treat new vma is,

    [0123][23]
    |23|
    mbind vma

    but it does

    [0123][23]
    |01|
    mbind vma

    Oops. then, it makes wrong vma merge and splitting ([01][0123] or similar).

    This patch fixes it.

    [testcase]
    test result - before the patch

    case4: 126: test failed. expect '2,4', actual '2,2,2'
    case5: passed
    case6: passed
    case7: passed
    case8: passed
    case_n: 246: test failed. expect '4,2', actual '1,4'

    ------------[ cut here ]------------
    kernel BUG at mm/filemap.c:135!
    invalid opcode: 0000 [#4] SMP DEBUG_PAGEALLOC

    (snip long bug on messages)

    test result - after the patch

    case4: passed
    case5: passed
    case6: passed
    case7: passed
    case8: passed
    case_n: passed

    source: mbind_vma_test.c
    ============================================================
    #include
    #include
    #include
    #include
    #include
    #include
    #include

    static unsigned long pagesize;
    void* mmap_addr;
    struct bitmask *nmask;
    char buf[1024];
    FILE *file;
    char retbuf[10240] = "";
    int mapped_fd;

    char *rubysrc = "ruby -e '\
    pid = %d; \
    vstart = 0x%llx; \
    vend = 0x%llx; \
    s = `pmap -q #{pid}`; \
    rary = []; \
    s.each_line {|line|; \
    ary=line.split(\" \"); \
    addr = ary[0].to_i(16); \
    if(vstart < vend) then \
    rary.push(ary[1].to_i()/4); \
    end; \
    }; \
    print rary.join(\",\"); \
    '";

    void init(void)
    {
    void* addr;
    char buf[128];

    nmask = numa_allocate_nodemask();
    numa_bitmask_setbit(nmask, 0);

    pagesize = getpagesize();

    sprintf(buf, "%s", "mbind_vma_XXXXXX");
    mapped_fd = mkstemp(buf);
    if (mapped_fd == -1)
    perror("mkstemp "), exit(1);
    unlink(buf);

    if (lseek(mapped_fd, pagesize*8, SEEK_SET) < 0)
    perror("lseek "), exit(1);
    if (write(mapped_fd, "\0", 1) < 0)
    perror("write "), exit(1);

    addr = mmap(NULL, pagesize*8, PROT_NONE,
    MAP_SHARED, mapped_fd, 0);
    if (addr == MAP_FAILED)
    perror("mmap "), exit(1);

    if (mprotect(addr+pagesize, pagesize*6, PROT_READ|PROT_WRITE) < 0)
    perror("mprotect "), exit(1);

    mmap_addr = addr + pagesize;

    /* make page populate */
    memset(mmap_addr, 0, pagesize*6);
    }

    void fin(void)
    {
    void* addr = mmap_addr - pagesize;
    munmap(addr, pagesize*8);

    memset(buf, 0, sizeof(buf));
    memset(retbuf, 0, sizeof(retbuf));
    }

    void mem_bind(int index, int len)
    {
    int err;

    err = mbind(mmap_addr+pagesize*index, pagesize*len,
    MPOL_BIND, nmask->maskp, nmask->size, 0);
    if (err)
    perror("mbind "), exit(err);
    }

    void mem_interleave(int index, int len)
    {
    int err;

    err = mbind(mmap_addr+pagesize*index, pagesize*len,
    MPOL_INTERLEAVE, nmask->maskp, nmask->size, 0);
    if (err)
    perror("mbind "), exit(err);
    }

    void mem_unbind(int index, int len)
    {
    int err;

    err = mbind(mmap_addr+pagesize*index, pagesize*len,
    MPOL_DEFAULT, NULL, 0, 0);
    if (err)
    perror("mbind "), exit(err);
    }

    void Assert(char *expected, char *value, char *name, int line)
    {
    if (strcmp(expected, value) == 0) {
    fprintf(stderr, "%s: passed\n", name);
    return;
    }
    else {
    fprintf(stderr, "%s: %d: test failed. expect '%s', actual '%s'\n",
    name, line,
    expected, value);
    // exit(1);
    }
    }

    /*
    AAAA
    PPPPPPNNNNNN
    might become
    PPNNNNNNNNNN
    case 4 below
    */
    void case4(void)
    {
    init();
    sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);

    mem_bind(0, 4);
    mem_unbind(2, 2);

    file = popen(buf, "r");
    fread(retbuf, sizeof(retbuf), 1, file);
    Assert("2,4", retbuf, "case4", __LINE__);

    fin();
    }

    /*
    AAAA
    PPPPPPNNNNNN
    might become
    PPPPPPPPPPNN
    case 5 below
    */
    void case5(void)
    {
    init();
    sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);

    mem_bind(0, 2);
    mem_bind(2, 2);

    file = popen(buf, "r");
    fread(retbuf, sizeof(retbuf), 1, file);
    Assert("4,2", retbuf, "case5", __LINE__);

    fin();
    }

    /*
    AAAA
    PPPPNNNNXXXX
    might become
    PPPPPPPPPPPP 6
    */
    void case6(void)
    {
    init();
    sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);

    mem_bind(0, 2);
    mem_bind(4, 2);
    mem_bind(2, 2);

    file = popen(buf, "r");
    fread(retbuf, sizeof(retbuf), 1, file);
    Assert("6", retbuf, "case6", __LINE__);

    fin();
    }

    /*
    AAAA
    PPPPNNNNXXXX
    might become
    PPPPPPPPXXXX 7
    */
    void case7(void)
    {
    init();
    sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);

    mem_bind(0, 2);
    mem_interleave(4, 2);
    mem_bind(2, 2);

    file = popen(buf, "r");
    fread(retbuf, sizeof(retbuf), 1, file);
    Assert("4,2", retbuf, "case7", __LINE__);

    fin();
    }

    /*
    AAAA
    PPPPNNNNXXXX
    might become
    PPPPNNNNNNNN 8
    */
    void case8(void)
    {
    init();
    sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);

    mem_bind(0, 2);
    mem_interleave(4, 2);
    mem_interleave(2, 2);

    file = popen(buf, "r");
    fread(retbuf, sizeof(retbuf), 1, file);
    Assert("2,4", retbuf, "case8", __LINE__);

    fin();
    }

    void case_n(void)
    {
    init();
    sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);

    /* make redundunt mappings [0][1234][34][7] */
    mmap(mmap_addr + pagesize*4, pagesize*2, PROT_READ|PROT_WRITE,
    MAP_FIXED|MAP_SHARED, mapped_fd, pagesize*3);

    /* Expect to do nothing. */
    mem_unbind(2, 2);

    file = popen(buf, "r");
    fread(retbuf, sizeof(retbuf), 1, file);
    Assert("4,2", retbuf, "case_n", __LINE__);

    fin();
    }

    int main(int argc, char** argv)
    {
    case4();
    case5();
    case6();
    case7();
    case8();
    case_n();

    return 0;
    }
    =============================================================

    Signed-off-by: KOSAKI Motohiro
    Acked-by: Johannes Weiner
    Cc: Minchan Kim
    Cc: Caspar Zhang
    Cc: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: Hugh Dickins
    Cc: Mel Gorman
    Cc: Lee Schermerhorn
    Cc: [3.1.x]
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    KOSAKI Motohiro
     

07 Nov, 2011

1 commit

  • * 'modsplit-Oct31_2011' of git://git.kernel.org/pub/scm/linux/kernel/git/paulg/linux: (230 commits)
    Revert "tracing: Include module.h in define_trace.h"
    irq: don't put module.h into irq.h for tracking irqgen modules.
    bluetooth: macroize two small inlines to avoid module.h
    ip_vs.h: fix implicit use of module_get/module_put from module.h
    nf_conntrack.h: fix up fallout from implicit moduleparam.h presence
    include: replace linux/module.h with "struct module" wherever possible
    include: convert various register fcns to macros to avoid include chaining
    crypto.h: remove unused crypto_tfm_alg_modname() inline
    uwb.h: fix implicit use of asm/page.h for PAGE_SIZE
    pm_runtime.h: explicitly requires notifier.h
    linux/dmaengine.h: fix implicit use of bitmap.h and asm/page.h
    miscdevice.h: fix up implicit use of lists and types
    stop_machine.h: fix implicit use of smp.h for smp_processor_id
    of: fix implicit use of errno.h in include/linux/of.h
    of_platform.h: delete needless include
    acpi: remove module.h include from platform/aclinux.h
    miscdevice.h: delete unnecessary inclusion of module.h
    device_cgroup.h: delete needless include
    net: sch_generic remove redundant use of
    net: inet_timewait_sock doesnt need
    ...

    Fix up trivial conflicts (other header files, and removal of the ab3550 mfd driver) in
    - drivers/media/dvb/frontends/dibx000_common.c
    - drivers/media/video/{mt9m111.c,ov6650.c}
    - drivers/mfd/ab3550-core.c
    - include/linux/dmaengine.h

    Linus Torvalds
     

01 Nov, 2011

1 commit

  • Quiet the spares noise:

    warning: symbol 'default_policy' was not declared. Should it be static?

    Signed-off-by: H Hartley Sweeten
    Cc: KOSAKI Motohiro
    Cc: Stephen Wilson
    Cc: Andrea Arcangeli
    Cc: Mel Gorman
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    H Hartley Sweeten
     

31 Oct, 2011

1 commit


15 Sep, 2011

2 commits

  • When compiling mm/mempolicy.c with struct user copy checks the following
    warning is shown:

    In file included from arch/x86/include/asm/uaccess.h:572,
    from include/linux/uaccess.h:5,
    from include/linux/highmem.h:7,
    from include/linux/pagemap.h:10,
    from include/linux/mempolicy.h:70,
    from mm/mempolicy.c:68:
    In function `copy_from_user',
    inlined from `compat_sys_get_mempolicy' at mm/mempolicy.c:1415:
    arch/x86/include/asm/uaccess_64.h:64: warning: call to `copy_from_user_overflow' declared with attribute warning: copy_from_user() buffer size is not provably correct
    LD mm/built-in.o

    Fix this by passing correct buffer size value.

    Signed-off-by: KAMEZAWA Hiroyuki
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    KAMEZAWA Hiroyuki
     
  • commit 9d8cebd4bcd7 ("mm: fix mbind vma merge problem") didn't really
    fix the mbind vma merge problem due to wrong pgoff value passing to
    vma_merge(), which made vma_merge() always return NULL.

    Before the patch applied, we are getting a result like:

    addr = 0x7fa58f00c000
    [snip]
    7fa58f00c000-7fa58f00d000 rw-p 00000000 00:00 0
    7fa58f00d000-7fa58f00e000 rw-p 00000000 00:00 0
    7fa58f00e000-7fa58f00f000 rw-p 00000000 00:00 0

    here 7fa58f00c000->7fa58f00f000 we get 3 VMAs which are expected to be
    merged described as described in commit 9d8cebd.

    Re-testing the patched kernel with the reproducer provided in commit
    9d8cebd, we get the correct result:

    addr = 0x7ffa5aaa2000
    [snip]
    7ffa5aaa2000-7ffa5aaa6000 rw-p 00000000 00:00 0
    7fffd556f000-7fffd5584000 rw-p 00000000 00:00 0 [stack]

    Signed-off-by: Caspar Zhang
    Cc: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: Hugh Dickins
    Cc: Mel Gorman
    Cc: Lee Schermerhorn
    Cc: Minchan Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Caspar Zhang
     

27 Jul, 2011

1 commit

  • [ This patch has already been accepted as commit 0ac0c0d0f837 but later
    reverted (commit 35926ff5fba8) because it itroduced arch specific
    __node_random which was defined only for x86 code so it broke other
    archs. This is a followup without any arch specific code. Other than
    that there are no functional changes.]

    Some workloads that create a large number of small files tend to assign
    too many pages to node 0 (multi-node systems). Part of the reason is
    that the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
    at node 0 for newly created tasks.

    This patch changes the rotor to be initialized to a random node number
    of the cpuset.

    [akpm@linux-foundation.org: fix layout]
    [Lee.Schermerhorn@hp.com: Define stub numa_random() for !NUMA configuration]
    [mhocko@suse.cz: Make it arch independent]
    [akpm@linux-foundation.org: fix CONFIG_NUMA=y, MAX_NUMNODES>1 build]
    Signed-off-by: Jack Steiner
    Signed-off-by: Lee Schermerhorn
    Signed-off-by: Michal Hocko
    Reviewed-by: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: Paul Menage
    Cc: Jack Steiner
    Cc: Robin Holt
    Cc: David Rientjes
    Cc: Christoph Lameter
    Cc: David Rientjes
    Cc: Jack Steiner
    Cc: KOSAKI Motohiro
    Cc: Lee Schermerhorn
    Cc: Michal Hocko
    Cc: Paul Menage
    Cc: Pekka Enberg
    Cc: Robin Holt
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko
     

25 May, 2011

6 commits

  • Moving show_numa_map() from mempolicy.c to task_mmu.c solves several
    issues.

    - Having the show() operation "miles away" from the corresponding
    seq_file iteration operations is a maintenance burden.

    - The need to export ad hoc info like struct proc_maps_private is
    eliminated.

    - The implementation of show_numa_map() can be improved in a simple
    manner by cooperating with the other seq_file operations (start,
    stop, etc) -- something that would be messy to do without this
    change.

    Signed-off-by: Stephen Wilson
    Reviewed-by: KOSAKI Motohiro
    Cc: Hugh Dickins
    Cc: David Rientjes
    Cc: Lee Schermerhorn
    Cc: Alexey Dobriyan
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Stephen Wilson
     
  • This function has been superseded by gather_hugetbl_stats() and is no
    longer needed.

    Signed-off-by: Stephen Wilson
    Reviewed-by: KOSAKI Motohiro
    Cc: Hugh Dickins
    Cc: David Rientjes
    Cc: Lee Schermerhorn
    Cc: Alexey Dobriyan
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Stephen Wilson
     
  • Improve the prototype of gather_stats() to take a struct numa_maps as
    argument instead of a generic void *. Update all callers to make the
    required type explicit.

    Since gather_stats() is not needed before its definition and is scheduled
    to be moved out of mempolicy.c the declaration is removed as well.

    Signed-off-by: Stephen Wilson
    Reviewed-by: KOSAKI Motohiro
    Cc: Hugh Dickins
    Cc: David Rientjes
    Cc: Lee Schermerhorn
    Cc: Alexey Dobriyan
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Stephen Wilson
     
  • Mapping statistics in a NUMA environment is now computed using the generic
    walk_page_range() logic. Remove the old/equivalent functionality.

    Signed-off-by: Stephen Wilson
    Reviewed-by: KOSAKI Motohiro
    Cc: Hugh Dickins
    Cc: David Rientjes
    Cc: Lee Schermerhorn
    Cc: Alexey Dobriyan
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Stephen Wilson
     
  • Converting show_numa_map() to use the generic routine decouples the
    function from mempolicy.c, allowing it to be moved out of the mm subsystem
    and into fs/proc.

    Also, include KSM pages in /proc/pid/numa_maps statistics. The pagewalk
    logic implemented by check_pte_range() failed to account for such pages as
    they were not applicable to the page migration case.

    Signed-off-by: Stephen Wilson
    Reviewed-by: KOSAKI Motohiro
    Cc: Hugh Dickins
    Cc: David Rientjes
    Cc: Lee Schermerhorn
    Cc: Alexey Dobriyan
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Stephen Wilson
     
  • In commit 48fce3429d ("mempolicies: unexport get_vma_policy()")
    get_vma_policy() was marked static as all clients were local to
    mempolicy.c.

    However, the decision to generate /proc/pid/numa_maps in the numa memory
    policy code and outside the procfs subsystem introduces an artificial
    interdependency between the two systems. Exporting get_vma_policy() once
    again is the first step to clean up this interdependency.

    Signed-off-by: Stephen Wilson
    Reviewed-by: KOSAKI Motohiro
    Cc: Hugh Dickins
    Cc: David Rientjes
    Cc: Lee Schermerhorn
    Cc: Alexey Dobriyan
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Stephen Wilson
     

23 Mar, 2011

1 commit


19 Mar, 2011

1 commit

  • * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial: (47 commits)
    doc: CONFIG_UNEVICTABLE_LRU doesn't exist anymore
    Update cpuset info & webiste for cgroups
    dcdbas: force SMI to happen when expected
    arch/arm/Kconfig: remove one to many l's in the word.
    asm-generic/user.h: Fix spelling in comment
    drm: fix printk typo 'sracth'
    Remove one to many n's in a word
    Documentation/filesystems/romfs.txt: fixing link to genromfs
    drivers:scsi Change printk typo initate -> initiate
    serial, pch uart: Remove duplicate inclusion of linux/pci.h header
    fs/eventpoll.c: fix spelling
    mm: Fix out-of-date comments which refers non-existent functions
    drm: Fix printk typo 'failled'
    coh901318.c: Change initate to initiate.
    mbox-db5500.c Change initate to initiate.
    edac: correct i82975x error-info reported
    edac: correct i82975x mci initialisation
    edac: correct commented info
    fs: update comments to point correct document
    target: remove duplicate include of target/target_core_device.h from drivers/target/target_core_hba.c
    ...

    Trivial conflict in fs/eventpoll.c (spelling vs addition)

    Linus Torvalds
     

05 Mar, 2011

2 commits

  • Pass down the correct node for a transparent hugepage allocation. Most
    callers continue to use the current node, however the hugepaged daemon
    now uses the previous node of the first to be collapsed page instead.
    This ensures that khugepaged does not mess up local memory for an
    existing process which uses local policy.

    The choice of node is somewhat primitive currently: it just uses the
    node of the first page in the pmd range. An alternative would be to
    look at multiple pages and use the most popular node. I used the
    simplest variant for now which should work well enough for the case of
    all pages being on the same node.

    [akpm@linux-foundation.org: coding-style fixes]
    Acked-by: Andrea Arcangeli
    Signed-off-by: Andi Kleen
    Reviewed-by: KAMEZAWA Hiroyuki
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andi Kleen
     
  • Currently alloc_pages_vma() always uses the local node as policy node for
    the LOCAL policy. Pass this node down as an argument instead.

    No behaviour change from this patch, but will be needed for followons.

    Acked-by: Andrea Arcangeli
    Signed-off-by: Andi Kleen
    Reviewed-by: KAMEZAWA Hiroyuki
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andi Kleen
     

01 Mar, 2011

1 commit


26 Feb, 2011

1 commit

  • The THP code didn't pass the correct interleaving shift to the memory
    policy code. Fix this here by adjusting for the order.

    Signed-off-by: Andi Kleen
    Reviewed-by: Christoph Lameter
    Acked-by: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andi Kleen
     

14 Jan, 2011

5 commits

  • It's mostly a matter of replacing alloc_pages with alloc_pages_vma after
    introducing alloc_pages_vma. khugepaged needs special handling as the
    allocation has to happen inside collapse_huge_page where the vma is known
    and an error has to be returned to the outer loop to sleep
    alloc_sleep_millisecs in case of failure. But it retains the more
    efficient logic of handling allocation failures in khugepaged in case of
    CONFIG_NUMA=n.

    Signed-off-by: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andrea Arcangeli
     
  • split_huge_page_pmd compat code. Each one of those would need to be
    expanded to hundred of lines of complex code without a fully reliable
    split_huge_page_pmd design.

    Signed-off-by: Andrea Arcangeli
    Acked-by: Rik van Riel
    Acked-by: Mel Gorman
    Signed-off-by: Johannes Weiner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andrea Arcangeli
     
  • Today, tasklist_lock in migrate_pages doesn't protect anything.
    rcu_read_lock() provide enough protection from pid hash walk.

    Signed-off-by: KOSAKI Motohiro
    Reported-by: Peter Zijlstra
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    KOSAKI Motohiro
     
  • With the introduction of the boolean sync parameter, the API looks a
    little inconsistent as offlining is still an int. Convert offlining to a
    bool for the sake of being tidy.

    Signed-off-by: Mel Gorman
    Cc: Andrea Arcangeli
    Cc: KOSAKI Motohiro
    Cc: Rik van Riel
    Acked-by: Johannes Weiner
    Cc: Andy Whitcroft
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Mel Gorman
     
  • …ompaction in the faster path

    Migration synchronously waits for writeback if the initial passes fails.
    Callers of memory compaction do not necessarily want this behaviour if the
    caller is latency sensitive or expects that synchronous migration is not
    going to have a significantly better success rate.

    This patch adds a sync parameter to migrate_pages() allowing the caller to
    indicate if wait_on_page_writeback() is allowed within migration or not.
    For reclaim/compaction, try_to_compact_pages() is first called
    asynchronously, direct reclaim runs and then try_to_compact_pages() is
    called synchronously as there is a greater expectation that it'll succeed.

    [akpm@linux-foundation.org: build/merge fix]
    Signed-off-by: Mel Gorman <mel@csn.ul.ie>
    Cc: Andrea Arcangeli <aarcange@redhat.com>
    Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
    Cc: Rik van Riel <riel@redhat.com>
    Acked-by: Johannes Weiner <hannes@cmpxchg.org>
    Cc: Andy Whitcroft <apw@shadowen.org>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

    Mel Gorman
     

03 Dec, 2010

1 commit


29 Oct, 2010

1 commit

  • When a node contains only HighMem memory, slab_node(MPOL_BIND)
    dereferences a NULL pointer.

    [ This code seems to go back all the way to commit 19770b32609b: "mm:
    filter based on a nodemask as well as a gfp_mask". Which was back in
    April 2008, and it got merged into 2.6.26. - Linus ]

    Signed-off-by: Eric Dumazet
    Cc: Mel Gorman
    Cc: Christoph Lameter
    Cc: Lee Schermerhorn
    Cc: Andrew Morton
    Cc: stable@kernel.org
    Signed-off-by: Linus Torvalds

    Eric Dumazet
     

27 Oct, 2010

2 commits

  • Function check_range may return ERR_PTR(...). Check for it.

    Signed-off-by: Vasiliy Kulikov
    Acked-by: David Rientjes
    Reviewed-by: Christoph Lameter
    Reviewed-by: KOSAKI Motohiro
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vasiliy Kulikov
     
  • Presently update_nr_listpages() doesn't have a role. That's because lists
    passed is always empty just after calling migrate_pages. The
    migrate_pages cleans up page list which have failed to migrate before
    returning by aaa994b3.

    [PATCH] page migration: handle freeing of pages in migrate_pages()

    Do not leave pages on the lists passed to migrate_pages(). Seems that we will
    not need any postprocessing of pages. This will simplify the handling of
    pages by the callers of migrate_pages().

    At that time, we thought we don't need any postprocessing of pages. But
    the situation is changed. The compaction need to know the number of
    failed to migrate for COMPACTPAGEFAILED stat

    This patch makes new rule for caller of migrate_pages to call
    putback_lru_pages. So caller need to clean up the lists so it has a
    chance to postprocess the pages. [suggested by Christoph Lameter]

    Signed-off-by: Minchan Kim
    Cc: Hugh Dickins
    Cc: Andi Kleen
    Reviewed-by: Mel Gorman
    Reviewed-by: Wu Fengguang
    Acked-by: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Minchan Kim
     

10 Aug, 2010

2 commits

  • migrate_pages() is using >500 bytes stack. Reduce it.

    mm/mempolicy.c: In function 'sys_migrate_pages':
    mm/mempolicy.c:1344: warning: the frame size of 528 bytes is larger than 512 bytes

    [akpm@linux-foundation.org: don't play with a might-be-NULL pointer]
    Signed-off-by: KOSAKI Motohiro
    Reviewed-by: KAMEZAWA Hiroyuki
    Reviewed-by: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    KOSAKI Motohiro
     
  • The oom killer presently kills current whenever there is no more memory
    free or reclaimable on its mempolicy's nodes. There is no guarantee that
    current is a memory-hogging task or that killing it will free any
    substantial amount of memory, however.

    In such situations, it is better to scan the tasklist for nodes that are
    allowed to allocate on current's set of nodes and kill the task with the
    highest badness() score. This ensures that the most memory-hogging task,
    or the one configured by the user with /proc/pid/oom_adj, is always
    selected in such scenarios.

    Signed-off-by: David Rientjes
    Reviewed-by: KOSAKI Motohiro
    Cc: KAMEZAWA Hiroyuki
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    David Rientjes
     

30 Jun, 2010

1 commit

  • My patch to "Factor out duplicate put/frees in mpol_shared_policy_init()
    to a common return path"; and Dan Carpenter's fix thereto both left a
    dangling reference to the incoming tmpfs superblock mempolicy structure.
    A similar leak was introduced earlier when the nodemask was moved offstack
    to the scratch area despite the note in the comment block regarding the
    incoming ref.

    Move the remaining 'put of the incoming "mpol" to the common exit path to
    drop the reference.

    Signed-off-by: Lee Schermerhorn
    Acked-by: Dan Carpenter
    Cc: KOSAKI Motohiro
    Cc: David Rientjes
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Lee Schermerhorn
     

26 May, 2010

1 commit


25 May, 2010

6 commits

  • Use mm->task_size instead of TASK_SIZE to ensure that the entire user
    address space is migrated. mm->task_size is independent of the calling
    task context. TASK SIZE may be dependant on the address space size of the
    calling process. Usage of TASK_SIZE can lead to partial address space
    migration if the calling process was 32 bit and the migrating process was
    64 bit.

    Here is the test script used on 64 system with a 32 bit echo process:

    mount -t cgroup none /cgroup -o cpuset
    cd /cgroup

    mkdir 0
    echo 1 > 0/cpuset.cpus
    echo 0 > 0/cpuset.mems
    echo 1 > 0/cpuset.memory_migrate

    mkdir 1
    echo 1 > 1/cpuset.cpus
    echo 1 > 1/cpuset.mems
    echo 1 > 1/cpuset.memory_migrate

    echo $$ > 0/tasks
    64_bit_process &
    pid=$!

    echo $pid > 1/tasks # This does not migrate all process pages without
    # this patch. If 64 bit echo is used or this patch is
    # applied, then the full address space of $pid is
    # migrated.

    To check memory migration, I watched:
    grep MemUsed /sys/devices/system/node/node*/meminfo

    Signed-off-by: Greg Thelen
    Acked-by: Christoph Lameter
    Reviewed-by: KOSAKI Motohiro
    Cc: Mel Gorman
    Cc: KAMEZAWA Hiroyuki
    Cc: Daisuke Nishimura
    Cc: Balbir Singh
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Greg Thelen
     
  • Before applying this patch, cpuset updates task->mems_allowed and
    mempolicy by setting all new bits in the nodemask first, and clearing all
    old unallowed bits later. But in the way, the allocator may find that
    there is no node to alloc memory.

    The reason is that cpuset rebinds the task's mempolicy, it cleans the
    nodes which the allocater can alloc pages on, for example:

    (mpol: mempolicy)
    task1 task1's mpol task2
    alloc page 1
    alloc on node0? NO 1
    1 change mems from 1 to 0
    1 rebind task1's mpol
    0-1 set new bits
    0 clear disallowed bits
    alloc on node1? NO 0
    ...
    can't alloc page
    goto oom

    This patch fixes this problem by expanding the nodes range first(set newly
    allowed bits) and shrink it lazily(clear newly disallowed bits). So we
    use a variable to tell the write-side task that read-side task is reading
    nodemask, and the write-side task clears newly disallowed nodes after
    read-side task ends the current memory allocation.

    [akpm@linux-foundation.org: fix spello]
    Signed-off-by: Miao Xie
    Cc: David Rientjes
    Cc: Nick Piggin
    Cc: Paul Menage
    Cc: Lee Schermerhorn
    Cc: Hugh Dickins
    Cc: Ravikiran Thirumalai
    Cc: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: Andi Kleen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Miao Xie
     
  • Nick Piggin reported that the allocator may see an empty nodemask when
    changing cpuset's mems[1]. It happens only on the kernel that do not do
    atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG)

    But I found that there is also a problem on the kernel that can do atomic
    nodemask_t stores. The problem is that the allocator can't find a node to
    alloc page when changing cpuset's mems though there is a lot of free
    memory. The reason is like this:

    (mpol: mempolicy)
    task1 task1's mpol task2
    alloc page 1
    alloc on node0? NO 1
    1 change mems from 1 to 0
    1 rebind task1's mpol
    0-1 set new bits
    0 clear disallowed bits
    alloc on node1? NO 0
    ...
    can't alloc page
    goto oom

    I can use the attached program reproduce it by the following step:

    # mkdir /dev/cpuset
    # mount -t cpuset cpuset /dev/cpuset
    # mkdir /dev/cpuset/1
    # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus
    # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems
    # echo $$ > /dev/cpuset/1/tasks
    # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog &
    = max(nr_cpus - 1, 1)
    # killall -s SIGUSR1 cpuset_mem_hog
    # ./change_mems.sh

    several hours later, oom will happen though there is a lot of free memory.

    This patchset fixes this problem by expanding the nodes range first(set
    newly allowed bits) and shrink it lazily(clear newly disallowed bits). So
    we use a variable to tell the write-side task that read-side task is
    reading nodemask, and the write-side task clears newly disallowed nodes
    after read-side task ends the current memory allocation.

    This patch:

    In order to fix no node to alloc memory, when we want to update mempolicy
    and mems_allowed, we expand the set of nodes first (set all the newly
    nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the
    mempolicy's rebind functions may breaks the expanding.

    So we restructure the mempolicy's rebind functions and split the rebind
    work to two steps, just like the update of cpuset's mems: The 1st step:
    expand the set of the mempolicy's nodes. The 2nd step: shrink the set of
    the mempolicy's nodes. It is used when there is no real lock to protect
    the mempolicy in the read-side. Otherwise we can do rebind work at once.

    In order to implement it, we define

    enum mpol_rebind_step {
    MPOL_REBIND_ONCE,
    MPOL_REBIND_STEP1,
    MPOL_REBIND_STEP2,
    MPOL_REBIND_NSTEP,
    };

    If the mempolicy needn't be updated by two steps, we can pass
    MPOL_REBIND_ONCE to the rebind functions. Or we can pass
    MPOL_REBIND_STEP1 to do the first step of the rebind work and pass
    MPOL_REBIND_STEP2 to do the second step work.

    Besides that, it maybe long time between these two step and we have to
    release the lock that protects mempolicy and mems_allowed. If we hold the
    lock once again, we must check whether the current mempolicy is under the
    rebinding (the first step has been done) or not, because the task may
    alloc a new mempolicy when we don't hold the lock. So we defined the
    following flag to identify it:

    #define MPOL_F_REBINDING (1 << 2)

    The new functions will be used in the next patch.

    Signed-off-by: Miao Xie
    Cc: David Rientjes
    Cc: Nick Piggin
    Cc: Paul Menage
    Cc: Lee Schermerhorn
    Cc: Hugh Dickins
    Cc: Ravikiran Thirumalai
    Cc: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: Andi Kleen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Miao Xie
     
  • Factor out duplicate put/frees in mpol_shared_policy_init() to a common
    return path.

    Signed-off-by: Lee Schermerhorn
    Cc: Hugh Dickins
    Cc: Ravikiran Thirumalai
    Cc: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Lee Schermerhorn
     
  • Rename 'policy_types[]' to 'policy_modes[]' to better match the array
    contents.

    Use designated intializer syntax for policy_modes[].

    Signed-off-by: Lee Schermerhorn
    Cc: Hugh Dickins
    Cc: Ravikiran Thirumalai
    Cc: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Lee Schermerhorn
     
  • We don't really need the extra variable 'i' in mpol_parse_str(). The only
    use is as the the loop variable. Then, it's assigned to 'mode'. Just use
    mode, and loose the 'uninitialized_var()' macro.

    Signed-off-by: Lee Schermerhorn
    Cc: Hugh Dickins
    Cc: Ravikiran Thirumalai
    Cc: KOSAKI Motohiro
    Cc: Christoph Lameter
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Lee Schermerhorn