10 Oct, 2014

40 commits

  • Signed-off-by: Ionut Alexa
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ionut Alexa
     
  • Quite useless but it shuts up some warnings.

    Signed-off-by: Michele Curti
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michele Curti
     
  • Instead of open-coding clamp_t macro min_t and max_t the way clamp macro
    does and instead of open-coding clamp_val simply use clamp_t.
    Furthermore, normalise argument naming in the macros to be lo and hi.

    Signed-off-by: Michal Nazarewicz
    Cc: Mark Rustad
    Cc: "Kirsher, Jeffrey T"
    Cc: Hagen Paul Pfeifer
    Cc: Steven Rostedt
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Nazarewicz
     
  • It appears that gcc is better at optimising a double call to min and max
    rather than open coded min3 and max3. This can be observed here:

    $ cat min-max.c
    #define min(x, y) ({ \
    typeof(x) _min1 = (x); \
    typeof(y) _min2 = (y); \
    (void) (&_min1 == &_min2); \
    _min1 < _min2 ? _min1 : _min2; })
    #define min3(x, y, z) ({ \
    typeof(x) _min1 = (x); \
    typeof(y) _min2 = (y); \
    typeof(z) _min3 = (z); \
    (void) (&_min1 == &_min2); \
    (void) (&_min1 == &_min3); \
    _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
    (_min2 < _min3 ? _min2 : _min3); })

    int fmin3(int x, int y, int z) { return min3(x, y, z); }
    int fmin2(int x, int y, int z) { return min(min(x, y), z); }

    $ gcc -O2 -o min-max.s -S min-max.c; cat min-max.s
    .file "min-max.c"
    .text
    .p2align 4,,15
    .globl fmin3
    .type fmin3, @function
    fmin3:
    .LFB0:
    .cfi_startproc
    cmpl %esi, %edi
    jl .L5
    cmpl %esi, %edx
    movl %esi, %eax
    cmovle %edx, %eax
    ret
    .p2align 4,,10
    .p2align 3
    .L5:
    cmpl %edi, %edx
    movl %edi, %eax
    cmovle %edx, %eax
    ret
    .cfi_endproc
    .LFE0:
    .size fmin3, .-fmin3
    .p2align 4,,15
    .globl fmin2
    .type fmin2, @function
    fmin2:
    .LFB1:
    .cfi_startproc
    cmpl %edi, %esi
    movl %edx, %eax
    cmovle %esi, %edi
    cmpl %edx, %edi
    cmovle %edi, %eax
    ret
    .cfi_endproc
    .LFE1:
    .size fmin2, .-fmin2
    .ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
    .section .note.GNU-stack,"",@progbits

    fmin3 function, which uses open-coded min3 macro, is compiled into total
    of ten instructions including a conditional branch, whereas fmin2
    function, which uses two calls to min2 macro, is compiled into six
    instructions with no branches.

    Similarly, open-coded clamp produces the same code as clamp using min and
    max macros, but the latter is much shorter:

    $ cat clamp.c
    #define clamp(val, min, max) ({ \
    typeof(val) __val = (val); \
    typeof(min) __min = (min); \
    typeof(max) __max = (max); \
    (void) (&__val == &__min); \
    (void) (&__val == &__max); \
    __val = __val < __min ? __min: __val; \
    __val > __max ? __max: __val; })
    #define min(x, y) ({ \
    typeof(x) _min1 = (x); \
    typeof(y) _min2 = (y); \
    (void) (&_min1 == &_min2); \
    _min1 < _min2 ? _min1 : _min2; })
    #define max(x, y) ({ \
    typeof(x) _max1 = (x); \
    typeof(y) _max2 = (y); \
    (void) (&_max1 == &_max2); \
    _max1 > _max2 ? _max1 : _max2; })

    int fclamp(int v, int min, int max) { return clamp(v, min, max); }
    int fclampmm(int v, int min, int max) { return min(max(v, min), max); }

    $ gcc -O2 -o clamp.s -S clamp.c; cat clamp.s
    .file "clamp.c"
    .text
    .p2align 4,,15
    .globl fclamp
    .type fclamp, @function
    fclamp:
    .LFB0:
    .cfi_startproc
    cmpl %edi, %esi
    movl %edx, %eax
    cmovge %esi, %edi
    cmpl %edx, %edi
    cmovle %edi, %eax
    ret
    .cfi_endproc
    .LFE0:
    .size fclamp, .-fclamp
    .p2align 4,,15
    .globl fclampmm
    .type fclampmm, @function
    fclampmm:
    .LFB1:
    .cfi_startproc
    cmpl %edi, %esi
    cmovge %esi, %edi
    cmpl %edi, %edx
    movl %edi, %eax
    cmovle %edx, %eax
    ret
    .cfi_endproc
    .LFE1:
    .size fclampmm, .-fclampmm
    .ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
    .section .note.GNU-stack,"",@progbits

    Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
    gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
    Copyright (C) 2011 Free Software Foundation, Inc.
    This is free software; see the source for copying conditions. There is NO
    warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

    -rwx------ 1 mpn eng 51224656 Jun 17 14:15 vmlinux.before
    -rwx------ 1 mpn eng 51224608 Jun 17 13:57 vmlinux.after

    48 bytes reduction. The do_fault_around was a few instruction shorter
    and as far as I can tell saved 12 bytes on the stack, i.e.:

    $ grep -e rsp -e pop -e push do_fault_around.*
    do_fault_around.before.s:push %rbp
    do_fault_around.before.s:mov %rsp,%rbp
    do_fault_around.before.s:push %r13
    do_fault_around.before.s:push %r12
    do_fault_around.before.s:push %rbx
    do_fault_around.before.s:sub $0x38,%rsp
    do_fault_around.before.s:add $0x38,%rsp
    do_fault_around.before.s:pop %rbx
    do_fault_around.before.s:pop %r12
    do_fault_around.before.s:pop %r13
    do_fault_around.before.s:pop %rbp

    do_fault_around.after.s:push %rbp
    do_fault_around.after.s:mov %rsp,%rbp
    do_fault_around.after.s:push %r12
    do_fault_around.after.s:push %rbx
    do_fault_around.after.s:sub $0x30,%rsp
    do_fault_around.after.s:add $0x30,%rsp
    do_fault_around.after.s:pop %rbx
    do_fault_around.after.s:pop %r12
    do_fault_around.after.s:pop %rbp

    or here side-by-side:

    Before After
    push %rbp push %rbp
    mov %rsp,%rbp mov %rsp,%rbp
    push %r13
    push %r12 push %r12
    push %rbx push %rbx
    sub $0x38,%rsp sub $0x30,%rsp
    add $0x38,%rsp add $0x30,%rsp
    pop %rbx pop %rbx
    pop %r12 pop %r12
    pop %r13
    pop %rbp pop %rbp

    There are also fewer branches:

    $ grep ^j do_fault_around.*
    do_fault_around.before.s:jae ffffffff812079b7
    do_fault_around.before.s:jmp ffffffff812079c5
    do_fault_around.before.s:jmp ffffffff81207a14
    do_fault_around.before.s:ja ffffffff812079f9
    do_fault_around.before.s:jb ffffffff81207a10
    do_fault_around.before.s:jmp ffffffff81207a63
    do_fault_around.before.s:jne ffffffff812079df

    do_fault_around.after.s:jmp ffffffff812079fd
    do_fault_around.after.s:ja ffffffff812079e2
    do_fault_around.after.s:jb ffffffff812079f9
    do_fault_around.after.s:jmp ffffffff81207a4c
    do_fault_around.after.s:jne ffffffff812079c8

    And here's with allyesconfig on a different machine:

    $ uname -a; gcc --version; ls -l vmlinux.*
    Linux erwin 3.14.7-mn #54 SMP Sun Jun 15 11:25:08 CEST 2014 x86_64 AMD Phenom(tm) II X3 710 Processor AuthenticAMD GNU/Linux
    gcc (GCC) 4.8.3
    Copyright (C) 2013 Free Software Foundation, Inc.
    This is free software; see the source for copying conditions. There is NO
    warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

    -rwx------ 1 mpn eng 437027411 Jun 20 16:04 vmlinux.before
    -rwx------ 1 mpn eng 437026881 Jun 20 15:30 vmlinux.after

    530 bytes reduction.

    Signed-off-by: Michal Nazarewicz
    Signed-off-by: Hagen Paul Pfeifer
    Acked-by: Steven Rostedt
    Cc: Hagen Paul Pfeifer
    Cc: David Rientjes
    Cc: "Rustad, Mark D"
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Nazarewicz
     
  • Signed-off-by: Geert Uytterhoeven
    Acked-by: Richard Henderson
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Geert Uytterhoeven
     
  • Remove the IRQF_DISABLED flag from FRV architecture code. It's a NOOP
    since 2.6.35 and it will be removed one day.

    Signed-off-by: Michael Opdenacker
    Cc: David Howells
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michael Opdenacker
     
  • Frv has a macro named cpu_data, interfering with variables and struct
    members with the same name:

    include/linux/pm_domain.h:75:24: error: expected identifier or '('
    before '&' token
    struct gpd_cpu_data *cpu_data;

    As struct cpuinfo_frv, boot_cpu_data, cpu_data, and current_cpu_data are
    not used, removed them to fix this.

    Signed-off-by: Geert Uytterhoeven
    Reported-by: kbuild test robot
    Cc: David Howells
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Geert Uytterhoeven
     
  • For now, there are NCHUNKS of 64 freelists in zbud_pool, the last
    unbuddied[63] freelist linked with all zbud pages which have free chunks
    of 63. Calculating according to context of num_free_chunks(), our max
    chunk number of unbuddied zbud page is 62, so none of zbud pages will be
    added/removed in last freelist, but still we will try to find an unbuddied
    zbud page in the last unused freelist, it is unneeded.

    This patch redefines NCHUNKS to 63 as free chunk number in one zbud page,
    hence we can decrease size of zpool and avoid accessing the last unused
    freelist whenever failing to allocate zbud from freelist in zbud_alloc.

    Signed-off-by: Chao Yu
    Cc: Seth Jennings
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Chao Yu
     
  • Change zsmalloc init_zspage() logic to iterate through each object on each
    of its pages, checking the offset to verify the object is on the current
    page before linking it into the zspage.

    The current zsmalloc init_zspage free object linking code has logic that
    relies on there only being one page per zspage when PAGE_SIZE is a
    multiple of class->size. It calculates the number of objects for the
    current page, and iterates through all of them plus one, to account for
    the assumed partial object at the end of the page. While this currently
    works, the logic can be simplified to just link the object at each
    successive offset until the offset is larger than PAGE_SIZE, which does
    not rely on PAGE_SIZE being a multiple of class->size.

    Signed-off-by: Dan Streetman
    Acked-by: Minchan Kim
    Cc: Sergey Senozhatsky
    Cc: Nitin Gupta
    Cc: Seth Jennings
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Dan Streetman
     
  • The letter 'f' in "n
    Acked-by: Minchan Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Wang Sheng-Hui
     
  • `notify_free' device attribute accounts the number of slot free
    notifications and internally represents the number of zram_free_page()
    calls. Slot free notifications are sent only when device is used as a
    swap device, hence `notify_free' is used only for swap devices. Since
    f4659d8e620d08 (zram: support REQ_DISCARD) ZRAM handles yet another one
    free notification (also via zram_free_page() call) -- REQ_DISCARD
    requests, which are sent by a filesystem, whenever some data blocks are
    discarded. However, there is no way to know the number of notifications
    in the latter case.

    Use `notify_free' to account the number of pages freed by
    zram_bio_discard() and zram_slot_free_notify(). Depending on usage
    scenario `notify_free' represents:

    a) the number of pages freed because of slot free notifications, which is
    equal to the number of swap_slot_free_notify() calls, so there is no
    behaviour change

    b) the number of pages freed because of REQ_DISCARD notifications

    Signed-off-by: Sergey Senozhatsky
    Acked-by: Minchan Kim
    Acked-by: Jerome Marchand
    Cc: Nitin Gupta
    Cc: Chao Yu
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Sergey Senozhatsky
     
  • Normally, zram user could get maximum memory usage zram consumed via
    polling mem_used_total with sysfs in userspace.

    But it has a critical problem because user can miss peak memory usage
    during update inverval of polling. For avoiding that, user should poll it
    with shorter interval(ie, 0.0000000001s) with mlocking to avoid page fault
    delay when memory pressure is heavy. It would be troublesome.

    This patch adds new knob "mem_used_max" so user could see the maximum
    memory usage easily via reading the knob and reset it via "echo 0 >
    /sys/block/zram0/mem_used_max".

    Signed-off-by: Minchan Kim
    Reviewed-by: Dan Streetman
    Cc: Sergey Senozhatsky
    Cc: Jerome Marchand
    Cc:
    Cc:
    Cc: Luigi Semenzato
    Cc: Nitin Gupta
    Cc: Seth Jennings
    Reviewed-by: David Horner
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Minchan Kim
     
  • Since zram has no control feature to limit memory usage, it makes hard to
    manage system memrory.

    This patch adds new knob "mem_limit" via sysfs to set up the a limit so
    that zram could fail allocation once it reaches the limit.

    In addition, user could change the limit in runtime so that he could
    manage the memory more dynamically.

    Initial state is no limit so it doesn't break old behavior.

    [akpm@linux-foundation.org: fix typo, per Sergey]
    Signed-off-by: Minchan Kim
    Cc: Dan Streetman
    Cc: Sergey Senozhatsky
    Cc: Jerome Marchand
    Cc:
    Cc:
    Cc: Luigi Semenzato
    Cc: Nitin Gupta
    Cc: Seth Jennings
    Cc: David Horner
    Cc: Joonsoo Kim
    Cc: Minchan Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Minchan Kim
     
  • zs_get_total_size_bytes returns a amount of memory zsmalloc consumed with
    *byte unit* but zsmalloc operates *page unit* rather than byte unit so
    let's change the API so benefit we could get is that reduce unnecessary
    overhead (ie, change page unit with byte unit) in zsmalloc.

    Since return type is pages, "zs_get_total_pages" is better than
    "zs_get_total_size_bytes".

    Signed-off-by: Minchan Kim
    Reviewed-by: Dan Streetman
    Cc: Sergey Senozhatsky
    Cc: Jerome Marchand
    Cc:
    Cc:
    Cc: Luigi Semenzato
    Cc: Nitin Gupta
    Cc: Seth Jennings
    Cc: David Horner
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Minchan Kim
     
  • Currently, zram has no feature to limit memory so theoretically zram can
    deplete system memory. Users have asked for a limit several times as even
    without exhaustion zram makes it hard to control memory usage of the
    platform. This patchset adds the feature.

    Patch 1 makes zs_get_total_size_bytes faster because it would be used
    frequently in later patches for the new feature.

    Patch 2 changes zs_get_total_size_bytes's return unit from bytes to page
    so that zsmalloc doesn't need unnecessary operation(ie, << PAGE_SHIFT).

    Patch 3 adds new feature. I added the feature into zram layer, not
    zsmalloc because limiation is zram's requirement, not zsmalloc so any
    other user using zsmalloc(ie, zpool) shouldn't affected by unnecessary
    branch of zsmalloc. In future, if every users of zsmalloc want the
    feature, then, we could move the feature from client side to zsmalloc
    easily but vice versa would be painful.

    Patch 4 adds news facility to report maximum memory usage of zram so that
    this avoids user polling frequently via /sys/block/zram0/ mem_used_total
    and ensures transient max are not missed.

    This patch (of 4):

    pages_allocated has counted in size_class structure and when user of
    zsmalloc want to see total_size_bytes, it should gather all of count from
    each size_class to report the sum.

    It's not bad if user don't see the value often but if user start to see
    the value frequently, it would be not a good deal for performance pov.

    This patch moves the count from size_class to zs_pool so it could reduce
    memory footprint (from [255 * 8byte] to [sizeof(atomic_long_t)]).

    Signed-off-by: Minchan Kim
    Reviewed-by: Dan Streetman
    Cc: Sergey Senozhatsky
    Cc: Jerome Marchand
    Cc:
    Cc:
    Cc: Luigi Semenzato
    Cc: Nitin Gupta
    Cc: Seth Jennings
    Reviewed-by: David Horner
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Minchan Kim
     
  • Performing vma lookups without taking the mm->mmap_sem is asking for
    trouble. While doing the search, the vma in question can be modified or
    even removed before returning to the caller. Take the lock (shared) in
    order to avoid races while iterating through the vmacache and/or rbtree.
    In addition, this guarantees that the address space will remain intact
    during the CPU flushing.

    Signed-off-by: Davidlohr Bueso
    Cc: Geert Uytterhoeven
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Davidlohr Bueso
     
  • vmstat workers are used for folding counter differentials into the zone,
    per node and global counters at certain time intervals. They currently
    run at defined intervals on all processors which will cause some holdoff
    for processors that need minimal intrusion by the OS.

    The current vmstat_update mechanism depends on a deferrable timer firing
    every other second by default which registers a work queue item that runs
    on the local CPU, with the result that we have 1 interrupt and one
    additional schedulable task on each CPU every 2 seconds If a workload
    indeed causes VM activity or multiple tasks are running on a CPU, then
    there are probably bigger issues to deal with.

    However, some workloads dedicate a CPU for a single CPU bound task. This
    is done in high performance computing, in high frequency financial
    applications, in networking (Intel DPDK, EZchip NPS) and with the advent
    of systems with more and more CPUs over time, this may become more and
    more common to do since when one has enough CPUs one cares less about
    efficiently sharing a CPU with other tasks and more about efficiently
    monopolizing a CPU per task.

    The difference of having this timer firing and workqueue kernel thread
    scheduled per second can be enormous. An artificial test measuring the
    worst case time to do a simple "i++" in an endless loop on a bare metal
    system and under Linux on an isolated CPU with dynticks and with and
    without this patch, have Linux match the bare metal performance (~700
    cycles) with this patch and loose by couple of orders of magnitude (~200k
    cycles) without it[*]. The loss occurs for something that just calculates
    statistics. For networking applications, for example, this could be the
    difference between dropping packets or sustaining line rate.

    Statistics are important and useful, but it would be great if there would
    be a way to not cause statistics gathering produce a huge performance
    difference. This patche does just that.

    This patch creates a vmstat shepherd worker that monitors the per cpu
    differentials on all processors. If there are differentials on a
    processor then a vmstat worker local to the processors with the
    differentials is created. That worker will then start folding the diffs
    in regular intervals. Should the worker find that there is no work to be
    done then it will make the shepherd worker monitor the differentials
    again.

    With this patch it is possible then to have periods longer than
    2 seconds without any OS event on a "cpu" (hardware thread).

    The patch shows a very minor increased in system performance.

    hackbench -s 512 -l 2000 -g 15 -f 25 -P

    Results before the patch:

    Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks)
    Each sender will pass 2000 messages of 512 bytes
    Time: 4.992
    Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks)
    Each sender will pass 2000 messages of 512 bytes
    Time: 4.971
    Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks)
    Each sender will pass 2000 messages of 512 bytes
    Time: 5.063

    Hackbench after the patch:

    Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks)
    Each sender will pass 2000 messages of 512 bytes
    Time: 4.973
    Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks)
    Each sender will pass 2000 messages of 512 bytes
    Time: 4.990
    Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks)
    Each sender will pass 2000 messages of 512 bytes
    Time: 4.993

    [fengguang.wu@intel.com: cpu_stat_off can be static]
    Signed-off-by: Christoph Lameter
    Reviewed-by: Gilad Ben-Yossef
    Cc: Frederic Weisbecker
    Cc: Thomas Gleixner
    Cc: Tejun Heo
    Cc: John Stultz
    Cc: Mike Frysinger
    Cc: Minchan Kim
    Cc: Hakan Akkan
    Cc: Max Krasnyansky
    Cc: "Paul E. McKenney"
    Cc: Hugh Dickins
    Cc: Viresh Kumar
    Cc: H. Peter Anvin
    Cc: Ingo Molnar
    Cc: Peter Zijlstra
    Signed-off-by: Fengguang Wu
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Christoph Lameter
     
  • It isn't obvious that CMA can be disabled on the kernel's command line, so
    document it.

    Signed-off-by: Jean Delvare
    Cc: Joonsoo Kim
    Cc: Greg Kroah-Hartman
    Cc: Akinobu Mita
    Cc: Chuck Ebbert
    Cc: Marek Szyprowski
    Cc: Konrad Rzeszutek Wilk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Jean Delvare
     
  • Increase the buffer-head per-CPU LRU size to allow efficient filesystem
    operations that access many blocks for each transaction. For example,
    creating a file in a large ext4 directory with quota enabled will access
    multiple buffer heads and will overflow the LRU at the default 8-block LRU
    size:

    * parent directory inode table block (ctime, nlinks for subdirs)
    * new inode bitmap
    * inode table block
    * 2 quota blocks
    * directory leaf block (not reused, but pollutes one cache entry)
    * 2 levels htree blocks (only one is reused, other pollutes cache)
    * 2 levels indirect/index blocks (only one is reused)

    The buffer-head per-CPU LRU size is raised to 16, as it shows in metadata
    performance benchmarks up to 10% gain for create, 4% for lookup and 7% for
    destroy.

    Signed-off-by: Liang Zhen
    Signed-off-by: Andreas Dilger
    Signed-off-by: Sebastien Buisson
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Sebastien Buisson
     
  • PROT_NUMA VMAs are skipped to avoid problems distinguishing between
    present, prot_none and special entries. MPOL_MF_LAZY is not visible from
    userspace since commit a720094ded8c ("mm: mempolicy: Hide MPOL_NOOP and
    MPOL_MF_LAZY from userspace for now") but it should still skip VMAs the
    same way task_numa_work does.

    Signed-off-by: Mel Gorman
    Acked-by: Rik van Riel
    Acked-by: Hugh Dickins
    Acked-by: Peter Zijlstra
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Mel Gorman
     
  • This tool induces memory fragmentation via sequential allocation of
    transparent huge pages and splitting off everything except their last
    sub-pages. It easily generates pressure to the memory compaction code.

    $ perf stat -e 'compaction:*' -e 'migrate:*' ./transhuge-stress
    transhuge-stress: allocate 7858 transhuge pages, using 15716 MiB virtual memory and 61 MiB of ram
    transhuge-stress: 1.653 s/loop, 0.210 ms/page, 9504.828 MiB/s 7858 succeed, 0 failed, 2439 different pages
    transhuge-stress: 1.537 s/loop, 0.196 ms/page, 10226.227 MiB/s 7858 succeed, 0 failed, 2364 different pages
    transhuge-stress: 1.658 s/loop, 0.211 ms/page, 9479.215 MiB/s 7858 succeed, 0 failed, 2179 different pages
    transhuge-stress: 1.617 s/loop, 0.206 ms/page, 9716.992 MiB/s 7858 succeed, 0 failed, 2421 different pages
    ^C./transhuge-stress: Interrupt

    Performance counter stats for './transhuge-stress':

    1.744.051 compaction:mm_compaction_isolate_migratepages
    1.014 compaction:mm_compaction_isolate_freepages
    1.744.051 compaction:mm_compaction_migratepages
    1.647 compaction:mm_compaction_begin
    1.647 compaction:mm_compaction_end
    1.744.051 migrate:mm_migrate_pages
    0 migrate:mm_numa_migrate_ratelimit

    7,964696835 seconds time elapsed

    Signed-off-by: Konstantin Khlebnikov
    Cc: Rafael Aquini
    Cc: Andrey Ryabinin
    Cc: Shuah Khan
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Konstantin Khlebnikov
     
  • Always mark pages with PageBalloon even if balloon compaction is disabled
    and expose this mark in /proc/kpageflags as KPF_BALLOON.

    Also this patch adds three counters into /proc/vmstat: "balloon_inflate",
    "balloon_deflate" and "balloon_migrate". They accumulate balloon
    activity. Current size of balloon is (balloon_inflate - balloon_deflate)
    pages.

    All generic balloon code now gathered under option CONFIG_MEMORY_BALLOON.
    It should be selected by ballooning driver which wants use this feature.
    Currently virtio-balloon is the only user.

    Signed-off-by: Konstantin Khlebnikov
    Cc: Rafael Aquini
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Konstantin Khlebnikov
     
  • Now ballooned pages are detected using PageBalloon(). Fake mapping is no
    longer required. This patch links ballooned pages to balloon device using
    field page->private instead of page->mapping. Also this patch embeds
    balloon_dev_info directly into struct virtio_balloon.

    Signed-off-by: Konstantin Khlebnikov
    Cc: Rafael Aquini
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Konstantin Khlebnikov
     
  • Sasha Levin reported KASAN splash inside isolate_migratepages_range().
    Problem is in the function __is_movable_balloon_page() which tests
    AS_BALLOON_MAP in page->mapping->flags. This function has no protection
    against anonymous pages. As result it tried to check address space flags
    inside struct anon_vma.

    Further investigation shows more problems in current implementation:

    * Special branch in __unmap_and_move() never works:
    balloon_page_movable() checks page flags and page_count. In
    __unmap_and_move() page is locked, reference counter is elevated, thus
    balloon_page_movable() always fails. As a result execution goes to the
    normal migration path. virtballoon_migratepage() returns
    MIGRATEPAGE_BALLOON_SUCCESS instead of MIGRATEPAGE_SUCCESS,
    move_to_new_page() thinks this is an error code and assigns
    newpage->mapping to NULL. Newly migrated page lose connectivity with
    balloon an all ability for further migration.

    * lru_lock erroneously required in isolate_migratepages_range() for
    isolation ballooned page. This function releases lru_lock periodically,
    this makes migration mostly impossible for some pages.

    * balloon_page_dequeue have a tight race with balloon_page_isolate:
    balloon_page_isolate could be executed in parallel with dequeue between
    picking page from list and locking page_lock. Race is rare because they
    use trylock_page() for locking.

    This patch fixes all of them.

    Instead of fake mapping with special flag this patch uses special state of
    page->_mapcount: PAGE_BALLOON_MAPCOUNT_VALUE = -256. Buddy allocator uses
    PAGE_BUDDY_MAPCOUNT_VALUE = -128 for similar purpose. Storing mark
    directly in struct page makes everything safer and easier.

    PagePrivate is used to mark pages present in page list (i.e. not
    isolated, like PageLRU for normal pages). It replaces special rules for
    reference counter and makes balloon migration similar to migration of
    normal pages. This flag is protected by page_lock together with link to
    the balloon device.

    Signed-off-by: Konstantin Khlebnikov
    Reported-by: Sasha Levin
    Link: http://lkml.kernel.org/p/53E6CEAA.9020105@oracle.com
    Cc: Rafael Aquini
    Cc: Andrey Ryabinin
    Cc: [3.8+]
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Konstantin Khlebnikov
     
  • Activate the RCU fast_gup for ARM64. We also need to force THP splits to
    broadcast an IPI s.t. we block in the fast_gup page walker. As THP
    splits are comparatively rare, this should not lead to a noticeable
    performance degradation.

    Some pre-requisite functions pud_write and pud_page are also added.

    [akpm@linux-foundation.org: coding-style fixes]
    Signed-off-by: Steve Capper
    Tested-by: Dann Frazier
    Acked-by: Catalin Marinas
    Cc: Hugh Dickins
    Cc: Russell King
    Cc: Mark Rutland
    Cc: Mel Gorman
    Cc: Will Deacon
    Cc: Christoffer Dall
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Steve Capper
     
  • In order to implement fast_get_user_pages we need to ensure that the page
    table walker is protected from page table pages being freed from under it.

    This patch enables HAVE_RCU_TABLE_FREE, any page table pages belonging to
    address spaces with multiple users will be call_rcu_sched freed. Meaning
    that disabling interrupts will block the free and protect the fast gup
    page walker.

    Signed-off-by: Steve Capper
    Tested-by: Dann Frazier
    Acked-by: Catalin Marinas
    Cc: Hugh Dickins
    Cc: Russell King
    Cc: Mark Rutland
    Cc: Mel Gorman
    Cc: Will Deacon
    Cc: Christoffer Dall
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Steve Capper
     
  • Activate the RCU fast_gup for ARM. We also need to force THP splits to
    broadcast an IPI s.t. we block in the fast_gup page walker. As THP
    splits are comparatively rare, this should not lead to a noticeable
    performance degradation.

    Some pre-requisite functions pud_write and pud_page are also added.

    Signed-off-by: Steve Capper
    Reviewed-by: Catalin Marinas
    Cc: Dann Frazier
    Cc: Hugh Dickins
    Cc: Russell King
    Cc: Mark Rutland
    Cc: Mel Gorman
    Cc: Will Deacon
    Cc: Christoffer Dall
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Steve Capper
     
  • In order to implement fast_get_user_pages we need to ensure that the page
    table walker is protected from page table pages being freed from under it.

    This patch enables HAVE_RCU_TABLE_FREE, any page table pages belonging to
    address spaces with multiple users will be call_rcu_sched freed. Meaning
    that disabling interrupts will block the free and protect the fast gup
    page walker.

    Signed-off-by: Steve Capper
    Reviewed-by: Catalin Marinas
    Cc: Dann Frazier
    Cc: Hugh Dickins
    Cc: Russell King
    Cc: Mark Rutland
    Cc: Mel Gorman
    Cc: Will Deacon
    Cc: Christoffer Dall
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Steve Capper
     
  • We need a mechanism to tag ptes as being special, this indicates that no
    attempt should be made to access the underlying struct page * associated
    with the pte. This is used by the fast_gup when operating on ptes as it
    has no means to access VMAs (that also contain this information)
    locklessly.

    The L_PTE_SPECIAL bit is already allocated for LPAE, this patch modifies
    pte_special and pte_mkspecial to make use of it, and defines
    __HAVE_ARCH_PTE_SPECIAL.

    This patch also excludes special ptes from the icache/dcache sync logic.

    Signed-off-by: Steve Capper
    Reviewed-by: Catalin Marinas
    Cc: Dann Frazier
    Cc: Hugh Dickins
    Cc: Russell King
    Cc: Mark Rutland
    Cc: Mel Gorman
    Cc: Will Deacon
    Cc: Christoffer Dall
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Steve Capper
     
  • This series implements general forms of get_user_pages_fast and
    __get_user_pages_fast in core code and activates them for arm and arm64.

    These are required for Transparent HugePages to function correctly, as a
    futex on a THP tail will otherwise result in an infinite loop (due to the
    core implementation of __get_user_pages_fast always returning 0).

    Unfortunately, a futex on THP tail can be quite common for certain
    workloads; thus THP is unreliable without a __get_user_pages_fast
    implementation.

    This series may also be beneficial for direct-IO heavy workloads and
    certain KVM workloads.

    This patch (of 6):

    get_user_pages_fast() attempts to pin user pages by walking the page
    tables directly and avoids taking locks. Thus the walker needs to be
    protected from page table pages being freed from under it, and needs to
    block any THP splits.

    One way to achieve this is to have the walker disable interrupts, and rely
    on IPIs from the TLB flushing code blocking before the page table pages
    are freed.

    On some platforms we have hardware broadcast of TLB invalidations, thus
    the TLB flushing code doesn't necessarily need to broadcast IPIs; and
    spuriously broadcasting IPIs can hurt system performance if done too
    often.

    This problem has been solved on PowerPC and Sparc by batching up page
    table pages belonging to more than one mm_user, then scheduling an
    rcu_sched callback to free the pages. This RCU page table free logic has
    been promoted to core code and is activated when one enables
    HAVE_RCU_TABLE_FREE. Unfortunately, these architectures implement their
    own get_user_pages_fast routines.

    The RCU page table free logic coupled with an IPI broadcast on THP split
    (which is a rare event), allows one to protect a page table walker by
    merely disabling the interrupts during the walk.

    This patch provides a general RCU implementation of get_user_pages_fast
    that can be used by architectures that perform hardware broadcast of TLB
    invalidations.

    It is based heavily on the PowerPC implementation by Nick Piggin.

    [akpm@linux-foundation.org: various comment fixes]
    Signed-off-by: Steve Capper
    Tested-by: Dann Frazier
    Reviewed-by: Catalin Marinas
    Acked-by: Hugh Dickins
    Cc: Russell King
    Cc: Mark Rutland
    Cc: Mel Gorman
    Cc: Will Deacon
    Cc: Christoffer Dall
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Steve Capper
     
  • Remove 3 brace coding style for any arm of this statement

    Signed-off-by: Paul McQuade
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Paul McQuade
     
  • WARNING: Prefer: pr_err(... to printk(KERN_ERR ...

    [akpm@linux-foundation.org: remove KERN_ERR]
    Signed-off-by: Paul McQuade
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Paul McQuade
     
  • By the following commits, we prevented from allocating firmware_map_entry
    of same memory range:
    f0093ede: drivers/firmware/memmap.c: don't allocate firmware_map_entry
    of same memory range
    49c8b24d: drivers/firmware/memmap.c: pass the correct argument to
    firmware_map_find_entry_bootmem()

    But it's not enough. When PNP0C80 device is added by acpi_scan_init(),
    memmap sysfses of same firmware_map_entry are created twice as follows:

    # cat /sys/firmware/memmap/*/start
    0x40000000000
    0x60000000000
    0x4a837000
    0x4a83a000
    0x4a8b5000
    ...
    0x40000000000
    0x60000000000
    ...

    The flows of the issues are as follows:

    1. e820_reserve_resources() allocates firmware_map_entrys of all
    memory ranges defined in e820. And, these firmware_map_entrys
    are linked with map_entries list.

    map_entries -> entry 1 -> ... -> entry N

    2. When PNP0C80 device is limited by mem= boot option, acpi_scan_init()
    added the memory device. In this case, firmware_map_add_hotplug()
    allocates firmware_map_entry and creates memmap sysfs.

    map_entries -> entry 1 -> ... -> entry N -> entry N+1
    |
    memmap 1

    3. firmware_memmap_init() creates memmap sysfses of firmware_map_entrys
    linked with map_entries.

    map_entries -> entry 1 -> ... -> entry N -> entry N+1
    | | |
    memmap 2 memmap N+1 memmap 1
    memmap N+2

    So while hot removing the PNP0C80 device, kernel panic occurs as follows:

    BUG: unable to handle kernel paging request at 00000001003e000b
    IP: sysfs_open_file+0x46/0x2b0
    PGD 203a89fe067 PUD 0
    Oops: 0000 [#1] SMP
    ...
    Call Trace:
    do_dentry_open+0x1ef/0x2a0
    finish_open+0x31/0x40
    do_last+0x57c/0x1220
    path_openat+0xc2/0x4c0
    do_filp_open+0x4b/0xb0
    do_sys_open+0xf3/0x1f0
    SyS_open+0x1e/0x20
    system_call_fastpath+0x16/0x1b

    The patch adds a check of confirming whether memmap sysfs of
    firmware_map_entry has been created, and does not create memmap
    sysfs of same firmware_map_entry.

    Signed-off-by: Yasuaki Ishimatsu
    Cc: Santosh Shilimkar
    Cc: Toshi Kani
    Cc: Greg Kroah-Hartman
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Yasuaki Ishimatsu
     
  • Replace asm. headers with linux/headers:

    Signed-off-by: Paul McQuade
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Paul McQuade
     
  • Signed-off-by: Paul McQuade
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Paul McQuade
     
  • "WARNING: Use #include instead of "

    Signed-off-by: Paul McQuade
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Paul McQuade
     
  • memcg_can_account_kmem() returns true iff

    !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
    memcg_kmem_is_active(memcg);

    To begin with the !mem_cgroup_is_root(memcg) check is useless, because one
    can't enable kmem accounting for the root cgroup (mem_cgroup_write()
    returns EINVAL on an attempt to set the limit on the root cgroup).

    Furthermore, the !mem_cgroup_disabled() check also seems to be redundant.
    The point is memcg_can_account_kmem() is called from three places:
    mem_cgroup_salbinfo_read(), __memcg_kmem_get_cache(), and
    __memcg_kmem_newpage_charge(). The latter two functions are only invoked
    if memcg_kmem_enabled() returns true, which implies that the memory cgroup
    subsystem is enabled. And mem_cgroup_slabinfo_read() shows the output of
    memory.kmem.slabinfo, which won't exist if the memory cgroup is completely
    disabled.

    So let's substitute all the calls to memcg_can_account_kmem() with plain
    memcg_kmem_is_active(), and kill the former.

    Signed-off-by: Vladimir Davydov
    Acked-by: Johannes Weiner
    Cc: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • In a memcg with even just moderate cache pressure, success rates for
    transparent huge page allocations drop to zero, wasting a lot of effort
    that the allocator puts into assembling these pages.

    The reason for this is that the memcg reclaim code was never designed for
    higher-order charges. It reclaims in small batches until there is room
    for at least one page. Huge page charges only succeed when these batches
    add up over a series of huge faults, which is unlikely under any
    significant load involving order-0 allocations in the group.

    Remove that loop on the memcg side in favor of passing the actual reclaim
    goal to direct reclaim, which is already set up and optimized to meet
    higher-order goals efficiently.

    This brings memcg's THP policy in line with the system policy: if the
    allocator painstakingly assembles a hugepage, memcg will at least make an
    honest effort to charge it. As a result, transparent hugepage allocation
    rates amid cache activity are drastically improved:

    vanilla patched
    pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
    pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
    pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
    thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
    thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)

    [ Note: this may in turn increase memory consumption from internal
    fragmentation, which is an inherent risk of transparent hugepages.
    Some setups may have to adjust the memcg limits accordingly to
    accomodate this - or, if the machine is already packed to capacity,
    disable the transparent huge page feature. ]

    Signed-off-by: Johannes Weiner
    Reviewed-by: Vladimir Davydov
    Cc: Michal Hocko
    Cc: Dave Hansen
    Cc: Greg Thelen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • When attempting to charge pages, we first charge the memory counter and
    then the memory+swap counter. If one of the counters is at its limit, we
    enter reclaim, but if it's the memory+swap counter, reclaim shouldn't swap
    because that wouldn't change the situation. However, if the counters have
    the same limits, we never get to the memory+swap limit. To know whether
    reclaim should swap or not, there is a state flag that indicates whether
    the limits are equal and whether hitting the memory limit implies hitting
    the memory+swap limit.

    Just try the memory+swap counter first.

    Signed-off-by: Johannes Weiner
    Reviewed-by: Vladimir Davydov
    Acked-by: Michal Hocko
    Cc: Dave Hansen
    Cc: Greg Thelen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • free_pages_and_swap_cache limits release_pages to PAGEVEC_SIZE chunks.
    This is not a big deal for the normal release path but it completely kills
    memcg uncharge batching which reduces res_counter spin_lock contention.
    Dave has noticed this with his page fault scalability test case on a large
    machine when the lock was basically dominating on all CPUs:

    80.18% 80.18% [kernel] [k] _raw_spin_lock
    |
    --- _raw_spin_lock
    |
    |--66.59%-- res_counter_uncharge_until
    | res_counter_uncharge
    | uncharge_batch
    | uncharge_list
    | mem_cgroup_uncharge_list
    | release_pages
    | free_pages_and_swap_cache
    | tlb_flush_mmu_free
    | |
    | |--90.12%-- unmap_single_vma
    | | unmap_vmas
    | | unmap_region
    | | do_munmap
    | | vm_munmap
    | | sys_munmap
    | | system_call_fastpath
    | | __GI___munmap
    | |
    | --9.88%-- tlb_flush_mmu
    | tlb_finish_mmu
    | unmap_region
    | do_munmap
    | vm_munmap
    | sys_munmap
    | system_call_fastpath
    | __GI___munmap

    In his case the load was running in the root memcg and that part has been
    handled by reverting 05b843012335 ("mm: memcontrol: use root_mem_cgroup
    res_counter") because this is a clear regression, but the problem remains
    inside dedicated memcgs.

    There is no reason to limit release_pages to PAGEVEC_SIZE batches other
    than lru_lock held times. This logic, however, can be moved inside the
    function. mem_cgroup_uncharge_list and free_hot_cold_page_list do not
    hold any lock for the whole pages_to_free list so it is safe to call them
    in a single run.

    The release_pages() code was previously breaking the lru_lock each
    PAGEVEC_SIZE pages (ie, 14 pages). However this code has no usage of
    pagevecs so switch to breaking the lock at least every SWAP_CLUSTER_MAX
    (32) pages. This means that the lock acquisition frequency is
    approximately halved and the max hold times are approximately doubled.

    The now unneeded batching is removed from free_pages_and_swap_cache().

    Also update the grossly out-of-date release_pages documentation.

    Signed-off-by: Michal Hocko
    Signed-off-by: Johannes Weiner
    Reported-by: Dave Hansen
    Cc: Vladimir Davydov
    Cc: Greg Thelen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko