19 Jan, 2016

1 commit

  • Conflicts:
    arch/arm/boot/dts/Makefile
    arch/arm/boot/dts/imx6qdl-sabreauto.dtsi
    arch/arm/boot/dts/imx6qdl-sabresd.dtsi
    arch/arm/boot/dts/imx6qp-sabresd.dts
    arch/arm/boot/dts/imx6sl-evk.dts
    arch/arm/boot/dts/imx6sl.dtsi
    arch/arm/boot/dts/imx6sx-14x14-arm2.dts
    arch/arm/boot/dts/imx6sx-19x19-arm2.dts
    arch/arm/boot/dts/imx6sx-sabreauto.dts
    arch/arm/boot/dts/imx6sx-sdb-btwifi.dts
    arch/arm/boot/dts/imx6sx-sdb.dtsi
    arch/arm/boot/dts/imx6sx.dtsi
    arch/arm/boot/dts/imx6ul-14x14-evk.dts
    arch/arm/boot/dts/imx6ul-9x9-evk.dts
    arch/arm/boot/dts/imx6ul-evk-btwifi.dtsi
    arch/arm/boot/dts/imx6ul-pinfunc.h
    arch/arm/boot/dts/imx6ul.dtsi
    arch/arm/boot/dts/imx7d-12x12-lpddr3-arm2.dts
    arch/arm/boot/dts/imx7d-pinfunc.h
    arch/arm/boot/dts/imx7d-sdb-epdc.dtsi
    arch/arm/boot/dts/imx7d-sdb-m4.dtsi
    arch/arm/boot/dts/imx7d-sdb-reva-touch.dts
    arch/arm/boot/dts/imx7d-sdb-reva.dts
    arch/arm/boot/dts/imx7d-sdb.dts
    arch/arm/boot/dts/imx7d.dtsi
    arch/arm/configs/imx_v7_defconfig
    arch/arm/configs/imx_v7_mfg_defconfig
    arch/arm/mach-imx/clk-imx6q.c
    arch/arm/mach-imx/clk.h
    arch/arm/mach-imx/cpuidle-imx7d.c
    arch/arm/mach-imx/ddr3_freq_imx7d.S
    arch/arm/mach-imx/gpcv2.c
    arch/arm/mach-imx/imx7d_low_power_idle.S
    arch/arm/mach-imx/lpddr3_freq_imx.S
    arch/arm/mach-imx/mach-imx7d.c
    arch/arm/mach-imx/pm-imx7.c
    arch/arm/mach-imx/suspend-imx7.S
    drivers/ata/ahci_imx.c
    drivers/cpufreq/imx6q-cpufreq.c
    drivers/dma/imx-sdma.c
    drivers/dma/pxp/pxp_dma_v2.c
    drivers/input/touchscreen/ads7846.c
    drivers/media/platform/mxc/capture/ov5640_mipi.c
    drivers/media/platform/mxc/output/mxc_pxp_v4l2.c
    drivers/mmc/core/core.c
    drivers/mmc/core/sd.c
    drivers/mtd/spi-nor/fsl-quadspi.c
    drivers/mxc/gpu-viv/Kbuild
    drivers/mxc/gpu-viv/config
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_context.c
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_context.h
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_hardware.c
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_hardware.h
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_recorder.c
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_command_vg.c
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_command_vg.h
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_vg.c
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_vg.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_command.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_command_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_db.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_debug.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_event.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_heap.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_interrupt_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_mmu.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_mmu_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_power.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_precomp.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_security.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_vg.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_video_memory.c
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_base.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_driver.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_driver_vg.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_dump.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_eglplatform.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_eglplatform_type.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_engine.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_engine_vg.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_enum.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_kernel_buffer.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_mem.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_options.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_profiler.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_raster.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_rename.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_security_interface.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_statistics.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_types.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_version.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_vg.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/default/gc_hal_kernel_allocator_array.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/default/gc_hal_kernel_allocator_dmabuf.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/freescale/gc_hal_kernel_allocator_array.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/freescale/gc_hal_kernel_allocator_cma.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_allocator.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_allocator.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_debug.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_debugfs.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_debugfs.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_device.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_device.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_iommu.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_linux.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_linux.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_math.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_mutex.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_os.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_os.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_platform.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_probe.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_security_channel.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_sync.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_sync.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/platform/freescale/gc_hal_kernel_platform_imx6q14.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/platform/freescale/gc_hal_kernel_platform_imx6q14.config
    drivers/mxc/hdmi-cec/mxc_hdmi-cec.c
    drivers/mxc/ipu3/ipu_common.c
    drivers/mxc/mlb/mxc_mlb.c
    drivers/net/ethernet/freescale/fec_main.c
    drivers/net/wireless/bcmdhd/dhd_linux.c
    drivers/net/wireless/bcmdhd/dhd_sdio.c
    drivers/scsi/scsi_error.c
    drivers/spi/spi-imx.c
    drivers/thermal/imx_thermal.c
    drivers/tty/serial/imx.c
    drivers/usb/chipidea/udc.c
    drivers/usb/gadget/configfs.c
    drivers/video/fbdev/mxc/mipi_dsi.c
    drivers/video/fbdev/mxc/mipi_dsi.h
    drivers/video/fbdev/mxc/mipi_dsi_samsung.c
    drivers/video/fbdev/mxc/mxc_edid.c
    drivers/video/fbdev/mxc/mxc_epdc_fb.c
    drivers/video/fbdev/mxc/mxc_epdc_v2_fb.c
    drivers/video/fbdev/mxc/mxc_ipuv3_fb.c
    drivers/video/fbdev/mxc/mxcfb_hx8369_wvga.c
    drivers/video/fbdev/mxsfb.c
    firmware/imx/sdma/sdma-imx6q.bin.ihex
    include/trace/events/cpufreq_interactive.h

    guoyin.chen
     

20 Nov, 2015

2 commits

  • Add a userspace visible knob to tell the VM to keep an extra amount
    of memory free, by increasing the gap between each zone's min and
    low watermarks.

    This is useful for realtime applications that call system
    calls and have a bound on the number of allocations that happen
    in any short time period. In this application, extra_free_kbytes
    would be left at an amount equal to or larger than than the
    maximum number of allocations that happen in any burst.

    It may also be useful to reduce the memory use of virtual
    machines (temporarily?), in a way that does not cause memory
    fragmentation like ballooning does.

    [ccross]
    Revived for use on old kernels where no other solution exists.
    The tunable will be removed on kernels that do better at avoiding
    direct reclaim.

    Change-Id: I765a42be8e964bfd3e2886d1ca85a29d60c3bb3e
    Signed-off-by: Rik van Riel
    Signed-off-by: Colin Cross

    Rik van Riel
     
  • By default the kernel tries to keep half as much memory free at each
    order as it does for one order below. This can be too agressive when
    running without swap.

    Change-Id: I5efc1a0b50f41ff3ac71e92d2efd175dedd54ead
    Signed-off-by: Arve Hjønnevåg

    Arve Hjønnevåg
     

30 Sep, 2015

1 commit

  • commit 2f064f3485cd29633ad1b3cfb00cc519509a3d72 upstream.

    Commit c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") added
    checks for page->pfmemalloc to __skb_fill_page_desc():

    if (page->pfmemalloc && !page->mapping)
    skb->pfmemalloc = true;

    It assumes page->mapping == NULL implies that page->pfmemalloc can be
    trusted. However, __delete_from_page_cache() can set set page->mapping
    to NULL and leave page->index value alone. Due to being in union, a
    non-zero page->index will be interpreted as true page->pfmemalloc.

    So the assumption is invalid if the networking code can see such a page.
    And it seems it can. We have encountered this with a NFS over loopback
    setup when such a page is attached to a new skbuf. There is no copying
    going on in this case so the page confuses __skb_fill_page_desc which
    interprets the index as pfmemalloc flag and the network stack drops
    packets that have been allocated using the reserves unless they are to
    be queued on sockets handling the swapping which is the case here and
    that leads to hangs when the nfs client waits for a response from the
    server which has been dropped and thus never arrive.

    The struct page is already heavily packed so rather than finding another
    hole to put it in, let's do a trick instead. We can reuse the index
    again but define it to an impossible value (-1UL). This is the page
    index so it should never see the value that large. Replace all direct
    users of page->pfmemalloc by page_is_pfmemalloc which will hide this
    nastiness from unspoiled eyes.

    The information will get lost if somebody wants to use page->index
    obviously but that was the case before and the original code expected
    that the information should be persisted somewhere else if that is
    really needed (e.g. what SLAB and SLUB do).

    [akpm@linux-foundation.org: fix blooper in slub]
    Fixes: c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb")
    Signed-off-by: Michal Hocko
    Debugged-by: Vlastimil Babka
    Debugged-by: Jiri Bohac
    Cc: Eric Dumazet
    Cc: David Miller
    Acked-by: Mel Gorman
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds
    Signed-off-by: Greg Kroah-Hartman

    Michal Hocko
     

16 Apr, 2015

1 commit

  • We converted some of the usages of ACCESS_ONCE to READ_ONCE in the mm/
    tree since it doesn't work reliably on non-scalar types.

    This patch removes the rest of the usages of ACCESS_ONCE, and use the new
    READ_ONCE API for the read accesses. This makes things cleaner, instead
    of using separate/multiple sets of APIs.

    Signed-off-by: Jason Low
    Acked-by: Michal Hocko
    Acked-by: Davidlohr Bueso
    Acked-by: Rik van Riel
    Reviewed-by: Christian Borntraeger
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Jason Low
     

15 Apr, 2015

7 commits

  • Signed-off-by: Yaowei Bai
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Yaowei Bai
     
  • NOTE: this is not about __GFP_THISNODE, this is only about GFP_THISNODE.

    GFP_THISNODE is a secret combination of gfp bits that have different
    behavior than expected. It is a combination of __GFP_THISNODE,
    __GFP_NORETRY, and __GFP_NOWARN and is special-cased in the page
    allocator slowpath to fail without trying reclaim even though it may be
    used in combination with __GFP_WAIT.

    An example of the problem this creates: commit e97ca8e5b864 ("mm: fix
    GFP_THISNODE callers and clarify") fixed up many users of GFP_THISNODE
    that really just wanted __GFP_THISNODE. The problem doesn't end there,
    however, because even it was a no-op for alloc_misplaced_dst_page(),
    which also sets __GFP_NORETRY and __GFP_NOWARN, and
    migrate_misplaced_transhuge_page(), where __GFP_NORETRY and __GFP_NOWAIT
    is set in GFP_TRANSHUGE. Converting GFP_THISNODE to __GFP_THISNODE is a
    no-op in these cases since the page allocator special-cases
    __GFP_THISNODE && __GFP_NORETRY && __GFP_NOWARN.

    It's time to just remove GFP_THISNODE entirely. We leave __GFP_THISNODE
    to restrict an allocation to a local node, but remove GFP_THISNODE and
    its obscurity. Instead, we require that a caller clear __GFP_WAIT if it
    wants to avoid reclaim.

    This allows the aforementioned functions to actually reclaim as they
    should. It also enables any future callers that want to do
    __GFP_THISNODE but also __GFP_NORETRY && __GFP_NOWARN to reclaim. The
    rule is simple: if you don't want to reclaim, then don't set __GFP_WAIT.

    Aside: ovs_flow_stats_update() really wants to avoid reclaim as well, so
    it is unchanged.

    Signed-off-by: David Rientjes
    Acked-by: Vlastimil Babka
    Cc: Christoph Lameter
    Acked-by: Pekka Enberg
    Cc: Joonsoo Kim
    Acked-by: Johannes Weiner
    Cc: Mel Gorman
    Cc: Pravin Shelar
    Cc: Jarno Rajahalme
    Cc: Li Zefan
    Cc: Greg Thelen
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    David Rientjes
     
  • It seems nobody needs this.

    Signed-off-by: Konstantin Khlebnikov
    Cc: Michal Hocko
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Konstantin Khlebnikov
     
  • This makes show_mem() much less verbose on huge machines. Instead of huge
    and almost useless dump of counters for each per-zone per-cpu lists this
    patch prints the sum of these counters for each zone (free_pcp) and size
    of per-cpu list for current cpu (local_pcp).

    The filter flag SHOW_MEM_PERCPU_LISTS reverts to the old verbose mode.

    [akpm@linux-foundation.org: update show_free_areas comment]
    Signed-off-by: Konstantin Khlebnikov
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Konstantin Khlebnikov
     
  • Compaction has anti fragmentation algorithm. It is that freepage should
    be more than pageblock order to finish the compaction if we don't find any
    freepage in requested migratetype buddy list. This is for mitigating
    fragmentation, but, there is a lack of migratetype consideration and it is
    too excessive compared to page allocator's anti fragmentation algorithm.

    Not considering migratetype would cause premature finish of compaction.
    For example, if allocation request is for unmovable migratetype, freepage
    with CMA migratetype doesn't help that allocation and compaction should
    not be stopped. But, current logic regards this situation as compaction
    is no longer needed, so finish the compaction.

    Secondly, condition is too excessive compared to page allocator's logic.
    We can steal freepage from other migratetype and change pageblock
    migratetype on more relaxed conditions in page allocator. This is
    designed to prevent fragmentation and we can use it here. Imposing hard
    constraint only to the compaction doesn't help much in this case since
    page allocator would cause fragmentation again.

    To solve these problems, this patch borrows anti fragmentation logic from
    page allocator. It will reduce premature compaction finish in some cases
    and reduce excessive compaction work.

    stress-highalloc test in mmtests with non movable order 7 allocation shows
    considerable increase of compaction success rate.

    Compaction success rate (Compaction success * 100 / Compaction stalls, %)
    31.82 : 42.20

    I tested it on non-reboot 5 runs stress-highalloc benchmark and found that
    there is no more degradation on allocation success rate than before. That
    roughly means that this patch doesn't result in more fragmentations.

    Vlastimil suggests additional idea that we only test for fallbacks when
    migration scanner has scanned a whole pageblock. It looked good for
    fragmentation because chance of stealing increase due to making more free
    pages in certain pageblock. So, I tested it, but, it results in decreased
    compaction success rate, roughly 38.00. I guess the reason that if system
    is low memory condition, watermark check could be failed due to not enough
    order 0 free page and so, sometimes, we can't reach a fallback check
    although migrate_pfn is aligned to pageblock_nr_pages. I can insert code
    to cope with this situation but it makes code more complicated so I don't
    include his idea at this patch.

    [akpm@linux-foundation.org: fix CONFIG_CMA=n build]
    Signed-off-by: Joonsoo Kim
    Acked-by: Vlastimil Babka
    Cc: Mel Gorman
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: Zhang Yanfei
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     
  • This is preparation step to use page allocator's anti fragmentation logic
    in compaction. This patch just separates fallback freepage checking part
    from fallback freepage management part. Therefore, there is no functional
    change.

    Signed-off-by: Joonsoo Kim
    Acked-by: Vlastimil Babka
    Cc: Mel Gorman
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: Zhang Yanfei
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     
  • Freepage with MIGRATE_CMA can be used only for MIGRATE_MOVABLE and they
    should not be expanded to other migratetype buddy list to protect them
    from unmovable/reclaimable allocation. Implementing these requirements in
    __rmqueue_fallback(), that is, finding largest possible block of freepage
    has bad effect that high order freepage with MIGRATE_CMA are broken
    continually although there are suitable order CMA freepage. Reason is
    that they are not be expanded to other migratetype buddy list and next
    __rmqueue_fallback() invocation try to finds another largest block of
    freepage and break it again. So, MIGRATE_CMA fallback should be handled
    separately. This patch introduces __rmqueue_cma_fallback(), that just
    wrapper of __rmqueue_smallest() and call it before __rmqueue_fallback() if
    migratetype == MIGRATE_MOVABLE.

    This results in unintended behaviour change that MIGRATE_CMA freepage is
    always used first rather than other migratetype as movable allocation's
    fallback. But, as already mentioned above, MIGRATE_CMA can be used only
    for MIGRATE_MOVABLE, so it is better to use MIGRATE_CMA freepage first as
    much as possible. Otherwise, we needlessly take up precious freepages
    with other migratetype and increase chance of fragmentation.

    Signed-off-by: Joonsoo Kim
    Acked-by: Vlastimil Babka
    Cc: Mel Gorman
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: Zhang Yanfei
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     

13 Mar, 2015

1 commit

  • Tetsuo Handa has pointed out that __GFP_NOFAIL allocations might fail
    after OOM killer is disabled if the allocation is performed by a kernel
    thread. This behavior was introduced from the very beginning by
    7f33d49a2ed5 ("mm, PM/Freezer: Disable OOM killer when tasks are frozen").
    This means that the basic contract for the allocation request is broken
    and the context requesting such an allocation might blow up unexpectedly.

    There are basically two ways forward.

    1) move oom_killer_disable after kernel threads are frozen. This has a
    risk that the OOM victim wouldn't be able to finish because it would
    depend on an already frozen kernel thread. This would be really tricky
    to debug.

    2) do not fail GFP_NOFAIL allocation no matter what and risk a
    potential Freezable kernel threads will loop and fail the suspend.
    Incidental allocations after kernel threads are frozen will at least
    dump a warning - if we are lucky and the serial console is still active
    of course...

    This patch implements the later option because it is safer. We would see
    warning rather than allocation failures for the kernel threads which would
    blow up otherwise and have a higher chances to identify __GFP_NOFAIL users
    from deeper pm code.

    Signed-off-by: Michal Hocko
    Acked-by: David Rientjes
    Cc: Johannes Weiner
    Cc: Tetsuo Handa
    Cc: "Rafael J. Wysocki"
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko
     

01 Mar, 2015

1 commit

  • Historically, !__GFP_FS allocations were not allowed to invoke the OOM
    killer once reclaim had failed, but nevertheless kept looping in the
    allocator.

    Commit 9879de7373fc ("mm: page_alloc: embed OOM killing naturally into
    allocation slowpath"), which should have been a simple cleanup patch,
    accidentally changed the behavior to aborting the allocation at that
    point. This creates problems with filesystem callers (?) that currently
    rely on the allocator waiting for other tasks to intervene.

    Revert the behavior as it shouldn't have been changed as part of a
    cleanup patch.

    Fixes: 9879de7373fc ("mm: page_alloc: embed OOM killing naturally into allocation slowpath")
    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Reported-by: Tetsuo Handa
    Cc: Theodore Ts'o
    Cc: Dave Chinner
    Acked-by: David Rientjes
    Cc: Oleg Nesterov
    Cc: Mel Gorman
    Cc: [3.19.x]
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     

14 Feb, 2015

1 commit

  • Add kernel address sanitizer hooks to mark allocated page's addresses as
    accessible in corresponding shadow region. Mark freed pages as
    inaccessible.

    Signed-off-by: Andrey Ryabinin
    Cc: Dmitry Vyukov
    Cc: Konstantin Serebryany
    Cc: Dmitry Chernenkov
    Signed-off-by: Andrey Konovalov
    Cc: Yuri Gribov
    Cc: Konstantin Khlebnikov
    Cc: Sasha Levin
    Cc: Christoph Lameter
    Cc: Joonsoo Kim
    Cc: Dave Hansen
    Cc: Andi Kleen
    Cc: Ingo Molnar
    Cc: Thomas Gleixner
    Cc: "H. Peter Anvin"
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andrey Ryabinin
     

13 Feb, 2015

2 commits

  • Add a necessary 'leave'.

    Signed-off-by: Yaowei Bai
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Yaowei Bai
     
  • Pulling the code protected by if (system_state == SYSTEM_BOOTING) into
    its own helper allows us to shrink .text a little. This relies on
    build_all_zonelists already having a __ref annotation. Add a comment
    explaining why so one doesn't have to track it down through git log.

    The real saving comes in 3/5, ("mm/mm_init.c: Mark mminit_verify_zonelist
    as __init"), where we save about 400 bytes

    Signed-off-by: Rasmus Villemoes
    Cc: Vlastimil Babka
    Cc: Rik van Riel
    Cc: Joonsoo Kim
    Cc: David Rientjes
    Cc: Vishnu Pratap Singh
    Cc: Pintu Kumar
    Cc: Michal Nazarewicz
    Cc: Mel Gorman
    Cc: Paul Gortmaker
    Cc: Peter Zijlstra
    Cc: Tim Chen
    Cc: Hugh Dickins
    Cc: Li Zefan
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Rasmus Villemoes
     

12 Feb, 2015

12 commits

  • When allocation falls back to stealing free pages of another migratetype,
    it can decide to steal extra pages, or even the whole pageblock in order
    to reduce fragmentation, which could happen if further allocation
    fallbacks pick a different pageblock. In try_to_steal_freepages(), one of
    the situations where extra pages are stolen happens when we are trying to
    allocate a MIGRATE_RECLAIMABLE page.

    However, MIGRATE_UNMOVABLE allocations are not treated the same way,
    although spreading such allocation over multiple fallback pageblocks is
    arguably even worse than it is for RECLAIMABLE allocations. To minimize
    fragmentation, we should minimize the number of such fallbacks, and thus
    steal as much as is possible from each fallback pageblock.

    Note that in theory this might put more pressure on movable pageblocks and
    cause movable allocations to steal back from unmovable pageblocks.
    However, movable allocations are not as aggressive with stealing, and do
    not cause permanent fragmentation, so the tradeoff is reasonable, and
    evaluation seems to support the change.

    This patch thus adds a check for MIGRATE_UNMOVABLE to the decision to
    steal extra free pages. When evaluating with stress-highalloc from
    mmtests, this has reduced the number of MIGRATE_UNMOVABLE fallbacks to
    roughly 1/6. The number of these fallbacks stealing from MIGRATE_MOVABLE
    block is reduced to 1/3. There was no observation of growing number of
    unmovable pageblocks over time, and also not of increased movable
    allocation fallbacks.

    Signed-off-by: Vlastimil Babka
    Acked-by: Mel Gorman
    Cc: Zhang Yanfei
    Cc: Minchan Kim
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: "Aneesh Kumar K.V"
    Cc: "Kirill A. Shutemov"
    Cc: Johannes Weiner
    Cc: Joonsoo Kim
    Cc: Michal Hocko
    Cc: KOSAKI Motohiro
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vlastimil Babka
     
  • When allocation falls back to another migratetype, it will steal a page
    with highest available order, and (depending on this order and desired
    migratetype), it might also steal the rest of free pages from the same
    pageblock.

    Given the preference of highest available order, it is likely that it will
    be higher than the desired order, and result in the stolen buddy page
    being split. The remaining pages after split are currently stolen only
    when the rest of the free pages are stolen. This can however lead to
    situations where for MOVABLE allocations we split e.g. order-4 fallback
    UNMOVABLE page, but steal only order-0 page. Then on the next MOVABLE
    allocation (which may be batched to fill the pcplists) we split another
    order-3 or higher page, etc. By stealing all pages that we have split, we
    can avoid further stealing.

    This patch therefore adjusts the page stealing so that buddy pages created
    by split are always stolen. This has effect only on MOVABLE allocations,
    as RECLAIMABLE and UNMOVABLE allocations already always do that in
    addition to stealing the rest of free pages from the pageblock. The
    change also allows to simplify try_to_steal_freepages() and factor out CMA
    handling.

    According to Mel, it has been intended since the beginning that buddy
    pages after split would be stolen always, but it doesn't seem like it was
    ever the case until commit 47118af076f6 ("mm: mmzone: MIGRATE_CMA
    migration type added"). The commit has unintentionally introduced this
    behavior, but was reverted by commit 0cbef29a7821 ("mm:
    __rmqueue_fallback() should respect pageblock type"). Neither included
    evaluation.

    My evaluation with stress-highalloc from mmtests shows about 2.5x
    reduction of page stealing events for MOVABLE allocations, without
    affecting the page stealing events for other allocation migratetypes.

    Signed-off-by: Vlastimil Babka
    Acked-by: Mel Gorman
    Cc: Zhang Yanfei
    Acked-by: Minchan Kim
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: "Aneesh Kumar K.V"
    Cc: "Kirill A. Shutemov"
    Cc: Johannes Weiner
    Cc: Joonsoo Kim
    Cc: Michal Hocko
    Cc: KOSAKI Motohiro
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vlastimil Babka
     
  • When studying page stealing, I noticed some weird looking decisions in
    try_to_steal_freepages(). The first I assume is a bug (Patch 1), the
    following two patches were driven by evaluation.

    Testing was done with stress-highalloc of mmtests, using the
    mm_page_alloc_extfrag tracepoint and postprocessing to get counts of how
    often page stealing occurs for individual migratetypes, and what
    migratetypes are used for fallbacks. Arguably, the worst case of page
    stealing is when UNMOVABLE allocation steals from MOVABLE pageblock.
    RECLAIMABLE allocation stealing from MOVABLE allocation is also not ideal,
    so the goal is to minimize these two cases.

    The evaluation of v2 wasn't always clear win and Joonsoo questioned the
    results. Here I used different baseline which includes RFC compaction
    improvements from [1]. I found that the compaction improvements reduce
    variability of stress-highalloc, so there's less noise in the data.

    First, let's look at stress-highalloc configured to do sync compaction,
    and how these patches reduce page stealing events during the test. First
    column is after fresh reboot, other two are reiterations of test without
    reboot. That was all accumulater over 5 re-iterations (so the benchmark
    was run 5x3 times with 5 fresh restarts).

    Baseline:

    3.19-rc4 3.19-rc4 3.19-rc4
    5-nothp-1 5-nothp-2 5-nothp-3
    Page alloc extfrag event 10264225 8702233 10244125
    Extfrag fragmenting 10263271 8701552 10243473
    Extfrag fragmenting for unmovable 13595 17616 15960
    Extfrag fragmenting unmovable placed with movable 7989 12193 8447
    Extfrag fragmenting for reclaimable 658 1840 1817
    Extfrag fragmenting reclaimable placed with movable 558 1677 1679
    Extfrag fragmenting for movable 10249018 8682096 10225696

    With Patch 1:
    3.19-rc4 3.19-rc4 3.19-rc4
    6-nothp-1 6-nothp-2 6-nothp-3
    Page alloc extfrag event 11834954 9877523 9774860
    Extfrag fragmenting 11833993 9876880 9774245
    Extfrag fragmenting for unmovable 7342 16129 11712
    Extfrag fragmenting unmovable placed with movable 4191 10547 6270
    Extfrag fragmenting for reclaimable 373 1130 923
    Extfrag fragmenting reclaimable placed with movable 302 906 738
    Extfrag fragmenting for movable 11826278 9859621 9761610

    With Patch 2:
    3.19-rc4 3.19-rc4 3.19-rc4
    7-nothp-1 7-nothp-2 7-nothp-3
    Page alloc extfrag event 4725990 3668793 3807436
    Extfrag fragmenting 4725104 3668252 3806898
    Extfrag fragmenting for unmovable 6678 7974 7281
    Extfrag fragmenting unmovable placed with movable 2051 3829 4017
    Extfrag fragmenting for reclaimable 429 1208 1278
    Extfrag fragmenting reclaimable placed with movable 369 976 1034
    Extfrag fragmenting for movable 4717997 3659070 3798339

    With Patch 3:
    3.19-rc4 3.19-rc4 3.19-rc4
    8-nothp-1 8-nothp-2 8-nothp-3
    Page alloc extfrag event 5016183 4700142 3850633
    Extfrag fragmenting 5015325 4699613 3850072
    Extfrag fragmenting for unmovable 1312 3154 3088
    Extfrag fragmenting unmovable placed with movable 1115 2777 2714
    Extfrag fragmenting for reclaimable 437 1193 1097
    Extfrag fragmenting reclaimable placed with movable 330 969 879
    Extfrag fragmenting for movable 5013576 4695266 3845887

    In v2 we've seen apparent regression with Patch 1 for unmovable events,
    this is now gone, suggesting it was indeed noise. Here, each patch
    improves the situation for unmovable events. Reclaimable is improved by
    patch 1 and then either the same modulo noise, or perhaps sligtly worse -
    a small price for unmovable improvements, IMHO. The number of movable
    allocations falling back to other migratetypes is most noisy, but it's
    reduced to half at Patch 2 nevertheless. These are least critical as
    compaction can move them around.

    If we look at success rates, the patches don't affect them, that didn't change.

    Baseline:
    3.19-rc4 3.19-rc4 3.19-rc4
    5-nothp-1 5-nothp-2 5-nothp-3
    Success 1 Min 49.00 ( 0.00%) 42.00 ( 14.29%) 41.00 ( 16.33%)
    Success 1 Mean 51.00 ( 0.00%) 45.00 ( 11.76%) 42.60 ( 16.47%)
    Success 1 Max 55.00 ( 0.00%) 51.00 ( 7.27%) 46.00 ( 16.36%)
    Success 2 Min 53.00 ( 0.00%) 47.00 ( 11.32%) 44.00 ( 16.98%)
    Success 2 Mean 59.60 ( 0.00%) 50.80 ( 14.77%) 48.20 ( 19.13%)
    Success 2 Max 64.00 ( 0.00%) 56.00 ( 12.50%) 52.00 ( 18.75%)
    Success 3 Min 84.00 ( 0.00%) 82.00 ( 2.38%) 78.00 ( 7.14%)
    Success 3 Mean 85.60 ( 0.00%) 82.80 ( 3.27%) 79.40 ( 7.24%)
    Success 3 Max 86.00 ( 0.00%) 83.00 ( 3.49%) 80.00 ( 6.98%)

    Patch 1:
    3.19-rc4 3.19-rc4 3.19-rc4
    6-nothp-1 6-nothp-2 6-nothp-3
    Success 1 Min 49.00 ( 0.00%) 44.00 ( 10.20%) 44.00 ( 10.20%)
    Success 1 Mean 51.80 ( 0.00%) 46.00 ( 11.20%) 45.80 ( 11.58%)
    Success 1 Max 54.00 ( 0.00%) 49.00 ( 9.26%) 49.00 ( 9.26%)
    Success 2 Min 58.00 ( 0.00%) 49.00 ( 15.52%) 48.00 ( 17.24%)
    Success 2 Mean 60.40 ( 0.00%) 51.80 ( 14.24%) 50.80 ( 15.89%)
    Success 2 Max 63.00 ( 0.00%) 54.00 ( 14.29%) 55.00 ( 12.70%)
    Success 3 Min 84.00 ( 0.00%) 81.00 ( 3.57%) 79.00 ( 5.95%)
    Success 3 Mean 85.00 ( 0.00%) 81.60 ( 4.00%) 79.80 ( 6.12%)
    Success 3 Max 86.00 ( 0.00%) 82.00 ( 4.65%) 82.00 ( 4.65%)

    Patch 2:

    3.19-rc4 3.19-rc4 3.19-rc4
    7-nothp-1 7-nothp-2 7-nothp-3
    Success 1 Min 50.00 ( 0.00%) 44.00 ( 12.00%) 39.00 ( 22.00%)
    Success 1 Mean 52.80 ( 0.00%) 45.60 ( 13.64%) 42.40 ( 19.70%)
    Success 1 Max 55.00 ( 0.00%) 46.00 ( 16.36%) 47.00 ( 14.55%)
    Success 2 Min 52.00 ( 0.00%) 48.00 ( 7.69%) 45.00 ( 13.46%)
    Success 2 Mean 53.40 ( 0.00%) 49.80 ( 6.74%) 48.80 ( 8.61%)
    Success 2 Max 57.00 ( 0.00%) 52.00 ( 8.77%) 52.00 ( 8.77%)
    Success 3 Min 84.00 ( 0.00%) 81.00 ( 3.57%) 79.00 ( 5.95%)
    Success 3 Mean 85.00 ( 0.00%) 82.40 ( 3.06%) 79.60 ( 6.35%)
    Success 3 Max 86.00 ( 0.00%) 83.00 ( 3.49%) 80.00 ( 6.98%)

    Patch 3:
    3.19-rc4 3.19-rc4 3.19-rc4
    8-nothp-1 8-nothp-2 8-nothp-3
    Success 1 Min 46.00 ( 0.00%) 44.00 ( 4.35%) 42.00 ( 8.70%)
    Success 1 Mean 50.20 ( 0.00%) 45.60 ( 9.16%) 44.00 ( 12.35%)
    Success 1 Max 52.00 ( 0.00%) 47.00 ( 9.62%) 47.00 ( 9.62%)
    Success 2 Min 53.00 ( 0.00%) 49.00 ( 7.55%) 48.00 ( 9.43%)
    Success 2 Mean 55.80 ( 0.00%) 50.60 ( 9.32%) 49.00 ( 12.19%)
    Success 2 Max 59.00 ( 0.00%) 52.00 ( 11.86%) 51.00 ( 13.56%)
    Success 3 Min 84.00 ( 0.00%) 80.00 ( 4.76%) 79.00 ( 5.95%)
    Success 3 Mean 85.40 ( 0.00%) 81.60 ( 4.45%) 80.40 ( 5.85%)
    Success 3 Max 87.00 ( 0.00%) 83.00 ( 4.60%) 82.00 ( 5.75%)

    While there's no improvement here, I consider reduced fragmentation events
    to be worth on its own. Patch 2 also seems to reduce scanning for free
    pages, and migrations in compaction, suggesting it has somewhat less work
    to do:

    Patch 1:

    Compaction stalls 4153 3959 3978
    Compaction success 1523 1441 1446
    Compaction failures 2630 2517 2531
    Page migrate success 4600827 4943120 5104348
    Page migrate failure 19763 16656 17806
    Compaction pages isolated 9597640 10305617 10653541
    Compaction migrate scanned 77828948 86533283 87137064
    Compaction free scanned 517758295 521312840 521462251
    Compaction cost 5503 5932 6110

    Patch 2:

    Compaction stalls 3800 3450 3518
    Compaction success 1421 1316 1317
    Compaction failures 2379 2134 2201
    Page migrate success 4160421 4502708 4752148
    Page migrate failure 19705 14340 14911
    Compaction pages isolated 8731983 9382374 9910043
    Compaction migrate scanned 98362797 96349194 98609686
    Compaction free scanned 496512560 469502017 480442545
    Compaction cost 5173 5526 5811

    As with v2, /proc/pagetypeinfo appears unaffected with respect to numbers
    of unmovable and reclaimable pageblocks.

    Configuring the benchmark to allocate like THP page fault (i.e. no sync
    compaction) gives much noisier results for iterations 2 and 3 after
    reboot. This is not so surprising given how [1] offers lower improvements
    in this scenario due to less restarts after deferred compaction which
    would change compaction pivot.

    Baseline:
    3.19-rc4 3.19-rc4 3.19-rc4
    5-thp-1 5-thp-2 5-thp-3
    Page alloc extfrag event 8148965 6227815 6646741
    Extfrag fragmenting 8147872 6227130 6646117
    Extfrag fragmenting for unmovable 10324 12942 15975
    Extfrag fragmenting unmovable placed with movable 5972 8495 10907
    Extfrag fragmenting for reclaimable 601 1707 2210
    Extfrag fragmenting reclaimable placed with movable 520 1570 2000
    Extfrag fragmenting for movable 8136947 6212481 6627932

    Patch 1:
    3.19-rc4 3.19-rc4 3.19-rc4
    6-thp-1 6-thp-2 6-thp-3
    Page alloc extfrag event 8345457 7574471 7020419
    Extfrag fragmenting 8343546 7573777 7019718
    Extfrag fragmenting for unmovable 10256 18535 30716
    Extfrag fragmenting unmovable placed with movable 6893 11726 22181
    Extfrag fragmenting for reclaimable 465 1208 1023
    Extfrag fragmenting reclaimable placed with movable 353 996 843
    Extfrag fragmenting for movable 8332825 7554034 6987979

    Patch 2:
    3.19-rc4 3.19-rc4 3.19-rc4
    7-thp-1 7-thp-2 7-thp-3
    Page alloc extfrag event 3512847 3020756 2891625
    Extfrag fragmenting 3511940 3020185 2891059
    Extfrag fragmenting for unmovable 9017 6892 6191
    Extfrag fragmenting unmovable placed with movable 1524 3053 2435
    Extfrag fragmenting for reclaimable 445 1081 1160
    Extfrag fragmenting reclaimable placed with movable 375 918 986
    Extfrag fragmenting for movable 3502478 3012212 2883708

    Patch 3:
    3.19-rc4 3.19-rc4 3.19-rc4
    8-thp-1 8-thp-2 8-thp-3
    Page alloc extfrag event 3181699 3082881 2674164
    Extfrag fragmenting 3180812 3082303 2673611
    Extfrag fragmenting for unmovable 1201 4031 4040
    Extfrag fragmenting unmovable placed with movable 974 3611 3645
    Extfrag fragmenting for reclaimable 478 1165 1294
    Extfrag fragmenting reclaimable placed with movable 387 985 1030
    Extfrag fragmenting for movable 3179133 3077107 2668277

    The improvements for first iteration are clear, the rest is much noisier
    and can appear like regression for Patch 1. Anyway, patch 2 rectifies it.

    Allocation success rates are again unaffected so there's no point in
    making this e-mail any longer.

    [1] http://marc.info/?l=linux-mm&m=142166196321125&w=2

    This patch (of 3):

    When __rmqueue_fallback() is called to allocate a page of order X, it will
    find a page of order Y >= X of a fallback migratetype, which is different
    from the desired migratetype. With the help of try_to_steal_freepages(),
    it may change the migratetype (to the desired one) also of:

    1) all currently free pages in the pageblock containing the fallback page
    2) the fallback pageblock itself
    3) buddy pages created by splitting the fallback page (when Y > X)

    These decisions take the order Y into account, as well as the desired
    migratetype, with the goal of preventing multiple fallback allocations
    that could e.g. distribute UNMOVABLE allocations among multiple
    pageblocks.

    Originally, decision for 1) has implied the decision for 3). Commit
    47118af076f6 ("mm: mmzone: MIGRATE_CMA migration type added") changed that
    (probably unintentionally) so that the buddy pages in case 3) are always
    changed to the desired migratetype, except for CMA pageblocks.

    Commit fef903efcf0c ("mm/page_allo.c: restructure free-page stealing code
    and fix a bug") did some refactoring and added a comment that the case of
    3) is intended. Commit 0cbef29a7821 ("mm: __rmqueue_fallback() should
    respect pageblock type") removed the comment and tried to restore the
    original behavior where 1) implies 3), but due to the previous
    refactoring, the result is instead that only 2) implies 3) - and the
    conditions for 2) are less frequently met than conditions for 1). This
    may increase fragmentation in situations where the code decides to steal
    all free pages from the pageblock (case 1)), but then gives back the buddy
    pages produced by splitting.

    This patch restores the original intended logic where 1) implies 3).
    During testing with stress-highalloc from mmtests, this has shown to
    decrease the number of events where UNMOVABLE and RECLAIMABLE allocations
    steal from MOVABLE pageblocks, which can lead to permanent fragmentation.
    In some cases it has increased the number of events when MOVABLE
    allocations steal from UNMOVABLE or RECLAIMABLE pageblocks, but these are
    fixable by sync compaction and thus less harmful.

    Note that evaluation has shown that the behavior introduced by
    47118af076f6 for buddy pages in case 3) is actually even better than the
    original logic, so the following patch will introduce it properly once
    again. For stable backports of this patch it makes thus sense to only fix
    versions containing 0cbef29a7821.

    [iamjoonsoo.kim@lge.com: tracepoint fix]
    Signed-off-by: Vlastimil Babka
    Acked-by: Mel Gorman
    Cc: Zhang Yanfei
    Acked-by: Minchan Kim
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: "Aneesh Kumar K.V"
    Cc: "Kirill A. Shutemov"
    Cc: Johannes Weiner
    Cc: Joonsoo Kim
    Cc: Michal Hocko
    Cc: KOSAKI Motohiro
    Cc: [3.13+ containing 0cbef29a7821]
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vlastimil Babka
     
  • Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM
    suspend") has left a race window when OOM killer manages to
    note_oom_kill after freeze_processes checks the counter. The race
    window is quite small and really unlikely and partial solution deemed
    sufficient at the time of submission.

    Tejun wasn't happy about this partial solution though and insisted on a
    full solution. That requires the full OOM and freezer's task freezing
    exclusion, though. This is done by this patch which introduces oom_sem
    RW lock and turns oom_killer_disable() into a full OOM barrier.

    oom_killer_disabled check is moved from the allocation path to the OOM
    level and we take oom_sem for reading for both the check and the whole
    OOM invocation.

    oom_killer_disable() takes oom_sem for writing so it waits for all
    currently running OOM killer invocations. Then it disable all the further
    OOMs by setting oom_killer_disabled and checks for any oom victims.
    Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The
    last victim wakes up all waiters enqueued by oom_killer_disable().
    Therefore this function acts as the full OOM barrier.

    The page fault path is covered now as well although it was assumed to be
    safe before. As per Tejun, "We used to have freezing points deep in file
    system code which may be reacheable from page fault." so it would be
    better and more robust to not rely on freezing points here. Same applies
    to the memcg OOM killer.

    out_of_memory tells the caller whether the OOM was allowed to trigger and
    the callers are supposed to handle the situation. The page allocation
    path simply fails the allocation same as before. The page fault path will
    retry the fault (more on that later) and Sysrq OOM trigger will simply
    complain to the log.

    Normally there wouldn't be any unfrozen user tasks after
    try_to_freeze_tasks so the function will not block. But if there was an
    OOM killer racing with try_to_freeze_tasks and the OOM victim didn't
    finish yet then we have to wait for it. This should complete in a finite
    time, though, because

    - the victim cannot loop in the page fault handler (it would die
    on the way out from the exception)
    - it cannot loop in the page allocator because all the further
    allocation would fail and __GFP_NOFAIL allocations are not
    acceptable at this stage
    - it shouldn't be blocked on any locks held by frozen tasks
    (try_to_freeze expects lockless context) and kernel threads and
    work queues are not frozen yet

    Signed-off-by: Michal Hocko
    Suggested-by: Tejun Heo
    Cc: David Rientjes
    Cc: Johannes Weiner
    Cc: Oleg Nesterov
    Cc: Cong Wang
    Cc: "Rafael J. Wysocki"
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko
     
  • Especially on 32 bit kernels memory node ranges are printed with 32 bit
    wide addresses only. Use u64 types and %llx specifiers to print full
    width of addresses.

    Signed-off-by: Juergen Gross
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Juergen Gross
     
  • Although it was not called, destroy_compound_page() did some potentially
    useful checks. Let's re-introduce them in free_pages_prepare(), where
    they can be actually triggered when CONFIG_DEBUG_VM=y.

    compound_order() assert is already in free_pages_prepare(). We have few
    checks for tail pages left.

    Signed-off-by: Kirill A. Shutemov
    Cc: Vlastimil Babka
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Kirill A. Shutemov
     
  • The only caller is __free_one_page(). By the time we should have
    page->flags to be cleared already:

    - for 0-order pages though PCP list:
    free_hot_cold_page()
    free_pages_prepare()
    free_pages_check()
    page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

    free_pcppages_bulk()
    page =
    __free_one_page(page)

    - for non-0-order pages:
    __free_pages_ok()
    free_pages_prepare()
    free_pages_check()
    page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    free_one_page()
    __free_one_page()

    So there's no way PageCompound() will return true in __free_one_page().
    Let's remove dead destroy_compound_page() and put assert for page->flags
    there instead.

    Signed-off-by: Kirill A. Shutemov
    Cc: Vlastimil Babka
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Kirill A. Shutemov
     
  • Expand the usage of the struct alloc_context introduced in the previous
    patch also for calling try_to_compact_pages(), to reduce the number of its
    parameters. Since the function is in different compilation unit, we need
    to move alloc_context definition in the shared mm/internal.h header.

    With this change we get simpler code and small savings of code size and stack
    usage:

    add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27)
    function old new delta
    __alloc_pages_direct_compact 283 256 -27
    add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-13 (-13)
    function old new delta
    try_to_compact_pages 582 569 -13

    Stack usage of __alloc_pages_direct_compact goes from 24 to none (per
    scripts/checkstack.pl).

    Signed-off-by: Vlastimil Babka
    Acked-by: Michal Hocko
    Cc: Mel Gorman
    Cc: Zhang Yanfei
    Cc: Minchan Kim
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: "Aneesh Kumar K.V"
    Cc: "Kirill A. Shutemov"
    Cc: Johannes Weiner
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vlastimil Babka
     
  • Introduce struct alloc_context to accumulate the numerous parameters
    passed between the alloc_pages* family of functions and
    get_page_from_freelist(). This excludes gfp_flags and alloc_info, which
    mutate too much along the way, and allocation order, which is conceptually
    different.

    The result is shorter function signatures, as well as overal code size and
    stack usage reductions.

    bloat-o-meter:

    add/remove: 0/0 grow/shrink: 1/2 up/down: 127/-310 (-183)
    function old new delta
    get_page_from_freelist 2525 2652 +127
    __alloc_pages_direct_compact 329 283 -46
    __alloc_pages_nodemask 2564 2300 -264

    checkstack.pl:

    function old new
    __alloc_pages_nodemask 248 200
    get_page_from_freelist 168 184
    __alloc_pages_direct_compact 40 24

    Signed-off-by: Vlastimil Babka
    Acked-by: Michal Hocko
    Cc: Mel Gorman
    Cc: Zhang Yanfei
    Cc: Minchan Kim
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: "Aneesh Kumar K.V"
    Cc: "Kirill A. Shutemov"
    Cc: Johannes Weiner
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vlastimil Babka
     
  • The possibility of replacing the numerous parameters of alloc_pages*
    functions with a single structure has been discussed when Minchan proposed
    to expand the x86 kernel stack [1]. This series implements the change,
    along with few more cleanups/microoptimizations.

    The series is based on next-20150108 and I used gcc 4.8.3 20140627 on
    openSUSE 13.2 for compiling. Config includess NUMA and COMPACTION.

    The core change is the introduction of a new struct alloc_context, which looks
    like this:

    struct alloc_context {
    struct zonelist *zonelist;
    nodemask_t *nodemask;
    struct zone *preferred_zone;
    int classzone_idx;
    int migratetype;
    enum zone_type high_zoneidx;
    };

    All the contents is mostly constant, except that __alloc_pages_slowpath()
    changes preferred_zone, classzone_idx and potentially zonelist. But
    that's not a problem in case control returns to retry_cpuset: in
    __alloc_pages_nodemask(), those will be reset to initial values again
    (although it's a bit subtle). On the other hand, gfp_flags and alloc_info
    mutate so much that it doesn't make sense to put them into alloc_context.
    Still, the result is one parameter instead of up to 7. This is all in
    Patch 2.

    Patch 3 is a step to expand alloc_context usage out of page_alloc.c
    itself. The function try_to_compact_pages() can also much benefit from
    the parameter reduction, but it means the struct definition has to be
    moved to a shared header.

    Patch 1 should IMHO be included even if the rest is deemed not useful
    enough. It improves maintainability and also has some code/stack
    reduction. Patch 4 is OTOH a tiny optimization.

    Overall bloat-o-meter results:

    add/remove: 0/0 grow/shrink: 0/4 up/down: 0/-460 (-460)
    function old new delta
    nr_free_zone_pages 129 115 -14
    __alloc_pages_direct_compact 329 256 -73
    get_page_from_freelist 2670 2576 -94
    __alloc_pages_nodemask 2564 2285 -279
    try_to_compact_pages 582 579 -3

    Overall stack sizes per ./scripts/checkstack.pl:

    old new delta
    get_page_from_freelist: 184 184 0
    __alloc_pages_nodemask 248 200 -48
    __alloc_pages_direct_c 40 - -40
    try_to_compact_pages 72 72 0
    -88

    [1] http://marc.info/?l=linux-mm&m=140142462528257&w=2

    This patch (of 4):

    prep_new_page() sets almost everything in the struct page of the page
    being allocated, except page->pfmemalloc. This is not obvious and has at
    least once led to a bug where page->pfmemalloc was forgotten to be set
    correctly, see commit 8fb74b9fb2b1 ("mm: compaction: partially revert
    capture of suitable high-order page").

    This patch moves the pfmemalloc setting to prep_new_page(), which means it
    needs to gain alloc_flags parameter. The call to prep_new_page is moved
    from buffered_rmqueue() to get_page_from_freelist(), which also leads to
    simpler code. An obsolete comment for buffered_rmqueue() is replaced.

    In addition to better maintainability there is a small reduction of code
    and stack usage for get_page_from_freelist(), which inlines the other
    functions involved.

    add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-145 (-145)
    function old new delta
    get_page_from_freelist 2670 2525 -145

    Stack usage is reduced from 184 to 168 bytes.

    Signed-off-by: Vlastimil Babka
    Acked-by: Michal Hocko
    Cc: Mel Gorman
    Cc: Zhang Yanfei
    Cc: Minchan Kim
    Cc: David Rientjes
    Cc: Rik van Riel
    Cc: "Aneesh Kumar K.V"
    Cc: "Kirill A. Shutemov"
    Cc: Johannes Weiner
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vlastimil Babka
     
  • Now kmemcheck_pagealloc_alloc() is only called by __alloc_pages_slowpath().
    __alloc_pages_nodemask()
    __alloc_pages_slowpath()
    kmemcheck_pagealloc_alloc()

    And the page will not be tracked by kmemcheck in the following path.
    __alloc_pages_nodemask()
    get_page_from_freelist()

    So move kmemcheck_pagealloc_alloc() into __alloc_pages_nodemask(),
    like this:
    __alloc_pages_nodemask()
    ...
    get_page_from_freelist()
    if (!page)
    __alloc_pages_slowpath()
    kmemcheck_pagealloc_alloc()
    ...

    Signed-off-by: Xishi Qiu
    Cc: Vegard Nossum
    Cc: Pekka Enberg
    Cc: Li Zefan
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Xishi Qiu
     
  • __alloc_pages_nodemask() strips __GFP_IO when retrying the page
    allocation. But it does this by altering the function-wide variable
    gfp_mask. This will cause subsequent allocation attempts to inadvertently
    use the modified gfp_mask.

    Also, pass the correct mask (the mask we actually used) into
    trace_mm_page_alloc().

    Cc: Ming Lei
    Cc: Mel Gorman
    Cc: Johannes Weiner
    Reviewed-by: Yasuaki Ishimatsu
    Cc: David Rientjes
    Acked-by: Vlastimil Babka
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andrew Morton
     

11 Feb, 2015

1 commit

  • If the freeing page and its buddy page are not at the same zone, the
    current holding zone->lock for the freeing page cann't prevent buddy page
    getting allocated, this could trigger VM_BUG_ON_PAGE in page_is_buddy() at
    a very tiny chance, such as:

    cpu 0: cpu 1:
    hold zone_1 lock
    check page and it buddy
    PageBuddy(buddy) is true hold zone_2 lock
    page_order(buddy) == order is true alloc buddy
    trigger VM_BUG_ON_PAGE(page_count(buddy) != 0)

    zone_1->lock prevents the freeing page getting allocated
    zone_2->lock prevents the buddy page getting allocated
    they are not the same zone->lock.

    If we can't remove the zone_id check statement, it's better handle this
    rare race. This patch fixes this by placing the zone_id check before the
    VM_BUG_ON_PAGE check.

    Signed-off-by: Weijie Yang
    Acked-by: Mel Gorman
    Cc: Johannes Weiner
    Cc: Rik van Riel
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Weijie Yang
     

27 Jan, 2015

1 commit

  • The OOM killing invocation does a lot of duplicative checks against the
    task's allocation context. Rework it to take advantage of the existing
    checks in the allocator slowpath.

    The OOM killer is invoked when the allocator is unable to reclaim any
    pages but the allocation has to keep looping. Instead of having a check
    for __GFP_NORETRY hidden in oom_gfp_allowed(), just move the OOM
    invocation to the true branch of should_alloc_retry(). The __GFP_FS
    check from oom_gfp_allowed() can then be moved into the OOM avoidance
    branch in __alloc_pages_may_oom(), along with the PF_DUMPCORE test.

    __alloc_pages_may_oom() can then signal to the caller whether the OOM
    killer was invoked, instead of requiring it to duplicate the order and
    high_zoneidx checks to guess this when deciding whether to continue.

    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     

19 Dec, 2014

1 commit

  • When the system boots up, in the dmesg logs we can see the memory
    statistics along with total reserved as below. Memory: 458840k/458840k
    available, 65448k reserved, 0K highmem

    When CMA is enabled, still the total reserved memory remains the same.
    However, the CMA memory is not considered as reserved. But, when we see
    /proc/meminfo, the CMA memory is part of free memory. This creates
    confusion. This patch corrects the problem by properly subtracting the
    CMA reserved memory from the total reserved memory in dmesg logs.

    Below is the dmesg snapshot from an arm based device with 512MB RAM and
    12MB single CMA region.

    Before this change:
    Memory: 458840k/458840k available, 65448k reserved, 0K highmem

    After this change:
    Memory: 458840k/458840k available, 53160k reserved, 12288k cma-reserved, 0K highmem

    Signed-off-by: Pintu Kumar
    Signed-off-by: Vishnu Pratap Singh
    Acked-by: Michal Nazarewicz
    Cc: Rafael Aquini
    Cc: Jerome Marchand
    Cc: Marek Szyprowski
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Pintu Kumar
     

14 Dec, 2014

7 commits

  • Since 01cefaef40c4 ("mm: provide more accurate estimation
    of pages occupied by memmap") allocate the pages from lowmem for the
    highmem zones' memmap. So It is not need to reserver the memmap's for
    the highmem.

    A 2G DDR3 for the arm platform:
    On node 0 totalpages: 524288
    free_area_init_node: node 0, pgdat 80ccd380, node_mem_map 80d38000
    DMA zone: 3568 pages used for memmap
    DMA zone: 0 pages reserved
    DMA zone: 456704 pages, LIFO batch:31
    HighMem zone: 528 pages used for memmap
    HighMem zone: 67584 pages, LIFO batch:15

    On node 0 totalpages: 524288
    free_area_init_node: node 0, pgdat 80cd6f40, node_mem_map 80d42000
    DMA zone: 3568 pages used for memmap
    DMA zone: 0 pages reserved
    DMA zone: 456704 pages, LIFO batch:31
    HighMem zone: 67584 pages, LIFO batch:15

    Signed-off-by: Hongbo Zhong
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Zhong Hongbo
     
  • The slab shrinkers are currently invoked from the zonelist walkers in
    kswapd, direct reclaim, and zone reclaim, all of which roughly gauge the
    eligible LRU pages and assemble a nodemask to pass to NUMA-aware
    shrinkers, which then again have to walk over the nodemask. This is
    redundant code, extra runtime work, and fairly inaccurate when it comes to
    the estimation of actually scannable LRU pages. The code duplication will
    only get worse when making the shrinkers cgroup-aware and requiring them
    to have out-of-band cgroup hierarchy walks as well.

    Instead, invoke the shrinkers from shrink_zone(), which is where all
    reclaimers end up, to avoid this duplication.

    Take the count for eligible LRU pages out of get_scan_count(), which
    considers many more factors than just the availability of swap space, like
    zone_reclaimable_pages() currently does. Accumulate the number over all
    visited lruvecs to get the per-zone value.

    Some nodes have multiple zones due to memory addressing restrictions. To
    avoid putting too much pressure on the shrinkers, only invoke them once
    for each such node, using the class zone of the allocation as the pivot
    zone.

    For now, this integrates the slab shrinking better into the reclaim logic
    and gets rid of duplicative invocations from kswapd, direct reclaim, and
    zone reclaim. It also prepares for cgroup-awareness, allowing
    memcg-capable shrinkers to be added at the lruvec level without much
    duplication of both code and runtime work.

    This changes kswapd behavior, which used to invoke the shrinkers for each
    zone, but with scan ratios gathered from the entire node, resulting in
    meaningless pressure quantities on multi-zone nodes.

    Zone reclaim behavior also changes. It used to shrink slabs until the
    same amount of pages were shrunk as were reclaimed from the LRUs. Now it
    merely invokes the shrinkers once with the zone's scan ratio, which makes
    the shrinkers go easier on caches that implement aging and would prefer
    feeding back pressure from recently used slab objects to unused LRU pages.

    [vdavydov@parallels.com: assure class zone is populated]
    Signed-off-by: Johannes Weiner
    Cc: Dave Chinner
    Signed-off-by: Vladimir Davydov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • This is the page owner tracking code which is introduced so far ago. It
    is resident on Andrew's tree, though, nobody tried to upstream so it
    remain as is. Our company uses this feature actively to debug memory leak
    or to find a memory hogger so I decide to upstream this feature.

    This functionality help us to know who allocates the page. When
    allocating a page, we store some information about allocation in extra
    memory. Later, if we need to know status of all pages, we can get and
    analyze it from this stored information.

    In previous version of this feature, extra memory is statically defined in
    struct page, but, in this version, extra memory is allocated outside of
    struct page. It enables us to turn on/off this feature at boottime
    without considerable memory waste.

    Although we already have tracepoint for tracing page allocation/free,
    using it to analyze page owner is rather complex. We need to enlarge the
    trace buffer for preventing overlapping until userspace program launched.
    And, launched program continually dump out the trace buffer for later
    analysis and it would change system behaviour with more possibility rather
    than just keeping it in memory, so bad for debug.

    Moreover, we can use page_owner feature further for various purposes. For
    example, we can use it for fragmentation statistics implemented in this
    patch. And, I also plan to implement some CMA failure debugging feature
    using this interface.

    I'd like to give the credit for all developers contributed this feature,
    but, it's not easy because I don't know exact history. Sorry about that.
    Below is people who has "Signed-off-by" in the patches in Andrew's tree.

    Contributor:
    Alexander Nyberg
    Mel Gorman
    Dave Hansen
    Minchan Kim
    Michal Nazarewicz
    Andrew Morton
    Jungsoo Son

    Signed-off-by: Joonsoo Kim
    Cc: Mel Gorman
    Cc: Johannes Weiner
    Cc: Minchan Kim
    Cc: Dave Hansen
    Cc: Michal Nazarewicz
    Cc: Jungsoo Son
    Cc: Ingo Molnar
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     
  • Now, we have prepared to avoid using debug-pagealloc in boottime. So
    introduce new kernel-parameter to disable debug-pagealloc in boottime, and
    makes related functions to be disabled in this case.

    Only non-intuitive part is change of guard page functions. Because guard
    page is effective only if debug-pagealloc is enabled, turning off
    according to debug-pagealloc is reasonable thing to do.

    Signed-off-by: Joonsoo Kim
    Cc: Mel Gorman
    Cc: Johannes Weiner
    Cc: Minchan Kim
    Cc: Dave Hansen
    Cc: Michal Nazarewicz
    Cc: Jungsoo Son
    Cc: Ingo Molnar
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     
  • Until now, debug-pagealloc needs extra flags in struct page, so we need to
    recompile whole source code when we decide to use it. This is really
    painful, because it takes some time to recompile and sometimes rebuild is
    not possible due to third party module depending on struct page. So, we
    can't use this good feature in many cases.

    Now, we have the page extension feature that allows us to insert extra
    flags to outside of struct page. This gets rid of third party module
    issue mentioned above. And, this allows us to determine if we need extra
    memory for this page extension in boottime. With these property, we can
    avoid using debug-pagealloc in boottime with low computational overhead in
    the kernel built with CONFIG_DEBUG_PAGEALLOC. This will help our
    development process greatly.

    This patch is the preparation step to achive above goal. debug-pagealloc
    originally uses extra field of struct page, but, after this patch, it will
    use field of struct page_ext. Because memory for page_ext is allocated
    later than initialization of page allocator in CONFIG_SPARSEMEM, we should
    disable debug-pagealloc feature temporarily until initialization of
    page_ext. This patch implements this.

    Signed-off-by: Joonsoo Kim
    Cc: Mel Gorman
    Cc: Johannes Weiner
    Cc: Minchan Kim
    Cc: Dave Hansen
    Cc: Michal Nazarewicz
    Cc: Jungsoo Son
    Cc: Ingo Molnar
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     
  • When we debug something, we'd like to insert some information to every
    page. For this purpose, we sometimes modify struct page itself. But,
    this has drawbacks. First, it requires re-compile. This makes us
    hesitate to use the powerful debug feature so development process is
    slowed down. And, second, sometimes it is impossible to rebuild the
    kernel due to third party module dependency. At third, system behaviour
    would be largely different after re-compile, because it changes size of
    struct page greatly and this structure is accessed by every part of
    kernel. Keeping this as it is would be better to reproduce errornous
    situation.

    This feature is intended to overcome above mentioned problems. This
    feature allocates memory for extended data per page in certain place
    rather than the struct page itself. This memory can be accessed by the
    accessor functions provided by this code. During the boot process, it
    checks whether allocation of huge chunk of memory is needed or not. If
    not, it avoids allocating memory at all. With this advantage, we can
    include this feature into the kernel in default and can avoid rebuild and
    solve related problems.

    Until now, memcg uses this technique. But, now, memcg decides to embed
    their variable to struct page itself and it's code to extend struct page
    has been removed. I'd like to use this code to develop debug feature, so
    this patch resurrect it.

    To help these things to work well, this patch introduces two callbacks for
    clients. One is the need callback which is mandatory if user wants to
    avoid useless memory allocation at boot-time. The other is optional, init
    callback, which is used to do proper initialization after memory is
    allocated. Detailed explanation about purpose of these functions is in
    code comment. Please refer it.

    Others are completely same with previous extension code in memcg.

    Signed-off-by: Joonsoo Kim
    Cc: Mel Gorman
    Cc: Johannes Weiner
    Cc: Minchan Kim
    Cc: Dave Hansen
    Cc: Michal Nazarewicz
    Cc: Jungsoo Son
    Cc: Ingo Molnar
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     
  • Page guard is used by debug-pagealloc feature. Currently, it is
    open-coded, but, I think that more abstraction of it makes core page
    allocator code more readable.

    There is no functional difference.

    Signed-off-by: Joonsoo Kim
    Acked-by: Vlastimil Babka
    Cc: Gioh Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Joonsoo Kim
     

12 Dec, 2014

1 commit

  • Pull cgroup update from Tejun Heo:
    "cpuset got simplified a bit. cgroup core got a fix on unified
    hierarchy and grew some effective css related interfaces which will be
    used for blkio support for writeback IO traffic which is currently
    being worked on"

    * 'for-3.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
    cgroup: implement cgroup_get_e_css()
    cgroup: add cgroup_subsys->css_e_css_changed()
    cgroup: add cgroup_subsys->css_released()
    cgroup: fix the async css offline wait logic in cgroup_subtree_control_write()
    cgroup: restructure child_subsys_mask handling in cgroup_subtree_control_write()
    cgroup: separate out cgroup_calc_child_subsys_mask() from cgroup_refresh_child_subsys_mask()
    cpuset: lock vs unlock typo
    cpuset: simplify cpuset_node_allowed API
    cpuset: convert callback_mutex to a spinlock

    Linus Torvalds