19 Jan, 2016

1 commit

  • Conflicts:
    arch/arm/boot/dts/Makefile
    arch/arm/boot/dts/imx6qdl-sabreauto.dtsi
    arch/arm/boot/dts/imx6qdl-sabresd.dtsi
    arch/arm/boot/dts/imx6qp-sabresd.dts
    arch/arm/boot/dts/imx6sl-evk.dts
    arch/arm/boot/dts/imx6sl.dtsi
    arch/arm/boot/dts/imx6sx-14x14-arm2.dts
    arch/arm/boot/dts/imx6sx-19x19-arm2.dts
    arch/arm/boot/dts/imx6sx-sabreauto.dts
    arch/arm/boot/dts/imx6sx-sdb-btwifi.dts
    arch/arm/boot/dts/imx6sx-sdb.dtsi
    arch/arm/boot/dts/imx6sx.dtsi
    arch/arm/boot/dts/imx6ul-14x14-evk.dts
    arch/arm/boot/dts/imx6ul-9x9-evk.dts
    arch/arm/boot/dts/imx6ul-evk-btwifi.dtsi
    arch/arm/boot/dts/imx6ul-pinfunc.h
    arch/arm/boot/dts/imx6ul.dtsi
    arch/arm/boot/dts/imx7d-12x12-lpddr3-arm2.dts
    arch/arm/boot/dts/imx7d-pinfunc.h
    arch/arm/boot/dts/imx7d-sdb-epdc.dtsi
    arch/arm/boot/dts/imx7d-sdb-m4.dtsi
    arch/arm/boot/dts/imx7d-sdb-reva-touch.dts
    arch/arm/boot/dts/imx7d-sdb-reva.dts
    arch/arm/boot/dts/imx7d-sdb.dts
    arch/arm/boot/dts/imx7d.dtsi
    arch/arm/configs/imx_v7_defconfig
    arch/arm/configs/imx_v7_mfg_defconfig
    arch/arm/mach-imx/clk-imx6q.c
    arch/arm/mach-imx/clk.h
    arch/arm/mach-imx/cpuidle-imx7d.c
    arch/arm/mach-imx/ddr3_freq_imx7d.S
    arch/arm/mach-imx/gpcv2.c
    arch/arm/mach-imx/imx7d_low_power_idle.S
    arch/arm/mach-imx/lpddr3_freq_imx.S
    arch/arm/mach-imx/mach-imx7d.c
    arch/arm/mach-imx/pm-imx7.c
    arch/arm/mach-imx/suspend-imx7.S
    drivers/ata/ahci_imx.c
    drivers/cpufreq/imx6q-cpufreq.c
    drivers/dma/imx-sdma.c
    drivers/dma/pxp/pxp_dma_v2.c
    drivers/input/touchscreen/ads7846.c
    drivers/media/platform/mxc/capture/ov5640_mipi.c
    drivers/media/platform/mxc/output/mxc_pxp_v4l2.c
    drivers/mmc/core/core.c
    drivers/mmc/core/sd.c
    drivers/mtd/spi-nor/fsl-quadspi.c
    drivers/mxc/gpu-viv/Kbuild
    drivers/mxc/gpu-viv/config
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_context.c
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_context.h
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_hardware.c
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_hardware.h
    drivers/mxc/gpu-viv/hal/kernel/arch/gc_hal_kernel_recorder.c
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_command_vg.c
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_command_vg.h
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_vg.c
    drivers/mxc/gpu-viv/hal/kernel/archvg/gc_hal_kernel_hardware_vg.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_command.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_command_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_db.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_debug.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_event.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_heap.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_interrupt_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_mmu.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_mmu_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_power.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_precomp.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_security.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_vg.c
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_vg.h
    drivers/mxc/gpu-viv/hal/kernel/gc_hal_kernel_video_memory.c
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_base.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_driver.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_driver_vg.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_dump.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_eglplatform.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_eglplatform_type.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_engine.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_engine_vg.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_enum.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_kernel_buffer.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_mem.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_options.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_profiler.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_raster.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_rename.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_security_interface.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_statistics.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_types.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_version.h
    drivers/mxc/gpu-viv/hal/kernel/inc/gc_hal_vg.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/default/gc_hal_kernel_allocator_array.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/default/gc_hal_kernel_allocator_dmabuf.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/freescale/gc_hal_kernel_allocator_array.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/allocator/freescale/gc_hal_kernel_allocator_cma.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_allocator.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_allocator.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_debug.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_debugfs.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_debugfs.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_device.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_device.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_iommu.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_linux.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_linux.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_math.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_mutex.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_os.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_os.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_platform.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_probe.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_security_channel.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_sync.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/gc_hal_kernel_sync.h
    drivers/mxc/gpu-viv/hal/os/linux/kernel/platform/freescale/gc_hal_kernel_platform_imx6q14.c
    drivers/mxc/gpu-viv/hal/os/linux/kernel/platform/freescale/gc_hal_kernel_platform_imx6q14.config
    drivers/mxc/hdmi-cec/mxc_hdmi-cec.c
    drivers/mxc/ipu3/ipu_common.c
    drivers/mxc/mlb/mxc_mlb.c
    drivers/net/ethernet/freescale/fec_main.c
    drivers/net/wireless/bcmdhd/dhd_linux.c
    drivers/net/wireless/bcmdhd/dhd_sdio.c
    drivers/scsi/scsi_error.c
    drivers/spi/spi-imx.c
    drivers/thermal/imx_thermal.c
    drivers/tty/serial/imx.c
    drivers/usb/chipidea/udc.c
    drivers/usb/gadget/configfs.c
    drivers/video/fbdev/mxc/mipi_dsi.c
    drivers/video/fbdev/mxc/mipi_dsi.h
    drivers/video/fbdev/mxc/mipi_dsi_samsung.c
    drivers/video/fbdev/mxc/mxc_edid.c
    drivers/video/fbdev/mxc/mxc_epdc_fb.c
    drivers/video/fbdev/mxc/mxc_epdc_v2_fb.c
    drivers/video/fbdev/mxc/mxc_ipuv3_fb.c
    drivers/video/fbdev/mxc/mxcfb_hx8369_wvga.c
    drivers/video/fbdev/mxsfb.c
    firmware/imx/sdma/sdma-imx6q.bin.ihex
    include/trace/events/cpufreq_interactive.h

    guoyin.chen
     

20 Nov, 2015

2 commits

  • Pass correct argument to subsys_cgroup_allow_attach(), which
    expects 'struct cgroup_subsys_state *' argument but we pass
    'struct cgroup *' instead which doesn't seem right.

    This fixes following 'incompatible pointer type' compiler warning:
    ----------
    CC mm/memcontrol.o
    mm/memcontrol.c: In function ‘mem_cgroup_allow_attach’:
    mm/memcontrol.c:5052:2: warning: passing argument 1 of ‘subsys_cgroup_allow_attach’ from incompatible pointer type [enabled by default]
    In file included from include/linux/memcontrol.h:22:0,
    from mm/memcontrol.c:29:
    include/linux/cgroup.h:953:5: note: expected ‘struct cgroup_subsys_state *’ but argument is of type ‘struct cgroup *’
    ----------

    Signed-off-by: Amit Pundir

    Amit Pundir
     
  • Use the 'allow_attach' handler for the 'mem' cgroup to allow
    non-root processes to add arbitrary processes to a 'mem' cgroup
    if it has the CAP_SYS_NICE capability set.

    Bug: 18260435
    Change-Id: If7d37bf90c1544024c4db53351adba6a64966250
    Signed-off-by: Rom Lemarchand

    Rom Lemarchand
     

27 Oct, 2015

1 commit

  • commit 424cdc14138088ada1b0e407a2195b2783c6e5ef upstream.

    page_counter_memparse() returns pages for the threshold, while
    mem_cgroup_usage() returns bytes for memory usage. Convert the
    threshold to bytes.

    Fixes: 3e32cb2e0a12b6915 ("memcg: rename cgroup_event to mem_cgroup_event").
    Signed-off-by: Shaohua Li
    Cc: Johannes Weiner
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds
    Signed-off-by: Greg Kroah-Hartman

    Shaohua Li
     

11 Jun, 2015

2 commits

  • On -rt, the VM_BUG_ON(!irqs_disabled()) triggers inside the memcg
    swapout path because the spin_lock_irq(&mapping->tree_lock) in the
    caller doesn't actually disable the hardware interrupts - which is fine,
    because on -rt the tophalves run in process context and so we are still
    safe from preemption while updating the statistics.

    Remove the VM_BUG_ON() but keep the comment of what we rely on.

    Signed-off-by: Johannes Weiner
    Reported-by: Clark Williams
    Cc: Fernando Lopez-Lezcano
    Cc: Steven Rostedt
    Cc: Thomas Gleixner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • When trimming memcg consumption excess (see memory.high), we call
    try_to_free_mem_cgroup_pages without checking if we are allowed to sleep
    in the current context, which can result in a deadlock. Fix this.

    Fixes: 241994ed8649 ("mm: memcontrol: default hierarchy interface for memory")
    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     

16 Apr, 2015

3 commits

  • We converted some of the usages of ACCESS_ONCE to READ_ONCE in the mm/
    tree since it doesn't work reliably on non-scalar types.

    This patch removes the rest of the usages of ACCESS_ONCE, and use the new
    READ_ONCE API for the read accesses. This makes things cleaner, instead
    of using separate/multiple sets of APIs.

    Signed-off-by: Jason Low
    Acked-by: Michal Hocko
    Acked-by: Davidlohr Bueso
    Acked-by: Rik van Riel
    Reviewed-by: Christian Borntraeger
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Jason Low
     
  • Low and high watermarks, as they defined in the TODO to the mem_cgroup
    struct, have already been implemented by Johannes, so remove the stale
    comment.

    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • mem_cgroup_lookup() is a wrapper around mem_cgroup_from_id(), which
    checks that id != 0 before issuing the function call. Today, there is
    no point in this additional check apart from optimization, because there
    is no css with id 0 to css_from_id.

    Signed-off-by: Vladimir Davydov
    Acked-by: Michal Hocko
    Cc: Johannes Weiner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     

15 Apr, 2015

3 commits

  • If kernel panics due to oom, caused by a cgroup reaching its limit, when
    'compulsory panic_on_oom' is enabled, then we will only see that the OOM
    happened because of "compulsory panic_on_oom is enabled" but this doesn't
    tell the difference between mempolicy and memcg. And dumping system wide
    information is plain wrong and more confusing. This patch provides the
    information of the cgroup whose limit triggerred panic

    Signed-off-by: Balasubramani Vivekanandan
    Acked-by: Michal Hocko
    Cc: Johannes Weiner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Balasubramani Vivekanandan
     
  • When !MMU, it will report warning. The related warning with allmodconfig
    under c6x:

    CC mm/memcontrol.o
    mm/memcontrol.c:2802:12: warning: 'mem_cgroup_move_account' defined but not used [-Wunused-function]
    static int mem_cgroup_move_account(struct page *page,
    ^

    Signed-off-by: Chen Gang
    Acked-by: Michal Hocko
    Acked-by: Johannes Weiner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Chen Gang
     
  • Add myself to the list of copyright holders.

    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     

13 Mar, 2015

1 commit

  • If the memory cgroup controller is initially mounted in the scope of the
    default cgroup hierarchy and then remounted to a legacy hierarchy, it will
    still have hierarchy support enabled, which is incorrect. We should
    disable hierarchy support if bound to the legacy cgroup hierarchy.

    Signed-off-by: Vladimir Davydov
    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     

01 Mar, 2015

2 commits

  • The memcg control knobs indicate the highest possible value using the
    symbolic name "infinity", which is long and awkward to type.

    Switch to the string "max", which is just as descriptive but shorter and
    sweeter.

    This changes a user interface, so do it before the release and before
    the development flag is dropped from the default hierarchy.

    Signed-off-by: Johannes Weiner
    Cc: Michal Hocko
    Cc: Tejun Heo
    Cc: Vladimir Davydov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • A memcg is considered low limited even when the current usage is equal to
    the low limit. This leads to interesting side effects e.g.
    groups/hierarchies with no memory accounted are considered protected and
    so the reclaim will emit MEMCG_LOW event when encountering them.

    Another and much bigger issue was reported by Joonsoo Kim. He has hit a
    NULL ptr dereference with the legacy cgroup API which even doesn't have
    low limit exposed. The limit is 0 by default but the initial check fails
    for memcg with 0 consumption and parent_mem_cgroup() would return NULL if
    use_hierarchy is 0 and so page_counter_read would try to dereference NULL.

    I suppose that the current implementation is just an overlook because the
    documentation in Documentation/cgroups/unified-hierarchy.txt says:

    "The memory.low boundary on the other hand is a top-down allocated
    reserve. A cgroup enjoys reclaim protection when it and all its
    ancestors are below their low boundaries"

    Fix the usage and the low limit comparision in mem_cgroup_low accordingly.

    Fixes: 241994ed8649 (mm: memcontrol: default hierarchy interface for memory)
    Reported-by: Joonsoo Kim
    Signed-off-by: Michal Hocko
    Acked-by: Johannes Weiner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko
     

13 Feb, 2015

8 commits

  • Move memcg_socket_limit_enabled decrement to tcp_destroy_cgroup (called
    from memcg_destroy_kmem -> mem_cgroup_sockets_destroy) and zap a bunch of
    wrapper functions.

    Although this patch moves static keys decrement from __mem_cgroup_free to
    mem_cgroup_css_free, it does not introduce any functional changes, because
    the keys are incremented on setting the limit (tcp or kmem), which can
    only happen after successful mem_cgroup_css_online.

    Signed-off-by: Vladimir Davydov
    Cc: Glauber Costa
    Cc: KAMEZAWA Hiroyuki
    Cc: Eric W. Biederman
    Cc: David S. Miller
    Cc: Johannes Weiner
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • Now, the only reason to keep kmemcg_id till css free is list_lru, which
    uses it to distribute elements between per-memcg lists. However, it can
    be easily sorted out - we only need to change kmemcg_id of an offline
    cgroup to its parent's id, making further list_lru_add()'s add elements to
    the parent's list, and then move all elements from the offline cgroup's
    list to the one of its parent. It will work, because a racing
    list_lru_del() does not need to know the list it is deleting the element
    from. It can decrement the wrong nr_items counter though, but the ongoing
    reparenting will fix it. After list_lru reparenting is done we are free
    to release kmemcg_id saving a valuable slot in a per-memcg array for new
    cgroups.

    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Cc: Tejun Heo
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Dave Chinner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • We need to look up a kmem_cache in ->memcg_params.memcg_caches arrays only
    on allocations, so there is no need to have the array entries set until
    css free - we can clear them on css offline. This will allow us to reuse
    array entries more efficiently and avoid costly array relocations.

    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Cc: Tejun Heo
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Dave Chinner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • Currently, kmem_cache stores a pointer to struct memcg_cache_params
    instead of embedding it. The rationale is to save memory when kmem
    accounting is disabled. However, the memcg_cache_params has shrivelled
    drastically since it was first introduced:

    * Initially:

    struct memcg_cache_params {
    bool is_root_cache;
    union {
    struct kmem_cache *memcg_caches[0];
    struct {
    struct mem_cgroup *memcg;
    struct list_head list;
    struct kmem_cache *root_cache;
    bool dead;
    atomic_t nr_pages;
    struct work_struct destroy;
    };
    };
    };

    * Now:

    struct memcg_cache_params {
    bool is_root_cache;
    union {
    struct {
    struct rcu_head rcu_head;
    struct kmem_cache *memcg_caches[0];
    };
    struct {
    struct mem_cgroup *memcg;
    struct kmem_cache *root_cache;
    };
    };
    };

    So the memory saving does not seem to be a clear win anymore.

    OTOH, keeping a pointer to memcg_cache_params struct instead of embedding
    it results in touching one more cache line on kmem alloc/free hot paths.
    Besides, it makes linking kmem caches in a list chained by a field of
    struct memcg_cache_params really painful due to a level of indirection,
    while I want to make them linked in the following patch. That said, let
    us embed it.

    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Cc: Tejun Heo
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Dave Chinner
    Cc: Dan Carpenter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • There are several FS shrinkers, including super_block::s_shrink, that
    keep reclaimable objects in the list_lru structure. Hence to turn them
    to memcg-aware shrinkers, it is enough to make list_lru per-memcg.

    This patch does the trick. It adds an array of lru lists to the
    list_lru_node structure (per-node part of the list_lru), one for each
    kmem-active memcg, and dispatches every item addition or removal to the
    list corresponding to the memcg which the item is accounted to. So now
    the list_lru structure is not just per node, but per node and per memcg.

    Not all list_lrus need this feature, so this patch also adds a new
    method, list_lru_init_memcg, which initializes a list_lru as memcg
    aware. Otherwise (i.e. if initialized with old list_lru_init), the
    list_lru won't have per memcg lists.

    Just like per memcg caches arrays, the arrays of per-memcg lists are
    indexed by memcg_cache_id, so we must grow them whenever
    memcg_nr_cache_ids is increased. So we introduce a callback,
    memcg_update_all_list_lrus, invoked by memcg_alloc_cache_id if the id
    space is full.

    The locking is implemented in a manner similar to lruvecs, i.e. we have
    one lock per node that protects all lists (both global and per cgroup) on
    the node.

    Signed-off-by: Vladimir Davydov
    Cc: Dave Chinner
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Cc: Greg Thelen
    Cc: Glauber Costa
    Cc: Alexander Viro
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • We need a stable value of memcg_nr_cache_ids in kmem_cache_create()
    (memcg_alloc_cache_params() wants it for root caches), where we only
    hold the slab_mutex and no memcg-related locks. As a result, we have to
    update memcg_nr_cache_ids under the slab_mutex, which we can only take
    on the slab's side (see memcg_update_array_size). This looks awkward
    and will become even worse when per-memcg list_lru is introduced, which
    also wants stable access to memcg_nr_cache_ids.

    To get rid of this dependency between the memcg_nr_cache_ids and the
    slab_mutex, this patch introduces a special rwsem. The rwsem is held
    for writing during memcg_caches arrays relocation and memcg_nr_cache_ids
    updates. Therefore one can take it for reading to get a stable access
    to memcg_caches arrays and/or memcg_nr_cache_ids.

    Currently the semaphore is taken for reading only from
    kmem_cache_create, right before taking the slab_mutex, so right now
    there's no much point in using rwsem instead of mutex. However, once
    list_lru is made per-memcg it will allow list_lru initializations to
    proceed concurrently.

    Signed-off-by: Vladimir Davydov
    Cc: Dave Chinner
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Cc: Greg Thelen
    Cc: Glauber Costa
    Cc: Alexander Viro
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • memcg_limited_groups_array_size, which defines the size of memcg_caches
    arrays, sounds rather cumbersome. Also it doesn't point anyhow that
    it's related to kmem/caches stuff. So let's rename it to
    memcg_nr_cache_ids. It's concise and points us directly to
    memcg_cache_id.

    Also, rename kmem_limited_groups to memcg_cache_ida.

    Signed-off-by: Vladimir Davydov
    Cc: Dave Chinner
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Cc: Greg Thelen
    Cc: Glauber Costa
    Cc: Alexander Viro
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • This patch adds SHRINKER_MEMCG_AWARE flag. If a shrinker has this flag
    set, it will be called per memory cgroup. The memory cgroup to scan
    objects from is passed in shrink_control->memcg. If the memory cgroup
    is NULL, a memcg aware shrinker is supposed to scan objects from the
    global list. Unaware shrinkers are only called on global pressure with
    memcg=NULL.

    Signed-off-by: Vladimir Davydov
    Cc: Dave Chinner
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Cc: Greg Thelen
    Cc: Glauber Costa
    Cc: Alexander Viro
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     

12 Feb, 2015

12 commits

  • pagewalk.c can handle vma in itself, so we don't have to pass vma via
    walk->private. And both of mem_cgroup_count_precharge() and
    mem_cgroup_move_charge() do for each vma loop themselves, but now it's
    done in pagewalk.c, so let's clean up them.

    Signed-off-by: Naoya Horiguchi
    Acked-by: Johannes Weiner
    Cc: "Kirill A. Shutemov"
    Cc: Andrea Arcangeli
    Cc: Cyrill Gorcunov
    Cc: Dave Hansen
    Cc: Kirill A. Shutemov
    Cc: Pavel Emelyanov
    Cc: Benjamin Herrenschmidt
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Naoya Horiguchi
     
  • The swap controller code is scattered all over the file. Gather all
    the code that isn't directly needed by the memory controller at the
    end of the file in its own CONFIG_MEMCG_SWAP section.

    Signed-off-by: Johannes Weiner
    Cc: Michal Hocko
    Reviewed-by: Vladimir Davydov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • The initialization code for the per-cpu charge stock and the soft
    limit tree is compact enough to inline it into mem_cgroup_init().

    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Reviewed-by: Vladimir Davydov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • - No need to test the node for N_MEMORY. node_online() is enough for
    node fallback to work in slab, use NUMA_NO_NODE for everything else.

    - Remove the BUG_ON() for allocation failure. A NULL pointer crash is
    just as descriptive, and the absent return value check is obvious.

    - Move local variables to the inner-most blocks.

    - Point to the tree structure after its initialized, not before, it's
    just more logical that way.

    Signed-off-by: Johannes Weiner
    Cc: Michal Hocko
    Cc: Vladimir Davydov
    Cc: Guenter Roeck
    Cc: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM
    suspend") has left a race window when OOM killer manages to
    note_oom_kill after freeze_processes checks the counter. The race
    window is quite small and really unlikely and partial solution deemed
    sufficient at the time of submission.

    Tejun wasn't happy about this partial solution though and insisted on a
    full solution. That requires the full OOM and freezer's task freezing
    exclusion, though. This is done by this patch which introduces oom_sem
    RW lock and turns oom_killer_disable() into a full OOM barrier.

    oom_killer_disabled check is moved from the allocation path to the OOM
    level and we take oom_sem for reading for both the check and the whole
    OOM invocation.

    oom_killer_disable() takes oom_sem for writing so it waits for all
    currently running OOM killer invocations. Then it disable all the further
    OOMs by setting oom_killer_disabled and checks for any oom victims.
    Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The
    last victim wakes up all waiters enqueued by oom_killer_disable().
    Therefore this function acts as the full OOM barrier.

    The page fault path is covered now as well although it was assumed to be
    safe before. As per Tejun, "We used to have freezing points deep in file
    system code which may be reacheable from page fault." so it would be
    better and more robust to not rely on freezing points here. Same applies
    to the memcg OOM killer.

    out_of_memory tells the caller whether the OOM was allowed to trigger and
    the callers are supposed to handle the situation. The page allocation
    path simply fails the allocation same as before. The page fault path will
    retry the fault (more on that later) and Sysrq OOM trigger will simply
    complain to the log.

    Normally there wouldn't be any unfrozen user tasks after
    try_to_freeze_tasks so the function will not block. But if there was an
    OOM killer racing with try_to_freeze_tasks and the OOM victim didn't
    finish yet then we have to wait for it. This should complete in a finite
    time, though, because

    - the victim cannot loop in the page fault handler (it would die
    on the way out from the exception)
    - it cannot loop in the page allocator because all the further
    allocation would fail and __GFP_NOFAIL allocations are not
    acceptable at this stage
    - it shouldn't be blocked on any locks held by frozen tasks
    (try_to_freeze expects lockless context) and kernel threads and
    work queues are not frozen yet

    Signed-off-by: Michal Hocko
    Suggested-by: Tejun Heo
    Cc: David Rientjes
    Cc: Johannes Weiner
    Cc: Oleg Nesterov
    Cc: Cong Wang
    Cc: "Rafael J. Wysocki"
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko
     
  • This patchset addresses a race which was described in the changelog for
    5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend"):

    : PM freezer relies on having all tasks frozen by the time devices are
    : getting frozen so that no task will touch them while they are getting
    : frozen. But OOM killer is allowed to kill an already frozen task in order
    : to handle OOM situtation. In order to protect from late wake ups OOM
    : killer is disabled after all tasks are frozen. This, however, still keeps
    : a window open when a killed task didn't manage to die by the time
    : freeze_processes finishes.

    The original patch hasn't closed the race window completely because that
    would require a more complex solution as it can be seen by this patchset.

    The primary motivation was to close the race condition between OOM killer
    and PM freezer _completely_. As Tejun pointed out, even though the race
    condition is unlikely the harder it would be to debug weird bugs deep in
    the PM freezer when the debugging options are reduced considerably. I can
    only speculate what might happen when a task is still runnable
    unexpectedly.

    On a plus side and as a side effect the oom enable/disable has a better
    (full barrier) semantic without polluting hot paths.

    I have tested the series in KVM with 100M RAM:
    - many small tasks (20M anon mmap) which are triggering OOM continually
    - s2ram which resumes automatically is triggered in a loop
    echo processors > /sys/power/pm_test
    while true
    do
    echo mem > /sys/power/state
    sleep 1s
    done
    - simple module which allocates and frees 20M in 8K chunks. If it sees
    freezing(current) then it tries another round of allocation before calling
    try_to_freeze
    - debugging messages of PM stages and OOM killer enable/disable/fail added
    and unmark_oom_victim is delayed by 1s after it clears TIF_MEMDIE and before
    it wakes up waiters.
    - rebased on top of the current mmotm which means some necessary updates
    in mm/oom_kill.c. mark_tsk_oom_victim is now called under task_lock but
    I think this should be OK because __thaw_task shouldn't interfere with any
    locking down wake_up_process. Oleg?

    As expected there are no OOM killed tasks after oom is disabled and
    allocations requested by the kernel thread are failing after all the tasks
    are frozen and OOM disabled. I wasn't able to catch a race where
    oom_killer_disable would really have to wait but I kinda expected the race
    is really unlikely.

    [ 242.609330] Killed process 2992 (mem_eater) total-vm:24412kB, anon-rss:2164kB, file-rss:4kB
    [ 243.628071] Unmarking 2992 OOM victim. oom_victims: 1
    [ 243.636072] (elapsed 2.837 seconds) done.
    [ 243.641985] Trying to disable OOM killer
    [ 243.643032] Waiting for concurent OOM victims
    [ 243.644342] OOM killer disabled
    [ 243.645447] Freezing remaining freezable tasks ... (elapsed 0.005 seconds) done.
    [ 243.652983] Suspending console(s) (use no_console_suspend to debug)
    [ 243.903299] kmem_eater: page allocation failure: order:1, mode:0x204010
    [...]
    [ 243.992600] PM: suspend of devices complete after 336.667 msecs
    [ 243.993264] PM: late suspend of devices complete after 0.660 msecs
    [ 243.994713] PM: noirq suspend of devices complete after 1.446 msecs
    [ 243.994717] ACPI: Preparing to enter system sleep state S3
    [ 243.994795] PM: Saving platform NVS memory
    [ 243.994796] Disabling non-boot CPUs ...

    The first 2 patches are simple cleanups for OOM. They should go in
    regardless the rest IMO.

    Patches 3 and 4 are trivial printk -> pr_info conversion and they should
    go in ditto.

    The main patch is the last one and I would appreciate acks from Tejun and
    Rafael. I think the OOM part should be OK (except for __thaw_task vs.
    task_lock where a look from Oleg would appreciated) but I am not so sure I
    haven't screwed anything in the freezer code. I have found several
    surprises there.

    This patch (of 5):

    This patch is just a preparatory and it doesn't introduce any functional
    change.

    Note:
    I am utterly unhappy about lowmemory killer abusing TIF_MEMDIE just to
    wait for the oom victim and to prevent from new killing. This is
    just a side effect of the flag. The primary meaning is to give the oom
    victim access to the memory reserves and that shouldn't be necessary
    here.

    Signed-off-by: Michal Hocko
    Cc: Tejun Heo
    Cc: David Rientjes
    Cc: Johannes Weiner
    Cc: Oleg Nesterov
    Cc: Cong Wang
    Cc: "Rafael J. Wysocki"
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko
     
  • Turn the move type enum into flags and give the flags field a shorter
    name. Once that is done, move_anon() and move_file() are simple enough to
    just fold them into the callsites.

    [akpm@linux-foundation.org: tweak MOVE_MASK definition, per Michal]
    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Reviewed-by: Vladimir Davydov
    Cc: Greg Thelen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • Introduce the basic control files to account, partition, and limit
    memory using cgroups in default hierarchy mode.

    This interface versioning allows us to address fundamental design
    issues in the existing memory cgroup interface, further explained
    below. The old interface will be maintained indefinitely, but a
    clearer model and improved workload performance should encourage
    existing users to switch over to the new one eventually.

    The control files are thus:

    - memory.current shows the current consumption of the cgroup and its
    descendants, in bytes.

    - memory.low configures the lower end of the cgroup's expected
    memory consumption range. The kernel considers memory below that
    boundary to be a reserve - the minimum that the workload needs in
    order to make forward progress - and generally avoids reclaiming
    it, unless there is an imminent risk of entering an OOM situation.

    - memory.high configures the upper end of the cgroup's expected
    memory consumption range. A cgroup whose consumption grows beyond
    this threshold is forced into direct reclaim, to work off the
    excess and to throttle new allocations heavily, but is generally
    allowed to continue and the OOM killer is not invoked.

    - memory.max configures the hard maximum amount of memory that the
    cgroup is allowed to consume before the OOM killer is invoked.

    - memory.events shows event counters that indicate how often the
    cgroup was reclaimed while below memory.low, how often it was
    forced to reclaim excess beyond memory.high, how often it hit
    memory.max, and how often it entered OOM due to memory.max. This
    allows users to identify configuration problems when observing a
    degradation in workload performance. An overcommitted system will
    have an increased rate of low boundary breaches, whereas increased
    rates of high limit breaches, maximum hits, or even OOM situations
    will indicate internally overcommitted cgroups.

    For existing users of memory cgroups, the following deviations from
    the current interface are worth pointing out and explaining:

    - The original lower boundary, the soft limit, is defined as a limit
    that is per default unset. As a result, the set of cgroups that
    global reclaim prefers is opt-in, rather than opt-out. The costs
    for optimizing these mostly negative lookups are so high that the
    implementation, despite its enormous size, does not even provide
    the basic desirable behavior. First off, the soft limit has no
    hierarchical meaning. All configured groups are organized in a
    global rbtree and treated like equal peers, regardless where they
    are located in the hierarchy. This makes subtree delegation
    impossible. Second, the soft limit reclaim pass is so aggressive
    that it not just introduces high allocation latencies into the
    system, but also impacts system performance due to overreclaim, to
    the point where the feature becomes self-defeating.

    The memory.low boundary on the other hand is a top-down allocated
    reserve. A cgroup enjoys reclaim protection when it and all its
    ancestors are below their low boundaries, which makes delegation
    of subtrees possible. Secondly, new cgroups have no reserve per
    default and in the common case most cgroups are eligible for the
    preferred reclaim pass. This allows the new low boundary to be
    efficiently implemented with just a minor addition to the generic
    reclaim code, without the need for out-of-band data structures and
    reclaim passes. Because the generic reclaim code considers all
    cgroups except for the ones running low in the preferred first
    reclaim pass, overreclaim of individual groups is eliminated as
    well, resulting in much better overall workload performance.

    - The original high boundary, the hard limit, is defined as a strict
    limit that can not budge, even if the OOM killer has to be called.
    But this generally goes against the goal of making the most out of
    the available memory. The memory consumption of workloads varies
    during runtime, and that requires users to overcommit. But doing
    that with a strict upper limit requires either a fairly accurate
    prediction of the working set size or adding slack to the limit.
    Since working set size estimation is hard and error prone, and
    getting it wrong results in OOM kills, most users tend to err on
    the side of a looser limit and end up wasting precious resources.

    The memory.high boundary on the other hand can be set much more
    conservatively. When hit, it throttles allocations by forcing
    them into direct reclaim to work off the excess, but it never
    invokes the OOM killer. As a result, a high boundary that is
    chosen too aggressively will not terminate the processes, but
    instead it will lead to gradual performance degradation. The user
    can monitor this and make corrections until the minimal memory
    footprint that still gives acceptable performance is found.

    In extreme cases, with many concurrent allocations and a complete
    breakdown of reclaim progress within the group, the high boundary
    can be exceeded. But even then it's mostly better to satisfy the
    allocation from the slack available in other groups or the rest of
    the system than killing the group. Otherwise, memory.max is there
    to limit this type of spillover and ultimately contain buggy or
    even malicious applications.

    - The original control file names are unwieldy and inconsistent in
    many different ways. For example, the upper boundary hit count is
    exported in the memory.failcnt file, but an OOM event count has to
    be manually counted by listening to memory.oom_control events, and
    lower boundary / soft limit events have to be counted by first
    setting a threshold for that value and then counting those events.
    Also, usage and limit files encode their units in the filename.
    That makes the filenames very long, even though this is not
    information that a user needs to be reminded of every time they
    type out those names.

    To address these naming issues, as well as to signal clearly that
    the new interface carries a new configuration model, the naming
    conventions in it necessarily differ from the old interface.

    - The original limit files indicate the state of an unset limit with
    a very high number, and a configured limit can be unset by echoing
    -1 into those files. But that very high number is implementation
    and architecture dependent and not very descriptive. And while -1
    can be understood as an underflow into the highest possible value,
    -2 or -10M etc. do not work, so it's not inconsistent.

    memory.low, memory.high, and memory.max will use the string
    "infinity" to indicate and set the highest possible value.

    [akpm@linux-foundation.org: use seq_puts() for basic strings]
    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Cc: Vladimir Davydov
    Cc: Greg Thelen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • The unified hierarchy interface for memory cgroups will no longer use "-1"
    to mean maximum possible resource value. In preparation for this, make
    the string an argument and let the caller supply it.

    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Cc: Vladimir Davydov
    Cc: Greg Thelen
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     
  • Use BUILD_BUG_ON() to compile assert that memcg string tables are in sync
    with corresponding enums. There aren't currently any issues with these
    tables. This is just defensive.

    Signed-off-by: Greg Thelen
    Acked-by: Johannes Weiner
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Greg Thelen
     
  • Since commit b2052564e66d ("mm: memcontrol: continue cache reclaim from
    offlined groups") pages charged to a memory cgroup are not reparented when
    the cgroup is removed. Instead, they are supposed to be reclaimed in a
    regular way, along with pages accounted to online memory cgroups.

    However, an lruvec of an offline memory cgroup will sooner or later get so
    small that it will be scanned only at low scan priorities (see
    get_scan_count()). Therefore, if there are enough reclaimable pages in
    big lruvecs, pages accounted to offline memory cgroups will never be
    scanned at all, wasting memory.

    Fix this by unconditionally forcing scanning dead lruvecs from kswapd.

    [akpm@linux-foundation.org: fix build]
    Signed-off-by: Vladimir Davydov
    Acked-by: Michal Hocko
    Acked-by: Johannes Weiner
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • The complexity of memcg page stat synchronization is currently leaking
    into the callsites, forcing them to keep track of the move_lock state and
    the IRQ flags. Simplify the API by tracking it in the memcg.

    Signed-off-by: Johannes Weiner
    Acked-by: Michal Hocko
    Reviewed-by: Vladimir Davydov
    Cc: Wu Fengguang
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Johannes Weiner
     

11 Feb, 2015

4 commits

  • mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
    the given cgroup. Currently, it is only used on css free in order to
    destroy all caches corresponding to the memory cgroup being freed. The
    list is protected by memcg_slab_mutex. The mutex is also used to protect
    kmem_cache->memcg_params->memcg_caches arrays and synchronizes
    kmem_cache_destroy vs memcg_unregister_all_caches.

    However, we can perfectly get on without these two. To destroy all caches
    corresponding to a memory cgroup, we can walk over the global list of kmem
    caches, slab_caches, and we can do all the synchronization stuff using the
    slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
    of the memcg_slab_caches and memcg_slab_mutex.

    Apart from this nice cleanup, it also:

    - assures that rcu_barrier() is called once at max when a root cache is
    destroyed or a memory cgroup is freed, no matter how many caches have
    SLAB_DESTROY_BY_RCU flag set;

    - fixes the race between kmem_cache_destroy and kmem_cache_create that
    exists, because memcg_cleanup_cache_params, which is called from
    kmem_cache_destroy after checking that kmem_cache->refcount=0,
    releases the slab_mutex, which gives kmem_cache_create a chance to
    make an alias to a cache doomed to be destroyed.

    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Acked-by: Christoph Lameter
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • Instead of passing the name of the memory cgroup which the cache is
    created for in the memcg_name_argument, let's obtain it immediately in
    memcg_create_kmem_cache.

    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's
    zap them and call these functions directly.

    Signed-off-by: Vladimir Davydov
    Cc: Johannes Weiner
    Cc: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     
  • One bit in ->vm_flags is unused now!

    Signed-off-by: Kirill A. Shutemov
    Cc: Dan Carpenter
    Cc: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Kirill A. Shutemov
     

06 Feb, 2015

1 commit

  • It has been reported that 965GM might trigger

    VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage)

    in mem_cgroup_migrate when shmem wants to replace a swap cache page
    because of shmem_should_replace_page (the page is allocated from an
    inappropriate zone). shmem_replace_page expects that the oldpage is not
    on LRU list and calls mem_cgroup_migrate without lrucare. This is
    obviously incorrect because swapcache pages might be on the LRU list
    (e.g. swapin readahead page).

    Fix this by enabling lrucare for the migration in shmem_replace_page.
    Also clarify that lrucare should be used even if one of the pages might
    be on LRU list.

    The BUG_ON will trigger only when CONFIG_DEBUG_VM is enabled but even
    without that the migration code might leave the old page on an
    inappropriate memcg' LRU which is not that critical because the page
    would get removed with its last reference but it is still confusing.

    Fixes: 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API")
    Signed-off-by: Michal Hocko
    Reported-by: Chris Wilson
    Reported-by: Dave Airlie
    Acked-by: Hugh Dickins
    Acked-by: Johannes Weiner
    Cc: [3.17+]
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko