Commit 3567b59aa80ac4417002bf58e35dce5c777d4164
Committed by
Al Viro
1 parent
acf92b485c
Exists in
master
and in
6 other branches
vmscan: reduce wind up shrinker->nr when shrinker can't do work
When a shrinker returns -1 to shrink_slab() to indicate it cannot do any work given the current memory reclaim requirements, it adds the entire total_scan count to shrinker->nr. The idea ehind this is that whenteh shrinker is next called and can do work, it will do the work of the previously aborted shrinker call as well. However, if a filesystem is doing lots of allocation with GFP_NOFS set, then we get many, many more aborts from the shrinkers than we do successful calls. The result is that shrinker->nr winds up to it's maximum permissible value (twice the current cache size) and then when the next shrinker call that can do work is issued, it has enough scan count built up to free the entire cache twice over. This manifests itself in the cache going from full to empty in a matter of seconds, even when only a small part of the cache is needed to be emptied to free sufficient memory. Under metadata intensive workloads on ext4 and XFS, I'm seeing the VFS caches increase memory consumption up to 75% of memory (no page cache pressure) over a period of 30-60s, and then the shrinker empties them down to zero in the space of 2-3s. This cycle repeats over and over again, with the shrinker completely trashing the inode and dentry caches every minute or so the workload continues. This behaviour was made obvious by the shrink_slab tracepoints added earlier in the series, and made worse by the patch that corrected the concurrent accounting of shrinker->nr. To avoid this problem, stop repeated small increments of the total scan value from winding shrinker->nr up to a value that can cause the entire cache to be freed. We still need to allow it to wind up, so use the delta as the "large scan" threshold check - if the delta is more than a quarter of the entire cache size, then it is a large scan and allowed to cause lots of windup because we are clearly needing to free lots of memory. If it isn't a large scan then limit the total scan to half the size of the cache so that windup never increases to consume the whole cache. Reducing the total scan limit further does not allow enough wind-up to maintain the current levels of performance, whilst a higher threshold does not prevent the windup from freeing the entire cache under sustained workloads. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Showing 1 changed file with 15 additions and 0 deletions Inline Diff
mm/vmscan.c
| 1 | /* | 1 | /* |
| 2 | * linux/mm/vmscan.c | 2 | * linux/mm/vmscan.c |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
| 5 | * | 5 | * |
| 6 | * Swap reorganised 29.12.95, Stephen Tweedie. | 6 | * Swap reorganised 29.12.95, Stephen Tweedie. |
| 7 | * kswapd added: 7.1.96 sct | 7 | * kswapd added: 7.1.96 sct |
| 8 | * Removed kswapd_ctl limits, and swap out as many pages as needed | 8 | * Removed kswapd_ctl limits, and swap out as many pages as needed |
| 9 | * to bring the system back to freepages.high: 2.4.97, Rik van Riel. | 9 | * to bring the system back to freepages.high: 2.4.97, Rik van Riel. |
| 10 | * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). | 10 | * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). |
| 11 | * Multiqueue VM started 5.8.00, Rik van Riel. | 11 | * Multiqueue VM started 5.8.00, Rik van Riel. |
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/gfp.h> | 16 | #include <linux/gfp.h> |
| 17 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
| 18 | #include <linux/swap.h> | 18 | #include <linux/swap.h> |
| 19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
| 20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
| 21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
| 22 | #include <linux/vmstat.h> | 22 | #include <linux/vmstat.h> |
| 23 | #include <linux/file.h> | 23 | #include <linux/file.h> |
| 24 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> |
| 25 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
| 26 | #include <linux/buffer_head.h> /* for try_to_release_page(), | 26 | #include <linux/buffer_head.h> /* for try_to_release_page(), |
| 27 | buffer_heads_over_limit */ | 27 | buffer_heads_over_limit */ |
| 28 | #include <linux/mm_inline.h> | 28 | #include <linux/mm_inline.h> |
| 29 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
| 30 | #include <linux/backing-dev.h> | 30 | #include <linux/backing-dev.h> |
| 31 | #include <linux/rmap.h> | 31 | #include <linux/rmap.h> |
| 32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
| 33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
| 34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
| 35 | #include <linux/compaction.h> | 35 | #include <linux/compaction.h> |
| 36 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
| 37 | #include <linux/rwsem.h> | 37 | #include <linux/rwsem.h> |
| 38 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
| 39 | #include <linux/kthread.h> | 39 | #include <linux/kthread.h> |
| 40 | #include <linux/freezer.h> | 40 | #include <linux/freezer.h> |
| 41 | #include <linux/memcontrol.h> | 41 | #include <linux/memcontrol.h> |
| 42 | #include <linux/delayacct.h> | 42 | #include <linux/delayacct.h> |
| 43 | #include <linux/sysctl.h> | 43 | #include <linux/sysctl.h> |
| 44 | #include <linux/oom.h> | 44 | #include <linux/oom.h> |
| 45 | #include <linux/prefetch.h> | 45 | #include <linux/prefetch.h> |
| 46 | 46 | ||
| 47 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
| 48 | #include <asm/div64.h> | 48 | #include <asm/div64.h> |
| 49 | 49 | ||
| 50 | #include <linux/swapops.h> | 50 | #include <linux/swapops.h> |
| 51 | 51 | ||
| 52 | #include "internal.h" | 52 | #include "internal.h" |
| 53 | 53 | ||
| 54 | #define CREATE_TRACE_POINTS | 54 | #define CREATE_TRACE_POINTS |
| 55 | #include <trace/events/vmscan.h> | 55 | #include <trace/events/vmscan.h> |
| 56 | 56 | ||
| 57 | /* | 57 | /* |
| 58 | * reclaim_mode determines how the inactive list is shrunk | 58 | * reclaim_mode determines how the inactive list is shrunk |
| 59 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages | 59 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages |
| 60 | * RECLAIM_MODE_ASYNC: Do not block | 60 | * RECLAIM_MODE_ASYNC: Do not block |
| 61 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback | 61 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback |
| 62 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | 62 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference |
| 63 | * page from the LRU and reclaim all pages within a | 63 | * page from the LRU and reclaim all pages within a |
| 64 | * naturally aligned range | 64 | * naturally aligned range |
| 65 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | 65 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of |
| 66 | * order-0 pages and then compact the zone | 66 | * order-0 pages and then compact the zone |
| 67 | */ | 67 | */ |
| 68 | typedef unsigned __bitwise__ reclaim_mode_t; | 68 | typedef unsigned __bitwise__ reclaim_mode_t; |
| 69 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | 69 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) |
| 70 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | 70 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) |
| 71 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | 71 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) |
| 72 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | 72 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) |
| 73 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | 73 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) |
| 74 | 74 | ||
| 75 | struct scan_control { | 75 | struct scan_control { |
| 76 | /* Incremented by the number of inactive pages that were scanned */ | 76 | /* Incremented by the number of inactive pages that were scanned */ |
| 77 | unsigned long nr_scanned; | 77 | unsigned long nr_scanned; |
| 78 | 78 | ||
| 79 | /* Number of pages freed so far during a call to shrink_zones() */ | 79 | /* Number of pages freed so far during a call to shrink_zones() */ |
| 80 | unsigned long nr_reclaimed; | 80 | unsigned long nr_reclaimed; |
| 81 | 81 | ||
| 82 | /* How many pages shrink_list() should reclaim */ | 82 | /* How many pages shrink_list() should reclaim */ |
| 83 | unsigned long nr_to_reclaim; | 83 | unsigned long nr_to_reclaim; |
| 84 | 84 | ||
| 85 | unsigned long hibernation_mode; | 85 | unsigned long hibernation_mode; |
| 86 | 86 | ||
| 87 | /* This context's GFP mask */ | 87 | /* This context's GFP mask */ |
| 88 | gfp_t gfp_mask; | 88 | gfp_t gfp_mask; |
| 89 | 89 | ||
| 90 | int may_writepage; | 90 | int may_writepage; |
| 91 | 91 | ||
| 92 | /* Can mapped pages be reclaimed? */ | 92 | /* Can mapped pages be reclaimed? */ |
| 93 | int may_unmap; | 93 | int may_unmap; |
| 94 | 94 | ||
| 95 | /* Can pages be swapped as part of reclaim? */ | 95 | /* Can pages be swapped as part of reclaim? */ |
| 96 | int may_swap; | 96 | int may_swap; |
| 97 | 97 | ||
| 98 | int swappiness; | 98 | int swappiness; |
| 99 | 99 | ||
| 100 | int order; | 100 | int order; |
| 101 | 101 | ||
| 102 | /* | 102 | /* |
| 103 | * Intend to reclaim enough continuous memory rather than reclaim | 103 | * Intend to reclaim enough continuous memory rather than reclaim |
| 104 | * enough amount of memory. i.e, mode for high order allocation. | 104 | * enough amount of memory. i.e, mode for high order allocation. |
| 105 | */ | 105 | */ |
| 106 | reclaim_mode_t reclaim_mode; | 106 | reclaim_mode_t reclaim_mode; |
| 107 | 107 | ||
| 108 | /* Which cgroup do we reclaim from */ | 108 | /* Which cgroup do we reclaim from */ |
| 109 | struct mem_cgroup *mem_cgroup; | 109 | struct mem_cgroup *mem_cgroup; |
| 110 | 110 | ||
| 111 | /* | 111 | /* |
| 112 | * Nodemask of nodes allowed by the caller. If NULL, all nodes | 112 | * Nodemask of nodes allowed by the caller. If NULL, all nodes |
| 113 | * are scanned. | 113 | * are scanned. |
| 114 | */ | 114 | */ |
| 115 | nodemask_t *nodemask; | 115 | nodemask_t *nodemask; |
| 116 | }; | 116 | }; |
| 117 | 117 | ||
| 118 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 118 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
| 119 | 119 | ||
| 120 | #ifdef ARCH_HAS_PREFETCH | 120 | #ifdef ARCH_HAS_PREFETCH |
| 121 | #define prefetch_prev_lru_page(_page, _base, _field) \ | 121 | #define prefetch_prev_lru_page(_page, _base, _field) \ |
| 122 | do { \ | 122 | do { \ |
| 123 | if ((_page)->lru.prev != _base) { \ | 123 | if ((_page)->lru.prev != _base) { \ |
| 124 | struct page *prev; \ | 124 | struct page *prev; \ |
| 125 | \ | 125 | \ |
| 126 | prev = lru_to_page(&(_page->lru)); \ | 126 | prev = lru_to_page(&(_page->lru)); \ |
| 127 | prefetch(&prev->_field); \ | 127 | prefetch(&prev->_field); \ |
| 128 | } \ | 128 | } \ |
| 129 | } while (0) | 129 | } while (0) |
| 130 | #else | 130 | #else |
| 131 | #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) | 131 | #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) |
| 132 | #endif | 132 | #endif |
| 133 | 133 | ||
| 134 | #ifdef ARCH_HAS_PREFETCHW | 134 | #ifdef ARCH_HAS_PREFETCHW |
| 135 | #define prefetchw_prev_lru_page(_page, _base, _field) \ | 135 | #define prefetchw_prev_lru_page(_page, _base, _field) \ |
| 136 | do { \ | 136 | do { \ |
| 137 | if ((_page)->lru.prev != _base) { \ | 137 | if ((_page)->lru.prev != _base) { \ |
| 138 | struct page *prev; \ | 138 | struct page *prev; \ |
| 139 | \ | 139 | \ |
| 140 | prev = lru_to_page(&(_page->lru)); \ | 140 | prev = lru_to_page(&(_page->lru)); \ |
| 141 | prefetchw(&prev->_field); \ | 141 | prefetchw(&prev->_field); \ |
| 142 | } \ | 142 | } \ |
| 143 | } while (0) | 143 | } while (0) |
| 144 | #else | 144 | #else |
| 145 | #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) | 145 | #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) |
| 146 | #endif | 146 | #endif |
| 147 | 147 | ||
| 148 | /* | 148 | /* |
| 149 | * From 0 .. 100. Higher means more swappy. | 149 | * From 0 .. 100. Higher means more swappy. |
| 150 | */ | 150 | */ |
| 151 | int vm_swappiness = 60; | 151 | int vm_swappiness = 60; |
| 152 | long vm_total_pages; /* The total number of pages which the VM controls */ | 152 | long vm_total_pages; /* The total number of pages which the VM controls */ |
| 153 | 153 | ||
| 154 | static LIST_HEAD(shrinker_list); | 154 | static LIST_HEAD(shrinker_list); |
| 155 | static DECLARE_RWSEM(shrinker_rwsem); | 155 | static DECLARE_RWSEM(shrinker_rwsem); |
| 156 | 156 | ||
| 157 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 157 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
| 158 | #define scanning_global_lru(sc) (!(sc)->mem_cgroup) | 158 | #define scanning_global_lru(sc) (!(sc)->mem_cgroup) |
| 159 | #else | 159 | #else |
| 160 | #define scanning_global_lru(sc) (1) | 160 | #define scanning_global_lru(sc) (1) |
| 161 | #endif | 161 | #endif |
| 162 | 162 | ||
| 163 | static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, | 163 | static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, |
| 164 | struct scan_control *sc) | 164 | struct scan_control *sc) |
| 165 | { | 165 | { |
| 166 | if (!scanning_global_lru(sc)) | 166 | if (!scanning_global_lru(sc)) |
| 167 | return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); | 167 | return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); |
| 168 | 168 | ||
| 169 | return &zone->reclaim_stat; | 169 | return &zone->reclaim_stat; |
| 170 | } | 170 | } |
| 171 | 171 | ||
| 172 | static unsigned long zone_nr_lru_pages(struct zone *zone, | 172 | static unsigned long zone_nr_lru_pages(struct zone *zone, |
| 173 | struct scan_control *sc, enum lru_list lru) | 173 | struct scan_control *sc, enum lru_list lru) |
| 174 | { | 174 | { |
| 175 | if (!scanning_global_lru(sc)) | 175 | if (!scanning_global_lru(sc)) |
| 176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); | 176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); |
| 177 | 177 | ||
| 178 | return zone_page_state(zone, NR_LRU_BASE + lru); | 178 | return zone_page_state(zone, NR_LRU_BASE + lru); |
| 179 | } | 179 | } |
| 180 | 180 | ||
| 181 | 181 | ||
| 182 | /* | 182 | /* |
| 183 | * Add a shrinker callback to be called from the vm | 183 | * Add a shrinker callback to be called from the vm |
| 184 | */ | 184 | */ |
| 185 | void register_shrinker(struct shrinker *shrinker) | 185 | void register_shrinker(struct shrinker *shrinker) |
| 186 | { | 186 | { |
| 187 | shrinker->nr = 0; | 187 | shrinker->nr = 0; |
| 188 | down_write(&shrinker_rwsem); | 188 | down_write(&shrinker_rwsem); |
| 189 | list_add_tail(&shrinker->list, &shrinker_list); | 189 | list_add_tail(&shrinker->list, &shrinker_list); |
| 190 | up_write(&shrinker_rwsem); | 190 | up_write(&shrinker_rwsem); |
| 191 | } | 191 | } |
| 192 | EXPORT_SYMBOL(register_shrinker); | 192 | EXPORT_SYMBOL(register_shrinker); |
| 193 | 193 | ||
| 194 | /* | 194 | /* |
| 195 | * Remove one | 195 | * Remove one |
| 196 | */ | 196 | */ |
| 197 | void unregister_shrinker(struct shrinker *shrinker) | 197 | void unregister_shrinker(struct shrinker *shrinker) |
| 198 | { | 198 | { |
| 199 | down_write(&shrinker_rwsem); | 199 | down_write(&shrinker_rwsem); |
| 200 | list_del(&shrinker->list); | 200 | list_del(&shrinker->list); |
| 201 | up_write(&shrinker_rwsem); | 201 | up_write(&shrinker_rwsem); |
| 202 | } | 202 | } |
| 203 | EXPORT_SYMBOL(unregister_shrinker); | 203 | EXPORT_SYMBOL(unregister_shrinker); |
| 204 | 204 | ||
| 205 | static inline int do_shrinker_shrink(struct shrinker *shrinker, | 205 | static inline int do_shrinker_shrink(struct shrinker *shrinker, |
| 206 | struct shrink_control *sc, | 206 | struct shrink_control *sc, |
| 207 | unsigned long nr_to_scan) | 207 | unsigned long nr_to_scan) |
| 208 | { | 208 | { |
| 209 | sc->nr_to_scan = nr_to_scan; | 209 | sc->nr_to_scan = nr_to_scan; |
| 210 | return (*shrinker->shrink)(shrinker, sc); | 210 | return (*shrinker->shrink)(shrinker, sc); |
| 211 | } | 211 | } |
| 212 | 212 | ||
| 213 | #define SHRINK_BATCH 128 | 213 | #define SHRINK_BATCH 128 |
| 214 | /* | 214 | /* |
| 215 | * Call the shrink functions to age shrinkable caches | 215 | * Call the shrink functions to age shrinkable caches |
| 216 | * | 216 | * |
| 217 | * Here we assume it costs one seek to replace a lru page and that it also | 217 | * Here we assume it costs one seek to replace a lru page and that it also |
| 218 | * takes a seek to recreate a cache object. With this in mind we age equal | 218 | * takes a seek to recreate a cache object. With this in mind we age equal |
| 219 | * percentages of the lru and ageable caches. This should balance the seeks | 219 | * percentages of the lru and ageable caches. This should balance the seeks |
| 220 | * generated by these structures. | 220 | * generated by these structures. |
| 221 | * | 221 | * |
| 222 | * If the vm encountered mapped pages on the LRU it increase the pressure on | 222 | * If the vm encountered mapped pages on the LRU it increase the pressure on |
| 223 | * slab to avoid swapping. | 223 | * slab to avoid swapping. |
| 224 | * | 224 | * |
| 225 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. | 225 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. |
| 226 | * | 226 | * |
| 227 | * `lru_pages' represents the number of on-LRU pages in all the zones which | 227 | * `lru_pages' represents the number of on-LRU pages in all the zones which |
| 228 | * are eligible for the caller's allocation attempt. It is used for balancing | 228 | * are eligible for the caller's allocation attempt. It is used for balancing |
| 229 | * slab reclaim versus page reclaim. | 229 | * slab reclaim versus page reclaim. |
| 230 | * | 230 | * |
| 231 | * Returns the number of slab objects which we shrunk. | 231 | * Returns the number of slab objects which we shrunk. |
| 232 | */ | 232 | */ |
| 233 | unsigned long shrink_slab(struct shrink_control *shrink, | 233 | unsigned long shrink_slab(struct shrink_control *shrink, |
| 234 | unsigned long nr_pages_scanned, | 234 | unsigned long nr_pages_scanned, |
| 235 | unsigned long lru_pages) | 235 | unsigned long lru_pages) |
| 236 | { | 236 | { |
| 237 | struct shrinker *shrinker; | 237 | struct shrinker *shrinker; |
| 238 | unsigned long ret = 0; | 238 | unsigned long ret = 0; |
| 239 | 239 | ||
| 240 | if (nr_pages_scanned == 0) | 240 | if (nr_pages_scanned == 0) |
| 241 | nr_pages_scanned = SWAP_CLUSTER_MAX; | 241 | nr_pages_scanned = SWAP_CLUSTER_MAX; |
| 242 | 242 | ||
| 243 | if (!down_read_trylock(&shrinker_rwsem)) { | 243 | if (!down_read_trylock(&shrinker_rwsem)) { |
| 244 | /* Assume we'll be able to shrink next time */ | 244 | /* Assume we'll be able to shrink next time */ |
| 245 | ret = 1; | 245 | ret = 1; |
| 246 | goto out; | 246 | goto out; |
| 247 | } | 247 | } |
| 248 | 248 | ||
| 249 | list_for_each_entry(shrinker, &shrinker_list, list) { | 249 | list_for_each_entry(shrinker, &shrinker_list, list) { |
| 250 | unsigned long long delta; | 250 | unsigned long long delta; |
| 251 | unsigned long total_scan; | 251 | unsigned long total_scan; |
| 252 | unsigned long max_pass; | 252 | unsigned long max_pass; |
| 253 | int shrink_ret = 0; | 253 | int shrink_ret = 0; |
| 254 | long nr; | 254 | long nr; |
| 255 | long new_nr; | 255 | long new_nr; |
| 256 | 256 | ||
| 257 | /* | 257 | /* |
| 258 | * copy the current shrinker scan count into a local variable | 258 | * copy the current shrinker scan count into a local variable |
| 259 | * and zero it so that other concurrent shrinker invocations | 259 | * and zero it so that other concurrent shrinker invocations |
| 260 | * don't also do this scanning work. | 260 | * don't also do this scanning work. |
| 261 | */ | 261 | */ |
| 262 | do { | 262 | do { |
| 263 | nr = shrinker->nr; | 263 | nr = shrinker->nr; |
| 264 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | 264 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); |
| 265 | 265 | ||
| 266 | total_scan = nr; | 266 | total_scan = nr; |
| 267 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 267 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
| 268 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 268 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
| 269 | delta *= max_pass; | 269 | delta *= max_pass; |
| 270 | do_div(delta, lru_pages + 1); | 270 | do_div(delta, lru_pages + 1); |
| 271 | total_scan += delta; | 271 | total_scan += delta; |
| 272 | if (total_scan < 0) { | 272 | if (total_scan < 0) { |
| 273 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 273 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
| 274 | "delete nr=%ld\n", | 274 | "delete nr=%ld\n", |
| 275 | shrinker->shrink, total_scan); | 275 | shrinker->shrink, total_scan); |
| 276 | total_scan = max_pass; | 276 | total_scan = max_pass; |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | /* | 279 | /* |
| 280 | * We need to avoid excessive windup on filesystem shrinkers | ||
| 281 | * due to large numbers of GFP_NOFS allocations causing the | ||
| 282 | * shrinkers to return -1 all the time. This results in a large | ||
| 283 | * nr being built up so when a shrink that can do some work | ||
| 284 | * comes along it empties the entire cache due to nr >>> | ||
| 285 | * max_pass. This is bad for sustaining a working set in | ||
| 286 | * memory. | ||
| 287 | * | ||
| 288 | * Hence only allow the shrinker to scan the entire cache when | ||
| 289 | * a large delta change is calculated directly. | ||
| 290 | */ | ||
| 291 | if (delta < max_pass / 4) | ||
| 292 | total_scan = min(total_scan, max_pass / 2); | ||
| 293 | |||
| 294 | /* | ||
| 280 | * Avoid risking looping forever due to too large nr value: | 295 | * Avoid risking looping forever due to too large nr value: |
| 281 | * never try to free more than twice the estimate number of | 296 | * never try to free more than twice the estimate number of |
| 282 | * freeable entries. | 297 | * freeable entries. |
| 283 | */ | 298 | */ |
| 284 | if (total_scan > max_pass * 2) | 299 | if (total_scan > max_pass * 2) |
| 285 | total_scan = max_pass * 2; | 300 | total_scan = max_pass * 2; |
| 286 | 301 | ||
| 287 | trace_mm_shrink_slab_start(shrinker, shrink, nr, | 302 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
| 288 | nr_pages_scanned, lru_pages, | 303 | nr_pages_scanned, lru_pages, |
| 289 | max_pass, delta, total_scan); | 304 | max_pass, delta, total_scan); |
| 290 | 305 | ||
| 291 | while (total_scan >= SHRINK_BATCH) { | 306 | while (total_scan >= SHRINK_BATCH) { |
| 292 | long this_scan = SHRINK_BATCH; | 307 | long this_scan = SHRINK_BATCH; |
| 293 | int nr_before; | 308 | int nr_before; |
| 294 | 309 | ||
| 295 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 310 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
| 296 | shrink_ret = do_shrinker_shrink(shrinker, shrink, | 311 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
| 297 | this_scan); | 312 | this_scan); |
| 298 | if (shrink_ret == -1) | 313 | if (shrink_ret == -1) |
| 299 | break; | 314 | break; |
| 300 | if (shrink_ret < nr_before) | 315 | if (shrink_ret < nr_before) |
| 301 | ret += nr_before - shrink_ret; | 316 | ret += nr_before - shrink_ret; |
| 302 | count_vm_events(SLABS_SCANNED, this_scan); | 317 | count_vm_events(SLABS_SCANNED, this_scan); |
| 303 | total_scan -= this_scan; | 318 | total_scan -= this_scan; |
| 304 | 319 | ||
| 305 | cond_resched(); | 320 | cond_resched(); |
| 306 | } | 321 | } |
| 307 | 322 | ||
| 308 | /* | 323 | /* |
| 309 | * move the unused scan count back into the shrinker in a | 324 | * move the unused scan count back into the shrinker in a |
| 310 | * manner that handles concurrent updates. If we exhausted the | 325 | * manner that handles concurrent updates. If we exhausted the |
| 311 | * scan, there is no need to do an update. | 326 | * scan, there is no need to do an update. |
| 312 | */ | 327 | */ |
| 313 | do { | 328 | do { |
| 314 | nr = shrinker->nr; | 329 | nr = shrinker->nr; |
| 315 | new_nr = total_scan + nr; | 330 | new_nr = total_scan + nr; |
| 316 | if (total_scan <= 0) | 331 | if (total_scan <= 0) |
| 317 | break; | 332 | break; |
| 318 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | 333 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); |
| 319 | 334 | ||
| 320 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | 335 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); |
| 321 | } | 336 | } |
| 322 | up_read(&shrinker_rwsem); | 337 | up_read(&shrinker_rwsem); |
| 323 | out: | 338 | out: |
| 324 | cond_resched(); | 339 | cond_resched(); |
| 325 | return ret; | 340 | return ret; |
| 326 | } | 341 | } |
| 327 | 342 | ||
| 328 | static void set_reclaim_mode(int priority, struct scan_control *sc, | 343 | static void set_reclaim_mode(int priority, struct scan_control *sc, |
| 329 | bool sync) | 344 | bool sync) |
| 330 | { | 345 | { |
| 331 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; | 346 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; |
| 332 | 347 | ||
| 333 | /* | 348 | /* |
| 334 | * Initially assume we are entering either lumpy reclaim or | 349 | * Initially assume we are entering either lumpy reclaim or |
| 335 | * reclaim/compaction.Depending on the order, we will either set the | 350 | * reclaim/compaction.Depending on the order, we will either set the |
| 336 | * sync mode or just reclaim order-0 pages later. | 351 | * sync mode or just reclaim order-0 pages later. |
| 337 | */ | 352 | */ |
| 338 | if (COMPACTION_BUILD) | 353 | if (COMPACTION_BUILD) |
| 339 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; | 354 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; |
| 340 | else | 355 | else |
| 341 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | 356 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; |
| 342 | 357 | ||
| 343 | /* | 358 | /* |
| 344 | * Avoid using lumpy reclaim or reclaim/compaction if possible by | 359 | * Avoid using lumpy reclaim or reclaim/compaction if possible by |
| 345 | * restricting when its set to either costly allocations or when | 360 | * restricting when its set to either costly allocations or when |
| 346 | * under memory pressure | 361 | * under memory pressure |
| 347 | */ | 362 | */ |
| 348 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 363 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
| 349 | sc->reclaim_mode |= syncmode; | 364 | sc->reclaim_mode |= syncmode; |
| 350 | else if (sc->order && priority < DEF_PRIORITY - 2) | 365 | else if (sc->order && priority < DEF_PRIORITY - 2) |
| 351 | sc->reclaim_mode |= syncmode; | 366 | sc->reclaim_mode |= syncmode; |
| 352 | else | 367 | else |
| 353 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | 368 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
| 354 | } | 369 | } |
| 355 | 370 | ||
| 356 | static void reset_reclaim_mode(struct scan_control *sc) | 371 | static void reset_reclaim_mode(struct scan_control *sc) |
| 357 | { | 372 | { |
| 358 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | 373 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
| 359 | } | 374 | } |
| 360 | 375 | ||
| 361 | static inline int is_page_cache_freeable(struct page *page) | 376 | static inline int is_page_cache_freeable(struct page *page) |
| 362 | { | 377 | { |
| 363 | /* | 378 | /* |
| 364 | * A freeable page cache page is referenced only by the caller | 379 | * A freeable page cache page is referenced only by the caller |
| 365 | * that isolated the page, the page cache radix tree and | 380 | * that isolated the page, the page cache radix tree and |
| 366 | * optional buffer heads at page->private. | 381 | * optional buffer heads at page->private. |
| 367 | */ | 382 | */ |
| 368 | return page_count(page) - page_has_private(page) == 2; | 383 | return page_count(page) - page_has_private(page) == 2; |
| 369 | } | 384 | } |
| 370 | 385 | ||
| 371 | static int may_write_to_queue(struct backing_dev_info *bdi, | 386 | static int may_write_to_queue(struct backing_dev_info *bdi, |
| 372 | struct scan_control *sc) | 387 | struct scan_control *sc) |
| 373 | { | 388 | { |
| 374 | if (current->flags & PF_SWAPWRITE) | 389 | if (current->flags & PF_SWAPWRITE) |
| 375 | return 1; | 390 | return 1; |
| 376 | if (!bdi_write_congested(bdi)) | 391 | if (!bdi_write_congested(bdi)) |
| 377 | return 1; | 392 | return 1; |
| 378 | if (bdi == current->backing_dev_info) | 393 | if (bdi == current->backing_dev_info) |
| 379 | return 1; | 394 | return 1; |
| 380 | 395 | ||
| 381 | /* lumpy reclaim for hugepage often need a lot of write */ | 396 | /* lumpy reclaim for hugepage often need a lot of write */ |
| 382 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 397 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
| 383 | return 1; | 398 | return 1; |
| 384 | return 0; | 399 | return 0; |
| 385 | } | 400 | } |
| 386 | 401 | ||
| 387 | /* | 402 | /* |
| 388 | * We detected a synchronous write error writing a page out. Probably | 403 | * We detected a synchronous write error writing a page out. Probably |
| 389 | * -ENOSPC. We need to propagate that into the address_space for a subsequent | 404 | * -ENOSPC. We need to propagate that into the address_space for a subsequent |
| 390 | * fsync(), msync() or close(). | 405 | * fsync(), msync() or close(). |
| 391 | * | 406 | * |
| 392 | * The tricky part is that after writepage we cannot touch the mapping: nothing | 407 | * The tricky part is that after writepage we cannot touch the mapping: nothing |
| 393 | * prevents it from being freed up. But we have a ref on the page and once | 408 | * prevents it from being freed up. But we have a ref on the page and once |
| 394 | * that page is locked, the mapping is pinned. | 409 | * that page is locked, the mapping is pinned. |
| 395 | * | 410 | * |
| 396 | * We're allowed to run sleeping lock_page() here because we know the caller has | 411 | * We're allowed to run sleeping lock_page() here because we know the caller has |
| 397 | * __GFP_FS. | 412 | * __GFP_FS. |
| 398 | */ | 413 | */ |
| 399 | static void handle_write_error(struct address_space *mapping, | 414 | static void handle_write_error(struct address_space *mapping, |
| 400 | struct page *page, int error) | 415 | struct page *page, int error) |
| 401 | { | 416 | { |
| 402 | lock_page(page); | 417 | lock_page(page); |
| 403 | if (page_mapping(page) == mapping) | 418 | if (page_mapping(page) == mapping) |
| 404 | mapping_set_error(mapping, error); | 419 | mapping_set_error(mapping, error); |
| 405 | unlock_page(page); | 420 | unlock_page(page); |
| 406 | } | 421 | } |
| 407 | 422 | ||
| 408 | /* possible outcome of pageout() */ | 423 | /* possible outcome of pageout() */ |
| 409 | typedef enum { | 424 | typedef enum { |
| 410 | /* failed to write page out, page is locked */ | 425 | /* failed to write page out, page is locked */ |
| 411 | PAGE_KEEP, | 426 | PAGE_KEEP, |
| 412 | /* move page to the active list, page is locked */ | 427 | /* move page to the active list, page is locked */ |
| 413 | PAGE_ACTIVATE, | 428 | PAGE_ACTIVATE, |
| 414 | /* page has been sent to the disk successfully, page is unlocked */ | 429 | /* page has been sent to the disk successfully, page is unlocked */ |
| 415 | PAGE_SUCCESS, | 430 | PAGE_SUCCESS, |
| 416 | /* page is clean and locked */ | 431 | /* page is clean and locked */ |
| 417 | PAGE_CLEAN, | 432 | PAGE_CLEAN, |
| 418 | } pageout_t; | 433 | } pageout_t; |
| 419 | 434 | ||
| 420 | /* | 435 | /* |
| 421 | * pageout is called by shrink_page_list() for each dirty page. | 436 | * pageout is called by shrink_page_list() for each dirty page. |
| 422 | * Calls ->writepage(). | 437 | * Calls ->writepage(). |
| 423 | */ | 438 | */ |
| 424 | static pageout_t pageout(struct page *page, struct address_space *mapping, | 439 | static pageout_t pageout(struct page *page, struct address_space *mapping, |
| 425 | struct scan_control *sc) | 440 | struct scan_control *sc) |
| 426 | { | 441 | { |
| 427 | /* | 442 | /* |
| 428 | * If the page is dirty, only perform writeback if that write | 443 | * If the page is dirty, only perform writeback if that write |
| 429 | * will be non-blocking. To prevent this allocation from being | 444 | * will be non-blocking. To prevent this allocation from being |
| 430 | * stalled by pagecache activity. But note that there may be | 445 | * stalled by pagecache activity. But note that there may be |
| 431 | * stalls if we need to run get_block(). We could test | 446 | * stalls if we need to run get_block(). We could test |
| 432 | * PagePrivate for that. | 447 | * PagePrivate for that. |
| 433 | * | 448 | * |
| 434 | * If this process is currently in __generic_file_aio_write() against | 449 | * If this process is currently in __generic_file_aio_write() against |
| 435 | * this page's queue, we can perform writeback even if that | 450 | * this page's queue, we can perform writeback even if that |
| 436 | * will block. | 451 | * will block. |
| 437 | * | 452 | * |
| 438 | * If the page is swapcache, write it back even if that would | 453 | * If the page is swapcache, write it back even if that would |
| 439 | * block, for some throttling. This happens by accident, because | 454 | * block, for some throttling. This happens by accident, because |
| 440 | * swap_backing_dev_info is bust: it doesn't reflect the | 455 | * swap_backing_dev_info is bust: it doesn't reflect the |
| 441 | * congestion state of the swapdevs. Easy to fix, if needed. | 456 | * congestion state of the swapdevs. Easy to fix, if needed. |
| 442 | */ | 457 | */ |
| 443 | if (!is_page_cache_freeable(page)) | 458 | if (!is_page_cache_freeable(page)) |
| 444 | return PAGE_KEEP; | 459 | return PAGE_KEEP; |
| 445 | if (!mapping) { | 460 | if (!mapping) { |
| 446 | /* | 461 | /* |
| 447 | * Some data journaling orphaned pages can have | 462 | * Some data journaling orphaned pages can have |
| 448 | * page->mapping == NULL while being dirty with clean buffers. | 463 | * page->mapping == NULL while being dirty with clean buffers. |
| 449 | */ | 464 | */ |
| 450 | if (page_has_private(page)) { | 465 | if (page_has_private(page)) { |
| 451 | if (try_to_free_buffers(page)) { | 466 | if (try_to_free_buffers(page)) { |
| 452 | ClearPageDirty(page); | 467 | ClearPageDirty(page); |
| 453 | printk("%s: orphaned page\n", __func__); | 468 | printk("%s: orphaned page\n", __func__); |
| 454 | return PAGE_CLEAN; | 469 | return PAGE_CLEAN; |
| 455 | } | 470 | } |
| 456 | } | 471 | } |
| 457 | return PAGE_KEEP; | 472 | return PAGE_KEEP; |
| 458 | } | 473 | } |
| 459 | if (mapping->a_ops->writepage == NULL) | 474 | if (mapping->a_ops->writepage == NULL) |
| 460 | return PAGE_ACTIVATE; | 475 | return PAGE_ACTIVATE; |
| 461 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) | 476 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) |
| 462 | return PAGE_KEEP; | 477 | return PAGE_KEEP; |
| 463 | 478 | ||
| 464 | if (clear_page_dirty_for_io(page)) { | 479 | if (clear_page_dirty_for_io(page)) { |
| 465 | int res; | 480 | int res; |
| 466 | struct writeback_control wbc = { | 481 | struct writeback_control wbc = { |
| 467 | .sync_mode = WB_SYNC_NONE, | 482 | .sync_mode = WB_SYNC_NONE, |
| 468 | .nr_to_write = SWAP_CLUSTER_MAX, | 483 | .nr_to_write = SWAP_CLUSTER_MAX, |
| 469 | .range_start = 0, | 484 | .range_start = 0, |
| 470 | .range_end = LLONG_MAX, | 485 | .range_end = LLONG_MAX, |
| 471 | .for_reclaim = 1, | 486 | .for_reclaim = 1, |
| 472 | }; | 487 | }; |
| 473 | 488 | ||
| 474 | SetPageReclaim(page); | 489 | SetPageReclaim(page); |
| 475 | res = mapping->a_ops->writepage(page, &wbc); | 490 | res = mapping->a_ops->writepage(page, &wbc); |
| 476 | if (res < 0) | 491 | if (res < 0) |
| 477 | handle_write_error(mapping, page, res); | 492 | handle_write_error(mapping, page, res); |
| 478 | if (res == AOP_WRITEPAGE_ACTIVATE) { | 493 | if (res == AOP_WRITEPAGE_ACTIVATE) { |
| 479 | ClearPageReclaim(page); | 494 | ClearPageReclaim(page); |
| 480 | return PAGE_ACTIVATE; | 495 | return PAGE_ACTIVATE; |
| 481 | } | 496 | } |
| 482 | 497 | ||
| 483 | /* | 498 | /* |
| 484 | * Wait on writeback if requested to. This happens when | 499 | * Wait on writeback if requested to. This happens when |
| 485 | * direct reclaiming a large contiguous area and the | 500 | * direct reclaiming a large contiguous area and the |
| 486 | * first attempt to free a range of pages fails. | 501 | * first attempt to free a range of pages fails. |
| 487 | */ | 502 | */ |
| 488 | if (PageWriteback(page) && | 503 | if (PageWriteback(page) && |
| 489 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) | 504 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) |
| 490 | wait_on_page_writeback(page); | 505 | wait_on_page_writeback(page); |
| 491 | 506 | ||
| 492 | if (!PageWriteback(page)) { | 507 | if (!PageWriteback(page)) { |
| 493 | /* synchronous write or broken a_ops? */ | 508 | /* synchronous write or broken a_ops? */ |
| 494 | ClearPageReclaim(page); | 509 | ClearPageReclaim(page); |
| 495 | } | 510 | } |
| 496 | trace_mm_vmscan_writepage(page, | 511 | trace_mm_vmscan_writepage(page, |
| 497 | trace_reclaim_flags(page, sc->reclaim_mode)); | 512 | trace_reclaim_flags(page, sc->reclaim_mode)); |
| 498 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 513 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
| 499 | return PAGE_SUCCESS; | 514 | return PAGE_SUCCESS; |
| 500 | } | 515 | } |
| 501 | 516 | ||
| 502 | return PAGE_CLEAN; | 517 | return PAGE_CLEAN; |
| 503 | } | 518 | } |
| 504 | 519 | ||
| 505 | /* | 520 | /* |
| 506 | * Same as remove_mapping, but if the page is removed from the mapping, it | 521 | * Same as remove_mapping, but if the page is removed from the mapping, it |
| 507 | * gets returned with a refcount of 0. | 522 | * gets returned with a refcount of 0. |
| 508 | */ | 523 | */ |
| 509 | static int __remove_mapping(struct address_space *mapping, struct page *page) | 524 | static int __remove_mapping(struct address_space *mapping, struct page *page) |
| 510 | { | 525 | { |
| 511 | BUG_ON(!PageLocked(page)); | 526 | BUG_ON(!PageLocked(page)); |
| 512 | BUG_ON(mapping != page_mapping(page)); | 527 | BUG_ON(mapping != page_mapping(page)); |
| 513 | 528 | ||
| 514 | spin_lock_irq(&mapping->tree_lock); | 529 | spin_lock_irq(&mapping->tree_lock); |
| 515 | /* | 530 | /* |
| 516 | * The non racy check for a busy page. | 531 | * The non racy check for a busy page. |
| 517 | * | 532 | * |
| 518 | * Must be careful with the order of the tests. When someone has | 533 | * Must be careful with the order of the tests. When someone has |
| 519 | * a ref to the page, it may be possible that they dirty it then | 534 | * a ref to the page, it may be possible that they dirty it then |
| 520 | * drop the reference. So if PageDirty is tested before page_count | 535 | * drop the reference. So if PageDirty is tested before page_count |
| 521 | * here, then the following race may occur: | 536 | * here, then the following race may occur: |
| 522 | * | 537 | * |
| 523 | * get_user_pages(&page); | 538 | * get_user_pages(&page); |
| 524 | * [user mapping goes away] | 539 | * [user mapping goes away] |
| 525 | * write_to(page); | 540 | * write_to(page); |
| 526 | * !PageDirty(page) [good] | 541 | * !PageDirty(page) [good] |
| 527 | * SetPageDirty(page); | 542 | * SetPageDirty(page); |
| 528 | * put_page(page); | 543 | * put_page(page); |
| 529 | * !page_count(page) [good, discard it] | 544 | * !page_count(page) [good, discard it] |
| 530 | * | 545 | * |
| 531 | * [oops, our write_to data is lost] | 546 | * [oops, our write_to data is lost] |
| 532 | * | 547 | * |
| 533 | * Reversing the order of the tests ensures such a situation cannot | 548 | * Reversing the order of the tests ensures such a situation cannot |
| 534 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags | 549 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags |
| 535 | * load is not satisfied before that of page->_count. | 550 | * load is not satisfied before that of page->_count. |
| 536 | * | 551 | * |
| 537 | * Note that if SetPageDirty is always performed via set_page_dirty, | 552 | * Note that if SetPageDirty is always performed via set_page_dirty, |
| 538 | * and thus under tree_lock, then this ordering is not required. | 553 | * and thus under tree_lock, then this ordering is not required. |
| 539 | */ | 554 | */ |
| 540 | if (!page_freeze_refs(page, 2)) | 555 | if (!page_freeze_refs(page, 2)) |
| 541 | goto cannot_free; | 556 | goto cannot_free; |
| 542 | /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ | 557 | /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ |
| 543 | if (unlikely(PageDirty(page))) { | 558 | if (unlikely(PageDirty(page))) { |
| 544 | page_unfreeze_refs(page, 2); | 559 | page_unfreeze_refs(page, 2); |
| 545 | goto cannot_free; | 560 | goto cannot_free; |
| 546 | } | 561 | } |
| 547 | 562 | ||
| 548 | if (PageSwapCache(page)) { | 563 | if (PageSwapCache(page)) { |
| 549 | swp_entry_t swap = { .val = page_private(page) }; | 564 | swp_entry_t swap = { .val = page_private(page) }; |
| 550 | __delete_from_swap_cache(page); | 565 | __delete_from_swap_cache(page); |
| 551 | spin_unlock_irq(&mapping->tree_lock); | 566 | spin_unlock_irq(&mapping->tree_lock); |
| 552 | swapcache_free(swap, page); | 567 | swapcache_free(swap, page); |
| 553 | } else { | 568 | } else { |
| 554 | void (*freepage)(struct page *); | 569 | void (*freepage)(struct page *); |
| 555 | 570 | ||
| 556 | freepage = mapping->a_ops->freepage; | 571 | freepage = mapping->a_ops->freepage; |
| 557 | 572 | ||
| 558 | __delete_from_page_cache(page); | 573 | __delete_from_page_cache(page); |
| 559 | spin_unlock_irq(&mapping->tree_lock); | 574 | spin_unlock_irq(&mapping->tree_lock); |
| 560 | mem_cgroup_uncharge_cache_page(page); | 575 | mem_cgroup_uncharge_cache_page(page); |
| 561 | 576 | ||
| 562 | if (freepage != NULL) | 577 | if (freepage != NULL) |
| 563 | freepage(page); | 578 | freepage(page); |
| 564 | } | 579 | } |
| 565 | 580 | ||
| 566 | return 1; | 581 | return 1; |
| 567 | 582 | ||
| 568 | cannot_free: | 583 | cannot_free: |
| 569 | spin_unlock_irq(&mapping->tree_lock); | 584 | spin_unlock_irq(&mapping->tree_lock); |
| 570 | return 0; | 585 | return 0; |
| 571 | } | 586 | } |
| 572 | 587 | ||
| 573 | /* | 588 | /* |
| 574 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | 589 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if |
| 575 | * someone else has a ref on the page, abort and return 0. If it was | 590 | * someone else has a ref on the page, abort and return 0. If it was |
| 576 | * successfully detached, return 1. Assumes the caller has a single ref on | 591 | * successfully detached, return 1. Assumes the caller has a single ref on |
| 577 | * this page. | 592 | * this page. |
| 578 | */ | 593 | */ |
| 579 | int remove_mapping(struct address_space *mapping, struct page *page) | 594 | int remove_mapping(struct address_space *mapping, struct page *page) |
| 580 | { | 595 | { |
| 581 | if (__remove_mapping(mapping, page)) { | 596 | if (__remove_mapping(mapping, page)) { |
| 582 | /* | 597 | /* |
| 583 | * Unfreezing the refcount with 1 rather than 2 effectively | 598 | * Unfreezing the refcount with 1 rather than 2 effectively |
| 584 | * drops the pagecache ref for us without requiring another | 599 | * drops the pagecache ref for us without requiring another |
| 585 | * atomic operation. | 600 | * atomic operation. |
| 586 | */ | 601 | */ |
| 587 | page_unfreeze_refs(page, 1); | 602 | page_unfreeze_refs(page, 1); |
| 588 | return 1; | 603 | return 1; |
| 589 | } | 604 | } |
| 590 | return 0; | 605 | return 0; |
| 591 | } | 606 | } |
| 592 | 607 | ||
| 593 | /** | 608 | /** |
| 594 | * putback_lru_page - put previously isolated page onto appropriate LRU list | 609 | * putback_lru_page - put previously isolated page onto appropriate LRU list |
| 595 | * @page: page to be put back to appropriate lru list | 610 | * @page: page to be put back to appropriate lru list |
| 596 | * | 611 | * |
| 597 | * Add previously isolated @page to appropriate LRU list. | 612 | * Add previously isolated @page to appropriate LRU list. |
| 598 | * Page may still be unevictable for other reasons. | 613 | * Page may still be unevictable for other reasons. |
| 599 | * | 614 | * |
| 600 | * lru_lock must not be held, interrupts must be enabled. | 615 | * lru_lock must not be held, interrupts must be enabled. |
| 601 | */ | 616 | */ |
| 602 | void putback_lru_page(struct page *page) | 617 | void putback_lru_page(struct page *page) |
| 603 | { | 618 | { |
| 604 | int lru; | 619 | int lru; |
| 605 | int active = !!TestClearPageActive(page); | 620 | int active = !!TestClearPageActive(page); |
| 606 | int was_unevictable = PageUnevictable(page); | 621 | int was_unevictable = PageUnevictable(page); |
| 607 | 622 | ||
| 608 | VM_BUG_ON(PageLRU(page)); | 623 | VM_BUG_ON(PageLRU(page)); |
| 609 | 624 | ||
| 610 | redo: | 625 | redo: |
| 611 | ClearPageUnevictable(page); | 626 | ClearPageUnevictable(page); |
| 612 | 627 | ||
| 613 | if (page_evictable(page, NULL)) { | 628 | if (page_evictable(page, NULL)) { |
| 614 | /* | 629 | /* |
| 615 | * For evictable pages, we can use the cache. | 630 | * For evictable pages, we can use the cache. |
| 616 | * In event of a race, worst case is we end up with an | 631 | * In event of a race, worst case is we end up with an |
| 617 | * unevictable page on [in]active list. | 632 | * unevictable page on [in]active list. |
| 618 | * We know how to handle that. | 633 | * We know how to handle that. |
| 619 | */ | 634 | */ |
| 620 | lru = active + page_lru_base_type(page); | 635 | lru = active + page_lru_base_type(page); |
| 621 | lru_cache_add_lru(page, lru); | 636 | lru_cache_add_lru(page, lru); |
| 622 | } else { | 637 | } else { |
| 623 | /* | 638 | /* |
| 624 | * Put unevictable pages directly on zone's unevictable | 639 | * Put unevictable pages directly on zone's unevictable |
| 625 | * list. | 640 | * list. |
| 626 | */ | 641 | */ |
| 627 | lru = LRU_UNEVICTABLE; | 642 | lru = LRU_UNEVICTABLE; |
| 628 | add_page_to_unevictable_list(page); | 643 | add_page_to_unevictable_list(page); |
| 629 | /* | 644 | /* |
| 630 | * When racing with an mlock clearing (page is | 645 | * When racing with an mlock clearing (page is |
| 631 | * unlocked), make sure that if the other thread does | 646 | * unlocked), make sure that if the other thread does |
| 632 | * not observe our setting of PG_lru and fails | 647 | * not observe our setting of PG_lru and fails |
| 633 | * isolation, we see PG_mlocked cleared below and move | 648 | * isolation, we see PG_mlocked cleared below and move |
| 634 | * the page back to the evictable list. | 649 | * the page back to the evictable list. |
| 635 | * | 650 | * |
| 636 | * The other side is TestClearPageMlocked(). | 651 | * The other side is TestClearPageMlocked(). |
| 637 | */ | 652 | */ |
| 638 | smp_mb(); | 653 | smp_mb(); |
| 639 | } | 654 | } |
| 640 | 655 | ||
| 641 | /* | 656 | /* |
| 642 | * page's status can change while we move it among lru. If an evictable | 657 | * page's status can change while we move it among lru. If an evictable |
| 643 | * page is on unevictable list, it never be freed. To avoid that, | 658 | * page is on unevictable list, it never be freed. To avoid that, |
| 644 | * check after we added it to the list, again. | 659 | * check after we added it to the list, again. |
| 645 | */ | 660 | */ |
| 646 | if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { | 661 | if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { |
| 647 | if (!isolate_lru_page(page)) { | 662 | if (!isolate_lru_page(page)) { |
| 648 | put_page(page); | 663 | put_page(page); |
| 649 | goto redo; | 664 | goto redo; |
| 650 | } | 665 | } |
| 651 | /* This means someone else dropped this page from LRU | 666 | /* This means someone else dropped this page from LRU |
| 652 | * So, it will be freed or putback to LRU again. There is | 667 | * So, it will be freed or putback to LRU again. There is |
| 653 | * nothing to do here. | 668 | * nothing to do here. |
| 654 | */ | 669 | */ |
| 655 | } | 670 | } |
| 656 | 671 | ||
| 657 | if (was_unevictable && lru != LRU_UNEVICTABLE) | 672 | if (was_unevictable && lru != LRU_UNEVICTABLE) |
| 658 | count_vm_event(UNEVICTABLE_PGRESCUED); | 673 | count_vm_event(UNEVICTABLE_PGRESCUED); |
| 659 | else if (!was_unevictable && lru == LRU_UNEVICTABLE) | 674 | else if (!was_unevictable && lru == LRU_UNEVICTABLE) |
| 660 | count_vm_event(UNEVICTABLE_PGCULLED); | 675 | count_vm_event(UNEVICTABLE_PGCULLED); |
| 661 | 676 | ||
| 662 | put_page(page); /* drop ref from isolate */ | 677 | put_page(page); /* drop ref from isolate */ |
| 663 | } | 678 | } |
| 664 | 679 | ||
| 665 | enum page_references { | 680 | enum page_references { |
| 666 | PAGEREF_RECLAIM, | 681 | PAGEREF_RECLAIM, |
| 667 | PAGEREF_RECLAIM_CLEAN, | 682 | PAGEREF_RECLAIM_CLEAN, |
| 668 | PAGEREF_KEEP, | 683 | PAGEREF_KEEP, |
| 669 | PAGEREF_ACTIVATE, | 684 | PAGEREF_ACTIVATE, |
| 670 | }; | 685 | }; |
| 671 | 686 | ||
| 672 | static enum page_references page_check_references(struct page *page, | 687 | static enum page_references page_check_references(struct page *page, |
| 673 | struct scan_control *sc) | 688 | struct scan_control *sc) |
| 674 | { | 689 | { |
| 675 | int referenced_ptes, referenced_page; | 690 | int referenced_ptes, referenced_page; |
| 676 | unsigned long vm_flags; | 691 | unsigned long vm_flags; |
| 677 | 692 | ||
| 678 | referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); | 693 | referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); |
| 679 | referenced_page = TestClearPageReferenced(page); | 694 | referenced_page = TestClearPageReferenced(page); |
| 680 | 695 | ||
| 681 | /* Lumpy reclaim - ignore references */ | 696 | /* Lumpy reclaim - ignore references */ |
| 682 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | 697 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
| 683 | return PAGEREF_RECLAIM; | 698 | return PAGEREF_RECLAIM; |
| 684 | 699 | ||
| 685 | /* | 700 | /* |
| 686 | * Mlock lost the isolation race with us. Let try_to_unmap() | 701 | * Mlock lost the isolation race with us. Let try_to_unmap() |
| 687 | * move the page to the unevictable list. | 702 | * move the page to the unevictable list. |
| 688 | */ | 703 | */ |
| 689 | if (vm_flags & VM_LOCKED) | 704 | if (vm_flags & VM_LOCKED) |
| 690 | return PAGEREF_RECLAIM; | 705 | return PAGEREF_RECLAIM; |
| 691 | 706 | ||
| 692 | if (referenced_ptes) { | 707 | if (referenced_ptes) { |
| 693 | if (PageAnon(page)) | 708 | if (PageAnon(page)) |
| 694 | return PAGEREF_ACTIVATE; | 709 | return PAGEREF_ACTIVATE; |
| 695 | /* | 710 | /* |
| 696 | * All mapped pages start out with page table | 711 | * All mapped pages start out with page table |
| 697 | * references from the instantiating fault, so we need | 712 | * references from the instantiating fault, so we need |
| 698 | * to look twice if a mapped file page is used more | 713 | * to look twice if a mapped file page is used more |
| 699 | * than once. | 714 | * than once. |
| 700 | * | 715 | * |
| 701 | * Mark it and spare it for another trip around the | 716 | * Mark it and spare it for another trip around the |
| 702 | * inactive list. Another page table reference will | 717 | * inactive list. Another page table reference will |
| 703 | * lead to its activation. | 718 | * lead to its activation. |
| 704 | * | 719 | * |
| 705 | * Note: the mark is set for activated pages as well | 720 | * Note: the mark is set for activated pages as well |
| 706 | * so that recently deactivated but used pages are | 721 | * so that recently deactivated but used pages are |
| 707 | * quickly recovered. | 722 | * quickly recovered. |
| 708 | */ | 723 | */ |
| 709 | SetPageReferenced(page); | 724 | SetPageReferenced(page); |
| 710 | 725 | ||
| 711 | if (referenced_page) | 726 | if (referenced_page) |
| 712 | return PAGEREF_ACTIVATE; | 727 | return PAGEREF_ACTIVATE; |
| 713 | 728 | ||
| 714 | return PAGEREF_KEEP; | 729 | return PAGEREF_KEEP; |
| 715 | } | 730 | } |
| 716 | 731 | ||
| 717 | /* Reclaim if clean, defer dirty pages to writeback */ | 732 | /* Reclaim if clean, defer dirty pages to writeback */ |
| 718 | if (referenced_page && !PageSwapBacked(page)) | 733 | if (referenced_page && !PageSwapBacked(page)) |
| 719 | return PAGEREF_RECLAIM_CLEAN; | 734 | return PAGEREF_RECLAIM_CLEAN; |
| 720 | 735 | ||
| 721 | return PAGEREF_RECLAIM; | 736 | return PAGEREF_RECLAIM; |
| 722 | } | 737 | } |
| 723 | 738 | ||
| 724 | static noinline_for_stack void free_page_list(struct list_head *free_pages) | 739 | static noinline_for_stack void free_page_list(struct list_head *free_pages) |
| 725 | { | 740 | { |
| 726 | struct pagevec freed_pvec; | 741 | struct pagevec freed_pvec; |
| 727 | struct page *page, *tmp; | 742 | struct page *page, *tmp; |
| 728 | 743 | ||
| 729 | pagevec_init(&freed_pvec, 1); | 744 | pagevec_init(&freed_pvec, 1); |
| 730 | 745 | ||
| 731 | list_for_each_entry_safe(page, tmp, free_pages, lru) { | 746 | list_for_each_entry_safe(page, tmp, free_pages, lru) { |
| 732 | list_del(&page->lru); | 747 | list_del(&page->lru); |
| 733 | if (!pagevec_add(&freed_pvec, page)) { | 748 | if (!pagevec_add(&freed_pvec, page)) { |
| 734 | __pagevec_free(&freed_pvec); | 749 | __pagevec_free(&freed_pvec); |
| 735 | pagevec_reinit(&freed_pvec); | 750 | pagevec_reinit(&freed_pvec); |
| 736 | } | 751 | } |
| 737 | } | 752 | } |
| 738 | 753 | ||
| 739 | pagevec_free(&freed_pvec); | 754 | pagevec_free(&freed_pvec); |
| 740 | } | 755 | } |
| 741 | 756 | ||
| 742 | /* | 757 | /* |
| 743 | * shrink_page_list() returns the number of reclaimed pages | 758 | * shrink_page_list() returns the number of reclaimed pages |
| 744 | */ | 759 | */ |
| 745 | static unsigned long shrink_page_list(struct list_head *page_list, | 760 | static unsigned long shrink_page_list(struct list_head *page_list, |
| 746 | struct zone *zone, | 761 | struct zone *zone, |
| 747 | struct scan_control *sc) | 762 | struct scan_control *sc) |
| 748 | { | 763 | { |
| 749 | LIST_HEAD(ret_pages); | 764 | LIST_HEAD(ret_pages); |
| 750 | LIST_HEAD(free_pages); | 765 | LIST_HEAD(free_pages); |
| 751 | int pgactivate = 0; | 766 | int pgactivate = 0; |
| 752 | unsigned long nr_dirty = 0; | 767 | unsigned long nr_dirty = 0; |
| 753 | unsigned long nr_congested = 0; | 768 | unsigned long nr_congested = 0; |
| 754 | unsigned long nr_reclaimed = 0; | 769 | unsigned long nr_reclaimed = 0; |
| 755 | 770 | ||
| 756 | cond_resched(); | 771 | cond_resched(); |
| 757 | 772 | ||
| 758 | while (!list_empty(page_list)) { | 773 | while (!list_empty(page_list)) { |
| 759 | enum page_references references; | 774 | enum page_references references; |
| 760 | struct address_space *mapping; | 775 | struct address_space *mapping; |
| 761 | struct page *page; | 776 | struct page *page; |
| 762 | int may_enter_fs; | 777 | int may_enter_fs; |
| 763 | 778 | ||
| 764 | cond_resched(); | 779 | cond_resched(); |
| 765 | 780 | ||
| 766 | page = lru_to_page(page_list); | 781 | page = lru_to_page(page_list); |
| 767 | list_del(&page->lru); | 782 | list_del(&page->lru); |
| 768 | 783 | ||
| 769 | if (!trylock_page(page)) | 784 | if (!trylock_page(page)) |
| 770 | goto keep; | 785 | goto keep; |
| 771 | 786 | ||
| 772 | VM_BUG_ON(PageActive(page)); | 787 | VM_BUG_ON(PageActive(page)); |
| 773 | VM_BUG_ON(page_zone(page) != zone); | 788 | VM_BUG_ON(page_zone(page) != zone); |
| 774 | 789 | ||
| 775 | sc->nr_scanned++; | 790 | sc->nr_scanned++; |
| 776 | 791 | ||
| 777 | if (unlikely(!page_evictable(page, NULL))) | 792 | if (unlikely(!page_evictable(page, NULL))) |
| 778 | goto cull_mlocked; | 793 | goto cull_mlocked; |
| 779 | 794 | ||
| 780 | if (!sc->may_unmap && page_mapped(page)) | 795 | if (!sc->may_unmap && page_mapped(page)) |
| 781 | goto keep_locked; | 796 | goto keep_locked; |
| 782 | 797 | ||
| 783 | /* Double the slab pressure for mapped and swapcache pages */ | 798 | /* Double the slab pressure for mapped and swapcache pages */ |
| 784 | if (page_mapped(page) || PageSwapCache(page)) | 799 | if (page_mapped(page) || PageSwapCache(page)) |
| 785 | sc->nr_scanned++; | 800 | sc->nr_scanned++; |
| 786 | 801 | ||
| 787 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | 802 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || |
| 788 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 803 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
| 789 | 804 | ||
| 790 | if (PageWriteback(page)) { | 805 | if (PageWriteback(page)) { |
| 791 | /* | 806 | /* |
| 792 | * Synchronous reclaim is performed in two passes, | 807 | * Synchronous reclaim is performed in two passes, |
| 793 | * first an asynchronous pass over the list to | 808 | * first an asynchronous pass over the list to |
| 794 | * start parallel writeback, and a second synchronous | 809 | * start parallel writeback, and a second synchronous |
| 795 | * pass to wait for the IO to complete. Wait here | 810 | * pass to wait for the IO to complete. Wait here |
| 796 | * for any page for which writeback has already | 811 | * for any page for which writeback has already |
| 797 | * started. | 812 | * started. |
| 798 | */ | 813 | */ |
| 799 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | 814 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
| 800 | may_enter_fs) | 815 | may_enter_fs) |
| 801 | wait_on_page_writeback(page); | 816 | wait_on_page_writeback(page); |
| 802 | else { | 817 | else { |
| 803 | unlock_page(page); | 818 | unlock_page(page); |
| 804 | goto keep_lumpy; | 819 | goto keep_lumpy; |
| 805 | } | 820 | } |
| 806 | } | 821 | } |
| 807 | 822 | ||
| 808 | references = page_check_references(page, sc); | 823 | references = page_check_references(page, sc); |
| 809 | switch (references) { | 824 | switch (references) { |
| 810 | case PAGEREF_ACTIVATE: | 825 | case PAGEREF_ACTIVATE: |
| 811 | goto activate_locked; | 826 | goto activate_locked; |
| 812 | case PAGEREF_KEEP: | 827 | case PAGEREF_KEEP: |
| 813 | goto keep_locked; | 828 | goto keep_locked; |
| 814 | case PAGEREF_RECLAIM: | 829 | case PAGEREF_RECLAIM: |
| 815 | case PAGEREF_RECLAIM_CLEAN: | 830 | case PAGEREF_RECLAIM_CLEAN: |
| 816 | ; /* try to reclaim the page below */ | 831 | ; /* try to reclaim the page below */ |
| 817 | } | 832 | } |
| 818 | 833 | ||
| 819 | /* | 834 | /* |
| 820 | * Anonymous process memory has backing store? | 835 | * Anonymous process memory has backing store? |
| 821 | * Try to allocate it some swap space here. | 836 | * Try to allocate it some swap space here. |
| 822 | */ | 837 | */ |
| 823 | if (PageAnon(page) && !PageSwapCache(page)) { | 838 | if (PageAnon(page) && !PageSwapCache(page)) { |
| 824 | if (!(sc->gfp_mask & __GFP_IO)) | 839 | if (!(sc->gfp_mask & __GFP_IO)) |
| 825 | goto keep_locked; | 840 | goto keep_locked; |
| 826 | if (!add_to_swap(page)) | 841 | if (!add_to_swap(page)) |
| 827 | goto activate_locked; | 842 | goto activate_locked; |
| 828 | may_enter_fs = 1; | 843 | may_enter_fs = 1; |
| 829 | } | 844 | } |
| 830 | 845 | ||
| 831 | mapping = page_mapping(page); | 846 | mapping = page_mapping(page); |
| 832 | 847 | ||
| 833 | /* | 848 | /* |
| 834 | * The page is mapped into the page tables of one or more | 849 | * The page is mapped into the page tables of one or more |
| 835 | * processes. Try to unmap it here. | 850 | * processes. Try to unmap it here. |
| 836 | */ | 851 | */ |
| 837 | if (page_mapped(page) && mapping) { | 852 | if (page_mapped(page) && mapping) { |
| 838 | switch (try_to_unmap(page, TTU_UNMAP)) { | 853 | switch (try_to_unmap(page, TTU_UNMAP)) { |
| 839 | case SWAP_FAIL: | 854 | case SWAP_FAIL: |
| 840 | goto activate_locked; | 855 | goto activate_locked; |
| 841 | case SWAP_AGAIN: | 856 | case SWAP_AGAIN: |
| 842 | goto keep_locked; | 857 | goto keep_locked; |
| 843 | case SWAP_MLOCK: | 858 | case SWAP_MLOCK: |
| 844 | goto cull_mlocked; | 859 | goto cull_mlocked; |
| 845 | case SWAP_SUCCESS: | 860 | case SWAP_SUCCESS: |
| 846 | ; /* try to free the page below */ | 861 | ; /* try to free the page below */ |
| 847 | } | 862 | } |
| 848 | } | 863 | } |
| 849 | 864 | ||
| 850 | if (PageDirty(page)) { | 865 | if (PageDirty(page)) { |
| 851 | nr_dirty++; | 866 | nr_dirty++; |
| 852 | 867 | ||
| 853 | if (references == PAGEREF_RECLAIM_CLEAN) | 868 | if (references == PAGEREF_RECLAIM_CLEAN) |
| 854 | goto keep_locked; | 869 | goto keep_locked; |
| 855 | if (!may_enter_fs) | 870 | if (!may_enter_fs) |
| 856 | goto keep_locked; | 871 | goto keep_locked; |
| 857 | if (!sc->may_writepage) | 872 | if (!sc->may_writepage) |
| 858 | goto keep_locked; | 873 | goto keep_locked; |
| 859 | 874 | ||
| 860 | /* Page is dirty, try to write it out here */ | 875 | /* Page is dirty, try to write it out here */ |
| 861 | switch (pageout(page, mapping, sc)) { | 876 | switch (pageout(page, mapping, sc)) { |
| 862 | case PAGE_KEEP: | 877 | case PAGE_KEEP: |
| 863 | nr_congested++; | 878 | nr_congested++; |
| 864 | goto keep_locked; | 879 | goto keep_locked; |
| 865 | case PAGE_ACTIVATE: | 880 | case PAGE_ACTIVATE: |
| 866 | goto activate_locked; | 881 | goto activate_locked; |
| 867 | case PAGE_SUCCESS: | 882 | case PAGE_SUCCESS: |
| 868 | if (PageWriteback(page)) | 883 | if (PageWriteback(page)) |
| 869 | goto keep_lumpy; | 884 | goto keep_lumpy; |
| 870 | if (PageDirty(page)) | 885 | if (PageDirty(page)) |
| 871 | goto keep; | 886 | goto keep; |
| 872 | 887 | ||
| 873 | /* | 888 | /* |
| 874 | * A synchronous write - probably a ramdisk. Go | 889 | * A synchronous write - probably a ramdisk. Go |
| 875 | * ahead and try to reclaim the page. | 890 | * ahead and try to reclaim the page. |
| 876 | */ | 891 | */ |
| 877 | if (!trylock_page(page)) | 892 | if (!trylock_page(page)) |
| 878 | goto keep; | 893 | goto keep; |
| 879 | if (PageDirty(page) || PageWriteback(page)) | 894 | if (PageDirty(page) || PageWriteback(page)) |
| 880 | goto keep_locked; | 895 | goto keep_locked; |
| 881 | mapping = page_mapping(page); | 896 | mapping = page_mapping(page); |
| 882 | case PAGE_CLEAN: | 897 | case PAGE_CLEAN: |
| 883 | ; /* try to free the page below */ | 898 | ; /* try to free the page below */ |
| 884 | } | 899 | } |
| 885 | } | 900 | } |
| 886 | 901 | ||
| 887 | /* | 902 | /* |
| 888 | * If the page has buffers, try to free the buffer mappings | 903 | * If the page has buffers, try to free the buffer mappings |
| 889 | * associated with this page. If we succeed we try to free | 904 | * associated with this page. If we succeed we try to free |
| 890 | * the page as well. | 905 | * the page as well. |
| 891 | * | 906 | * |
| 892 | * We do this even if the page is PageDirty(). | 907 | * We do this even if the page is PageDirty(). |
| 893 | * try_to_release_page() does not perform I/O, but it is | 908 | * try_to_release_page() does not perform I/O, but it is |
| 894 | * possible for a page to have PageDirty set, but it is actually | 909 | * possible for a page to have PageDirty set, but it is actually |
| 895 | * clean (all its buffers are clean). This happens if the | 910 | * clean (all its buffers are clean). This happens if the |
| 896 | * buffers were written out directly, with submit_bh(). ext3 | 911 | * buffers were written out directly, with submit_bh(). ext3 |
| 897 | * will do this, as well as the blockdev mapping. | 912 | * will do this, as well as the blockdev mapping. |
| 898 | * try_to_release_page() will discover that cleanness and will | 913 | * try_to_release_page() will discover that cleanness and will |
| 899 | * drop the buffers and mark the page clean - it can be freed. | 914 | * drop the buffers and mark the page clean - it can be freed. |
| 900 | * | 915 | * |
| 901 | * Rarely, pages can have buffers and no ->mapping. These are | 916 | * Rarely, pages can have buffers and no ->mapping. These are |
| 902 | * the pages which were not successfully invalidated in | 917 | * the pages which were not successfully invalidated in |
| 903 | * truncate_complete_page(). We try to drop those buffers here | 918 | * truncate_complete_page(). We try to drop those buffers here |
| 904 | * and if that worked, and the page is no longer mapped into | 919 | * and if that worked, and the page is no longer mapped into |
| 905 | * process address space (page_count == 1) it can be freed. | 920 | * process address space (page_count == 1) it can be freed. |
| 906 | * Otherwise, leave the page on the LRU so it is swappable. | 921 | * Otherwise, leave the page on the LRU so it is swappable. |
| 907 | */ | 922 | */ |
| 908 | if (page_has_private(page)) { | 923 | if (page_has_private(page)) { |
| 909 | if (!try_to_release_page(page, sc->gfp_mask)) | 924 | if (!try_to_release_page(page, sc->gfp_mask)) |
| 910 | goto activate_locked; | 925 | goto activate_locked; |
| 911 | if (!mapping && page_count(page) == 1) { | 926 | if (!mapping && page_count(page) == 1) { |
| 912 | unlock_page(page); | 927 | unlock_page(page); |
| 913 | if (put_page_testzero(page)) | 928 | if (put_page_testzero(page)) |
| 914 | goto free_it; | 929 | goto free_it; |
| 915 | else { | 930 | else { |
| 916 | /* | 931 | /* |
| 917 | * rare race with speculative reference. | 932 | * rare race with speculative reference. |
| 918 | * the speculative reference will free | 933 | * the speculative reference will free |
| 919 | * this page shortly, so we may | 934 | * this page shortly, so we may |
| 920 | * increment nr_reclaimed here (and | 935 | * increment nr_reclaimed here (and |
| 921 | * leave it off the LRU). | 936 | * leave it off the LRU). |
| 922 | */ | 937 | */ |
| 923 | nr_reclaimed++; | 938 | nr_reclaimed++; |
| 924 | continue; | 939 | continue; |
| 925 | } | 940 | } |
| 926 | } | 941 | } |
| 927 | } | 942 | } |
| 928 | 943 | ||
| 929 | if (!mapping || !__remove_mapping(mapping, page)) | 944 | if (!mapping || !__remove_mapping(mapping, page)) |
| 930 | goto keep_locked; | 945 | goto keep_locked; |
| 931 | 946 | ||
| 932 | /* | 947 | /* |
| 933 | * At this point, we have no other references and there is | 948 | * At this point, we have no other references and there is |
| 934 | * no way to pick any more up (removed from LRU, removed | 949 | * no way to pick any more up (removed from LRU, removed |
| 935 | * from pagecache). Can use non-atomic bitops now (and | 950 | * from pagecache). Can use non-atomic bitops now (and |
| 936 | * we obviously don't have to worry about waking up a process | 951 | * we obviously don't have to worry about waking up a process |
| 937 | * waiting on the page lock, because there are no references. | 952 | * waiting on the page lock, because there are no references. |
| 938 | */ | 953 | */ |
| 939 | __clear_page_locked(page); | 954 | __clear_page_locked(page); |
| 940 | free_it: | 955 | free_it: |
| 941 | nr_reclaimed++; | 956 | nr_reclaimed++; |
| 942 | 957 | ||
| 943 | /* | 958 | /* |
| 944 | * Is there need to periodically free_page_list? It would | 959 | * Is there need to periodically free_page_list? It would |
| 945 | * appear not as the counts should be low | 960 | * appear not as the counts should be low |
| 946 | */ | 961 | */ |
| 947 | list_add(&page->lru, &free_pages); | 962 | list_add(&page->lru, &free_pages); |
| 948 | continue; | 963 | continue; |
| 949 | 964 | ||
| 950 | cull_mlocked: | 965 | cull_mlocked: |
| 951 | if (PageSwapCache(page)) | 966 | if (PageSwapCache(page)) |
| 952 | try_to_free_swap(page); | 967 | try_to_free_swap(page); |
| 953 | unlock_page(page); | 968 | unlock_page(page); |
| 954 | putback_lru_page(page); | 969 | putback_lru_page(page); |
| 955 | reset_reclaim_mode(sc); | 970 | reset_reclaim_mode(sc); |
| 956 | continue; | 971 | continue; |
| 957 | 972 | ||
| 958 | activate_locked: | 973 | activate_locked: |
| 959 | /* Not a candidate for swapping, so reclaim swap space. */ | 974 | /* Not a candidate for swapping, so reclaim swap space. */ |
| 960 | if (PageSwapCache(page) && vm_swap_full()) | 975 | if (PageSwapCache(page) && vm_swap_full()) |
| 961 | try_to_free_swap(page); | 976 | try_to_free_swap(page); |
| 962 | VM_BUG_ON(PageActive(page)); | 977 | VM_BUG_ON(PageActive(page)); |
| 963 | SetPageActive(page); | 978 | SetPageActive(page); |
| 964 | pgactivate++; | 979 | pgactivate++; |
| 965 | keep_locked: | 980 | keep_locked: |
| 966 | unlock_page(page); | 981 | unlock_page(page); |
| 967 | keep: | 982 | keep: |
| 968 | reset_reclaim_mode(sc); | 983 | reset_reclaim_mode(sc); |
| 969 | keep_lumpy: | 984 | keep_lumpy: |
| 970 | list_add(&page->lru, &ret_pages); | 985 | list_add(&page->lru, &ret_pages); |
| 971 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 986 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
| 972 | } | 987 | } |
| 973 | 988 | ||
| 974 | /* | 989 | /* |
| 975 | * Tag a zone as congested if all the dirty pages encountered were | 990 | * Tag a zone as congested if all the dirty pages encountered were |
| 976 | * backed by a congested BDI. In this case, reclaimers should just | 991 | * backed by a congested BDI. In this case, reclaimers should just |
| 977 | * back off and wait for congestion to clear because further reclaim | 992 | * back off and wait for congestion to clear because further reclaim |
| 978 | * will encounter the same problem | 993 | * will encounter the same problem |
| 979 | */ | 994 | */ |
| 980 | if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) | 995 | if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) |
| 981 | zone_set_flag(zone, ZONE_CONGESTED); | 996 | zone_set_flag(zone, ZONE_CONGESTED); |
| 982 | 997 | ||
| 983 | free_page_list(&free_pages); | 998 | free_page_list(&free_pages); |
| 984 | 999 | ||
| 985 | list_splice(&ret_pages, page_list); | 1000 | list_splice(&ret_pages, page_list); |
| 986 | count_vm_events(PGACTIVATE, pgactivate); | 1001 | count_vm_events(PGACTIVATE, pgactivate); |
| 987 | return nr_reclaimed; | 1002 | return nr_reclaimed; |
| 988 | } | 1003 | } |
| 989 | 1004 | ||
| 990 | /* | 1005 | /* |
| 991 | * Attempt to remove the specified page from its LRU. Only take this page | 1006 | * Attempt to remove the specified page from its LRU. Only take this page |
| 992 | * if it is of the appropriate PageActive status. Pages which are being | 1007 | * if it is of the appropriate PageActive status. Pages which are being |
| 993 | * freed elsewhere are also ignored. | 1008 | * freed elsewhere are also ignored. |
| 994 | * | 1009 | * |
| 995 | * page: page to consider | 1010 | * page: page to consider |
| 996 | * mode: one of the LRU isolation modes defined above | 1011 | * mode: one of the LRU isolation modes defined above |
| 997 | * | 1012 | * |
| 998 | * returns 0 on success, -ve errno on failure. | 1013 | * returns 0 on success, -ve errno on failure. |
| 999 | */ | 1014 | */ |
| 1000 | int __isolate_lru_page(struct page *page, int mode, int file) | 1015 | int __isolate_lru_page(struct page *page, int mode, int file) |
| 1001 | { | 1016 | { |
| 1002 | int ret = -EINVAL; | 1017 | int ret = -EINVAL; |
| 1003 | 1018 | ||
| 1004 | /* Only take pages on the LRU. */ | 1019 | /* Only take pages on the LRU. */ |
| 1005 | if (!PageLRU(page)) | 1020 | if (!PageLRU(page)) |
| 1006 | return ret; | 1021 | return ret; |
| 1007 | 1022 | ||
| 1008 | /* | 1023 | /* |
| 1009 | * When checking the active state, we need to be sure we are | 1024 | * When checking the active state, we need to be sure we are |
| 1010 | * dealing with comparible boolean values. Take the logical not | 1025 | * dealing with comparible boolean values. Take the logical not |
| 1011 | * of each. | 1026 | * of each. |
| 1012 | */ | 1027 | */ |
| 1013 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 1028 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) |
| 1014 | return ret; | 1029 | return ret; |
| 1015 | 1030 | ||
| 1016 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) | 1031 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) |
| 1017 | return ret; | 1032 | return ret; |
| 1018 | 1033 | ||
| 1019 | /* | 1034 | /* |
| 1020 | * When this function is being called for lumpy reclaim, we | 1035 | * When this function is being called for lumpy reclaim, we |
| 1021 | * initially look into all LRU pages, active, inactive and | 1036 | * initially look into all LRU pages, active, inactive and |
| 1022 | * unevictable; only give shrink_page_list evictable pages. | 1037 | * unevictable; only give shrink_page_list evictable pages. |
| 1023 | */ | 1038 | */ |
| 1024 | if (PageUnevictable(page)) | 1039 | if (PageUnevictable(page)) |
| 1025 | return ret; | 1040 | return ret; |
| 1026 | 1041 | ||
| 1027 | ret = -EBUSY; | 1042 | ret = -EBUSY; |
| 1028 | 1043 | ||
| 1029 | if (likely(get_page_unless_zero(page))) { | 1044 | if (likely(get_page_unless_zero(page))) { |
| 1030 | /* | 1045 | /* |
| 1031 | * Be careful not to clear PageLRU until after we're | 1046 | * Be careful not to clear PageLRU until after we're |
| 1032 | * sure the page is not being freed elsewhere -- the | 1047 | * sure the page is not being freed elsewhere -- the |
| 1033 | * page release code relies on it. | 1048 | * page release code relies on it. |
| 1034 | */ | 1049 | */ |
| 1035 | ClearPageLRU(page); | 1050 | ClearPageLRU(page); |
| 1036 | ret = 0; | 1051 | ret = 0; |
| 1037 | } | 1052 | } |
| 1038 | 1053 | ||
| 1039 | return ret; | 1054 | return ret; |
| 1040 | } | 1055 | } |
| 1041 | 1056 | ||
| 1042 | /* | 1057 | /* |
| 1043 | * zone->lru_lock is heavily contended. Some of the functions that | 1058 | * zone->lru_lock is heavily contended. Some of the functions that |
| 1044 | * shrink the lists perform better by taking out a batch of pages | 1059 | * shrink the lists perform better by taking out a batch of pages |
| 1045 | * and working on them outside the LRU lock. | 1060 | * and working on them outside the LRU lock. |
| 1046 | * | 1061 | * |
| 1047 | * For pagecache intensive workloads, this function is the hottest | 1062 | * For pagecache intensive workloads, this function is the hottest |
| 1048 | * spot in the kernel (apart from copy_*_user functions). | 1063 | * spot in the kernel (apart from copy_*_user functions). |
| 1049 | * | 1064 | * |
| 1050 | * Appropriate locks must be held before calling this function. | 1065 | * Appropriate locks must be held before calling this function. |
| 1051 | * | 1066 | * |
| 1052 | * @nr_to_scan: The number of pages to look through on the list. | 1067 | * @nr_to_scan: The number of pages to look through on the list. |
| 1053 | * @src: The LRU list to pull pages off. | 1068 | * @src: The LRU list to pull pages off. |
| 1054 | * @dst: The temp list to put pages on to. | 1069 | * @dst: The temp list to put pages on to. |
| 1055 | * @scanned: The number of pages that were scanned. | 1070 | * @scanned: The number of pages that were scanned. |
| 1056 | * @order: The caller's attempted allocation order | 1071 | * @order: The caller's attempted allocation order |
| 1057 | * @mode: One of the LRU isolation modes | 1072 | * @mode: One of the LRU isolation modes |
| 1058 | * @file: True [1] if isolating file [!anon] pages | 1073 | * @file: True [1] if isolating file [!anon] pages |
| 1059 | * | 1074 | * |
| 1060 | * returns how many pages were moved onto *@dst. | 1075 | * returns how many pages were moved onto *@dst. |
| 1061 | */ | 1076 | */ |
| 1062 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1077 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
| 1063 | struct list_head *src, struct list_head *dst, | 1078 | struct list_head *src, struct list_head *dst, |
| 1064 | unsigned long *scanned, int order, int mode, int file) | 1079 | unsigned long *scanned, int order, int mode, int file) |
| 1065 | { | 1080 | { |
| 1066 | unsigned long nr_taken = 0; | 1081 | unsigned long nr_taken = 0; |
| 1067 | unsigned long nr_lumpy_taken = 0; | 1082 | unsigned long nr_lumpy_taken = 0; |
| 1068 | unsigned long nr_lumpy_dirty = 0; | 1083 | unsigned long nr_lumpy_dirty = 0; |
| 1069 | unsigned long nr_lumpy_failed = 0; | 1084 | unsigned long nr_lumpy_failed = 0; |
| 1070 | unsigned long scan; | 1085 | unsigned long scan; |
| 1071 | 1086 | ||
| 1072 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | 1087 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
| 1073 | struct page *page; | 1088 | struct page *page; |
| 1074 | unsigned long pfn; | 1089 | unsigned long pfn; |
| 1075 | unsigned long end_pfn; | 1090 | unsigned long end_pfn; |
| 1076 | unsigned long page_pfn; | 1091 | unsigned long page_pfn; |
| 1077 | int zone_id; | 1092 | int zone_id; |
| 1078 | 1093 | ||
| 1079 | page = lru_to_page(src); | 1094 | page = lru_to_page(src); |
| 1080 | prefetchw_prev_lru_page(page, src, flags); | 1095 | prefetchw_prev_lru_page(page, src, flags); |
| 1081 | 1096 | ||
| 1082 | VM_BUG_ON(!PageLRU(page)); | 1097 | VM_BUG_ON(!PageLRU(page)); |
| 1083 | 1098 | ||
| 1084 | switch (__isolate_lru_page(page, mode, file)) { | 1099 | switch (__isolate_lru_page(page, mode, file)) { |
| 1085 | case 0: | 1100 | case 0: |
| 1086 | list_move(&page->lru, dst); | 1101 | list_move(&page->lru, dst); |
| 1087 | mem_cgroup_del_lru(page); | 1102 | mem_cgroup_del_lru(page); |
| 1088 | nr_taken += hpage_nr_pages(page); | 1103 | nr_taken += hpage_nr_pages(page); |
| 1089 | break; | 1104 | break; |
| 1090 | 1105 | ||
| 1091 | case -EBUSY: | 1106 | case -EBUSY: |
| 1092 | /* else it is being freed elsewhere */ | 1107 | /* else it is being freed elsewhere */ |
| 1093 | list_move(&page->lru, src); | 1108 | list_move(&page->lru, src); |
| 1094 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | 1109 | mem_cgroup_rotate_lru_list(page, page_lru(page)); |
| 1095 | continue; | 1110 | continue; |
| 1096 | 1111 | ||
| 1097 | default: | 1112 | default: |
| 1098 | BUG(); | 1113 | BUG(); |
| 1099 | } | 1114 | } |
| 1100 | 1115 | ||
| 1101 | if (!order) | 1116 | if (!order) |
| 1102 | continue; | 1117 | continue; |
| 1103 | 1118 | ||
| 1104 | /* | 1119 | /* |
| 1105 | * Attempt to take all pages in the order aligned region | 1120 | * Attempt to take all pages in the order aligned region |
| 1106 | * surrounding the tag page. Only take those pages of | 1121 | * surrounding the tag page. Only take those pages of |
| 1107 | * the same active state as that tag page. We may safely | 1122 | * the same active state as that tag page. We may safely |
| 1108 | * round the target page pfn down to the requested order | 1123 | * round the target page pfn down to the requested order |
| 1109 | * as the mem_map is guaranteed valid out to MAX_ORDER, | 1124 | * as the mem_map is guaranteed valid out to MAX_ORDER, |
| 1110 | * where that page is in a different zone we will detect | 1125 | * where that page is in a different zone we will detect |
| 1111 | * it from its zone id and abort this block scan. | 1126 | * it from its zone id and abort this block scan. |
| 1112 | */ | 1127 | */ |
| 1113 | zone_id = page_zone_id(page); | 1128 | zone_id = page_zone_id(page); |
| 1114 | page_pfn = page_to_pfn(page); | 1129 | page_pfn = page_to_pfn(page); |
| 1115 | pfn = page_pfn & ~((1 << order) - 1); | 1130 | pfn = page_pfn & ~((1 << order) - 1); |
| 1116 | end_pfn = pfn + (1 << order); | 1131 | end_pfn = pfn + (1 << order); |
| 1117 | for (; pfn < end_pfn; pfn++) { | 1132 | for (; pfn < end_pfn; pfn++) { |
| 1118 | struct page *cursor_page; | 1133 | struct page *cursor_page; |
| 1119 | 1134 | ||
| 1120 | /* The target page is in the block, ignore it. */ | 1135 | /* The target page is in the block, ignore it. */ |
| 1121 | if (unlikely(pfn == page_pfn)) | 1136 | if (unlikely(pfn == page_pfn)) |
| 1122 | continue; | 1137 | continue; |
| 1123 | 1138 | ||
| 1124 | /* Avoid holes within the zone. */ | 1139 | /* Avoid holes within the zone. */ |
| 1125 | if (unlikely(!pfn_valid_within(pfn))) | 1140 | if (unlikely(!pfn_valid_within(pfn))) |
| 1126 | break; | 1141 | break; |
| 1127 | 1142 | ||
| 1128 | cursor_page = pfn_to_page(pfn); | 1143 | cursor_page = pfn_to_page(pfn); |
| 1129 | 1144 | ||
| 1130 | /* Check that we have not crossed a zone boundary. */ | 1145 | /* Check that we have not crossed a zone boundary. */ |
| 1131 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 1146 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
| 1132 | break; | 1147 | break; |
| 1133 | 1148 | ||
| 1134 | /* | 1149 | /* |
| 1135 | * If we don't have enough swap space, reclaiming of | 1150 | * If we don't have enough swap space, reclaiming of |
| 1136 | * anon page which don't already have a swap slot is | 1151 | * anon page which don't already have a swap slot is |
| 1137 | * pointless. | 1152 | * pointless. |
| 1138 | */ | 1153 | */ |
| 1139 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && | 1154 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && |
| 1140 | !PageSwapCache(cursor_page)) | 1155 | !PageSwapCache(cursor_page)) |
| 1141 | break; | 1156 | break; |
| 1142 | 1157 | ||
| 1143 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1158 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
| 1144 | list_move(&cursor_page->lru, dst); | 1159 | list_move(&cursor_page->lru, dst); |
| 1145 | mem_cgroup_del_lru(cursor_page); | 1160 | mem_cgroup_del_lru(cursor_page); |
| 1146 | nr_taken += hpage_nr_pages(page); | 1161 | nr_taken += hpage_nr_pages(page); |
| 1147 | nr_lumpy_taken++; | 1162 | nr_lumpy_taken++; |
| 1148 | if (PageDirty(cursor_page)) | 1163 | if (PageDirty(cursor_page)) |
| 1149 | nr_lumpy_dirty++; | 1164 | nr_lumpy_dirty++; |
| 1150 | scan++; | 1165 | scan++; |
| 1151 | } else { | 1166 | } else { |
| 1152 | /* | 1167 | /* |
| 1153 | * Check if the page is freed already. | 1168 | * Check if the page is freed already. |
| 1154 | * | 1169 | * |
| 1155 | * We can't use page_count() as that | 1170 | * We can't use page_count() as that |
| 1156 | * requires compound_head and we don't | 1171 | * requires compound_head and we don't |
| 1157 | * have a pin on the page here. If a | 1172 | * have a pin on the page here. If a |
| 1158 | * page is tail, we may or may not | 1173 | * page is tail, we may or may not |
| 1159 | * have isolated the head, so assume | 1174 | * have isolated the head, so assume |
| 1160 | * it's not free, it'd be tricky to | 1175 | * it's not free, it'd be tricky to |
| 1161 | * track the head status without a | 1176 | * track the head status without a |
| 1162 | * page pin. | 1177 | * page pin. |
| 1163 | */ | 1178 | */ |
| 1164 | if (!PageTail(cursor_page) && | 1179 | if (!PageTail(cursor_page) && |
| 1165 | !atomic_read(&cursor_page->_count)) | 1180 | !atomic_read(&cursor_page->_count)) |
| 1166 | continue; | 1181 | continue; |
| 1167 | break; | 1182 | break; |
| 1168 | } | 1183 | } |
| 1169 | } | 1184 | } |
| 1170 | 1185 | ||
| 1171 | /* If we break out of the loop above, lumpy reclaim failed */ | 1186 | /* If we break out of the loop above, lumpy reclaim failed */ |
| 1172 | if (pfn < end_pfn) | 1187 | if (pfn < end_pfn) |
| 1173 | nr_lumpy_failed++; | 1188 | nr_lumpy_failed++; |
| 1174 | } | 1189 | } |
| 1175 | 1190 | ||
| 1176 | *scanned = scan; | 1191 | *scanned = scan; |
| 1177 | 1192 | ||
| 1178 | trace_mm_vmscan_lru_isolate(order, | 1193 | trace_mm_vmscan_lru_isolate(order, |
| 1179 | nr_to_scan, scan, | 1194 | nr_to_scan, scan, |
| 1180 | nr_taken, | 1195 | nr_taken, |
| 1181 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | 1196 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, |
| 1182 | mode); | 1197 | mode); |
| 1183 | return nr_taken; | 1198 | return nr_taken; |
| 1184 | } | 1199 | } |
| 1185 | 1200 | ||
| 1186 | static unsigned long isolate_pages_global(unsigned long nr, | 1201 | static unsigned long isolate_pages_global(unsigned long nr, |
| 1187 | struct list_head *dst, | 1202 | struct list_head *dst, |
| 1188 | unsigned long *scanned, int order, | 1203 | unsigned long *scanned, int order, |
| 1189 | int mode, struct zone *z, | 1204 | int mode, struct zone *z, |
| 1190 | int active, int file) | 1205 | int active, int file) |
| 1191 | { | 1206 | { |
| 1192 | int lru = LRU_BASE; | 1207 | int lru = LRU_BASE; |
| 1193 | if (active) | 1208 | if (active) |
| 1194 | lru += LRU_ACTIVE; | 1209 | lru += LRU_ACTIVE; |
| 1195 | if (file) | 1210 | if (file) |
| 1196 | lru += LRU_FILE; | 1211 | lru += LRU_FILE; |
| 1197 | return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, | 1212 | return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, |
| 1198 | mode, file); | 1213 | mode, file); |
| 1199 | } | 1214 | } |
| 1200 | 1215 | ||
| 1201 | /* | 1216 | /* |
| 1202 | * clear_active_flags() is a helper for shrink_active_list(), clearing | 1217 | * clear_active_flags() is a helper for shrink_active_list(), clearing |
| 1203 | * any active bits from the pages in the list. | 1218 | * any active bits from the pages in the list. |
| 1204 | */ | 1219 | */ |
| 1205 | static unsigned long clear_active_flags(struct list_head *page_list, | 1220 | static unsigned long clear_active_flags(struct list_head *page_list, |
| 1206 | unsigned int *count) | 1221 | unsigned int *count) |
| 1207 | { | 1222 | { |
| 1208 | int nr_active = 0; | 1223 | int nr_active = 0; |
| 1209 | int lru; | 1224 | int lru; |
| 1210 | struct page *page; | 1225 | struct page *page; |
| 1211 | 1226 | ||
| 1212 | list_for_each_entry(page, page_list, lru) { | 1227 | list_for_each_entry(page, page_list, lru) { |
| 1213 | int numpages = hpage_nr_pages(page); | 1228 | int numpages = hpage_nr_pages(page); |
| 1214 | lru = page_lru_base_type(page); | 1229 | lru = page_lru_base_type(page); |
| 1215 | if (PageActive(page)) { | 1230 | if (PageActive(page)) { |
| 1216 | lru += LRU_ACTIVE; | 1231 | lru += LRU_ACTIVE; |
| 1217 | ClearPageActive(page); | 1232 | ClearPageActive(page); |
| 1218 | nr_active += numpages; | 1233 | nr_active += numpages; |
| 1219 | } | 1234 | } |
| 1220 | if (count) | 1235 | if (count) |
| 1221 | count[lru] += numpages; | 1236 | count[lru] += numpages; |
| 1222 | } | 1237 | } |
| 1223 | 1238 | ||
| 1224 | return nr_active; | 1239 | return nr_active; |
| 1225 | } | 1240 | } |
| 1226 | 1241 | ||
| 1227 | /** | 1242 | /** |
| 1228 | * isolate_lru_page - tries to isolate a page from its LRU list | 1243 | * isolate_lru_page - tries to isolate a page from its LRU list |
| 1229 | * @page: page to isolate from its LRU list | 1244 | * @page: page to isolate from its LRU list |
| 1230 | * | 1245 | * |
| 1231 | * Isolates a @page from an LRU list, clears PageLRU and adjusts the | 1246 | * Isolates a @page from an LRU list, clears PageLRU and adjusts the |
| 1232 | * vmstat statistic corresponding to whatever LRU list the page was on. | 1247 | * vmstat statistic corresponding to whatever LRU list the page was on. |
| 1233 | * | 1248 | * |
| 1234 | * Returns 0 if the page was removed from an LRU list. | 1249 | * Returns 0 if the page was removed from an LRU list. |
| 1235 | * Returns -EBUSY if the page was not on an LRU list. | 1250 | * Returns -EBUSY if the page was not on an LRU list. |
| 1236 | * | 1251 | * |
| 1237 | * The returned page will have PageLRU() cleared. If it was found on | 1252 | * The returned page will have PageLRU() cleared. If it was found on |
| 1238 | * the active list, it will have PageActive set. If it was found on | 1253 | * the active list, it will have PageActive set. If it was found on |
| 1239 | * the unevictable list, it will have the PageUnevictable bit set. That flag | 1254 | * the unevictable list, it will have the PageUnevictable bit set. That flag |
| 1240 | * may need to be cleared by the caller before letting the page go. | 1255 | * may need to be cleared by the caller before letting the page go. |
| 1241 | * | 1256 | * |
| 1242 | * The vmstat statistic corresponding to the list on which the page was | 1257 | * The vmstat statistic corresponding to the list on which the page was |
| 1243 | * found will be decremented. | 1258 | * found will be decremented. |
| 1244 | * | 1259 | * |
| 1245 | * Restrictions: | 1260 | * Restrictions: |
| 1246 | * (1) Must be called with an elevated refcount on the page. This is a | 1261 | * (1) Must be called with an elevated refcount on the page. This is a |
| 1247 | * fundamentnal difference from isolate_lru_pages (which is called | 1262 | * fundamentnal difference from isolate_lru_pages (which is called |
| 1248 | * without a stable reference). | 1263 | * without a stable reference). |
| 1249 | * (2) the lru_lock must not be held. | 1264 | * (2) the lru_lock must not be held. |
| 1250 | * (3) interrupts must be enabled. | 1265 | * (3) interrupts must be enabled. |
| 1251 | */ | 1266 | */ |
| 1252 | int isolate_lru_page(struct page *page) | 1267 | int isolate_lru_page(struct page *page) |
| 1253 | { | 1268 | { |
| 1254 | int ret = -EBUSY; | 1269 | int ret = -EBUSY; |
| 1255 | 1270 | ||
| 1256 | VM_BUG_ON(!page_count(page)); | 1271 | VM_BUG_ON(!page_count(page)); |
| 1257 | 1272 | ||
| 1258 | if (PageLRU(page)) { | 1273 | if (PageLRU(page)) { |
| 1259 | struct zone *zone = page_zone(page); | 1274 | struct zone *zone = page_zone(page); |
| 1260 | 1275 | ||
| 1261 | spin_lock_irq(&zone->lru_lock); | 1276 | spin_lock_irq(&zone->lru_lock); |
| 1262 | if (PageLRU(page)) { | 1277 | if (PageLRU(page)) { |
| 1263 | int lru = page_lru(page); | 1278 | int lru = page_lru(page); |
| 1264 | ret = 0; | 1279 | ret = 0; |
| 1265 | get_page(page); | 1280 | get_page(page); |
| 1266 | ClearPageLRU(page); | 1281 | ClearPageLRU(page); |
| 1267 | 1282 | ||
| 1268 | del_page_from_lru_list(zone, page, lru); | 1283 | del_page_from_lru_list(zone, page, lru); |
| 1269 | } | 1284 | } |
| 1270 | spin_unlock_irq(&zone->lru_lock); | 1285 | spin_unlock_irq(&zone->lru_lock); |
| 1271 | } | 1286 | } |
| 1272 | return ret; | 1287 | return ret; |
| 1273 | } | 1288 | } |
| 1274 | 1289 | ||
| 1275 | /* | 1290 | /* |
| 1276 | * Are there way too many processes in the direct reclaim path already? | 1291 | * Are there way too many processes in the direct reclaim path already? |
| 1277 | */ | 1292 | */ |
| 1278 | static int too_many_isolated(struct zone *zone, int file, | 1293 | static int too_many_isolated(struct zone *zone, int file, |
| 1279 | struct scan_control *sc) | 1294 | struct scan_control *sc) |
| 1280 | { | 1295 | { |
| 1281 | unsigned long inactive, isolated; | 1296 | unsigned long inactive, isolated; |
| 1282 | 1297 | ||
| 1283 | if (current_is_kswapd()) | 1298 | if (current_is_kswapd()) |
| 1284 | return 0; | 1299 | return 0; |
| 1285 | 1300 | ||
| 1286 | if (!scanning_global_lru(sc)) | 1301 | if (!scanning_global_lru(sc)) |
| 1287 | return 0; | 1302 | return 0; |
| 1288 | 1303 | ||
| 1289 | if (file) { | 1304 | if (file) { |
| 1290 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | 1305 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); |
| 1291 | isolated = zone_page_state(zone, NR_ISOLATED_FILE); | 1306 | isolated = zone_page_state(zone, NR_ISOLATED_FILE); |
| 1292 | } else { | 1307 | } else { |
| 1293 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); | 1308 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); |
| 1294 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); | 1309 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); |
| 1295 | } | 1310 | } |
| 1296 | 1311 | ||
| 1297 | return isolated > inactive; | 1312 | return isolated > inactive; |
| 1298 | } | 1313 | } |
| 1299 | 1314 | ||
| 1300 | /* | 1315 | /* |
| 1301 | * TODO: Try merging with migrations version of putback_lru_pages | 1316 | * TODO: Try merging with migrations version of putback_lru_pages |
| 1302 | */ | 1317 | */ |
| 1303 | static noinline_for_stack void | 1318 | static noinline_for_stack void |
| 1304 | putback_lru_pages(struct zone *zone, struct scan_control *sc, | 1319 | putback_lru_pages(struct zone *zone, struct scan_control *sc, |
| 1305 | unsigned long nr_anon, unsigned long nr_file, | 1320 | unsigned long nr_anon, unsigned long nr_file, |
| 1306 | struct list_head *page_list) | 1321 | struct list_head *page_list) |
| 1307 | { | 1322 | { |
| 1308 | struct page *page; | 1323 | struct page *page; |
| 1309 | struct pagevec pvec; | 1324 | struct pagevec pvec; |
| 1310 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1325 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1311 | 1326 | ||
| 1312 | pagevec_init(&pvec, 1); | 1327 | pagevec_init(&pvec, 1); |
| 1313 | 1328 | ||
| 1314 | /* | 1329 | /* |
| 1315 | * Put back any unfreeable pages. | 1330 | * Put back any unfreeable pages. |
| 1316 | */ | 1331 | */ |
| 1317 | spin_lock(&zone->lru_lock); | 1332 | spin_lock(&zone->lru_lock); |
| 1318 | while (!list_empty(page_list)) { | 1333 | while (!list_empty(page_list)) { |
| 1319 | int lru; | 1334 | int lru; |
| 1320 | page = lru_to_page(page_list); | 1335 | page = lru_to_page(page_list); |
| 1321 | VM_BUG_ON(PageLRU(page)); | 1336 | VM_BUG_ON(PageLRU(page)); |
| 1322 | list_del(&page->lru); | 1337 | list_del(&page->lru); |
| 1323 | if (unlikely(!page_evictable(page, NULL))) { | 1338 | if (unlikely(!page_evictable(page, NULL))) { |
| 1324 | spin_unlock_irq(&zone->lru_lock); | 1339 | spin_unlock_irq(&zone->lru_lock); |
| 1325 | putback_lru_page(page); | 1340 | putback_lru_page(page); |
| 1326 | spin_lock_irq(&zone->lru_lock); | 1341 | spin_lock_irq(&zone->lru_lock); |
| 1327 | continue; | 1342 | continue; |
| 1328 | } | 1343 | } |
| 1329 | SetPageLRU(page); | 1344 | SetPageLRU(page); |
| 1330 | lru = page_lru(page); | 1345 | lru = page_lru(page); |
| 1331 | add_page_to_lru_list(zone, page, lru); | 1346 | add_page_to_lru_list(zone, page, lru); |
| 1332 | if (is_active_lru(lru)) { | 1347 | if (is_active_lru(lru)) { |
| 1333 | int file = is_file_lru(lru); | 1348 | int file = is_file_lru(lru); |
| 1334 | int numpages = hpage_nr_pages(page); | 1349 | int numpages = hpage_nr_pages(page); |
| 1335 | reclaim_stat->recent_rotated[file] += numpages; | 1350 | reclaim_stat->recent_rotated[file] += numpages; |
| 1336 | } | 1351 | } |
| 1337 | if (!pagevec_add(&pvec, page)) { | 1352 | if (!pagevec_add(&pvec, page)) { |
| 1338 | spin_unlock_irq(&zone->lru_lock); | 1353 | spin_unlock_irq(&zone->lru_lock); |
| 1339 | __pagevec_release(&pvec); | 1354 | __pagevec_release(&pvec); |
| 1340 | spin_lock_irq(&zone->lru_lock); | 1355 | spin_lock_irq(&zone->lru_lock); |
| 1341 | } | 1356 | } |
| 1342 | } | 1357 | } |
| 1343 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); | 1358 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); |
| 1344 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); | 1359 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); |
| 1345 | 1360 | ||
| 1346 | spin_unlock_irq(&zone->lru_lock); | 1361 | spin_unlock_irq(&zone->lru_lock); |
| 1347 | pagevec_release(&pvec); | 1362 | pagevec_release(&pvec); |
| 1348 | } | 1363 | } |
| 1349 | 1364 | ||
| 1350 | static noinline_for_stack void update_isolated_counts(struct zone *zone, | 1365 | static noinline_for_stack void update_isolated_counts(struct zone *zone, |
| 1351 | struct scan_control *sc, | 1366 | struct scan_control *sc, |
| 1352 | unsigned long *nr_anon, | 1367 | unsigned long *nr_anon, |
| 1353 | unsigned long *nr_file, | 1368 | unsigned long *nr_file, |
| 1354 | struct list_head *isolated_list) | 1369 | struct list_head *isolated_list) |
| 1355 | { | 1370 | { |
| 1356 | unsigned long nr_active; | 1371 | unsigned long nr_active; |
| 1357 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1372 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
| 1358 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1373 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1359 | 1374 | ||
| 1360 | nr_active = clear_active_flags(isolated_list, count); | 1375 | nr_active = clear_active_flags(isolated_list, count); |
| 1361 | __count_vm_events(PGDEACTIVATE, nr_active); | 1376 | __count_vm_events(PGDEACTIVATE, nr_active); |
| 1362 | 1377 | ||
| 1363 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, | 1378 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, |
| 1364 | -count[LRU_ACTIVE_FILE]); | 1379 | -count[LRU_ACTIVE_FILE]); |
| 1365 | __mod_zone_page_state(zone, NR_INACTIVE_FILE, | 1380 | __mod_zone_page_state(zone, NR_INACTIVE_FILE, |
| 1366 | -count[LRU_INACTIVE_FILE]); | 1381 | -count[LRU_INACTIVE_FILE]); |
| 1367 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, | 1382 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, |
| 1368 | -count[LRU_ACTIVE_ANON]); | 1383 | -count[LRU_ACTIVE_ANON]); |
| 1369 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | 1384 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, |
| 1370 | -count[LRU_INACTIVE_ANON]); | 1385 | -count[LRU_INACTIVE_ANON]); |
| 1371 | 1386 | ||
| 1372 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 1387 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; |
| 1373 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 1388 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; |
| 1374 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); | 1389 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); |
| 1375 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); | 1390 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); |
| 1376 | 1391 | ||
| 1377 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1392 | reclaim_stat->recent_scanned[0] += *nr_anon; |
| 1378 | reclaim_stat->recent_scanned[1] += *nr_file; | 1393 | reclaim_stat->recent_scanned[1] += *nr_file; |
| 1379 | } | 1394 | } |
| 1380 | 1395 | ||
| 1381 | /* | 1396 | /* |
| 1382 | * Returns true if the caller should wait to clean dirty/writeback pages. | 1397 | * Returns true if the caller should wait to clean dirty/writeback pages. |
| 1383 | * | 1398 | * |
| 1384 | * If we are direct reclaiming for contiguous pages and we do not reclaim | 1399 | * If we are direct reclaiming for contiguous pages and we do not reclaim |
| 1385 | * everything in the list, try again and wait for writeback IO to complete. | 1400 | * everything in the list, try again and wait for writeback IO to complete. |
| 1386 | * This will stall high-order allocations noticeably. Only do that when really | 1401 | * This will stall high-order allocations noticeably. Only do that when really |
| 1387 | * need to free the pages under high memory pressure. | 1402 | * need to free the pages under high memory pressure. |
| 1388 | */ | 1403 | */ |
| 1389 | static inline bool should_reclaim_stall(unsigned long nr_taken, | 1404 | static inline bool should_reclaim_stall(unsigned long nr_taken, |
| 1390 | unsigned long nr_freed, | 1405 | unsigned long nr_freed, |
| 1391 | int priority, | 1406 | int priority, |
| 1392 | struct scan_control *sc) | 1407 | struct scan_control *sc) |
| 1393 | { | 1408 | { |
| 1394 | int lumpy_stall_priority; | 1409 | int lumpy_stall_priority; |
| 1395 | 1410 | ||
| 1396 | /* kswapd should not stall on sync IO */ | 1411 | /* kswapd should not stall on sync IO */ |
| 1397 | if (current_is_kswapd()) | 1412 | if (current_is_kswapd()) |
| 1398 | return false; | 1413 | return false; |
| 1399 | 1414 | ||
| 1400 | /* Only stall on lumpy reclaim */ | 1415 | /* Only stall on lumpy reclaim */ |
| 1401 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) | 1416 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
| 1402 | return false; | 1417 | return false; |
| 1403 | 1418 | ||
| 1404 | /* If we have relaimed everything on the isolated list, no stall */ | 1419 | /* If we have relaimed everything on the isolated list, no stall */ |
| 1405 | if (nr_freed == nr_taken) | 1420 | if (nr_freed == nr_taken) |
| 1406 | return false; | 1421 | return false; |
| 1407 | 1422 | ||
| 1408 | /* | 1423 | /* |
| 1409 | * For high-order allocations, there are two stall thresholds. | 1424 | * For high-order allocations, there are two stall thresholds. |
| 1410 | * High-cost allocations stall immediately where as lower | 1425 | * High-cost allocations stall immediately where as lower |
| 1411 | * order allocations such as stacks require the scanning | 1426 | * order allocations such as stacks require the scanning |
| 1412 | * priority to be much higher before stalling. | 1427 | * priority to be much higher before stalling. |
| 1413 | */ | 1428 | */ |
| 1414 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 1429 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
| 1415 | lumpy_stall_priority = DEF_PRIORITY; | 1430 | lumpy_stall_priority = DEF_PRIORITY; |
| 1416 | else | 1431 | else |
| 1417 | lumpy_stall_priority = DEF_PRIORITY / 3; | 1432 | lumpy_stall_priority = DEF_PRIORITY / 3; |
| 1418 | 1433 | ||
| 1419 | return priority <= lumpy_stall_priority; | 1434 | return priority <= lumpy_stall_priority; |
| 1420 | } | 1435 | } |
| 1421 | 1436 | ||
| 1422 | /* | 1437 | /* |
| 1423 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1438 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
| 1424 | * of reclaimed pages | 1439 | * of reclaimed pages |
| 1425 | */ | 1440 | */ |
| 1426 | static noinline_for_stack unsigned long | 1441 | static noinline_for_stack unsigned long |
| 1427 | shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | 1442 | shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, |
| 1428 | struct scan_control *sc, int priority, int file) | 1443 | struct scan_control *sc, int priority, int file) |
| 1429 | { | 1444 | { |
| 1430 | LIST_HEAD(page_list); | 1445 | LIST_HEAD(page_list); |
| 1431 | unsigned long nr_scanned; | 1446 | unsigned long nr_scanned; |
| 1432 | unsigned long nr_reclaimed = 0; | 1447 | unsigned long nr_reclaimed = 0; |
| 1433 | unsigned long nr_taken; | 1448 | unsigned long nr_taken; |
| 1434 | unsigned long nr_anon; | 1449 | unsigned long nr_anon; |
| 1435 | unsigned long nr_file; | 1450 | unsigned long nr_file; |
| 1436 | 1451 | ||
| 1437 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1452 | while (unlikely(too_many_isolated(zone, file, sc))) { |
| 1438 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1453 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1439 | 1454 | ||
| 1440 | /* We are about to die and free our memory. Return now. */ | 1455 | /* We are about to die and free our memory. Return now. */ |
| 1441 | if (fatal_signal_pending(current)) | 1456 | if (fatal_signal_pending(current)) |
| 1442 | return SWAP_CLUSTER_MAX; | 1457 | return SWAP_CLUSTER_MAX; |
| 1443 | } | 1458 | } |
| 1444 | 1459 | ||
| 1445 | set_reclaim_mode(priority, sc, false); | 1460 | set_reclaim_mode(priority, sc, false); |
| 1446 | lru_add_drain(); | 1461 | lru_add_drain(); |
| 1447 | spin_lock_irq(&zone->lru_lock); | 1462 | spin_lock_irq(&zone->lru_lock); |
| 1448 | 1463 | ||
| 1449 | if (scanning_global_lru(sc)) { | 1464 | if (scanning_global_lru(sc)) { |
| 1450 | nr_taken = isolate_pages_global(nr_to_scan, | 1465 | nr_taken = isolate_pages_global(nr_to_scan, |
| 1451 | &page_list, &nr_scanned, sc->order, | 1466 | &page_list, &nr_scanned, sc->order, |
| 1452 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | 1467 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
| 1453 | ISOLATE_BOTH : ISOLATE_INACTIVE, | 1468 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
| 1454 | zone, 0, file); | 1469 | zone, 0, file); |
| 1455 | zone->pages_scanned += nr_scanned; | 1470 | zone->pages_scanned += nr_scanned; |
| 1456 | if (current_is_kswapd()) | 1471 | if (current_is_kswapd()) |
| 1457 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1472 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
| 1458 | nr_scanned); | 1473 | nr_scanned); |
| 1459 | else | 1474 | else |
| 1460 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1475 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
| 1461 | nr_scanned); | 1476 | nr_scanned); |
| 1462 | } else { | 1477 | } else { |
| 1463 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1478 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, |
| 1464 | &page_list, &nr_scanned, sc->order, | 1479 | &page_list, &nr_scanned, sc->order, |
| 1465 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | 1480 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
| 1466 | ISOLATE_BOTH : ISOLATE_INACTIVE, | 1481 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
| 1467 | zone, sc->mem_cgroup, | 1482 | zone, sc->mem_cgroup, |
| 1468 | 0, file); | 1483 | 0, file); |
| 1469 | /* | 1484 | /* |
| 1470 | * mem_cgroup_isolate_pages() keeps track of | 1485 | * mem_cgroup_isolate_pages() keeps track of |
| 1471 | * scanned pages on its own. | 1486 | * scanned pages on its own. |
| 1472 | */ | 1487 | */ |
| 1473 | } | 1488 | } |
| 1474 | 1489 | ||
| 1475 | if (nr_taken == 0) { | 1490 | if (nr_taken == 0) { |
| 1476 | spin_unlock_irq(&zone->lru_lock); | 1491 | spin_unlock_irq(&zone->lru_lock); |
| 1477 | return 0; | 1492 | return 0; |
| 1478 | } | 1493 | } |
| 1479 | 1494 | ||
| 1480 | update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); | 1495 | update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); |
| 1481 | 1496 | ||
| 1482 | spin_unlock_irq(&zone->lru_lock); | 1497 | spin_unlock_irq(&zone->lru_lock); |
| 1483 | 1498 | ||
| 1484 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); | 1499 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); |
| 1485 | 1500 | ||
| 1486 | /* Check if we should syncronously wait for writeback */ | 1501 | /* Check if we should syncronously wait for writeback */ |
| 1487 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1502 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
| 1488 | set_reclaim_mode(priority, sc, true); | 1503 | set_reclaim_mode(priority, sc, true); |
| 1489 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1504 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
| 1490 | } | 1505 | } |
| 1491 | 1506 | ||
| 1492 | local_irq_disable(); | 1507 | local_irq_disable(); |
| 1493 | if (current_is_kswapd()) | 1508 | if (current_is_kswapd()) |
| 1494 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1509 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
| 1495 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); | 1510 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); |
| 1496 | 1511 | ||
| 1497 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); | 1512 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); |
| 1498 | 1513 | ||
| 1499 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1514 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
| 1500 | zone_idx(zone), | 1515 | zone_idx(zone), |
| 1501 | nr_scanned, nr_reclaimed, | 1516 | nr_scanned, nr_reclaimed, |
| 1502 | priority, | 1517 | priority, |
| 1503 | trace_shrink_flags(file, sc->reclaim_mode)); | 1518 | trace_shrink_flags(file, sc->reclaim_mode)); |
| 1504 | return nr_reclaimed; | 1519 | return nr_reclaimed; |
| 1505 | } | 1520 | } |
| 1506 | 1521 | ||
| 1507 | /* | 1522 | /* |
| 1508 | * This moves pages from the active list to the inactive list. | 1523 | * This moves pages from the active list to the inactive list. |
| 1509 | * | 1524 | * |
| 1510 | * We move them the other way if the page is referenced by one or more | 1525 | * We move them the other way if the page is referenced by one or more |
| 1511 | * processes, from rmap. | 1526 | * processes, from rmap. |
| 1512 | * | 1527 | * |
| 1513 | * If the pages are mostly unmapped, the processing is fast and it is | 1528 | * If the pages are mostly unmapped, the processing is fast and it is |
| 1514 | * appropriate to hold zone->lru_lock across the whole operation. But if | 1529 | * appropriate to hold zone->lru_lock across the whole operation. But if |
| 1515 | * the pages are mapped, the processing is slow (page_referenced()) so we | 1530 | * the pages are mapped, the processing is slow (page_referenced()) so we |
| 1516 | * should drop zone->lru_lock around each page. It's impossible to balance | 1531 | * should drop zone->lru_lock around each page. It's impossible to balance |
| 1517 | * this, so instead we remove the pages from the LRU while processing them. | 1532 | * this, so instead we remove the pages from the LRU while processing them. |
| 1518 | * It is safe to rely on PG_active against the non-LRU pages in here because | 1533 | * It is safe to rely on PG_active against the non-LRU pages in here because |
| 1519 | * nobody will play with that bit on a non-LRU page. | 1534 | * nobody will play with that bit on a non-LRU page. |
| 1520 | * | 1535 | * |
| 1521 | * The downside is that we have to touch page->_count against each page. | 1536 | * The downside is that we have to touch page->_count against each page. |
| 1522 | * But we had to alter page->flags anyway. | 1537 | * But we had to alter page->flags anyway. |
| 1523 | */ | 1538 | */ |
| 1524 | 1539 | ||
| 1525 | static void move_active_pages_to_lru(struct zone *zone, | 1540 | static void move_active_pages_to_lru(struct zone *zone, |
| 1526 | struct list_head *list, | 1541 | struct list_head *list, |
| 1527 | enum lru_list lru) | 1542 | enum lru_list lru) |
| 1528 | { | 1543 | { |
| 1529 | unsigned long pgmoved = 0; | 1544 | unsigned long pgmoved = 0; |
| 1530 | struct pagevec pvec; | 1545 | struct pagevec pvec; |
| 1531 | struct page *page; | 1546 | struct page *page; |
| 1532 | 1547 | ||
| 1533 | pagevec_init(&pvec, 1); | 1548 | pagevec_init(&pvec, 1); |
| 1534 | 1549 | ||
| 1535 | while (!list_empty(list)) { | 1550 | while (!list_empty(list)) { |
| 1536 | page = lru_to_page(list); | 1551 | page = lru_to_page(list); |
| 1537 | 1552 | ||
| 1538 | VM_BUG_ON(PageLRU(page)); | 1553 | VM_BUG_ON(PageLRU(page)); |
| 1539 | SetPageLRU(page); | 1554 | SetPageLRU(page); |
| 1540 | 1555 | ||
| 1541 | list_move(&page->lru, &zone->lru[lru].list); | 1556 | list_move(&page->lru, &zone->lru[lru].list); |
| 1542 | mem_cgroup_add_lru_list(page, lru); | 1557 | mem_cgroup_add_lru_list(page, lru); |
| 1543 | pgmoved += hpage_nr_pages(page); | 1558 | pgmoved += hpage_nr_pages(page); |
| 1544 | 1559 | ||
| 1545 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1560 | if (!pagevec_add(&pvec, page) || list_empty(list)) { |
| 1546 | spin_unlock_irq(&zone->lru_lock); | 1561 | spin_unlock_irq(&zone->lru_lock); |
| 1547 | if (buffer_heads_over_limit) | 1562 | if (buffer_heads_over_limit) |
| 1548 | pagevec_strip(&pvec); | 1563 | pagevec_strip(&pvec); |
| 1549 | __pagevec_release(&pvec); | 1564 | __pagevec_release(&pvec); |
| 1550 | spin_lock_irq(&zone->lru_lock); | 1565 | spin_lock_irq(&zone->lru_lock); |
| 1551 | } | 1566 | } |
| 1552 | } | 1567 | } |
| 1553 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | 1568 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
| 1554 | if (!is_active_lru(lru)) | 1569 | if (!is_active_lru(lru)) |
| 1555 | __count_vm_events(PGDEACTIVATE, pgmoved); | 1570 | __count_vm_events(PGDEACTIVATE, pgmoved); |
| 1556 | } | 1571 | } |
| 1557 | 1572 | ||
| 1558 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1573 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
| 1559 | struct scan_control *sc, int priority, int file) | 1574 | struct scan_control *sc, int priority, int file) |
| 1560 | { | 1575 | { |
| 1561 | unsigned long nr_taken; | 1576 | unsigned long nr_taken; |
| 1562 | unsigned long pgscanned; | 1577 | unsigned long pgscanned; |
| 1563 | unsigned long vm_flags; | 1578 | unsigned long vm_flags; |
| 1564 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1579 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
| 1565 | LIST_HEAD(l_active); | 1580 | LIST_HEAD(l_active); |
| 1566 | LIST_HEAD(l_inactive); | 1581 | LIST_HEAD(l_inactive); |
| 1567 | struct page *page; | 1582 | struct page *page; |
| 1568 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1583 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1569 | unsigned long nr_rotated = 0; | 1584 | unsigned long nr_rotated = 0; |
| 1570 | 1585 | ||
| 1571 | lru_add_drain(); | 1586 | lru_add_drain(); |
| 1572 | spin_lock_irq(&zone->lru_lock); | 1587 | spin_lock_irq(&zone->lru_lock); |
| 1573 | if (scanning_global_lru(sc)) { | 1588 | if (scanning_global_lru(sc)) { |
| 1574 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | 1589 | nr_taken = isolate_pages_global(nr_pages, &l_hold, |
| 1575 | &pgscanned, sc->order, | 1590 | &pgscanned, sc->order, |
| 1576 | ISOLATE_ACTIVE, zone, | 1591 | ISOLATE_ACTIVE, zone, |
| 1577 | 1, file); | 1592 | 1, file); |
| 1578 | zone->pages_scanned += pgscanned; | 1593 | zone->pages_scanned += pgscanned; |
| 1579 | } else { | 1594 | } else { |
| 1580 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | 1595 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, |
| 1581 | &pgscanned, sc->order, | 1596 | &pgscanned, sc->order, |
| 1582 | ISOLATE_ACTIVE, zone, | 1597 | ISOLATE_ACTIVE, zone, |
| 1583 | sc->mem_cgroup, 1, file); | 1598 | sc->mem_cgroup, 1, file); |
| 1584 | /* | 1599 | /* |
| 1585 | * mem_cgroup_isolate_pages() keeps track of | 1600 | * mem_cgroup_isolate_pages() keeps track of |
| 1586 | * scanned pages on its own. | 1601 | * scanned pages on its own. |
| 1587 | */ | 1602 | */ |
| 1588 | } | 1603 | } |
| 1589 | 1604 | ||
| 1590 | reclaim_stat->recent_scanned[file] += nr_taken; | 1605 | reclaim_stat->recent_scanned[file] += nr_taken; |
| 1591 | 1606 | ||
| 1592 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1607 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
| 1593 | if (file) | 1608 | if (file) |
| 1594 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); | 1609 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); |
| 1595 | else | 1610 | else |
| 1596 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); | 1611 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); |
| 1597 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | 1612 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); |
| 1598 | spin_unlock_irq(&zone->lru_lock); | 1613 | spin_unlock_irq(&zone->lru_lock); |
| 1599 | 1614 | ||
| 1600 | while (!list_empty(&l_hold)) { | 1615 | while (!list_empty(&l_hold)) { |
| 1601 | cond_resched(); | 1616 | cond_resched(); |
| 1602 | page = lru_to_page(&l_hold); | 1617 | page = lru_to_page(&l_hold); |
| 1603 | list_del(&page->lru); | 1618 | list_del(&page->lru); |
| 1604 | 1619 | ||
| 1605 | if (unlikely(!page_evictable(page, NULL))) { | 1620 | if (unlikely(!page_evictable(page, NULL))) { |
| 1606 | putback_lru_page(page); | 1621 | putback_lru_page(page); |
| 1607 | continue; | 1622 | continue; |
| 1608 | } | 1623 | } |
| 1609 | 1624 | ||
| 1610 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1625 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
| 1611 | nr_rotated += hpage_nr_pages(page); | 1626 | nr_rotated += hpage_nr_pages(page); |
| 1612 | /* | 1627 | /* |
| 1613 | * Identify referenced, file-backed active pages and | 1628 | * Identify referenced, file-backed active pages and |
| 1614 | * give them one more trip around the active list. So | 1629 | * give them one more trip around the active list. So |
| 1615 | * that executable code get better chances to stay in | 1630 | * that executable code get better chances to stay in |
| 1616 | * memory under moderate memory pressure. Anon pages | 1631 | * memory under moderate memory pressure. Anon pages |
| 1617 | * are not likely to be evicted by use-once streaming | 1632 | * are not likely to be evicted by use-once streaming |
| 1618 | * IO, plus JVM can create lots of anon VM_EXEC pages, | 1633 | * IO, plus JVM can create lots of anon VM_EXEC pages, |
| 1619 | * so we ignore them here. | 1634 | * so we ignore them here. |
| 1620 | */ | 1635 | */ |
| 1621 | if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { | 1636 | if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { |
| 1622 | list_add(&page->lru, &l_active); | 1637 | list_add(&page->lru, &l_active); |
| 1623 | continue; | 1638 | continue; |
| 1624 | } | 1639 | } |
| 1625 | } | 1640 | } |
| 1626 | 1641 | ||
| 1627 | ClearPageActive(page); /* we are de-activating */ | 1642 | ClearPageActive(page); /* we are de-activating */ |
| 1628 | list_add(&page->lru, &l_inactive); | 1643 | list_add(&page->lru, &l_inactive); |
| 1629 | } | 1644 | } |
| 1630 | 1645 | ||
| 1631 | /* | 1646 | /* |
| 1632 | * Move pages back to the lru list. | 1647 | * Move pages back to the lru list. |
| 1633 | */ | 1648 | */ |
| 1634 | spin_lock_irq(&zone->lru_lock); | 1649 | spin_lock_irq(&zone->lru_lock); |
| 1635 | /* | 1650 | /* |
| 1636 | * Count referenced pages from currently used mappings as rotated, | 1651 | * Count referenced pages from currently used mappings as rotated, |
| 1637 | * even though only some of them are actually re-activated. This | 1652 | * even though only some of them are actually re-activated. This |
| 1638 | * helps balance scan pressure between file and anonymous pages in | 1653 | * helps balance scan pressure between file and anonymous pages in |
| 1639 | * get_scan_ratio. | 1654 | * get_scan_ratio. |
| 1640 | */ | 1655 | */ |
| 1641 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1656 | reclaim_stat->recent_rotated[file] += nr_rotated; |
| 1642 | 1657 | ||
| 1643 | move_active_pages_to_lru(zone, &l_active, | 1658 | move_active_pages_to_lru(zone, &l_active, |
| 1644 | LRU_ACTIVE + file * LRU_FILE); | 1659 | LRU_ACTIVE + file * LRU_FILE); |
| 1645 | move_active_pages_to_lru(zone, &l_inactive, | 1660 | move_active_pages_to_lru(zone, &l_inactive, |
| 1646 | LRU_BASE + file * LRU_FILE); | 1661 | LRU_BASE + file * LRU_FILE); |
| 1647 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1662 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
| 1648 | spin_unlock_irq(&zone->lru_lock); | 1663 | spin_unlock_irq(&zone->lru_lock); |
| 1649 | } | 1664 | } |
| 1650 | 1665 | ||
| 1651 | #ifdef CONFIG_SWAP | 1666 | #ifdef CONFIG_SWAP |
| 1652 | static int inactive_anon_is_low_global(struct zone *zone) | 1667 | static int inactive_anon_is_low_global(struct zone *zone) |
| 1653 | { | 1668 | { |
| 1654 | unsigned long active, inactive; | 1669 | unsigned long active, inactive; |
| 1655 | 1670 | ||
| 1656 | active = zone_page_state(zone, NR_ACTIVE_ANON); | 1671 | active = zone_page_state(zone, NR_ACTIVE_ANON); |
| 1657 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); | 1672 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); |
| 1658 | 1673 | ||
| 1659 | if (inactive * zone->inactive_ratio < active) | 1674 | if (inactive * zone->inactive_ratio < active) |
| 1660 | return 1; | 1675 | return 1; |
| 1661 | 1676 | ||
| 1662 | return 0; | 1677 | return 0; |
| 1663 | } | 1678 | } |
| 1664 | 1679 | ||
| 1665 | /** | 1680 | /** |
| 1666 | * inactive_anon_is_low - check if anonymous pages need to be deactivated | 1681 | * inactive_anon_is_low - check if anonymous pages need to be deactivated |
| 1667 | * @zone: zone to check | 1682 | * @zone: zone to check |
| 1668 | * @sc: scan control of this context | 1683 | * @sc: scan control of this context |
| 1669 | * | 1684 | * |
| 1670 | * Returns true if the zone does not have enough inactive anon pages, | 1685 | * Returns true if the zone does not have enough inactive anon pages, |
| 1671 | * meaning some active anon pages need to be deactivated. | 1686 | * meaning some active anon pages need to be deactivated. |
| 1672 | */ | 1687 | */ |
| 1673 | static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | 1688 | static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) |
| 1674 | { | 1689 | { |
| 1675 | int low; | 1690 | int low; |
| 1676 | 1691 | ||
| 1677 | /* | 1692 | /* |
| 1678 | * If we don't have swap space, anonymous page deactivation | 1693 | * If we don't have swap space, anonymous page deactivation |
| 1679 | * is pointless. | 1694 | * is pointless. |
| 1680 | */ | 1695 | */ |
| 1681 | if (!total_swap_pages) | 1696 | if (!total_swap_pages) |
| 1682 | return 0; | 1697 | return 0; |
| 1683 | 1698 | ||
| 1684 | if (scanning_global_lru(sc)) | 1699 | if (scanning_global_lru(sc)) |
| 1685 | low = inactive_anon_is_low_global(zone); | 1700 | low = inactive_anon_is_low_global(zone); |
| 1686 | else | 1701 | else |
| 1687 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); | 1702 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); |
| 1688 | return low; | 1703 | return low; |
| 1689 | } | 1704 | } |
| 1690 | #else | 1705 | #else |
| 1691 | static inline int inactive_anon_is_low(struct zone *zone, | 1706 | static inline int inactive_anon_is_low(struct zone *zone, |
| 1692 | struct scan_control *sc) | 1707 | struct scan_control *sc) |
| 1693 | { | 1708 | { |
| 1694 | return 0; | 1709 | return 0; |
| 1695 | } | 1710 | } |
| 1696 | #endif | 1711 | #endif |
| 1697 | 1712 | ||
| 1698 | static int inactive_file_is_low_global(struct zone *zone) | 1713 | static int inactive_file_is_low_global(struct zone *zone) |
| 1699 | { | 1714 | { |
| 1700 | unsigned long active, inactive; | 1715 | unsigned long active, inactive; |
| 1701 | 1716 | ||
| 1702 | active = zone_page_state(zone, NR_ACTIVE_FILE); | 1717 | active = zone_page_state(zone, NR_ACTIVE_FILE); |
| 1703 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | 1718 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); |
| 1704 | 1719 | ||
| 1705 | return (active > inactive); | 1720 | return (active > inactive); |
| 1706 | } | 1721 | } |
| 1707 | 1722 | ||
| 1708 | /** | 1723 | /** |
| 1709 | * inactive_file_is_low - check if file pages need to be deactivated | 1724 | * inactive_file_is_low - check if file pages need to be deactivated |
| 1710 | * @zone: zone to check | 1725 | * @zone: zone to check |
| 1711 | * @sc: scan control of this context | 1726 | * @sc: scan control of this context |
| 1712 | * | 1727 | * |
| 1713 | * When the system is doing streaming IO, memory pressure here | 1728 | * When the system is doing streaming IO, memory pressure here |
| 1714 | * ensures that active file pages get deactivated, until more | 1729 | * ensures that active file pages get deactivated, until more |
| 1715 | * than half of the file pages are on the inactive list. | 1730 | * than half of the file pages are on the inactive list. |
| 1716 | * | 1731 | * |
| 1717 | * Once we get to that situation, protect the system's working | 1732 | * Once we get to that situation, protect the system's working |
| 1718 | * set from being evicted by disabling active file page aging. | 1733 | * set from being evicted by disabling active file page aging. |
| 1719 | * | 1734 | * |
| 1720 | * This uses a different ratio than the anonymous pages, because | 1735 | * This uses a different ratio than the anonymous pages, because |
| 1721 | * the page cache uses a use-once replacement algorithm. | 1736 | * the page cache uses a use-once replacement algorithm. |
| 1722 | */ | 1737 | */ |
| 1723 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | 1738 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) |
| 1724 | { | 1739 | { |
| 1725 | int low; | 1740 | int low; |
| 1726 | 1741 | ||
| 1727 | if (scanning_global_lru(sc)) | 1742 | if (scanning_global_lru(sc)) |
| 1728 | low = inactive_file_is_low_global(zone); | 1743 | low = inactive_file_is_low_global(zone); |
| 1729 | else | 1744 | else |
| 1730 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | 1745 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); |
| 1731 | return low; | 1746 | return low; |
| 1732 | } | 1747 | } |
| 1733 | 1748 | ||
| 1734 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, | 1749 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, |
| 1735 | int file) | 1750 | int file) |
| 1736 | { | 1751 | { |
| 1737 | if (file) | 1752 | if (file) |
| 1738 | return inactive_file_is_low(zone, sc); | 1753 | return inactive_file_is_low(zone, sc); |
| 1739 | else | 1754 | else |
| 1740 | return inactive_anon_is_low(zone, sc); | 1755 | return inactive_anon_is_low(zone, sc); |
| 1741 | } | 1756 | } |
| 1742 | 1757 | ||
| 1743 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1758 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
| 1744 | struct zone *zone, struct scan_control *sc, int priority) | 1759 | struct zone *zone, struct scan_control *sc, int priority) |
| 1745 | { | 1760 | { |
| 1746 | int file = is_file_lru(lru); | 1761 | int file = is_file_lru(lru); |
| 1747 | 1762 | ||
| 1748 | if (is_active_lru(lru)) { | 1763 | if (is_active_lru(lru)) { |
| 1749 | if (inactive_list_is_low(zone, sc, file)) | 1764 | if (inactive_list_is_low(zone, sc, file)) |
| 1750 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1765 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
| 1751 | return 0; | 1766 | return 0; |
| 1752 | } | 1767 | } |
| 1753 | 1768 | ||
| 1754 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1769 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
| 1755 | } | 1770 | } |
| 1756 | 1771 | ||
| 1757 | /* | 1772 | /* |
| 1758 | * Determine how aggressively the anon and file LRU lists should be | 1773 | * Determine how aggressively the anon and file LRU lists should be |
| 1759 | * scanned. The relative value of each set of LRU lists is determined | 1774 | * scanned. The relative value of each set of LRU lists is determined |
| 1760 | * by looking at the fraction of the pages scanned we did rotate back | 1775 | * by looking at the fraction of the pages scanned we did rotate back |
| 1761 | * onto the active list instead of evict. | 1776 | * onto the active list instead of evict. |
| 1762 | * | 1777 | * |
| 1763 | * nr[0] = anon pages to scan; nr[1] = file pages to scan | 1778 | * nr[0] = anon pages to scan; nr[1] = file pages to scan |
| 1764 | */ | 1779 | */ |
| 1765 | static void get_scan_count(struct zone *zone, struct scan_control *sc, | 1780 | static void get_scan_count(struct zone *zone, struct scan_control *sc, |
| 1766 | unsigned long *nr, int priority) | 1781 | unsigned long *nr, int priority) |
| 1767 | { | 1782 | { |
| 1768 | unsigned long anon, file, free; | 1783 | unsigned long anon, file, free; |
| 1769 | unsigned long anon_prio, file_prio; | 1784 | unsigned long anon_prio, file_prio; |
| 1770 | unsigned long ap, fp; | 1785 | unsigned long ap, fp; |
| 1771 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1786 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1772 | u64 fraction[2], denominator; | 1787 | u64 fraction[2], denominator; |
| 1773 | enum lru_list l; | 1788 | enum lru_list l; |
| 1774 | int noswap = 0; | 1789 | int noswap = 0; |
| 1775 | int force_scan = 0; | 1790 | int force_scan = 0; |
| 1776 | 1791 | ||
| 1777 | 1792 | ||
| 1778 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1793 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + |
| 1779 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1794 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); |
| 1780 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1795 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + |
| 1781 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 1796 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); |
| 1782 | 1797 | ||
| 1783 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | 1798 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { |
| 1784 | /* kswapd does zone balancing and need to scan this zone */ | 1799 | /* kswapd does zone balancing and need to scan this zone */ |
| 1785 | if (scanning_global_lru(sc) && current_is_kswapd()) | 1800 | if (scanning_global_lru(sc) && current_is_kswapd()) |
| 1786 | force_scan = 1; | 1801 | force_scan = 1; |
| 1787 | /* memcg may have small limit and need to avoid priority drop */ | 1802 | /* memcg may have small limit and need to avoid priority drop */ |
| 1788 | if (!scanning_global_lru(sc)) | 1803 | if (!scanning_global_lru(sc)) |
| 1789 | force_scan = 1; | 1804 | force_scan = 1; |
| 1790 | } | 1805 | } |
| 1791 | 1806 | ||
| 1792 | /* If we have no swap space, do not bother scanning anon pages. */ | 1807 | /* If we have no swap space, do not bother scanning anon pages. */ |
| 1793 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1808 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
| 1794 | noswap = 1; | 1809 | noswap = 1; |
| 1795 | fraction[0] = 0; | 1810 | fraction[0] = 0; |
| 1796 | fraction[1] = 1; | 1811 | fraction[1] = 1; |
| 1797 | denominator = 1; | 1812 | denominator = 1; |
| 1798 | goto out; | 1813 | goto out; |
| 1799 | } | 1814 | } |
| 1800 | 1815 | ||
| 1801 | if (scanning_global_lru(sc)) { | 1816 | if (scanning_global_lru(sc)) { |
| 1802 | free = zone_page_state(zone, NR_FREE_PAGES); | 1817 | free = zone_page_state(zone, NR_FREE_PAGES); |
| 1803 | /* If we have very few page cache pages, | 1818 | /* If we have very few page cache pages, |
| 1804 | force-scan anon pages. */ | 1819 | force-scan anon pages. */ |
| 1805 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1820 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
| 1806 | fraction[0] = 1; | 1821 | fraction[0] = 1; |
| 1807 | fraction[1] = 0; | 1822 | fraction[1] = 0; |
| 1808 | denominator = 1; | 1823 | denominator = 1; |
| 1809 | goto out; | 1824 | goto out; |
| 1810 | } | 1825 | } |
| 1811 | } | 1826 | } |
| 1812 | 1827 | ||
| 1813 | /* | 1828 | /* |
| 1814 | * With swappiness at 100, anonymous and file have the same priority. | 1829 | * With swappiness at 100, anonymous and file have the same priority. |
| 1815 | * This scanning priority is essentially the inverse of IO cost. | 1830 | * This scanning priority is essentially the inverse of IO cost. |
| 1816 | */ | 1831 | */ |
| 1817 | anon_prio = sc->swappiness; | 1832 | anon_prio = sc->swappiness; |
| 1818 | file_prio = 200 - sc->swappiness; | 1833 | file_prio = 200 - sc->swappiness; |
| 1819 | 1834 | ||
| 1820 | /* | 1835 | /* |
| 1821 | * OK, so we have swap space and a fair amount of page cache | 1836 | * OK, so we have swap space and a fair amount of page cache |
| 1822 | * pages. We use the recently rotated / recently scanned | 1837 | * pages. We use the recently rotated / recently scanned |
| 1823 | * ratios to determine how valuable each cache is. | 1838 | * ratios to determine how valuable each cache is. |
| 1824 | * | 1839 | * |
| 1825 | * Because workloads change over time (and to avoid overflow) | 1840 | * Because workloads change over time (and to avoid overflow) |
| 1826 | * we keep these statistics as a floating average, which ends | 1841 | * we keep these statistics as a floating average, which ends |
| 1827 | * up weighing recent references more than old ones. | 1842 | * up weighing recent references more than old ones. |
| 1828 | * | 1843 | * |
| 1829 | * anon in [0], file in [1] | 1844 | * anon in [0], file in [1] |
| 1830 | */ | 1845 | */ |
| 1831 | spin_lock_irq(&zone->lru_lock); | 1846 | spin_lock_irq(&zone->lru_lock); |
| 1832 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 1847 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
| 1833 | reclaim_stat->recent_scanned[0] /= 2; | 1848 | reclaim_stat->recent_scanned[0] /= 2; |
| 1834 | reclaim_stat->recent_rotated[0] /= 2; | 1849 | reclaim_stat->recent_rotated[0] /= 2; |
| 1835 | } | 1850 | } |
| 1836 | 1851 | ||
| 1837 | if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { | 1852 | if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { |
| 1838 | reclaim_stat->recent_scanned[1] /= 2; | 1853 | reclaim_stat->recent_scanned[1] /= 2; |
| 1839 | reclaim_stat->recent_rotated[1] /= 2; | 1854 | reclaim_stat->recent_rotated[1] /= 2; |
| 1840 | } | 1855 | } |
| 1841 | 1856 | ||
| 1842 | /* | 1857 | /* |
| 1843 | * The amount of pressure on anon vs file pages is inversely | 1858 | * The amount of pressure on anon vs file pages is inversely |
| 1844 | * proportional to the fraction of recently scanned pages on | 1859 | * proportional to the fraction of recently scanned pages on |
| 1845 | * each list that were recently referenced and in active use. | 1860 | * each list that were recently referenced and in active use. |
| 1846 | */ | 1861 | */ |
| 1847 | ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); | 1862 | ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); |
| 1848 | ap /= reclaim_stat->recent_rotated[0] + 1; | 1863 | ap /= reclaim_stat->recent_rotated[0] + 1; |
| 1849 | 1864 | ||
| 1850 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); | 1865 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); |
| 1851 | fp /= reclaim_stat->recent_rotated[1] + 1; | 1866 | fp /= reclaim_stat->recent_rotated[1] + 1; |
| 1852 | spin_unlock_irq(&zone->lru_lock); | 1867 | spin_unlock_irq(&zone->lru_lock); |
| 1853 | 1868 | ||
| 1854 | fraction[0] = ap; | 1869 | fraction[0] = ap; |
| 1855 | fraction[1] = fp; | 1870 | fraction[1] = fp; |
| 1856 | denominator = ap + fp + 1; | 1871 | denominator = ap + fp + 1; |
| 1857 | out: | 1872 | out: |
| 1858 | for_each_evictable_lru(l) { | 1873 | for_each_evictable_lru(l) { |
| 1859 | int file = is_file_lru(l); | 1874 | int file = is_file_lru(l); |
| 1860 | unsigned long scan; | 1875 | unsigned long scan; |
| 1861 | 1876 | ||
| 1862 | scan = zone_nr_lru_pages(zone, sc, l); | 1877 | scan = zone_nr_lru_pages(zone, sc, l); |
| 1863 | if (priority || noswap) { | 1878 | if (priority || noswap) { |
| 1864 | scan >>= priority; | 1879 | scan >>= priority; |
| 1865 | scan = div64_u64(scan * fraction[file], denominator); | 1880 | scan = div64_u64(scan * fraction[file], denominator); |
| 1866 | } | 1881 | } |
| 1867 | 1882 | ||
| 1868 | /* | 1883 | /* |
| 1869 | * If zone is small or memcg is small, nr[l] can be 0. | 1884 | * If zone is small or memcg is small, nr[l] can be 0. |
| 1870 | * This results no-scan on this priority and priority drop down. | 1885 | * This results no-scan on this priority and priority drop down. |
| 1871 | * For global direct reclaim, it can visit next zone and tend | 1886 | * For global direct reclaim, it can visit next zone and tend |
| 1872 | * not to have problems. For global kswapd, it's for zone | 1887 | * not to have problems. For global kswapd, it's for zone |
| 1873 | * balancing and it need to scan a small amounts. When using | 1888 | * balancing and it need to scan a small amounts. When using |
| 1874 | * memcg, priority drop can cause big latency. So, it's better | 1889 | * memcg, priority drop can cause big latency. So, it's better |
| 1875 | * to scan small amount. See may_noscan above. | 1890 | * to scan small amount. See may_noscan above. |
| 1876 | */ | 1891 | */ |
| 1877 | if (!scan && force_scan) { | 1892 | if (!scan && force_scan) { |
| 1878 | if (file) | 1893 | if (file) |
| 1879 | scan = SWAP_CLUSTER_MAX; | 1894 | scan = SWAP_CLUSTER_MAX; |
| 1880 | else if (!noswap) | 1895 | else if (!noswap) |
| 1881 | scan = SWAP_CLUSTER_MAX; | 1896 | scan = SWAP_CLUSTER_MAX; |
| 1882 | } | 1897 | } |
| 1883 | nr[l] = scan; | 1898 | nr[l] = scan; |
| 1884 | } | 1899 | } |
| 1885 | } | 1900 | } |
| 1886 | 1901 | ||
| 1887 | /* | 1902 | /* |
| 1888 | * Reclaim/compaction depends on a number of pages being freed. To avoid | 1903 | * Reclaim/compaction depends on a number of pages being freed. To avoid |
| 1889 | * disruption to the system, a small number of order-0 pages continue to be | 1904 | * disruption to the system, a small number of order-0 pages continue to be |
| 1890 | * rotated and reclaimed in the normal fashion. However, by the time we get | 1905 | * rotated and reclaimed in the normal fashion. However, by the time we get |
| 1891 | * back to the allocator and call try_to_compact_zone(), we ensure that | 1906 | * back to the allocator and call try_to_compact_zone(), we ensure that |
| 1892 | * there are enough free pages for it to be likely successful | 1907 | * there are enough free pages for it to be likely successful |
| 1893 | */ | 1908 | */ |
| 1894 | static inline bool should_continue_reclaim(struct zone *zone, | 1909 | static inline bool should_continue_reclaim(struct zone *zone, |
| 1895 | unsigned long nr_reclaimed, | 1910 | unsigned long nr_reclaimed, |
| 1896 | unsigned long nr_scanned, | 1911 | unsigned long nr_scanned, |
| 1897 | struct scan_control *sc) | 1912 | struct scan_control *sc) |
| 1898 | { | 1913 | { |
| 1899 | unsigned long pages_for_compaction; | 1914 | unsigned long pages_for_compaction; |
| 1900 | unsigned long inactive_lru_pages; | 1915 | unsigned long inactive_lru_pages; |
| 1901 | 1916 | ||
| 1902 | /* If not in reclaim/compaction mode, stop */ | 1917 | /* If not in reclaim/compaction mode, stop */ |
| 1903 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | 1918 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) |
| 1904 | return false; | 1919 | return false; |
| 1905 | 1920 | ||
| 1906 | /* Consider stopping depending on scan and reclaim activity */ | 1921 | /* Consider stopping depending on scan and reclaim activity */ |
| 1907 | if (sc->gfp_mask & __GFP_REPEAT) { | 1922 | if (sc->gfp_mask & __GFP_REPEAT) { |
| 1908 | /* | 1923 | /* |
| 1909 | * For __GFP_REPEAT allocations, stop reclaiming if the | 1924 | * For __GFP_REPEAT allocations, stop reclaiming if the |
| 1910 | * full LRU list has been scanned and we are still failing | 1925 | * full LRU list has been scanned and we are still failing |
| 1911 | * to reclaim pages. This full LRU scan is potentially | 1926 | * to reclaim pages. This full LRU scan is potentially |
| 1912 | * expensive but a __GFP_REPEAT caller really wants to succeed | 1927 | * expensive but a __GFP_REPEAT caller really wants to succeed |
| 1913 | */ | 1928 | */ |
| 1914 | if (!nr_reclaimed && !nr_scanned) | 1929 | if (!nr_reclaimed && !nr_scanned) |
| 1915 | return false; | 1930 | return false; |
| 1916 | } else { | 1931 | } else { |
| 1917 | /* | 1932 | /* |
| 1918 | * For non-__GFP_REPEAT allocations which can presumably | 1933 | * For non-__GFP_REPEAT allocations which can presumably |
| 1919 | * fail without consequence, stop if we failed to reclaim | 1934 | * fail without consequence, stop if we failed to reclaim |
| 1920 | * any pages from the last SWAP_CLUSTER_MAX number of | 1935 | * any pages from the last SWAP_CLUSTER_MAX number of |
| 1921 | * pages that were scanned. This will return to the | 1936 | * pages that were scanned. This will return to the |
| 1922 | * caller faster at the risk reclaim/compaction and | 1937 | * caller faster at the risk reclaim/compaction and |
| 1923 | * the resulting allocation attempt fails | 1938 | * the resulting allocation attempt fails |
| 1924 | */ | 1939 | */ |
| 1925 | if (!nr_reclaimed) | 1940 | if (!nr_reclaimed) |
| 1926 | return false; | 1941 | return false; |
| 1927 | } | 1942 | } |
| 1928 | 1943 | ||
| 1929 | /* | 1944 | /* |
| 1930 | * If we have not reclaimed enough pages for compaction and the | 1945 | * If we have not reclaimed enough pages for compaction and the |
| 1931 | * inactive lists are large enough, continue reclaiming | 1946 | * inactive lists are large enough, continue reclaiming |
| 1932 | */ | 1947 | */ |
| 1933 | pages_for_compaction = (2UL << sc->order); | 1948 | pages_for_compaction = (2UL << sc->order); |
| 1934 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | 1949 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + |
| 1935 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 1950 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); |
| 1936 | if (sc->nr_reclaimed < pages_for_compaction && | 1951 | if (sc->nr_reclaimed < pages_for_compaction && |
| 1937 | inactive_lru_pages > pages_for_compaction) | 1952 | inactive_lru_pages > pages_for_compaction) |
| 1938 | return true; | 1953 | return true; |
| 1939 | 1954 | ||
| 1940 | /* If compaction would go ahead or the allocation would succeed, stop */ | 1955 | /* If compaction would go ahead or the allocation would succeed, stop */ |
| 1941 | switch (compaction_suitable(zone, sc->order)) { | 1956 | switch (compaction_suitable(zone, sc->order)) { |
| 1942 | case COMPACT_PARTIAL: | 1957 | case COMPACT_PARTIAL: |
| 1943 | case COMPACT_CONTINUE: | 1958 | case COMPACT_CONTINUE: |
| 1944 | return false; | 1959 | return false; |
| 1945 | default: | 1960 | default: |
| 1946 | return true; | 1961 | return true; |
| 1947 | } | 1962 | } |
| 1948 | } | 1963 | } |
| 1949 | 1964 | ||
| 1950 | /* | 1965 | /* |
| 1951 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1966 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
| 1952 | */ | 1967 | */ |
| 1953 | static void shrink_zone(int priority, struct zone *zone, | 1968 | static void shrink_zone(int priority, struct zone *zone, |
| 1954 | struct scan_control *sc) | 1969 | struct scan_control *sc) |
| 1955 | { | 1970 | { |
| 1956 | unsigned long nr[NR_LRU_LISTS]; | 1971 | unsigned long nr[NR_LRU_LISTS]; |
| 1957 | unsigned long nr_to_scan; | 1972 | unsigned long nr_to_scan; |
| 1958 | enum lru_list l; | 1973 | enum lru_list l; |
| 1959 | unsigned long nr_reclaimed, nr_scanned; | 1974 | unsigned long nr_reclaimed, nr_scanned; |
| 1960 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1975 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
| 1961 | 1976 | ||
| 1962 | restart: | 1977 | restart: |
| 1963 | nr_reclaimed = 0; | 1978 | nr_reclaimed = 0; |
| 1964 | nr_scanned = sc->nr_scanned; | 1979 | nr_scanned = sc->nr_scanned; |
| 1965 | get_scan_count(zone, sc, nr, priority); | 1980 | get_scan_count(zone, sc, nr, priority); |
| 1966 | 1981 | ||
| 1967 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1982 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
| 1968 | nr[LRU_INACTIVE_FILE]) { | 1983 | nr[LRU_INACTIVE_FILE]) { |
| 1969 | for_each_evictable_lru(l) { | 1984 | for_each_evictable_lru(l) { |
| 1970 | if (nr[l]) { | 1985 | if (nr[l]) { |
| 1971 | nr_to_scan = min_t(unsigned long, | 1986 | nr_to_scan = min_t(unsigned long, |
| 1972 | nr[l], SWAP_CLUSTER_MAX); | 1987 | nr[l], SWAP_CLUSTER_MAX); |
| 1973 | nr[l] -= nr_to_scan; | 1988 | nr[l] -= nr_to_scan; |
| 1974 | 1989 | ||
| 1975 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1990 | nr_reclaimed += shrink_list(l, nr_to_scan, |
| 1976 | zone, sc, priority); | 1991 | zone, sc, priority); |
| 1977 | } | 1992 | } |
| 1978 | } | 1993 | } |
| 1979 | /* | 1994 | /* |
| 1980 | * On large memory systems, scan >> priority can become | 1995 | * On large memory systems, scan >> priority can become |
| 1981 | * really large. This is fine for the starting priority; | 1996 | * really large. This is fine for the starting priority; |
| 1982 | * we want to put equal scanning pressure on each zone. | 1997 | * we want to put equal scanning pressure on each zone. |
| 1983 | * However, if the VM has a harder time of freeing pages, | 1998 | * However, if the VM has a harder time of freeing pages, |
| 1984 | * with multiple processes reclaiming pages, the total | 1999 | * with multiple processes reclaiming pages, the total |
| 1985 | * freeing target can get unreasonably large. | 2000 | * freeing target can get unreasonably large. |
| 1986 | */ | 2001 | */ |
| 1987 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 2002 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
| 1988 | break; | 2003 | break; |
| 1989 | } | 2004 | } |
| 1990 | sc->nr_reclaimed += nr_reclaimed; | 2005 | sc->nr_reclaimed += nr_reclaimed; |
| 1991 | 2006 | ||
| 1992 | /* | 2007 | /* |
| 1993 | * Even if we did not try to evict anon pages at all, we want to | 2008 | * Even if we did not try to evict anon pages at all, we want to |
| 1994 | * rebalance the anon lru active/inactive ratio. | 2009 | * rebalance the anon lru active/inactive ratio. |
| 1995 | */ | 2010 | */ |
| 1996 | if (inactive_anon_is_low(zone, sc)) | 2011 | if (inactive_anon_is_low(zone, sc)) |
| 1997 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 2012 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
| 1998 | 2013 | ||
| 1999 | /* reclaim/compaction might need reclaim to continue */ | 2014 | /* reclaim/compaction might need reclaim to continue */ |
| 2000 | if (should_continue_reclaim(zone, nr_reclaimed, | 2015 | if (should_continue_reclaim(zone, nr_reclaimed, |
| 2001 | sc->nr_scanned - nr_scanned, sc)) | 2016 | sc->nr_scanned - nr_scanned, sc)) |
| 2002 | goto restart; | 2017 | goto restart; |
| 2003 | 2018 | ||
| 2004 | throttle_vm_writeout(sc->gfp_mask); | 2019 | throttle_vm_writeout(sc->gfp_mask); |
| 2005 | } | 2020 | } |
| 2006 | 2021 | ||
| 2007 | /* | 2022 | /* |
| 2008 | * This is the direct reclaim path, for page-allocating processes. We only | 2023 | * This is the direct reclaim path, for page-allocating processes. We only |
| 2009 | * try to reclaim pages from zones which will satisfy the caller's allocation | 2024 | * try to reclaim pages from zones which will satisfy the caller's allocation |
| 2010 | * request. | 2025 | * request. |
| 2011 | * | 2026 | * |
| 2012 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). | 2027 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). |
| 2013 | * Because: | 2028 | * Because: |
| 2014 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | 2029 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order |
| 2015 | * allocation or | 2030 | * allocation or |
| 2016 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones | 2031 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones |
| 2017 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' | 2032 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' |
| 2018 | * zone defense algorithm. | 2033 | * zone defense algorithm. |
| 2019 | * | 2034 | * |
| 2020 | * If a zone is deemed to be full of pinned pages then just give it a light | 2035 | * If a zone is deemed to be full of pinned pages then just give it a light |
| 2021 | * scan then give up on it. | 2036 | * scan then give up on it. |
| 2022 | */ | 2037 | */ |
| 2023 | static void shrink_zones(int priority, struct zonelist *zonelist, | 2038 | static void shrink_zones(int priority, struct zonelist *zonelist, |
| 2024 | struct scan_control *sc) | 2039 | struct scan_control *sc) |
| 2025 | { | 2040 | { |
| 2026 | struct zoneref *z; | 2041 | struct zoneref *z; |
| 2027 | struct zone *zone; | 2042 | struct zone *zone; |
| 2028 | unsigned long nr_soft_reclaimed; | 2043 | unsigned long nr_soft_reclaimed; |
| 2029 | unsigned long nr_soft_scanned; | 2044 | unsigned long nr_soft_scanned; |
| 2030 | 2045 | ||
| 2031 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2046 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
| 2032 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2047 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
| 2033 | if (!populated_zone(zone)) | 2048 | if (!populated_zone(zone)) |
| 2034 | continue; | 2049 | continue; |
| 2035 | /* | 2050 | /* |
| 2036 | * Take care memory controller reclaiming has small influence | 2051 | * Take care memory controller reclaiming has small influence |
| 2037 | * to global LRU. | 2052 | * to global LRU. |
| 2038 | */ | 2053 | */ |
| 2039 | if (scanning_global_lru(sc)) { | 2054 | if (scanning_global_lru(sc)) { |
| 2040 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2055 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
| 2041 | continue; | 2056 | continue; |
| 2042 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2057 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 2043 | continue; /* Let kswapd poll it */ | 2058 | continue; /* Let kswapd poll it */ |
| 2044 | /* | 2059 | /* |
| 2045 | * This steals pages from memory cgroups over softlimit | 2060 | * This steals pages from memory cgroups over softlimit |
| 2046 | * and returns the number of reclaimed pages and | 2061 | * and returns the number of reclaimed pages and |
| 2047 | * scanned pages. This works for global memory pressure | 2062 | * scanned pages. This works for global memory pressure |
| 2048 | * and balancing, not for a memcg's limit. | 2063 | * and balancing, not for a memcg's limit. |
| 2049 | */ | 2064 | */ |
| 2050 | nr_soft_scanned = 0; | 2065 | nr_soft_scanned = 0; |
| 2051 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | 2066 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, |
| 2052 | sc->order, sc->gfp_mask, | 2067 | sc->order, sc->gfp_mask, |
| 2053 | &nr_soft_scanned); | 2068 | &nr_soft_scanned); |
| 2054 | sc->nr_reclaimed += nr_soft_reclaimed; | 2069 | sc->nr_reclaimed += nr_soft_reclaimed; |
| 2055 | sc->nr_scanned += nr_soft_scanned; | 2070 | sc->nr_scanned += nr_soft_scanned; |
| 2056 | /* need some check for avoid more shrink_zone() */ | 2071 | /* need some check for avoid more shrink_zone() */ |
| 2057 | } | 2072 | } |
| 2058 | 2073 | ||
| 2059 | shrink_zone(priority, zone, sc); | 2074 | shrink_zone(priority, zone, sc); |
| 2060 | } | 2075 | } |
| 2061 | } | 2076 | } |
| 2062 | 2077 | ||
| 2063 | static bool zone_reclaimable(struct zone *zone) | 2078 | static bool zone_reclaimable(struct zone *zone) |
| 2064 | { | 2079 | { |
| 2065 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; | 2080 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; |
| 2066 | } | 2081 | } |
| 2067 | 2082 | ||
| 2068 | /* All zones in zonelist are unreclaimable? */ | 2083 | /* All zones in zonelist are unreclaimable? */ |
| 2069 | static bool all_unreclaimable(struct zonelist *zonelist, | 2084 | static bool all_unreclaimable(struct zonelist *zonelist, |
| 2070 | struct scan_control *sc) | 2085 | struct scan_control *sc) |
| 2071 | { | 2086 | { |
| 2072 | struct zoneref *z; | 2087 | struct zoneref *z; |
| 2073 | struct zone *zone; | 2088 | struct zone *zone; |
| 2074 | 2089 | ||
| 2075 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2090 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
| 2076 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2091 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
| 2077 | if (!populated_zone(zone)) | 2092 | if (!populated_zone(zone)) |
| 2078 | continue; | 2093 | continue; |
| 2079 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2094 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
| 2080 | continue; | 2095 | continue; |
| 2081 | if (!zone->all_unreclaimable) | 2096 | if (!zone->all_unreclaimable) |
| 2082 | return false; | 2097 | return false; |
| 2083 | } | 2098 | } |
| 2084 | 2099 | ||
| 2085 | return true; | 2100 | return true; |
| 2086 | } | 2101 | } |
| 2087 | 2102 | ||
| 2088 | /* | 2103 | /* |
| 2089 | * This is the main entry point to direct page reclaim. | 2104 | * This is the main entry point to direct page reclaim. |
| 2090 | * | 2105 | * |
| 2091 | * If a full scan of the inactive list fails to free enough memory then we | 2106 | * If a full scan of the inactive list fails to free enough memory then we |
| 2092 | * are "out of memory" and something needs to be killed. | 2107 | * are "out of memory" and something needs to be killed. |
| 2093 | * | 2108 | * |
| 2094 | * If the caller is !__GFP_FS then the probability of a failure is reasonably | 2109 | * If the caller is !__GFP_FS then the probability of a failure is reasonably |
| 2095 | * high - the zone may be full of dirty or under-writeback pages, which this | 2110 | * high - the zone may be full of dirty or under-writeback pages, which this |
| 2096 | * caller can't do much about. We kick the writeback threads and take explicit | 2111 | * caller can't do much about. We kick the writeback threads and take explicit |
| 2097 | * naps in the hope that some of these pages can be written. But if the | 2112 | * naps in the hope that some of these pages can be written. But if the |
| 2098 | * allocating task holds filesystem locks which prevent writeout this might not | 2113 | * allocating task holds filesystem locks which prevent writeout this might not |
| 2099 | * work, and the allocation attempt will fail. | 2114 | * work, and the allocation attempt will fail. |
| 2100 | * | 2115 | * |
| 2101 | * returns: 0, if no pages reclaimed | 2116 | * returns: 0, if no pages reclaimed |
| 2102 | * else, the number of pages reclaimed | 2117 | * else, the number of pages reclaimed |
| 2103 | */ | 2118 | */ |
| 2104 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | 2119 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
| 2105 | struct scan_control *sc, | 2120 | struct scan_control *sc, |
| 2106 | struct shrink_control *shrink) | 2121 | struct shrink_control *shrink) |
| 2107 | { | 2122 | { |
| 2108 | int priority; | 2123 | int priority; |
| 2109 | unsigned long total_scanned = 0; | 2124 | unsigned long total_scanned = 0; |
| 2110 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2125 | struct reclaim_state *reclaim_state = current->reclaim_state; |
| 2111 | struct zoneref *z; | 2126 | struct zoneref *z; |
| 2112 | struct zone *zone; | 2127 | struct zone *zone; |
| 2113 | unsigned long writeback_threshold; | 2128 | unsigned long writeback_threshold; |
| 2114 | 2129 | ||
| 2115 | get_mems_allowed(); | 2130 | get_mems_allowed(); |
| 2116 | delayacct_freepages_start(); | 2131 | delayacct_freepages_start(); |
| 2117 | 2132 | ||
| 2118 | if (scanning_global_lru(sc)) | 2133 | if (scanning_global_lru(sc)) |
| 2119 | count_vm_event(ALLOCSTALL); | 2134 | count_vm_event(ALLOCSTALL); |
| 2120 | 2135 | ||
| 2121 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2136 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
| 2122 | sc->nr_scanned = 0; | 2137 | sc->nr_scanned = 0; |
| 2123 | if (!priority) | 2138 | if (!priority) |
| 2124 | disable_swap_token(sc->mem_cgroup); | 2139 | disable_swap_token(sc->mem_cgroup); |
| 2125 | shrink_zones(priority, zonelist, sc); | 2140 | shrink_zones(priority, zonelist, sc); |
| 2126 | /* | 2141 | /* |
| 2127 | * Don't shrink slabs when reclaiming memory from | 2142 | * Don't shrink slabs when reclaiming memory from |
| 2128 | * over limit cgroups | 2143 | * over limit cgroups |
| 2129 | */ | 2144 | */ |
| 2130 | if (scanning_global_lru(sc)) { | 2145 | if (scanning_global_lru(sc)) { |
| 2131 | unsigned long lru_pages = 0; | 2146 | unsigned long lru_pages = 0; |
| 2132 | for_each_zone_zonelist(zone, z, zonelist, | 2147 | for_each_zone_zonelist(zone, z, zonelist, |
| 2133 | gfp_zone(sc->gfp_mask)) { | 2148 | gfp_zone(sc->gfp_mask)) { |
| 2134 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2149 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
| 2135 | continue; | 2150 | continue; |
| 2136 | 2151 | ||
| 2137 | lru_pages += zone_reclaimable_pages(zone); | 2152 | lru_pages += zone_reclaimable_pages(zone); |
| 2138 | } | 2153 | } |
| 2139 | 2154 | ||
| 2140 | shrink_slab(shrink, sc->nr_scanned, lru_pages); | 2155 | shrink_slab(shrink, sc->nr_scanned, lru_pages); |
| 2141 | if (reclaim_state) { | 2156 | if (reclaim_state) { |
| 2142 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | 2157 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
| 2143 | reclaim_state->reclaimed_slab = 0; | 2158 | reclaim_state->reclaimed_slab = 0; |
| 2144 | } | 2159 | } |
| 2145 | } | 2160 | } |
| 2146 | total_scanned += sc->nr_scanned; | 2161 | total_scanned += sc->nr_scanned; |
| 2147 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) | 2162 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) |
| 2148 | goto out; | 2163 | goto out; |
| 2149 | 2164 | ||
| 2150 | /* | 2165 | /* |
| 2151 | * Try to write back as many pages as we just scanned. This | 2166 | * Try to write back as many pages as we just scanned. This |
| 2152 | * tends to cause slow streaming writers to write data to the | 2167 | * tends to cause slow streaming writers to write data to the |
| 2153 | * disk smoothly, at the dirtying rate, which is nice. But | 2168 | * disk smoothly, at the dirtying rate, which is nice. But |
| 2154 | * that's undesirable in laptop mode, where we *want* lumpy | 2169 | * that's undesirable in laptop mode, where we *want* lumpy |
| 2155 | * writeout. So in laptop mode, write out the whole world. | 2170 | * writeout. So in laptop mode, write out the whole world. |
| 2156 | */ | 2171 | */ |
| 2157 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; | 2172 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
| 2158 | if (total_scanned > writeback_threshold) { | 2173 | if (total_scanned > writeback_threshold) { |
| 2159 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 2174 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
| 2160 | sc->may_writepage = 1; | 2175 | sc->may_writepage = 1; |
| 2161 | } | 2176 | } |
| 2162 | 2177 | ||
| 2163 | /* Take a nap, wait for some writeback to complete */ | 2178 | /* Take a nap, wait for some writeback to complete */ |
| 2164 | if (!sc->hibernation_mode && sc->nr_scanned && | 2179 | if (!sc->hibernation_mode && sc->nr_scanned && |
| 2165 | priority < DEF_PRIORITY - 2) { | 2180 | priority < DEF_PRIORITY - 2) { |
| 2166 | struct zone *preferred_zone; | 2181 | struct zone *preferred_zone; |
| 2167 | 2182 | ||
| 2168 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | 2183 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), |
| 2169 | &cpuset_current_mems_allowed, | 2184 | &cpuset_current_mems_allowed, |
| 2170 | &preferred_zone); | 2185 | &preferred_zone); |
| 2171 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | 2186 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); |
| 2172 | } | 2187 | } |
| 2173 | } | 2188 | } |
| 2174 | 2189 | ||
| 2175 | out: | 2190 | out: |
| 2176 | delayacct_freepages_end(); | 2191 | delayacct_freepages_end(); |
| 2177 | put_mems_allowed(); | 2192 | put_mems_allowed(); |
| 2178 | 2193 | ||
| 2179 | if (sc->nr_reclaimed) | 2194 | if (sc->nr_reclaimed) |
| 2180 | return sc->nr_reclaimed; | 2195 | return sc->nr_reclaimed; |
| 2181 | 2196 | ||
| 2182 | /* | 2197 | /* |
| 2183 | * As hibernation is going on, kswapd is freezed so that it can't mark | 2198 | * As hibernation is going on, kswapd is freezed so that it can't mark |
| 2184 | * the zone into all_unreclaimable. Thus bypassing all_unreclaimable | 2199 | * the zone into all_unreclaimable. Thus bypassing all_unreclaimable |
| 2185 | * check. | 2200 | * check. |
| 2186 | */ | 2201 | */ |
| 2187 | if (oom_killer_disabled) | 2202 | if (oom_killer_disabled) |
| 2188 | return 0; | 2203 | return 0; |
| 2189 | 2204 | ||
| 2190 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 2205 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
| 2191 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) | 2206 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) |
| 2192 | return 1; | 2207 | return 1; |
| 2193 | 2208 | ||
| 2194 | return 0; | 2209 | return 0; |
| 2195 | } | 2210 | } |
| 2196 | 2211 | ||
| 2197 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2212 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
| 2198 | gfp_t gfp_mask, nodemask_t *nodemask) | 2213 | gfp_t gfp_mask, nodemask_t *nodemask) |
| 2199 | { | 2214 | { |
| 2200 | unsigned long nr_reclaimed; | 2215 | unsigned long nr_reclaimed; |
| 2201 | struct scan_control sc = { | 2216 | struct scan_control sc = { |
| 2202 | .gfp_mask = gfp_mask, | 2217 | .gfp_mask = gfp_mask, |
| 2203 | .may_writepage = !laptop_mode, | 2218 | .may_writepage = !laptop_mode, |
| 2204 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2219 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
| 2205 | .may_unmap = 1, | 2220 | .may_unmap = 1, |
| 2206 | .may_swap = 1, | 2221 | .may_swap = 1, |
| 2207 | .swappiness = vm_swappiness, | 2222 | .swappiness = vm_swappiness, |
| 2208 | .order = order, | 2223 | .order = order, |
| 2209 | .mem_cgroup = NULL, | 2224 | .mem_cgroup = NULL, |
| 2210 | .nodemask = nodemask, | 2225 | .nodemask = nodemask, |
| 2211 | }; | 2226 | }; |
| 2212 | struct shrink_control shrink = { | 2227 | struct shrink_control shrink = { |
| 2213 | .gfp_mask = sc.gfp_mask, | 2228 | .gfp_mask = sc.gfp_mask, |
| 2214 | }; | 2229 | }; |
| 2215 | 2230 | ||
| 2216 | trace_mm_vmscan_direct_reclaim_begin(order, | 2231 | trace_mm_vmscan_direct_reclaim_begin(order, |
| 2217 | sc.may_writepage, | 2232 | sc.may_writepage, |
| 2218 | gfp_mask); | 2233 | gfp_mask); |
| 2219 | 2234 | ||
| 2220 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); | 2235 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
| 2221 | 2236 | ||
| 2222 | trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); | 2237 | trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); |
| 2223 | 2238 | ||
| 2224 | return nr_reclaimed; | 2239 | return nr_reclaimed; |
| 2225 | } | 2240 | } |
| 2226 | 2241 | ||
| 2227 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2242 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
| 2228 | 2243 | ||
| 2229 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2244 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
| 2230 | gfp_t gfp_mask, bool noswap, | 2245 | gfp_t gfp_mask, bool noswap, |
| 2231 | unsigned int swappiness, | 2246 | unsigned int swappiness, |
| 2232 | struct zone *zone, | 2247 | struct zone *zone, |
| 2233 | unsigned long *nr_scanned) | 2248 | unsigned long *nr_scanned) |
| 2234 | { | 2249 | { |
| 2235 | struct scan_control sc = { | 2250 | struct scan_control sc = { |
| 2236 | .nr_scanned = 0, | 2251 | .nr_scanned = 0, |
| 2237 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2252 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
| 2238 | .may_writepage = !laptop_mode, | 2253 | .may_writepage = !laptop_mode, |
| 2239 | .may_unmap = 1, | 2254 | .may_unmap = 1, |
| 2240 | .may_swap = !noswap, | 2255 | .may_swap = !noswap, |
| 2241 | .swappiness = swappiness, | 2256 | .swappiness = swappiness, |
| 2242 | .order = 0, | 2257 | .order = 0, |
| 2243 | .mem_cgroup = mem, | 2258 | .mem_cgroup = mem, |
| 2244 | }; | 2259 | }; |
| 2245 | 2260 | ||
| 2246 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2261 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
| 2247 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2262 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
| 2248 | 2263 | ||
| 2249 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, | 2264 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, |
| 2250 | sc.may_writepage, | 2265 | sc.may_writepage, |
| 2251 | sc.gfp_mask); | 2266 | sc.gfp_mask); |
| 2252 | 2267 | ||
| 2253 | /* | 2268 | /* |
| 2254 | * NOTE: Although we can get the priority field, using it | 2269 | * NOTE: Although we can get the priority field, using it |
| 2255 | * here is not a good idea, since it limits the pages we can scan. | 2270 | * here is not a good idea, since it limits the pages we can scan. |
| 2256 | * if we don't reclaim here, the shrink_zone from balance_pgdat | 2271 | * if we don't reclaim here, the shrink_zone from balance_pgdat |
| 2257 | * will pick up pages from other mem cgroup's as well. We hack | 2272 | * will pick up pages from other mem cgroup's as well. We hack |
| 2258 | * the priority and make it zero. | 2273 | * the priority and make it zero. |
| 2259 | */ | 2274 | */ |
| 2260 | shrink_zone(0, zone, &sc); | 2275 | shrink_zone(0, zone, &sc); |
| 2261 | 2276 | ||
| 2262 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2277 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
| 2263 | 2278 | ||
| 2264 | *nr_scanned = sc.nr_scanned; | 2279 | *nr_scanned = sc.nr_scanned; |
| 2265 | return sc.nr_reclaimed; | 2280 | return sc.nr_reclaimed; |
| 2266 | } | 2281 | } |
| 2267 | 2282 | ||
| 2268 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2283 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
| 2269 | gfp_t gfp_mask, | 2284 | gfp_t gfp_mask, |
| 2270 | bool noswap, | 2285 | bool noswap, |
| 2271 | unsigned int swappiness) | 2286 | unsigned int swappiness) |
| 2272 | { | 2287 | { |
| 2273 | struct zonelist *zonelist; | 2288 | struct zonelist *zonelist; |
| 2274 | unsigned long nr_reclaimed; | 2289 | unsigned long nr_reclaimed; |
| 2275 | int nid; | 2290 | int nid; |
| 2276 | struct scan_control sc = { | 2291 | struct scan_control sc = { |
| 2277 | .may_writepage = !laptop_mode, | 2292 | .may_writepage = !laptop_mode, |
| 2278 | .may_unmap = 1, | 2293 | .may_unmap = 1, |
| 2279 | .may_swap = !noswap, | 2294 | .may_swap = !noswap, |
| 2280 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2295 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
| 2281 | .swappiness = swappiness, | 2296 | .swappiness = swappiness, |
| 2282 | .order = 0, | 2297 | .order = 0, |
| 2283 | .mem_cgroup = mem_cont, | 2298 | .mem_cgroup = mem_cont, |
| 2284 | .nodemask = NULL, /* we don't care the placement */ | 2299 | .nodemask = NULL, /* we don't care the placement */ |
| 2285 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2300 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
| 2286 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2301 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
| 2287 | }; | 2302 | }; |
| 2288 | struct shrink_control shrink = { | 2303 | struct shrink_control shrink = { |
| 2289 | .gfp_mask = sc.gfp_mask, | 2304 | .gfp_mask = sc.gfp_mask, |
| 2290 | }; | 2305 | }; |
| 2291 | 2306 | ||
| 2292 | /* | 2307 | /* |
| 2293 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't | 2308 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't |
| 2294 | * take care of from where we get pages. So the node where we start the | 2309 | * take care of from where we get pages. So the node where we start the |
| 2295 | * scan does not need to be the current node. | 2310 | * scan does not need to be the current node. |
| 2296 | */ | 2311 | */ |
| 2297 | nid = mem_cgroup_select_victim_node(mem_cont); | 2312 | nid = mem_cgroup_select_victim_node(mem_cont); |
| 2298 | 2313 | ||
| 2299 | zonelist = NODE_DATA(nid)->node_zonelists; | 2314 | zonelist = NODE_DATA(nid)->node_zonelists; |
| 2300 | 2315 | ||
| 2301 | trace_mm_vmscan_memcg_reclaim_begin(0, | 2316 | trace_mm_vmscan_memcg_reclaim_begin(0, |
| 2302 | sc.may_writepage, | 2317 | sc.may_writepage, |
| 2303 | sc.gfp_mask); | 2318 | sc.gfp_mask); |
| 2304 | 2319 | ||
| 2305 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); | 2320 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
| 2306 | 2321 | ||
| 2307 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 2322 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
| 2308 | 2323 | ||
| 2309 | return nr_reclaimed; | 2324 | return nr_reclaimed; |
| 2310 | } | 2325 | } |
| 2311 | #endif | 2326 | #endif |
| 2312 | 2327 | ||
| 2313 | /* | 2328 | /* |
| 2314 | * pgdat_balanced is used when checking if a node is balanced for high-order | 2329 | * pgdat_balanced is used when checking if a node is balanced for high-order |
| 2315 | * allocations. Only zones that meet watermarks and are in a zone allowed | 2330 | * allocations. Only zones that meet watermarks and are in a zone allowed |
| 2316 | * by the callers classzone_idx are added to balanced_pages. The total of | 2331 | * by the callers classzone_idx are added to balanced_pages. The total of |
| 2317 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | 2332 | * balanced pages must be at least 25% of the zones allowed by classzone_idx |
| 2318 | * for the node to be considered balanced. Forcing all zones to be balanced | 2333 | * for the node to be considered balanced. Forcing all zones to be balanced |
| 2319 | * for high orders can cause excessive reclaim when there are imbalanced zones. | 2334 | * for high orders can cause excessive reclaim when there are imbalanced zones. |
| 2320 | * The choice of 25% is due to | 2335 | * The choice of 25% is due to |
| 2321 | * o a 16M DMA zone that is balanced will not balance a zone on any | 2336 | * o a 16M DMA zone that is balanced will not balance a zone on any |
| 2322 | * reasonable sized machine | 2337 | * reasonable sized machine |
| 2323 | * o On all other machines, the top zone must be at least a reasonable | 2338 | * o On all other machines, the top zone must be at least a reasonable |
| 2324 | * percentage of the middle zones. For example, on 32-bit x86, highmem | 2339 | * percentage of the middle zones. For example, on 32-bit x86, highmem |
| 2325 | * would need to be at least 256M for it to be balance a whole node. | 2340 | * would need to be at least 256M for it to be balance a whole node. |
| 2326 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | 2341 | * Similarly, on x86-64 the Normal zone would need to be at least 1G |
| 2327 | * to balance a node on its own. These seemed like reasonable ratios. | 2342 | * to balance a node on its own. These seemed like reasonable ratios. |
| 2328 | */ | 2343 | */ |
| 2329 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | 2344 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, |
| 2330 | int classzone_idx) | 2345 | int classzone_idx) |
| 2331 | { | 2346 | { |
| 2332 | unsigned long present_pages = 0; | 2347 | unsigned long present_pages = 0; |
| 2333 | int i; | 2348 | int i; |
| 2334 | 2349 | ||
| 2335 | for (i = 0; i <= classzone_idx; i++) | 2350 | for (i = 0; i <= classzone_idx; i++) |
| 2336 | present_pages += pgdat->node_zones[i].present_pages; | 2351 | present_pages += pgdat->node_zones[i].present_pages; |
| 2337 | 2352 | ||
| 2338 | /* A special case here: if zone has no page, we think it's balanced */ | 2353 | /* A special case here: if zone has no page, we think it's balanced */ |
| 2339 | return balanced_pages >= (present_pages >> 2); | 2354 | return balanced_pages >= (present_pages >> 2); |
| 2340 | } | 2355 | } |
| 2341 | 2356 | ||
| 2342 | /* is kswapd sleeping prematurely? */ | 2357 | /* is kswapd sleeping prematurely? */ |
| 2343 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | 2358 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, |
| 2344 | int classzone_idx) | 2359 | int classzone_idx) |
| 2345 | { | 2360 | { |
| 2346 | int i; | 2361 | int i; |
| 2347 | unsigned long balanced = 0; | 2362 | unsigned long balanced = 0; |
| 2348 | bool all_zones_ok = true; | 2363 | bool all_zones_ok = true; |
| 2349 | 2364 | ||
| 2350 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2365 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
| 2351 | if (remaining) | 2366 | if (remaining) |
| 2352 | return true; | 2367 | return true; |
| 2353 | 2368 | ||
| 2354 | /* Check the watermark levels */ | 2369 | /* Check the watermark levels */ |
| 2355 | for (i = 0; i <= classzone_idx; i++) { | 2370 | for (i = 0; i <= classzone_idx; i++) { |
| 2356 | struct zone *zone = pgdat->node_zones + i; | 2371 | struct zone *zone = pgdat->node_zones + i; |
| 2357 | 2372 | ||
| 2358 | if (!populated_zone(zone)) | 2373 | if (!populated_zone(zone)) |
| 2359 | continue; | 2374 | continue; |
| 2360 | 2375 | ||
| 2361 | /* | 2376 | /* |
| 2362 | * balance_pgdat() skips over all_unreclaimable after | 2377 | * balance_pgdat() skips over all_unreclaimable after |
| 2363 | * DEF_PRIORITY. Effectively, it considers them balanced so | 2378 | * DEF_PRIORITY. Effectively, it considers them balanced so |
| 2364 | * they must be considered balanced here as well if kswapd | 2379 | * they must be considered balanced here as well if kswapd |
| 2365 | * is to sleep | 2380 | * is to sleep |
| 2366 | */ | 2381 | */ |
| 2367 | if (zone->all_unreclaimable) { | 2382 | if (zone->all_unreclaimable) { |
| 2368 | balanced += zone->present_pages; | 2383 | balanced += zone->present_pages; |
| 2369 | continue; | 2384 | continue; |
| 2370 | } | 2385 | } |
| 2371 | 2386 | ||
| 2372 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), | 2387 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
| 2373 | i, 0)) | 2388 | i, 0)) |
| 2374 | all_zones_ok = false; | 2389 | all_zones_ok = false; |
| 2375 | else | 2390 | else |
| 2376 | balanced += zone->present_pages; | 2391 | balanced += zone->present_pages; |
| 2377 | } | 2392 | } |
| 2378 | 2393 | ||
| 2379 | /* | 2394 | /* |
| 2380 | * For high-order requests, the balanced zones must contain at least | 2395 | * For high-order requests, the balanced zones must contain at least |
| 2381 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | 2396 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones |
| 2382 | * must be balanced | 2397 | * must be balanced |
| 2383 | */ | 2398 | */ |
| 2384 | if (order) | 2399 | if (order) |
| 2385 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | 2400 | return !pgdat_balanced(pgdat, balanced, classzone_idx); |
| 2386 | else | 2401 | else |
| 2387 | return !all_zones_ok; | 2402 | return !all_zones_ok; |
| 2388 | } | 2403 | } |
| 2389 | 2404 | ||
| 2390 | /* | 2405 | /* |
| 2391 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2406 | * For kswapd, balance_pgdat() will work across all this node's zones until |
| 2392 | * they are all at high_wmark_pages(zone). | 2407 | * they are all at high_wmark_pages(zone). |
| 2393 | * | 2408 | * |
| 2394 | * Returns the final order kswapd was reclaiming at | 2409 | * Returns the final order kswapd was reclaiming at |
| 2395 | * | 2410 | * |
| 2396 | * There is special handling here for zones which are full of pinned pages. | 2411 | * There is special handling here for zones which are full of pinned pages. |
| 2397 | * This can happen if the pages are all mlocked, or if they are all used by | 2412 | * This can happen if the pages are all mlocked, or if they are all used by |
| 2398 | * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. | 2413 | * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. |
| 2399 | * What we do is to detect the case where all pages in the zone have been | 2414 | * What we do is to detect the case where all pages in the zone have been |
| 2400 | * scanned twice and there has been zero successful reclaim. Mark the zone as | 2415 | * scanned twice and there has been zero successful reclaim. Mark the zone as |
| 2401 | * dead and from now on, only perform a short scan. Basically we're polling | 2416 | * dead and from now on, only perform a short scan. Basically we're polling |
| 2402 | * the zone for when the problem goes away. | 2417 | * the zone for when the problem goes away. |
| 2403 | * | 2418 | * |
| 2404 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 2419 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
| 2405 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is | 2420 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
| 2406 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the | 2421 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the |
| 2407 | * lower zones regardless of the number of free pages in the lower zones. This | 2422 | * lower zones regardless of the number of free pages in the lower zones. This |
| 2408 | * interoperates with the page allocator fallback scheme to ensure that aging | 2423 | * interoperates with the page allocator fallback scheme to ensure that aging |
| 2409 | * of pages is balanced across the zones. | 2424 | * of pages is balanced across the zones. |
| 2410 | */ | 2425 | */ |
| 2411 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2426 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
| 2412 | int *classzone_idx) | 2427 | int *classzone_idx) |
| 2413 | { | 2428 | { |
| 2414 | int all_zones_ok; | 2429 | int all_zones_ok; |
| 2415 | unsigned long balanced; | 2430 | unsigned long balanced; |
| 2416 | int priority; | 2431 | int priority; |
| 2417 | int i; | 2432 | int i; |
| 2418 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2433 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
| 2419 | unsigned long total_scanned; | 2434 | unsigned long total_scanned; |
| 2420 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2435 | struct reclaim_state *reclaim_state = current->reclaim_state; |
| 2421 | unsigned long nr_soft_reclaimed; | 2436 | unsigned long nr_soft_reclaimed; |
| 2422 | unsigned long nr_soft_scanned; | 2437 | unsigned long nr_soft_scanned; |
| 2423 | struct scan_control sc = { | 2438 | struct scan_control sc = { |
| 2424 | .gfp_mask = GFP_KERNEL, | 2439 | .gfp_mask = GFP_KERNEL, |
| 2425 | .may_unmap = 1, | 2440 | .may_unmap = 1, |
| 2426 | .may_swap = 1, | 2441 | .may_swap = 1, |
| 2427 | /* | 2442 | /* |
| 2428 | * kswapd doesn't want to be bailed out while reclaim. because | 2443 | * kswapd doesn't want to be bailed out while reclaim. because |
| 2429 | * we want to put equal scanning pressure on each zone. | 2444 | * we want to put equal scanning pressure on each zone. |
| 2430 | */ | 2445 | */ |
| 2431 | .nr_to_reclaim = ULONG_MAX, | 2446 | .nr_to_reclaim = ULONG_MAX, |
| 2432 | .swappiness = vm_swappiness, | 2447 | .swappiness = vm_swappiness, |
| 2433 | .order = order, | 2448 | .order = order, |
| 2434 | .mem_cgroup = NULL, | 2449 | .mem_cgroup = NULL, |
| 2435 | }; | 2450 | }; |
| 2436 | struct shrink_control shrink = { | 2451 | struct shrink_control shrink = { |
| 2437 | .gfp_mask = sc.gfp_mask, | 2452 | .gfp_mask = sc.gfp_mask, |
| 2438 | }; | 2453 | }; |
| 2439 | loop_again: | 2454 | loop_again: |
| 2440 | total_scanned = 0; | 2455 | total_scanned = 0; |
| 2441 | sc.nr_reclaimed = 0; | 2456 | sc.nr_reclaimed = 0; |
| 2442 | sc.may_writepage = !laptop_mode; | 2457 | sc.may_writepage = !laptop_mode; |
| 2443 | count_vm_event(PAGEOUTRUN); | 2458 | count_vm_event(PAGEOUTRUN); |
| 2444 | 2459 | ||
| 2445 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2460 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
| 2446 | unsigned long lru_pages = 0; | 2461 | unsigned long lru_pages = 0; |
| 2447 | int has_under_min_watermark_zone = 0; | 2462 | int has_under_min_watermark_zone = 0; |
| 2448 | 2463 | ||
| 2449 | /* The swap token gets in the way of swapout... */ | 2464 | /* The swap token gets in the way of swapout... */ |
| 2450 | if (!priority) | 2465 | if (!priority) |
| 2451 | disable_swap_token(NULL); | 2466 | disable_swap_token(NULL); |
| 2452 | 2467 | ||
| 2453 | all_zones_ok = 1; | 2468 | all_zones_ok = 1; |
| 2454 | balanced = 0; | 2469 | balanced = 0; |
| 2455 | 2470 | ||
| 2456 | /* | 2471 | /* |
| 2457 | * Scan in the highmem->dma direction for the highest | 2472 | * Scan in the highmem->dma direction for the highest |
| 2458 | * zone which needs scanning | 2473 | * zone which needs scanning |
| 2459 | */ | 2474 | */ |
| 2460 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 2475 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
| 2461 | struct zone *zone = pgdat->node_zones + i; | 2476 | struct zone *zone = pgdat->node_zones + i; |
| 2462 | 2477 | ||
| 2463 | if (!populated_zone(zone)) | 2478 | if (!populated_zone(zone)) |
| 2464 | continue; | 2479 | continue; |
| 2465 | 2480 | ||
| 2466 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2481 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 2467 | continue; | 2482 | continue; |
| 2468 | 2483 | ||
| 2469 | /* | 2484 | /* |
| 2470 | * Do some background aging of the anon list, to give | 2485 | * Do some background aging of the anon list, to give |
| 2471 | * pages a chance to be referenced before reclaiming. | 2486 | * pages a chance to be referenced before reclaiming. |
| 2472 | */ | 2487 | */ |
| 2473 | if (inactive_anon_is_low(zone, &sc)) | 2488 | if (inactive_anon_is_low(zone, &sc)) |
| 2474 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2489 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
| 2475 | &sc, priority, 0); | 2490 | &sc, priority, 0); |
| 2476 | 2491 | ||
| 2477 | if (!zone_watermark_ok_safe(zone, order, | 2492 | if (!zone_watermark_ok_safe(zone, order, |
| 2478 | high_wmark_pages(zone), 0, 0)) { | 2493 | high_wmark_pages(zone), 0, 0)) { |
| 2479 | end_zone = i; | 2494 | end_zone = i; |
| 2480 | break; | 2495 | break; |
| 2481 | } | 2496 | } |
| 2482 | } | 2497 | } |
| 2483 | if (i < 0) | 2498 | if (i < 0) |
| 2484 | goto out; | 2499 | goto out; |
| 2485 | 2500 | ||
| 2486 | for (i = 0; i <= end_zone; i++) { | 2501 | for (i = 0; i <= end_zone; i++) { |
| 2487 | struct zone *zone = pgdat->node_zones + i; | 2502 | struct zone *zone = pgdat->node_zones + i; |
| 2488 | 2503 | ||
| 2489 | lru_pages += zone_reclaimable_pages(zone); | 2504 | lru_pages += zone_reclaimable_pages(zone); |
| 2490 | } | 2505 | } |
| 2491 | 2506 | ||
| 2492 | /* | 2507 | /* |
| 2493 | * Now scan the zone in the dma->highmem direction, stopping | 2508 | * Now scan the zone in the dma->highmem direction, stopping |
| 2494 | * at the last zone which needs scanning. | 2509 | * at the last zone which needs scanning. |
| 2495 | * | 2510 | * |
| 2496 | * We do this because the page allocator works in the opposite | 2511 | * We do this because the page allocator works in the opposite |
| 2497 | * direction. This prevents the page allocator from allocating | 2512 | * direction. This prevents the page allocator from allocating |
| 2498 | * pages behind kswapd's direction of progress, which would | 2513 | * pages behind kswapd's direction of progress, which would |
| 2499 | * cause too much scanning of the lower zones. | 2514 | * cause too much scanning of the lower zones. |
| 2500 | */ | 2515 | */ |
| 2501 | for (i = 0; i <= end_zone; i++) { | 2516 | for (i = 0; i <= end_zone; i++) { |
| 2502 | struct zone *zone = pgdat->node_zones + i; | 2517 | struct zone *zone = pgdat->node_zones + i; |
| 2503 | int nr_slab; | 2518 | int nr_slab; |
| 2504 | unsigned long balance_gap; | 2519 | unsigned long balance_gap; |
| 2505 | 2520 | ||
| 2506 | if (!populated_zone(zone)) | 2521 | if (!populated_zone(zone)) |
| 2507 | continue; | 2522 | continue; |
| 2508 | 2523 | ||
| 2509 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2524 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 2510 | continue; | 2525 | continue; |
| 2511 | 2526 | ||
| 2512 | sc.nr_scanned = 0; | 2527 | sc.nr_scanned = 0; |
| 2513 | 2528 | ||
| 2514 | nr_soft_scanned = 0; | 2529 | nr_soft_scanned = 0; |
| 2515 | /* | 2530 | /* |
| 2516 | * Call soft limit reclaim before calling shrink_zone. | 2531 | * Call soft limit reclaim before calling shrink_zone. |
| 2517 | */ | 2532 | */ |
| 2518 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | 2533 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, |
| 2519 | order, sc.gfp_mask, | 2534 | order, sc.gfp_mask, |
| 2520 | &nr_soft_scanned); | 2535 | &nr_soft_scanned); |
| 2521 | sc.nr_reclaimed += nr_soft_reclaimed; | 2536 | sc.nr_reclaimed += nr_soft_reclaimed; |
| 2522 | total_scanned += nr_soft_scanned; | 2537 | total_scanned += nr_soft_scanned; |
| 2523 | 2538 | ||
| 2524 | /* | 2539 | /* |
| 2525 | * We put equal pressure on every zone, unless | 2540 | * We put equal pressure on every zone, unless |
| 2526 | * one zone has way too many pages free | 2541 | * one zone has way too many pages free |
| 2527 | * already. The "too many pages" is defined | 2542 | * already. The "too many pages" is defined |
| 2528 | * as the high wmark plus a "gap" where the | 2543 | * as the high wmark plus a "gap" where the |
| 2529 | * gap is either the low watermark or 1% | 2544 | * gap is either the low watermark or 1% |
| 2530 | * of the zone, whichever is smaller. | 2545 | * of the zone, whichever is smaller. |
| 2531 | */ | 2546 | */ |
| 2532 | balance_gap = min(low_wmark_pages(zone), | 2547 | balance_gap = min(low_wmark_pages(zone), |
| 2533 | (zone->present_pages + | 2548 | (zone->present_pages + |
| 2534 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2549 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
| 2535 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2550 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
| 2536 | if (!zone_watermark_ok_safe(zone, order, | 2551 | if (!zone_watermark_ok_safe(zone, order, |
| 2537 | high_wmark_pages(zone) + balance_gap, | 2552 | high_wmark_pages(zone) + balance_gap, |
| 2538 | end_zone, 0)) { | 2553 | end_zone, 0)) { |
| 2539 | shrink_zone(priority, zone, &sc); | 2554 | shrink_zone(priority, zone, &sc); |
| 2540 | 2555 | ||
| 2541 | reclaim_state->reclaimed_slab = 0; | 2556 | reclaim_state->reclaimed_slab = 0; |
| 2542 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | 2557 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
| 2543 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2558 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
| 2544 | total_scanned += sc.nr_scanned; | 2559 | total_scanned += sc.nr_scanned; |
| 2545 | 2560 | ||
| 2546 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2561 | if (nr_slab == 0 && !zone_reclaimable(zone)) |
| 2547 | zone->all_unreclaimable = 1; | 2562 | zone->all_unreclaimable = 1; |
| 2548 | } | 2563 | } |
| 2549 | 2564 | ||
| 2550 | /* | 2565 | /* |
| 2551 | * If we've done a decent amount of scanning and | 2566 | * If we've done a decent amount of scanning and |
| 2552 | * the reclaim ratio is low, start doing writepage | 2567 | * the reclaim ratio is low, start doing writepage |
| 2553 | * even in laptop mode | 2568 | * even in laptop mode |
| 2554 | */ | 2569 | */ |
| 2555 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2570 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
| 2556 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2571 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
| 2557 | sc.may_writepage = 1; | 2572 | sc.may_writepage = 1; |
| 2558 | 2573 | ||
| 2559 | if (zone->all_unreclaimable) { | 2574 | if (zone->all_unreclaimable) { |
| 2560 | if (end_zone && end_zone == i) | 2575 | if (end_zone && end_zone == i) |
| 2561 | end_zone--; | 2576 | end_zone--; |
| 2562 | continue; | 2577 | continue; |
| 2563 | } | 2578 | } |
| 2564 | 2579 | ||
| 2565 | if (!zone_watermark_ok_safe(zone, order, | 2580 | if (!zone_watermark_ok_safe(zone, order, |
| 2566 | high_wmark_pages(zone), end_zone, 0)) { | 2581 | high_wmark_pages(zone), end_zone, 0)) { |
| 2567 | all_zones_ok = 0; | 2582 | all_zones_ok = 0; |
| 2568 | /* | 2583 | /* |
| 2569 | * We are still under min water mark. This | 2584 | * We are still under min water mark. This |
| 2570 | * means that we have a GFP_ATOMIC allocation | 2585 | * means that we have a GFP_ATOMIC allocation |
| 2571 | * failure risk. Hurry up! | 2586 | * failure risk. Hurry up! |
| 2572 | */ | 2587 | */ |
| 2573 | if (!zone_watermark_ok_safe(zone, order, | 2588 | if (!zone_watermark_ok_safe(zone, order, |
| 2574 | min_wmark_pages(zone), end_zone, 0)) | 2589 | min_wmark_pages(zone), end_zone, 0)) |
| 2575 | has_under_min_watermark_zone = 1; | 2590 | has_under_min_watermark_zone = 1; |
| 2576 | } else { | 2591 | } else { |
| 2577 | /* | 2592 | /* |
| 2578 | * If a zone reaches its high watermark, | 2593 | * If a zone reaches its high watermark, |
| 2579 | * consider it to be no longer congested. It's | 2594 | * consider it to be no longer congested. It's |
| 2580 | * possible there are dirty pages backed by | 2595 | * possible there are dirty pages backed by |
| 2581 | * congested BDIs but as pressure is relieved, | 2596 | * congested BDIs but as pressure is relieved, |
| 2582 | * spectulatively avoid congestion waits | 2597 | * spectulatively avoid congestion waits |
| 2583 | */ | 2598 | */ |
| 2584 | zone_clear_flag(zone, ZONE_CONGESTED); | 2599 | zone_clear_flag(zone, ZONE_CONGESTED); |
| 2585 | if (i <= *classzone_idx) | 2600 | if (i <= *classzone_idx) |
| 2586 | balanced += zone->present_pages; | 2601 | balanced += zone->present_pages; |
| 2587 | } | 2602 | } |
| 2588 | 2603 | ||
| 2589 | } | 2604 | } |
| 2590 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2605 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
| 2591 | break; /* kswapd: all done */ | 2606 | break; /* kswapd: all done */ |
| 2592 | /* | 2607 | /* |
| 2593 | * OK, kswapd is getting into trouble. Take a nap, then take | 2608 | * OK, kswapd is getting into trouble. Take a nap, then take |
| 2594 | * another pass across the zones. | 2609 | * another pass across the zones. |
| 2595 | */ | 2610 | */ |
| 2596 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { | 2611 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { |
| 2597 | if (has_under_min_watermark_zone) | 2612 | if (has_under_min_watermark_zone) |
| 2598 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | 2613 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); |
| 2599 | else | 2614 | else |
| 2600 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2615 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 2601 | } | 2616 | } |
| 2602 | 2617 | ||
| 2603 | /* | 2618 | /* |
| 2604 | * We do this so kswapd doesn't build up large priorities for | 2619 | * We do this so kswapd doesn't build up large priorities for |
| 2605 | * example when it is freeing in parallel with allocators. It | 2620 | * example when it is freeing in parallel with allocators. It |
| 2606 | * matches the direct reclaim path behaviour in terms of impact | 2621 | * matches the direct reclaim path behaviour in terms of impact |
| 2607 | * on zone->*_priority. | 2622 | * on zone->*_priority. |
| 2608 | */ | 2623 | */ |
| 2609 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 2624 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
| 2610 | break; | 2625 | break; |
| 2611 | } | 2626 | } |
| 2612 | out: | 2627 | out: |
| 2613 | 2628 | ||
| 2614 | /* | 2629 | /* |
| 2615 | * order-0: All zones must meet high watermark for a balanced node | 2630 | * order-0: All zones must meet high watermark for a balanced node |
| 2616 | * high-order: Balanced zones must make up at least 25% of the node | 2631 | * high-order: Balanced zones must make up at least 25% of the node |
| 2617 | * for the node to be balanced | 2632 | * for the node to be balanced |
| 2618 | */ | 2633 | */ |
| 2619 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | 2634 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { |
| 2620 | cond_resched(); | 2635 | cond_resched(); |
| 2621 | 2636 | ||
| 2622 | try_to_freeze(); | 2637 | try_to_freeze(); |
| 2623 | 2638 | ||
| 2624 | /* | 2639 | /* |
| 2625 | * Fragmentation may mean that the system cannot be | 2640 | * Fragmentation may mean that the system cannot be |
| 2626 | * rebalanced for high-order allocations in all zones. | 2641 | * rebalanced for high-order allocations in all zones. |
| 2627 | * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, | 2642 | * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, |
| 2628 | * it means the zones have been fully scanned and are still | 2643 | * it means the zones have been fully scanned and are still |
| 2629 | * not balanced. For high-order allocations, there is | 2644 | * not balanced. For high-order allocations, there is |
| 2630 | * little point trying all over again as kswapd may | 2645 | * little point trying all over again as kswapd may |
| 2631 | * infinite loop. | 2646 | * infinite loop. |
| 2632 | * | 2647 | * |
| 2633 | * Instead, recheck all watermarks at order-0 as they | 2648 | * Instead, recheck all watermarks at order-0 as they |
| 2634 | * are the most important. If watermarks are ok, kswapd will go | 2649 | * are the most important. If watermarks are ok, kswapd will go |
| 2635 | * back to sleep. High-order users can still perform direct | 2650 | * back to sleep. High-order users can still perform direct |
| 2636 | * reclaim if they wish. | 2651 | * reclaim if they wish. |
| 2637 | */ | 2652 | */ |
| 2638 | if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) | 2653 | if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) |
| 2639 | order = sc.order = 0; | 2654 | order = sc.order = 0; |
| 2640 | 2655 | ||
| 2641 | goto loop_again; | 2656 | goto loop_again; |
| 2642 | } | 2657 | } |
| 2643 | 2658 | ||
| 2644 | /* | 2659 | /* |
| 2645 | * If kswapd was reclaiming at a higher order, it has the option of | 2660 | * If kswapd was reclaiming at a higher order, it has the option of |
| 2646 | * sleeping without all zones being balanced. Before it does, it must | 2661 | * sleeping without all zones being balanced. Before it does, it must |
| 2647 | * ensure that the watermarks for order-0 on *all* zones are met and | 2662 | * ensure that the watermarks for order-0 on *all* zones are met and |
| 2648 | * that the congestion flags are cleared. The congestion flag must | 2663 | * that the congestion flags are cleared. The congestion flag must |
| 2649 | * be cleared as kswapd is the only mechanism that clears the flag | 2664 | * be cleared as kswapd is the only mechanism that clears the flag |
| 2650 | * and it is potentially going to sleep here. | 2665 | * and it is potentially going to sleep here. |
| 2651 | */ | 2666 | */ |
| 2652 | if (order) { | 2667 | if (order) { |
| 2653 | for (i = 0; i <= end_zone; i++) { | 2668 | for (i = 0; i <= end_zone; i++) { |
| 2654 | struct zone *zone = pgdat->node_zones + i; | 2669 | struct zone *zone = pgdat->node_zones + i; |
| 2655 | 2670 | ||
| 2656 | if (!populated_zone(zone)) | 2671 | if (!populated_zone(zone)) |
| 2657 | continue; | 2672 | continue; |
| 2658 | 2673 | ||
| 2659 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2674 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 2660 | continue; | 2675 | continue; |
| 2661 | 2676 | ||
| 2662 | /* Confirm the zone is balanced for order-0 */ | 2677 | /* Confirm the zone is balanced for order-0 */ |
| 2663 | if (!zone_watermark_ok(zone, 0, | 2678 | if (!zone_watermark_ok(zone, 0, |
| 2664 | high_wmark_pages(zone), 0, 0)) { | 2679 | high_wmark_pages(zone), 0, 0)) { |
| 2665 | order = sc.order = 0; | 2680 | order = sc.order = 0; |
| 2666 | goto loop_again; | 2681 | goto loop_again; |
| 2667 | } | 2682 | } |
| 2668 | 2683 | ||
| 2669 | /* If balanced, clear the congested flag */ | 2684 | /* If balanced, clear the congested flag */ |
| 2670 | zone_clear_flag(zone, ZONE_CONGESTED); | 2685 | zone_clear_flag(zone, ZONE_CONGESTED); |
| 2671 | } | 2686 | } |
| 2672 | } | 2687 | } |
| 2673 | 2688 | ||
| 2674 | /* | 2689 | /* |
| 2675 | * Return the order we were reclaiming at so sleeping_prematurely() | 2690 | * Return the order we were reclaiming at so sleeping_prematurely() |
| 2676 | * makes a decision on the order we were last reclaiming at. However, | 2691 | * makes a decision on the order we were last reclaiming at. However, |
| 2677 | * if another caller entered the allocator slow path while kswapd | 2692 | * if another caller entered the allocator slow path while kswapd |
| 2678 | * was awake, order will remain at the higher level | 2693 | * was awake, order will remain at the higher level |
| 2679 | */ | 2694 | */ |
| 2680 | *classzone_idx = end_zone; | 2695 | *classzone_idx = end_zone; |
| 2681 | return order; | 2696 | return order; |
| 2682 | } | 2697 | } |
| 2683 | 2698 | ||
| 2684 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | 2699 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) |
| 2685 | { | 2700 | { |
| 2686 | long remaining = 0; | 2701 | long remaining = 0; |
| 2687 | DEFINE_WAIT(wait); | 2702 | DEFINE_WAIT(wait); |
| 2688 | 2703 | ||
| 2689 | if (freezing(current) || kthread_should_stop()) | 2704 | if (freezing(current) || kthread_should_stop()) |
| 2690 | return; | 2705 | return; |
| 2691 | 2706 | ||
| 2692 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2707 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
| 2693 | 2708 | ||
| 2694 | /* Try to sleep for a short interval */ | 2709 | /* Try to sleep for a short interval */ |
| 2695 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2710 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { |
| 2696 | remaining = schedule_timeout(HZ/10); | 2711 | remaining = schedule_timeout(HZ/10); |
| 2697 | finish_wait(&pgdat->kswapd_wait, &wait); | 2712 | finish_wait(&pgdat->kswapd_wait, &wait); |
| 2698 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2713 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
| 2699 | } | 2714 | } |
| 2700 | 2715 | ||
| 2701 | /* | 2716 | /* |
| 2702 | * After a short sleep, check if it was a premature sleep. If not, then | 2717 | * After a short sleep, check if it was a premature sleep. If not, then |
| 2703 | * go fully to sleep until explicitly woken up. | 2718 | * go fully to sleep until explicitly woken up. |
| 2704 | */ | 2719 | */ |
| 2705 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2720 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { |
| 2706 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2721 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
| 2707 | 2722 | ||
| 2708 | /* | 2723 | /* |
| 2709 | * vmstat counters are not perfectly accurate and the estimated | 2724 | * vmstat counters are not perfectly accurate and the estimated |
| 2710 | * value for counters such as NR_FREE_PAGES can deviate from the | 2725 | * value for counters such as NR_FREE_PAGES can deviate from the |
| 2711 | * true value by nr_online_cpus * threshold. To avoid the zone | 2726 | * true value by nr_online_cpus * threshold. To avoid the zone |
| 2712 | * watermarks being breached while under pressure, we reduce the | 2727 | * watermarks being breached while under pressure, we reduce the |
| 2713 | * per-cpu vmstat threshold while kswapd is awake and restore | 2728 | * per-cpu vmstat threshold while kswapd is awake and restore |
| 2714 | * them before going back to sleep. | 2729 | * them before going back to sleep. |
| 2715 | */ | 2730 | */ |
| 2716 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2731 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
| 2717 | schedule(); | 2732 | schedule(); |
| 2718 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2733 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
| 2719 | } else { | 2734 | } else { |
| 2720 | if (remaining) | 2735 | if (remaining) |
| 2721 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | 2736 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); |
| 2722 | else | 2737 | else |
| 2723 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | 2738 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); |
| 2724 | } | 2739 | } |
| 2725 | finish_wait(&pgdat->kswapd_wait, &wait); | 2740 | finish_wait(&pgdat->kswapd_wait, &wait); |
| 2726 | } | 2741 | } |
| 2727 | 2742 | ||
| 2728 | /* | 2743 | /* |
| 2729 | * The background pageout daemon, started as a kernel thread | 2744 | * The background pageout daemon, started as a kernel thread |
| 2730 | * from the init process. | 2745 | * from the init process. |
| 2731 | * | 2746 | * |
| 2732 | * This basically trickles out pages so that we have _some_ | 2747 | * This basically trickles out pages so that we have _some_ |
| 2733 | * free memory available even if there is no other activity | 2748 | * free memory available even if there is no other activity |
| 2734 | * that frees anything up. This is needed for things like routing | 2749 | * that frees anything up. This is needed for things like routing |
| 2735 | * etc, where we otherwise might have all activity going on in | 2750 | * etc, where we otherwise might have all activity going on in |
| 2736 | * asynchronous contexts that cannot page things out. | 2751 | * asynchronous contexts that cannot page things out. |
| 2737 | * | 2752 | * |
| 2738 | * If there are applications that are active memory-allocators | 2753 | * If there are applications that are active memory-allocators |
| 2739 | * (most normal use), this basically shouldn't matter. | 2754 | * (most normal use), this basically shouldn't matter. |
| 2740 | */ | 2755 | */ |
| 2741 | static int kswapd(void *p) | 2756 | static int kswapd(void *p) |
| 2742 | { | 2757 | { |
| 2743 | unsigned long order, new_order; | 2758 | unsigned long order, new_order; |
| 2744 | int classzone_idx, new_classzone_idx; | 2759 | int classzone_idx, new_classzone_idx; |
| 2745 | pg_data_t *pgdat = (pg_data_t*)p; | 2760 | pg_data_t *pgdat = (pg_data_t*)p; |
| 2746 | struct task_struct *tsk = current; | 2761 | struct task_struct *tsk = current; |
| 2747 | 2762 | ||
| 2748 | struct reclaim_state reclaim_state = { | 2763 | struct reclaim_state reclaim_state = { |
| 2749 | .reclaimed_slab = 0, | 2764 | .reclaimed_slab = 0, |
| 2750 | }; | 2765 | }; |
| 2751 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); | 2766 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
| 2752 | 2767 | ||
| 2753 | lockdep_set_current_reclaim_state(GFP_KERNEL); | 2768 | lockdep_set_current_reclaim_state(GFP_KERNEL); |
| 2754 | 2769 | ||
| 2755 | if (!cpumask_empty(cpumask)) | 2770 | if (!cpumask_empty(cpumask)) |
| 2756 | set_cpus_allowed_ptr(tsk, cpumask); | 2771 | set_cpus_allowed_ptr(tsk, cpumask); |
| 2757 | current->reclaim_state = &reclaim_state; | 2772 | current->reclaim_state = &reclaim_state; |
| 2758 | 2773 | ||
| 2759 | /* | 2774 | /* |
| 2760 | * Tell the memory management that we're a "memory allocator", | 2775 | * Tell the memory management that we're a "memory allocator", |
| 2761 | * and that if we need more memory we should get access to it | 2776 | * and that if we need more memory we should get access to it |
| 2762 | * regardless (see "__alloc_pages()"). "kswapd" should | 2777 | * regardless (see "__alloc_pages()"). "kswapd" should |
| 2763 | * never get caught in the normal page freeing logic. | 2778 | * never get caught in the normal page freeing logic. |
| 2764 | * | 2779 | * |
| 2765 | * (Kswapd normally doesn't need memory anyway, but sometimes | 2780 | * (Kswapd normally doesn't need memory anyway, but sometimes |
| 2766 | * you need a small amount of memory in order to be able to | 2781 | * you need a small amount of memory in order to be able to |
| 2767 | * page out something else, and this flag essentially protects | 2782 | * page out something else, and this flag essentially protects |
| 2768 | * us from recursively trying to free more memory as we're | 2783 | * us from recursively trying to free more memory as we're |
| 2769 | * trying to free the first piece of memory in the first place). | 2784 | * trying to free the first piece of memory in the first place). |
| 2770 | */ | 2785 | */ |
| 2771 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 2786 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
| 2772 | set_freezable(); | 2787 | set_freezable(); |
| 2773 | 2788 | ||
| 2774 | order = new_order = 0; | 2789 | order = new_order = 0; |
| 2775 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2790 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
| 2776 | for ( ; ; ) { | 2791 | for ( ; ; ) { |
| 2777 | int ret; | 2792 | int ret; |
| 2778 | 2793 | ||
| 2779 | /* | 2794 | /* |
| 2780 | * If the last balance_pgdat was unsuccessful it's unlikely a | 2795 | * If the last balance_pgdat was unsuccessful it's unlikely a |
| 2781 | * new request of a similar or harder type will succeed soon | 2796 | * new request of a similar or harder type will succeed soon |
| 2782 | * so consider going to sleep on the basis we reclaimed at | 2797 | * so consider going to sleep on the basis we reclaimed at |
| 2783 | */ | 2798 | */ |
| 2784 | if (classzone_idx >= new_classzone_idx && order == new_order) { | 2799 | if (classzone_idx >= new_classzone_idx && order == new_order) { |
| 2785 | new_order = pgdat->kswapd_max_order; | 2800 | new_order = pgdat->kswapd_max_order; |
| 2786 | new_classzone_idx = pgdat->classzone_idx; | 2801 | new_classzone_idx = pgdat->classzone_idx; |
| 2787 | pgdat->kswapd_max_order = 0; | 2802 | pgdat->kswapd_max_order = 0; |
| 2788 | pgdat->classzone_idx = pgdat->nr_zones - 1; | 2803 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
| 2789 | } | 2804 | } |
| 2790 | 2805 | ||
| 2791 | if (order < new_order || classzone_idx > new_classzone_idx) { | 2806 | if (order < new_order || classzone_idx > new_classzone_idx) { |
| 2792 | /* | 2807 | /* |
| 2793 | * Don't sleep if someone wants a larger 'order' | 2808 | * Don't sleep if someone wants a larger 'order' |
| 2794 | * allocation or has tigher zone constraints | 2809 | * allocation or has tigher zone constraints |
| 2795 | */ | 2810 | */ |
| 2796 | order = new_order; | 2811 | order = new_order; |
| 2797 | classzone_idx = new_classzone_idx; | 2812 | classzone_idx = new_classzone_idx; |
| 2798 | } else { | 2813 | } else { |
| 2799 | kswapd_try_to_sleep(pgdat, order, classzone_idx); | 2814 | kswapd_try_to_sleep(pgdat, order, classzone_idx); |
| 2800 | order = pgdat->kswapd_max_order; | 2815 | order = pgdat->kswapd_max_order; |
| 2801 | classzone_idx = pgdat->classzone_idx; | 2816 | classzone_idx = pgdat->classzone_idx; |
| 2802 | pgdat->kswapd_max_order = 0; | 2817 | pgdat->kswapd_max_order = 0; |
| 2803 | pgdat->classzone_idx = pgdat->nr_zones - 1; | 2818 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
| 2804 | } | 2819 | } |
| 2805 | 2820 | ||
| 2806 | ret = try_to_freeze(); | 2821 | ret = try_to_freeze(); |
| 2807 | if (kthread_should_stop()) | 2822 | if (kthread_should_stop()) |
| 2808 | break; | 2823 | break; |
| 2809 | 2824 | ||
| 2810 | /* | 2825 | /* |
| 2811 | * We can speed up thawing tasks if we don't call balance_pgdat | 2826 | * We can speed up thawing tasks if we don't call balance_pgdat |
| 2812 | * after returning from the refrigerator | 2827 | * after returning from the refrigerator |
| 2813 | */ | 2828 | */ |
| 2814 | if (!ret) { | 2829 | if (!ret) { |
| 2815 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2830 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
| 2816 | order = balance_pgdat(pgdat, order, &classzone_idx); | 2831 | order = balance_pgdat(pgdat, order, &classzone_idx); |
| 2817 | } | 2832 | } |
| 2818 | } | 2833 | } |
| 2819 | return 0; | 2834 | return 0; |
| 2820 | } | 2835 | } |
| 2821 | 2836 | ||
| 2822 | /* | 2837 | /* |
| 2823 | * A zone is low on free memory, so wake its kswapd task to service it. | 2838 | * A zone is low on free memory, so wake its kswapd task to service it. |
| 2824 | */ | 2839 | */ |
| 2825 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | 2840 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
| 2826 | { | 2841 | { |
| 2827 | pg_data_t *pgdat; | 2842 | pg_data_t *pgdat; |
| 2828 | 2843 | ||
| 2829 | if (!populated_zone(zone)) | 2844 | if (!populated_zone(zone)) |
| 2830 | return; | 2845 | return; |
| 2831 | 2846 | ||
| 2832 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2847 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
| 2833 | return; | 2848 | return; |
| 2834 | pgdat = zone->zone_pgdat; | 2849 | pgdat = zone->zone_pgdat; |
| 2835 | if (pgdat->kswapd_max_order < order) { | 2850 | if (pgdat->kswapd_max_order < order) { |
| 2836 | pgdat->kswapd_max_order = order; | 2851 | pgdat->kswapd_max_order = order; |
| 2837 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | 2852 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); |
| 2838 | } | 2853 | } |
| 2839 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2854 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
| 2840 | return; | 2855 | return; |
| 2841 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | 2856 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) |
| 2842 | return; | 2857 | return; |
| 2843 | 2858 | ||
| 2844 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | 2859 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); |
| 2845 | wake_up_interruptible(&pgdat->kswapd_wait); | 2860 | wake_up_interruptible(&pgdat->kswapd_wait); |
| 2846 | } | 2861 | } |
| 2847 | 2862 | ||
| 2848 | /* | 2863 | /* |
| 2849 | * The reclaimable count would be mostly accurate. | 2864 | * The reclaimable count would be mostly accurate. |
| 2850 | * The less reclaimable pages may be | 2865 | * The less reclaimable pages may be |
| 2851 | * - mlocked pages, which will be moved to unevictable list when encountered | 2866 | * - mlocked pages, which will be moved to unevictable list when encountered |
| 2852 | * - mapped pages, which may require several travels to be reclaimed | 2867 | * - mapped pages, which may require several travels to be reclaimed |
| 2853 | * - dirty pages, which is not "instantly" reclaimable | 2868 | * - dirty pages, which is not "instantly" reclaimable |
| 2854 | */ | 2869 | */ |
| 2855 | unsigned long global_reclaimable_pages(void) | 2870 | unsigned long global_reclaimable_pages(void) |
| 2856 | { | 2871 | { |
| 2857 | int nr; | 2872 | int nr; |
| 2858 | 2873 | ||
| 2859 | nr = global_page_state(NR_ACTIVE_FILE) + | 2874 | nr = global_page_state(NR_ACTIVE_FILE) + |
| 2860 | global_page_state(NR_INACTIVE_FILE); | 2875 | global_page_state(NR_INACTIVE_FILE); |
| 2861 | 2876 | ||
| 2862 | if (nr_swap_pages > 0) | 2877 | if (nr_swap_pages > 0) |
| 2863 | nr += global_page_state(NR_ACTIVE_ANON) + | 2878 | nr += global_page_state(NR_ACTIVE_ANON) + |
| 2864 | global_page_state(NR_INACTIVE_ANON); | 2879 | global_page_state(NR_INACTIVE_ANON); |
| 2865 | 2880 | ||
| 2866 | return nr; | 2881 | return nr; |
| 2867 | } | 2882 | } |
| 2868 | 2883 | ||
| 2869 | unsigned long zone_reclaimable_pages(struct zone *zone) | 2884 | unsigned long zone_reclaimable_pages(struct zone *zone) |
| 2870 | { | 2885 | { |
| 2871 | int nr; | 2886 | int nr; |
| 2872 | 2887 | ||
| 2873 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 2888 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + |
| 2874 | zone_page_state(zone, NR_INACTIVE_FILE); | 2889 | zone_page_state(zone, NR_INACTIVE_FILE); |
| 2875 | 2890 | ||
| 2876 | if (nr_swap_pages > 0) | 2891 | if (nr_swap_pages > 0) |
| 2877 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | 2892 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + |
| 2878 | zone_page_state(zone, NR_INACTIVE_ANON); | 2893 | zone_page_state(zone, NR_INACTIVE_ANON); |
| 2879 | 2894 | ||
| 2880 | return nr; | 2895 | return nr; |
| 2881 | } | 2896 | } |
| 2882 | 2897 | ||
| 2883 | #ifdef CONFIG_HIBERNATION | 2898 | #ifdef CONFIG_HIBERNATION |
| 2884 | /* | 2899 | /* |
| 2885 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of | 2900 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
| 2886 | * freed pages. | 2901 | * freed pages. |
| 2887 | * | 2902 | * |
| 2888 | * Rather than trying to age LRUs the aim is to preserve the overall | 2903 | * Rather than trying to age LRUs the aim is to preserve the overall |
| 2889 | * LRU order by reclaiming preferentially | 2904 | * LRU order by reclaiming preferentially |
| 2890 | * inactive > active > active referenced > active mapped | 2905 | * inactive > active > active referenced > active mapped |
| 2891 | */ | 2906 | */ |
| 2892 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | 2907 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) |
| 2893 | { | 2908 | { |
| 2894 | struct reclaim_state reclaim_state; | 2909 | struct reclaim_state reclaim_state; |
| 2895 | struct scan_control sc = { | 2910 | struct scan_control sc = { |
| 2896 | .gfp_mask = GFP_HIGHUSER_MOVABLE, | 2911 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
| 2897 | .may_swap = 1, | 2912 | .may_swap = 1, |
| 2898 | .may_unmap = 1, | 2913 | .may_unmap = 1, |
| 2899 | .may_writepage = 1, | 2914 | .may_writepage = 1, |
| 2900 | .nr_to_reclaim = nr_to_reclaim, | 2915 | .nr_to_reclaim = nr_to_reclaim, |
| 2901 | .hibernation_mode = 1, | 2916 | .hibernation_mode = 1, |
| 2902 | .swappiness = vm_swappiness, | 2917 | .swappiness = vm_swappiness, |
| 2903 | .order = 0, | 2918 | .order = 0, |
| 2904 | }; | 2919 | }; |
| 2905 | struct shrink_control shrink = { | 2920 | struct shrink_control shrink = { |
| 2906 | .gfp_mask = sc.gfp_mask, | 2921 | .gfp_mask = sc.gfp_mask, |
| 2907 | }; | 2922 | }; |
| 2908 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | 2923 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); |
| 2909 | struct task_struct *p = current; | 2924 | struct task_struct *p = current; |
| 2910 | unsigned long nr_reclaimed; | 2925 | unsigned long nr_reclaimed; |
| 2911 | 2926 | ||
| 2912 | p->flags |= PF_MEMALLOC; | 2927 | p->flags |= PF_MEMALLOC; |
| 2913 | lockdep_set_current_reclaim_state(sc.gfp_mask); | 2928 | lockdep_set_current_reclaim_state(sc.gfp_mask); |
| 2914 | reclaim_state.reclaimed_slab = 0; | 2929 | reclaim_state.reclaimed_slab = 0; |
| 2915 | p->reclaim_state = &reclaim_state; | 2930 | p->reclaim_state = &reclaim_state; |
| 2916 | 2931 | ||
| 2917 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); | 2932 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
| 2918 | 2933 | ||
| 2919 | p->reclaim_state = NULL; | 2934 | p->reclaim_state = NULL; |
| 2920 | lockdep_clear_current_reclaim_state(); | 2935 | lockdep_clear_current_reclaim_state(); |
| 2921 | p->flags &= ~PF_MEMALLOC; | 2936 | p->flags &= ~PF_MEMALLOC; |
| 2922 | 2937 | ||
| 2923 | return nr_reclaimed; | 2938 | return nr_reclaimed; |
| 2924 | } | 2939 | } |
| 2925 | #endif /* CONFIG_HIBERNATION */ | 2940 | #endif /* CONFIG_HIBERNATION */ |
| 2926 | 2941 | ||
| 2927 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 2942 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
| 2928 | not required for correctness. So if the last cpu in a node goes | 2943 | not required for correctness. So if the last cpu in a node goes |
| 2929 | away, we get changed to run anywhere: as the first one comes back, | 2944 | away, we get changed to run anywhere: as the first one comes back, |
| 2930 | restore their cpu bindings. */ | 2945 | restore their cpu bindings. */ |
| 2931 | static int __devinit cpu_callback(struct notifier_block *nfb, | 2946 | static int __devinit cpu_callback(struct notifier_block *nfb, |
| 2932 | unsigned long action, void *hcpu) | 2947 | unsigned long action, void *hcpu) |
| 2933 | { | 2948 | { |
| 2934 | int nid; | 2949 | int nid; |
| 2935 | 2950 | ||
| 2936 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 2951 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
| 2937 | for_each_node_state(nid, N_HIGH_MEMORY) { | 2952 | for_each_node_state(nid, N_HIGH_MEMORY) { |
| 2938 | pg_data_t *pgdat = NODE_DATA(nid); | 2953 | pg_data_t *pgdat = NODE_DATA(nid); |
| 2939 | const struct cpumask *mask; | 2954 | const struct cpumask *mask; |
| 2940 | 2955 | ||
| 2941 | mask = cpumask_of_node(pgdat->node_id); | 2956 | mask = cpumask_of_node(pgdat->node_id); |
| 2942 | 2957 | ||
| 2943 | if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) | 2958 | if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) |
| 2944 | /* One of our CPUs online: restore mask */ | 2959 | /* One of our CPUs online: restore mask */ |
| 2945 | set_cpus_allowed_ptr(pgdat->kswapd, mask); | 2960 | set_cpus_allowed_ptr(pgdat->kswapd, mask); |
| 2946 | } | 2961 | } |
| 2947 | } | 2962 | } |
| 2948 | return NOTIFY_OK; | 2963 | return NOTIFY_OK; |
| 2949 | } | 2964 | } |
| 2950 | 2965 | ||
| 2951 | /* | 2966 | /* |
| 2952 | * This kswapd start function will be called by init and node-hot-add. | 2967 | * This kswapd start function will be called by init and node-hot-add. |
| 2953 | * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. | 2968 | * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. |
| 2954 | */ | 2969 | */ |
| 2955 | int kswapd_run(int nid) | 2970 | int kswapd_run(int nid) |
| 2956 | { | 2971 | { |
| 2957 | pg_data_t *pgdat = NODE_DATA(nid); | 2972 | pg_data_t *pgdat = NODE_DATA(nid); |
| 2958 | int ret = 0; | 2973 | int ret = 0; |
| 2959 | 2974 | ||
| 2960 | if (pgdat->kswapd) | 2975 | if (pgdat->kswapd) |
| 2961 | return 0; | 2976 | return 0; |
| 2962 | 2977 | ||
| 2963 | pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); | 2978 | pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); |
| 2964 | if (IS_ERR(pgdat->kswapd)) { | 2979 | if (IS_ERR(pgdat->kswapd)) { |
| 2965 | /* failure at boot is fatal */ | 2980 | /* failure at boot is fatal */ |
| 2966 | BUG_ON(system_state == SYSTEM_BOOTING); | 2981 | BUG_ON(system_state == SYSTEM_BOOTING); |
| 2967 | printk("Failed to start kswapd on node %d\n",nid); | 2982 | printk("Failed to start kswapd on node %d\n",nid); |
| 2968 | ret = -1; | 2983 | ret = -1; |
| 2969 | } | 2984 | } |
| 2970 | return ret; | 2985 | return ret; |
| 2971 | } | 2986 | } |
| 2972 | 2987 | ||
| 2973 | /* | 2988 | /* |
| 2974 | * Called by memory hotplug when all memory in a node is offlined. | 2989 | * Called by memory hotplug when all memory in a node is offlined. |
| 2975 | */ | 2990 | */ |
| 2976 | void kswapd_stop(int nid) | 2991 | void kswapd_stop(int nid) |
| 2977 | { | 2992 | { |
| 2978 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | 2993 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; |
| 2979 | 2994 | ||
| 2980 | if (kswapd) | 2995 | if (kswapd) |
| 2981 | kthread_stop(kswapd); | 2996 | kthread_stop(kswapd); |
| 2982 | } | 2997 | } |
| 2983 | 2998 | ||
| 2984 | static int __init kswapd_init(void) | 2999 | static int __init kswapd_init(void) |
| 2985 | { | 3000 | { |
| 2986 | int nid; | 3001 | int nid; |
| 2987 | 3002 | ||
| 2988 | swap_setup(); | 3003 | swap_setup(); |
| 2989 | for_each_node_state(nid, N_HIGH_MEMORY) | 3004 | for_each_node_state(nid, N_HIGH_MEMORY) |
| 2990 | kswapd_run(nid); | 3005 | kswapd_run(nid); |
| 2991 | hotcpu_notifier(cpu_callback, 0); | 3006 | hotcpu_notifier(cpu_callback, 0); |
| 2992 | return 0; | 3007 | return 0; |
| 2993 | } | 3008 | } |
| 2994 | 3009 | ||
| 2995 | module_init(kswapd_init) | 3010 | module_init(kswapd_init) |
| 2996 | 3011 | ||
| 2997 | #ifdef CONFIG_NUMA | 3012 | #ifdef CONFIG_NUMA |
| 2998 | /* | 3013 | /* |
| 2999 | * Zone reclaim mode | 3014 | * Zone reclaim mode |
| 3000 | * | 3015 | * |
| 3001 | * If non-zero call zone_reclaim when the number of free pages falls below | 3016 | * If non-zero call zone_reclaim when the number of free pages falls below |
| 3002 | * the watermarks. | 3017 | * the watermarks. |
| 3003 | */ | 3018 | */ |
| 3004 | int zone_reclaim_mode __read_mostly; | 3019 | int zone_reclaim_mode __read_mostly; |
| 3005 | 3020 | ||
| 3006 | #define RECLAIM_OFF 0 | 3021 | #define RECLAIM_OFF 0 |
| 3007 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ | 3022 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ |
| 3008 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 3023 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
| 3009 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 3024 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
| 3010 | 3025 | ||
| 3011 | /* | 3026 | /* |
| 3012 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 3027 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
| 3013 | * of a node considered for each zone_reclaim. 4 scans 1/16th of | 3028 | * of a node considered for each zone_reclaim. 4 scans 1/16th of |
| 3014 | * a zone. | 3029 | * a zone. |
| 3015 | */ | 3030 | */ |
| 3016 | #define ZONE_RECLAIM_PRIORITY 4 | 3031 | #define ZONE_RECLAIM_PRIORITY 4 |
| 3017 | 3032 | ||
| 3018 | /* | 3033 | /* |
| 3019 | * Percentage of pages in a zone that must be unmapped for zone_reclaim to | 3034 | * Percentage of pages in a zone that must be unmapped for zone_reclaim to |
| 3020 | * occur. | 3035 | * occur. |
| 3021 | */ | 3036 | */ |
| 3022 | int sysctl_min_unmapped_ratio = 1; | 3037 | int sysctl_min_unmapped_ratio = 1; |
| 3023 | 3038 | ||
| 3024 | /* | 3039 | /* |
| 3025 | * If the number of slab pages in a zone grows beyond this percentage then | 3040 | * If the number of slab pages in a zone grows beyond this percentage then |
| 3026 | * slab reclaim needs to occur. | 3041 | * slab reclaim needs to occur. |
| 3027 | */ | 3042 | */ |
| 3028 | int sysctl_min_slab_ratio = 5; | 3043 | int sysctl_min_slab_ratio = 5; |
| 3029 | 3044 | ||
| 3030 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | 3045 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) |
| 3031 | { | 3046 | { |
| 3032 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); | 3047 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); |
| 3033 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + | 3048 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + |
| 3034 | zone_page_state(zone, NR_ACTIVE_FILE); | 3049 | zone_page_state(zone, NR_ACTIVE_FILE); |
| 3035 | 3050 | ||
| 3036 | /* | 3051 | /* |
| 3037 | * It's possible for there to be more file mapped pages than | 3052 | * It's possible for there to be more file mapped pages than |
| 3038 | * accounted for by the pages on the file LRU lists because | 3053 | * accounted for by the pages on the file LRU lists because |
| 3039 | * tmpfs pages accounted for as ANON can also be FILE_MAPPED | 3054 | * tmpfs pages accounted for as ANON can also be FILE_MAPPED |
| 3040 | */ | 3055 | */ |
| 3041 | return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; | 3056 | return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; |
| 3042 | } | 3057 | } |
| 3043 | 3058 | ||
| 3044 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | 3059 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ |
| 3045 | static long zone_pagecache_reclaimable(struct zone *zone) | 3060 | static long zone_pagecache_reclaimable(struct zone *zone) |
| 3046 | { | 3061 | { |
| 3047 | long nr_pagecache_reclaimable; | 3062 | long nr_pagecache_reclaimable; |
| 3048 | long delta = 0; | 3063 | long delta = 0; |
| 3049 | 3064 | ||
| 3050 | /* | 3065 | /* |
| 3051 | * If RECLAIM_SWAP is set, then all file pages are considered | 3066 | * If RECLAIM_SWAP is set, then all file pages are considered |
| 3052 | * potentially reclaimable. Otherwise, we have to worry about | 3067 | * potentially reclaimable. Otherwise, we have to worry about |
| 3053 | * pages like swapcache and zone_unmapped_file_pages() provides | 3068 | * pages like swapcache and zone_unmapped_file_pages() provides |
| 3054 | * a better estimate | 3069 | * a better estimate |
| 3055 | */ | 3070 | */ |
| 3056 | if (zone_reclaim_mode & RECLAIM_SWAP) | 3071 | if (zone_reclaim_mode & RECLAIM_SWAP) |
| 3057 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | 3072 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); |
| 3058 | else | 3073 | else |
| 3059 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | 3074 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); |
| 3060 | 3075 | ||
| 3061 | /* If we can't clean pages, remove dirty pages from consideration */ | 3076 | /* If we can't clean pages, remove dirty pages from consideration */ |
| 3062 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) | 3077 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) |
| 3063 | delta += zone_page_state(zone, NR_FILE_DIRTY); | 3078 | delta += zone_page_state(zone, NR_FILE_DIRTY); |
| 3064 | 3079 | ||
| 3065 | /* Watch for any possible underflows due to delta */ | 3080 | /* Watch for any possible underflows due to delta */ |
| 3066 | if (unlikely(delta > nr_pagecache_reclaimable)) | 3081 | if (unlikely(delta > nr_pagecache_reclaimable)) |
| 3067 | delta = nr_pagecache_reclaimable; | 3082 | delta = nr_pagecache_reclaimable; |
| 3068 | 3083 | ||
| 3069 | return nr_pagecache_reclaimable - delta; | 3084 | return nr_pagecache_reclaimable - delta; |
| 3070 | } | 3085 | } |
| 3071 | 3086 | ||
| 3072 | /* | 3087 | /* |
| 3073 | * Try to free up some pages from this zone through reclaim. | 3088 | * Try to free up some pages from this zone through reclaim. |
| 3074 | */ | 3089 | */ |
| 3075 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 3090 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
| 3076 | { | 3091 | { |
| 3077 | /* Minimum pages needed in order to stay on node */ | 3092 | /* Minimum pages needed in order to stay on node */ |
| 3078 | const unsigned long nr_pages = 1 << order; | 3093 | const unsigned long nr_pages = 1 << order; |
| 3079 | struct task_struct *p = current; | 3094 | struct task_struct *p = current; |
| 3080 | struct reclaim_state reclaim_state; | 3095 | struct reclaim_state reclaim_state; |
| 3081 | int priority; | 3096 | int priority; |
| 3082 | struct scan_control sc = { | 3097 | struct scan_control sc = { |
| 3083 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3098 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
| 3084 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3099 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
| 3085 | .may_swap = 1, | 3100 | .may_swap = 1, |
| 3086 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3101 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
| 3087 | SWAP_CLUSTER_MAX), | 3102 | SWAP_CLUSTER_MAX), |
| 3088 | .gfp_mask = gfp_mask, | 3103 | .gfp_mask = gfp_mask, |
| 3089 | .swappiness = vm_swappiness, | 3104 | .swappiness = vm_swappiness, |
| 3090 | .order = order, | 3105 | .order = order, |
| 3091 | }; | 3106 | }; |
| 3092 | struct shrink_control shrink = { | 3107 | struct shrink_control shrink = { |
| 3093 | .gfp_mask = sc.gfp_mask, | 3108 | .gfp_mask = sc.gfp_mask, |
| 3094 | }; | 3109 | }; |
| 3095 | unsigned long nr_slab_pages0, nr_slab_pages1; | 3110 | unsigned long nr_slab_pages0, nr_slab_pages1; |
| 3096 | 3111 | ||
| 3097 | cond_resched(); | 3112 | cond_resched(); |
| 3098 | /* | 3113 | /* |
| 3099 | * We need to be able to allocate from the reserves for RECLAIM_SWAP | 3114 | * We need to be able to allocate from the reserves for RECLAIM_SWAP |
| 3100 | * and we also need to be able to write out pages for RECLAIM_WRITE | 3115 | * and we also need to be able to write out pages for RECLAIM_WRITE |
| 3101 | * and RECLAIM_SWAP. | 3116 | * and RECLAIM_SWAP. |
| 3102 | */ | 3117 | */ |
| 3103 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 3118 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; |
| 3104 | lockdep_set_current_reclaim_state(gfp_mask); | 3119 | lockdep_set_current_reclaim_state(gfp_mask); |
| 3105 | reclaim_state.reclaimed_slab = 0; | 3120 | reclaim_state.reclaimed_slab = 0; |
| 3106 | p->reclaim_state = &reclaim_state; | 3121 | p->reclaim_state = &reclaim_state; |
| 3107 | 3122 | ||
| 3108 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { | 3123 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { |
| 3109 | /* | 3124 | /* |
| 3110 | * Free memory by calling shrink zone with increasing | 3125 | * Free memory by calling shrink zone with increasing |
| 3111 | * priorities until we have enough memory freed. | 3126 | * priorities until we have enough memory freed. |
| 3112 | */ | 3127 | */ |
| 3113 | priority = ZONE_RECLAIM_PRIORITY; | 3128 | priority = ZONE_RECLAIM_PRIORITY; |
| 3114 | do { | 3129 | do { |
| 3115 | shrink_zone(priority, zone, &sc); | 3130 | shrink_zone(priority, zone, &sc); |
| 3116 | priority--; | 3131 | priority--; |
| 3117 | } while (priority >= 0 && sc.nr_reclaimed < nr_pages); | 3132 | } while (priority >= 0 && sc.nr_reclaimed < nr_pages); |
| 3118 | } | 3133 | } |
| 3119 | 3134 | ||
| 3120 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 3135 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
| 3121 | if (nr_slab_pages0 > zone->min_slab_pages) { | 3136 | if (nr_slab_pages0 > zone->min_slab_pages) { |
| 3122 | /* | 3137 | /* |
| 3123 | * shrink_slab() does not currently allow us to determine how | 3138 | * shrink_slab() does not currently allow us to determine how |
| 3124 | * many pages were freed in this zone. So we take the current | 3139 | * many pages were freed in this zone. So we take the current |
| 3125 | * number of slab pages and shake the slab until it is reduced | 3140 | * number of slab pages and shake the slab until it is reduced |
| 3126 | * by the same nr_pages that we used for reclaiming unmapped | 3141 | * by the same nr_pages that we used for reclaiming unmapped |
| 3127 | * pages. | 3142 | * pages. |
| 3128 | * | 3143 | * |
| 3129 | * Note that shrink_slab will free memory on all zones and may | 3144 | * Note that shrink_slab will free memory on all zones and may |
| 3130 | * take a long time. | 3145 | * take a long time. |
| 3131 | */ | 3146 | */ |
| 3132 | for (;;) { | 3147 | for (;;) { |
| 3133 | unsigned long lru_pages = zone_reclaimable_pages(zone); | 3148 | unsigned long lru_pages = zone_reclaimable_pages(zone); |
| 3134 | 3149 | ||
| 3135 | /* No reclaimable slab or very low memory pressure */ | 3150 | /* No reclaimable slab or very low memory pressure */ |
| 3136 | if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) | 3151 | if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) |
| 3137 | break; | 3152 | break; |
| 3138 | 3153 | ||
| 3139 | /* Freed enough memory */ | 3154 | /* Freed enough memory */ |
| 3140 | nr_slab_pages1 = zone_page_state(zone, | 3155 | nr_slab_pages1 = zone_page_state(zone, |
| 3141 | NR_SLAB_RECLAIMABLE); | 3156 | NR_SLAB_RECLAIMABLE); |
| 3142 | if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) | 3157 | if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) |
| 3143 | break; | 3158 | break; |
| 3144 | } | 3159 | } |
| 3145 | 3160 | ||
| 3146 | /* | 3161 | /* |
| 3147 | * Update nr_reclaimed by the number of slab pages we | 3162 | * Update nr_reclaimed by the number of slab pages we |
| 3148 | * reclaimed from this zone. | 3163 | * reclaimed from this zone. |
| 3149 | */ | 3164 | */ |
| 3150 | nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 3165 | nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
| 3151 | if (nr_slab_pages1 < nr_slab_pages0) | 3166 | if (nr_slab_pages1 < nr_slab_pages0) |
| 3152 | sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; | 3167 | sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; |
| 3153 | } | 3168 | } |
| 3154 | 3169 | ||
| 3155 | p->reclaim_state = NULL; | 3170 | p->reclaim_state = NULL; |
| 3156 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 3171 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
| 3157 | lockdep_clear_current_reclaim_state(); | 3172 | lockdep_clear_current_reclaim_state(); |
| 3158 | return sc.nr_reclaimed >= nr_pages; | 3173 | return sc.nr_reclaimed >= nr_pages; |
| 3159 | } | 3174 | } |
| 3160 | 3175 | ||
| 3161 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 3176 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
| 3162 | { | 3177 | { |
| 3163 | int node_id; | 3178 | int node_id; |
| 3164 | int ret; | 3179 | int ret; |
| 3165 | 3180 | ||
| 3166 | /* | 3181 | /* |
| 3167 | * Zone reclaim reclaims unmapped file backed pages and | 3182 | * Zone reclaim reclaims unmapped file backed pages and |
| 3168 | * slab pages if we are over the defined limits. | 3183 | * slab pages if we are over the defined limits. |
| 3169 | * | 3184 | * |
| 3170 | * A small portion of unmapped file backed pages is needed for | 3185 | * A small portion of unmapped file backed pages is needed for |
| 3171 | * file I/O otherwise pages read by file I/O will be immediately | 3186 | * file I/O otherwise pages read by file I/O will be immediately |
| 3172 | * thrown out if the zone is overallocated. So we do not reclaim | 3187 | * thrown out if the zone is overallocated. So we do not reclaim |
| 3173 | * if less than a specified percentage of the zone is used by | 3188 | * if less than a specified percentage of the zone is used by |
| 3174 | * unmapped file backed pages. | 3189 | * unmapped file backed pages. |
| 3175 | */ | 3190 | */ |
| 3176 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && | 3191 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && |
| 3177 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) | 3192 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
| 3178 | return ZONE_RECLAIM_FULL; | 3193 | return ZONE_RECLAIM_FULL; |
| 3179 | 3194 | ||
| 3180 | if (zone->all_unreclaimable) | 3195 | if (zone->all_unreclaimable) |
| 3181 | return ZONE_RECLAIM_FULL; | 3196 | return ZONE_RECLAIM_FULL; |
| 3182 | 3197 | ||
| 3183 | /* | 3198 | /* |
| 3184 | * Do not scan if the allocation should not be delayed. | 3199 | * Do not scan if the allocation should not be delayed. |
| 3185 | */ | 3200 | */ |
| 3186 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) | 3201 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
| 3187 | return ZONE_RECLAIM_NOSCAN; | 3202 | return ZONE_RECLAIM_NOSCAN; |
| 3188 | 3203 | ||
| 3189 | /* | 3204 | /* |
| 3190 | * Only run zone reclaim on the local zone or on zones that do not | 3205 | * Only run zone reclaim on the local zone or on zones that do not |
| 3191 | * have associated processors. This will favor the local processor | 3206 | * have associated processors. This will favor the local processor |
| 3192 | * over remote processors and spread off node memory allocations | 3207 | * over remote processors and spread off node memory allocations |
| 3193 | * as wide as possible. | 3208 | * as wide as possible. |
| 3194 | */ | 3209 | */ |
| 3195 | node_id = zone_to_nid(zone); | 3210 | node_id = zone_to_nid(zone); |
| 3196 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 3211 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
| 3197 | return ZONE_RECLAIM_NOSCAN; | 3212 | return ZONE_RECLAIM_NOSCAN; |
| 3198 | 3213 | ||
| 3199 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 3214 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) |
| 3200 | return ZONE_RECLAIM_NOSCAN; | 3215 | return ZONE_RECLAIM_NOSCAN; |
| 3201 | 3216 | ||
| 3202 | ret = __zone_reclaim(zone, gfp_mask, order); | 3217 | ret = __zone_reclaim(zone, gfp_mask, order); |
| 3203 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 3218 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); |
| 3204 | 3219 | ||
| 3205 | if (!ret) | 3220 | if (!ret) |
| 3206 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | 3221 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); |
| 3207 | 3222 | ||
| 3208 | return ret; | 3223 | return ret; |
| 3209 | } | 3224 | } |
| 3210 | #endif | 3225 | #endif |
| 3211 | 3226 | ||
| 3212 | /* | 3227 | /* |
| 3213 | * page_evictable - test whether a page is evictable | 3228 | * page_evictable - test whether a page is evictable |
| 3214 | * @page: the page to test | 3229 | * @page: the page to test |
| 3215 | * @vma: the VMA in which the page is or will be mapped, may be NULL | 3230 | * @vma: the VMA in which the page is or will be mapped, may be NULL |
| 3216 | * | 3231 | * |
| 3217 | * Test whether page is evictable--i.e., should be placed on active/inactive | 3232 | * Test whether page is evictable--i.e., should be placed on active/inactive |
| 3218 | * lists vs unevictable list. The vma argument is !NULL when called from the | 3233 | * lists vs unevictable list. The vma argument is !NULL when called from the |
| 3219 | * fault path to determine how to instantate a new page. | 3234 | * fault path to determine how to instantate a new page. |
| 3220 | * | 3235 | * |
| 3221 | * Reasons page might not be evictable: | 3236 | * Reasons page might not be evictable: |
| 3222 | * (1) page's mapping marked unevictable | 3237 | * (1) page's mapping marked unevictable |
| 3223 | * (2) page is part of an mlocked VMA | 3238 | * (2) page is part of an mlocked VMA |
| 3224 | * | 3239 | * |
| 3225 | */ | 3240 | */ |
| 3226 | int page_evictable(struct page *page, struct vm_area_struct *vma) | 3241 | int page_evictable(struct page *page, struct vm_area_struct *vma) |
| 3227 | { | 3242 | { |
| 3228 | 3243 | ||
| 3229 | if (mapping_unevictable(page_mapping(page))) | 3244 | if (mapping_unevictable(page_mapping(page))) |
| 3230 | return 0; | 3245 | return 0; |
| 3231 | 3246 | ||
| 3232 | if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) | 3247 | if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) |
| 3233 | return 0; | 3248 | return 0; |
| 3234 | 3249 | ||
| 3235 | return 1; | 3250 | return 1; |
| 3236 | } | 3251 | } |
| 3237 | 3252 | ||
| 3238 | /** | 3253 | /** |
| 3239 | * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list | 3254 | * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list |
| 3240 | * @page: page to check evictability and move to appropriate lru list | 3255 | * @page: page to check evictability and move to appropriate lru list |
| 3241 | * @zone: zone page is in | 3256 | * @zone: zone page is in |
| 3242 | * | 3257 | * |
| 3243 | * Checks a page for evictability and moves the page to the appropriate | 3258 | * Checks a page for evictability and moves the page to the appropriate |
| 3244 | * zone lru list. | 3259 | * zone lru list. |
| 3245 | * | 3260 | * |
| 3246 | * Restrictions: zone->lru_lock must be held, page must be on LRU and must | 3261 | * Restrictions: zone->lru_lock must be held, page must be on LRU and must |
| 3247 | * have PageUnevictable set. | 3262 | * have PageUnevictable set. |
| 3248 | */ | 3263 | */ |
| 3249 | static void check_move_unevictable_page(struct page *page, struct zone *zone) | 3264 | static void check_move_unevictable_page(struct page *page, struct zone *zone) |
| 3250 | { | 3265 | { |
| 3251 | VM_BUG_ON(PageActive(page)); | 3266 | VM_BUG_ON(PageActive(page)); |
| 3252 | 3267 | ||
| 3253 | retry: | 3268 | retry: |
| 3254 | ClearPageUnevictable(page); | 3269 | ClearPageUnevictable(page); |
| 3255 | if (page_evictable(page, NULL)) { | 3270 | if (page_evictable(page, NULL)) { |
| 3256 | enum lru_list l = page_lru_base_type(page); | 3271 | enum lru_list l = page_lru_base_type(page); |
| 3257 | 3272 | ||
| 3258 | __dec_zone_state(zone, NR_UNEVICTABLE); | 3273 | __dec_zone_state(zone, NR_UNEVICTABLE); |
| 3259 | list_move(&page->lru, &zone->lru[l].list); | 3274 | list_move(&page->lru, &zone->lru[l].list); |
| 3260 | mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); | 3275 | mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); |
| 3261 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); | 3276 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); |
| 3262 | __count_vm_event(UNEVICTABLE_PGRESCUED); | 3277 | __count_vm_event(UNEVICTABLE_PGRESCUED); |
| 3263 | } else { | 3278 | } else { |
| 3264 | /* | 3279 | /* |
| 3265 | * rotate unevictable list | 3280 | * rotate unevictable list |
| 3266 | */ | 3281 | */ |
| 3267 | SetPageUnevictable(page); | 3282 | SetPageUnevictable(page); |
| 3268 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); | 3283 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); |
| 3269 | mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); | 3284 | mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); |
| 3270 | if (page_evictable(page, NULL)) | 3285 | if (page_evictable(page, NULL)) |
| 3271 | goto retry; | 3286 | goto retry; |
| 3272 | } | 3287 | } |
| 3273 | } | 3288 | } |
| 3274 | 3289 | ||
| 3275 | /** | 3290 | /** |
| 3276 | * scan_mapping_unevictable_pages - scan an address space for evictable pages | 3291 | * scan_mapping_unevictable_pages - scan an address space for evictable pages |
| 3277 | * @mapping: struct address_space to scan for evictable pages | 3292 | * @mapping: struct address_space to scan for evictable pages |
| 3278 | * | 3293 | * |
| 3279 | * Scan all pages in mapping. Check unevictable pages for | 3294 | * Scan all pages in mapping. Check unevictable pages for |
| 3280 | * evictability and move them to the appropriate zone lru list. | 3295 | * evictability and move them to the appropriate zone lru list. |
| 3281 | */ | 3296 | */ |
| 3282 | void scan_mapping_unevictable_pages(struct address_space *mapping) | 3297 | void scan_mapping_unevictable_pages(struct address_space *mapping) |
| 3283 | { | 3298 | { |
| 3284 | pgoff_t next = 0; | 3299 | pgoff_t next = 0; |
| 3285 | pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> | 3300 | pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> |
| 3286 | PAGE_CACHE_SHIFT; | 3301 | PAGE_CACHE_SHIFT; |
| 3287 | struct zone *zone; | 3302 | struct zone *zone; |
| 3288 | struct pagevec pvec; | 3303 | struct pagevec pvec; |
| 3289 | 3304 | ||
| 3290 | if (mapping->nrpages == 0) | 3305 | if (mapping->nrpages == 0) |
| 3291 | return; | 3306 | return; |
| 3292 | 3307 | ||
| 3293 | pagevec_init(&pvec, 0); | 3308 | pagevec_init(&pvec, 0); |
| 3294 | while (next < end && | 3309 | while (next < end && |
| 3295 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 3310 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
| 3296 | int i; | 3311 | int i; |
| 3297 | int pg_scanned = 0; | 3312 | int pg_scanned = 0; |
| 3298 | 3313 | ||
| 3299 | zone = NULL; | 3314 | zone = NULL; |
| 3300 | 3315 | ||
| 3301 | for (i = 0; i < pagevec_count(&pvec); i++) { | 3316 | for (i = 0; i < pagevec_count(&pvec); i++) { |
| 3302 | struct page *page = pvec.pages[i]; | 3317 | struct page *page = pvec.pages[i]; |
| 3303 | pgoff_t page_index = page->index; | 3318 | pgoff_t page_index = page->index; |
| 3304 | struct zone *pagezone = page_zone(page); | 3319 | struct zone *pagezone = page_zone(page); |
| 3305 | 3320 | ||
| 3306 | pg_scanned++; | 3321 | pg_scanned++; |
| 3307 | if (page_index > next) | 3322 | if (page_index > next) |
| 3308 | next = page_index; | 3323 | next = page_index; |
| 3309 | next++; | 3324 | next++; |
| 3310 | 3325 | ||
| 3311 | if (pagezone != zone) { | 3326 | if (pagezone != zone) { |
| 3312 | if (zone) | 3327 | if (zone) |
| 3313 | spin_unlock_irq(&zone->lru_lock); | 3328 | spin_unlock_irq(&zone->lru_lock); |
| 3314 | zone = pagezone; | 3329 | zone = pagezone; |
| 3315 | spin_lock_irq(&zone->lru_lock); | 3330 | spin_lock_irq(&zone->lru_lock); |
| 3316 | } | 3331 | } |
| 3317 | 3332 | ||
| 3318 | if (PageLRU(page) && PageUnevictable(page)) | 3333 | if (PageLRU(page) && PageUnevictable(page)) |
| 3319 | check_move_unevictable_page(page, zone); | 3334 | check_move_unevictable_page(page, zone); |
| 3320 | } | 3335 | } |
| 3321 | if (zone) | 3336 | if (zone) |
| 3322 | spin_unlock_irq(&zone->lru_lock); | 3337 | spin_unlock_irq(&zone->lru_lock); |
| 3323 | pagevec_release(&pvec); | 3338 | pagevec_release(&pvec); |
| 3324 | 3339 | ||
| 3325 | count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); | 3340 | count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); |
| 3326 | } | 3341 | } |
| 3327 | 3342 | ||
| 3328 | } | 3343 | } |
| 3329 | 3344 | ||
| 3330 | /** | 3345 | /** |
| 3331 | * scan_zone_unevictable_pages - check unevictable list for evictable pages | 3346 | * scan_zone_unevictable_pages - check unevictable list for evictable pages |
| 3332 | * @zone - zone of which to scan the unevictable list | 3347 | * @zone - zone of which to scan the unevictable list |
| 3333 | * | 3348 | * |
| 3334 | * Scan @zone's unevictable LRU lists to check for pages that have become | 3349 | * Scan @zone's unevictable LRU lists to check for pages that have become |
| 3335 | * evictable. Move those that have to @zone's inactive list where they | 3350 | * evictable. Move those that have to @zone's inactive list where they |
| 3336 | * become candidates for reclaim, unless shrink_inactive_zone() decides | 3351 | * become candidates for reclaim, unless shrink_inactive_zone() decides |
| 3337 | * to reactivate them. Pages that are still unevictable are rotated | 3352 | * to reactivate them. Pages that are still unevictable are rotated |
| 3338 | * back onto @zone's unevictable list. | 3353 | * back onto @zone's unevictable list. |
| 3339 | */ | 3354 | */ |
| 3340 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | 3355 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ |
| 3341 | static void scan_zone_unevictable_pages(struct zone *zone) | 3356 | static void scan_zone_unevictable_pages(struct zone *zone) |
| 3342 | { | 3357 | { |
| 3343 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | 3358 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; |
| 3344 | unsigned long scan; | 3359 | unsigned long scan; |
| 3345 | unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); | 3360 | unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); |
| 3346 | 3361 | ||
| 3347 | while (nr_to_scan > 0) { | 3362 | while (nr_to_scan > 0) { |
| 3348 | unsigned long batch_size = min(nr_to_scan, | 3363 | unsigned long batch_size = min(nr_to_scan, |
| 3349 | SCAN_UNEVICTABLE_BATCH_SIZE); | 3364 | SCAN_UNEVICTABLE_BATCH_SIZE); |
| 3350 | 3365 | ||
| 3351 | spin_lock_irq(&zone->lru_lock); | 3366 | spin_lock_irq(&zone->lru_lock); |
| 3352 | for (scan = 0; scan < batch_size; scan++) { | 3367 | for (scan = 0; scan < batch_size; scan++) { |
| 3353 | struct page *page = lru_to_page(l_unevictable); | 3368 | struct page *page = lru_to_page(l_unevictable); |
| 3354 | 3369 | ||
| 3355 | if (!trylock_page(page)) | 3370 | if (!trylock_page(page)) |
| 3356 | continue; | 3371 | continue; |
| 3357 | 3372 | ||
| 3358 | prefetchw_prev_lru_page(page, l_unevictable, flags); | 3373 | prefetchw_prev_lru_page(page, l_unevictable, flags); |
| 3359 | 3374 | ||
| 3360 | if (likely(PageLRU(page) && PageUnevictable(page))) | 3375 | if (likely(PageLRU(page) && PageUnevictable(page))) |
| 3361 | check_move_unevictable_page(page, zone); | 3376 | check_move_unevictable_page(page, zone); |
| 3362 | 3377 | ||
| 3363 | unlock_page(page); | 3378 | unlock_page(page); |
| 3364 | } | 3379 | } |
| 3365 | spin_unlock_irq(&zone->lru_lock); | 3380 | spin_unlock_irq(&zone->lru_lock); |
| 3366 | 3381 | ||
| 3367 | nr_to_scan -= batch_size; | 3382 | nr_to_scan -= batch_size; |
| 3368 | } | 3383 | } |
| 3369 | } | 3384 | } |
| 3370 | 3385 | ||
| 3371 | 3386 | ||
| 3372 | /** | 3387 | /** |
| 3373 | * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages | 3388 | * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages |
| 3374 | * | 3389 | * |
| 3375 | * A really big hammer: scan all zones' unevictable LRU lists to check for | 3390 | * A really big hammer: scan all zones' unevictable LRU lists to check for |
| 3376 | * pages that have become evictable. Move those back to the zones' | 3391 | * pages that have become evictable. Move those back to the zones' |
| 3377 | * inactive list where they become candidates for reclaim. | 3392 | * inactive list where they become candidates for reclaim. |
| 3378 | * This occurs when, e.g., we have unswappable pages on the unevictable lists, | 3393 | * This occurs when, e.g., we have unswappable pages on the unevictable lists, |
| 3379 | * and we add swap to the system. As such, it runs in the context of a task | 3394 | * and we add swap to the system. As such, it runs in the context of a task |
| 3380 | * that has possibly/probably made some previously unevictable pages | 3395 | * that has possibly/probably made some previously unevictable pages |
| 3381 | * evictable. | 3396 | * evictable. |
| 3382 | */ | 3397 | */ |
| 3383 | static void scan_all_zones_unevictable_pages(void) | 3398 | static void scan_all_zones_unevictable_pages(void) |
| 3384 | { | 3399 | { |
| 3385 | struct zone *zone; | 3400 | struct zone *zone; |
| 3386 | 3401 | ||
| 3387 | for_each_zone(zone) { | 3402 | for_each_zone(zone) { |
| 3388 | scan_zone_unevictable_pages(zone); | 3403 | scan_zone_unevictable_pages(zone); |
| 3389 | } | 3404 | } |
| 3390 | } | 3405 | } |
| 3391 | 3406 | ||
| 3392 | /* | 3407 | /* |
| 3393 | * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of | 3408 | * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of |
| 3394 | * all nodes' unevictable lists for evictable pages | 3409 | * all nodes' unevictable lists for evictable pages |
| 3395 | */ | 3410 | */ |
| 3396 | unsigned long scan_unevictable_pages; | 3411 | unsigned long scan_unevictable_pages; |
| 3397 | 3412 | ||
| 3398 | int scan_unevictable_handler(struct ctl_table *table, int write, | 3413 | int scan_unevictable_handler(struct ctl_table *table, int write, |
| 3399 | void __user *buffer, | 3414 | void __user *buffer, |
| 3400 | size_t *length, loff_t *ppos) | 3415 | size_t *length, loff_t *ppos) |
| 3401 | { | 3416 | { |
| 3402 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 3417 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 3403 | 3418 | ||
| 3404 | if (write && *(unsigned long *)table->data) | 3419 | if (write && *(unsigned long *)table->data) |
| 3405 | scan_all_zones_unevictable_pages(); | 3420 | scan_all_zones_unevictable_pages(); |
| 3406 | 3421 | ||
| 3407 | scan_unevictable_pages = 0; | 3422 | scan_unevictable_pages = 0; |
| 3408 | return 0; | 3423 | return 0; |
| 3409 | } | 3424 | } |
| 3410 | 3425 | ||
| 3411 | #ifdef CONFIG_NUMA | 3426 | #ifdef CONFIG_NUMA |
| 3412 | /* | 3427 | /* |
| 3413 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of | 3428 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of |
| 3414 | * a specified node's per zone unevictable lists for evictable pages. | 3429 | * a specified node's per zone unevictable lists for evictable pages. |
| 3415 | */ | 3430 | */ |
| 3416 | 3431 | ||
| 3417 | static ssize_t read_scan_unevictable_node(struct sys_device *dev, | 3432 | static ssize_t read_scan_unevictable_node(struct sys_device *dev, |
| 3418 | struct sysdev_attribute *attr, | 3433 | struct sysdev_attribute *attr, |
| 3419 | char *buf) | 3434 | char *buf) |
| 3420 | { | 3435 | { |
| 3421 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | 3436 | return sprintf(buf, "0\n"); /* always zero; should fit... */ |
| 3422 | } | 3437 | } |
| 3423 | 3438 | ||
| 3424 | static ssize_t write_scan_unevictable_node(struct sys_device *dev, | 3439 | static ssize_t write_scan_unevictable_node(struct sys_device *dev, |
| 3425 | struct sysdev_attribute *attr, | 3440 | struct sysdev_attribute *attr, |
| 3426 | const char *buf, size_t count) | 3441 | const char *buf, size_t count) |
| 3427 | { | 3442 | { |
| 3428 | struct zone *node_zones = NODE_DATA(dev->id)->node_zones; | 3443 | struct zone *node_zones = NODE_DATA(dev->id)->node_zones; |
| 3429 | struct zone *zone; | 3444 | struct zone *zone; |
| 3430 | unsigned long res; | 3445 | unsigned long res; |
| 3431 | unsigned long req = strict_strtoul(buf, 10, &res); | 3446 | unsigned long req = strict_strtoul(buf, 10, &res); |
| 3432 | 3447 | ||
| 3433 | if (!req) | 3448 | if (!req) |
| 3434 | return 1; /* zero is no-op */ | 3449 | return 1; /* zero is no-op */ |
| 3435 | 3450 | ||
| 3436 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 3451 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
| 3437 | if (!populated_zone(zone)) | 3452 | if (!populated_zone(zone)) |
| 3438 | continue; | 3453 | continue; |
| 3439 | scan_zone_unevictable_pages(zone); | 3454 | scan_zone_unevictable_pages(zone); |
| 3440 | } | 3455 | } |
| 3441 | return 1; | 3456 | return 1; |
| 3442 | } | 3457 | } |
| 3443 | 3458 | ||
| 3444 | 3459 | ||
| 3445 | static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, | 3460 | static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, |
| 3446 | read_scan_unevictable_node, | 3461 | read_scan_unevictable_node, |
| 3447 | write_scan_unevictable_node); | 3462 | write_scan_unevictable_node); |
| 3448 | 3463 | ||
| 3449 | int scan_unevictable_register_node(struct node *node) | 3464 | int scan_unevictable_register_node(struct node *node) |
| 3450 | { | 3465 | { |
| 3451 | return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); | 3466 | return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); |
| 3452 | } | 3467 | } |
| 3453 | 3468 | ||
| 3454 | void scan_unevictable_unregister_node(struct node *node) | 3469 | void scan_unevictable_unregister_node(struct node *node) |
| 3455 | { | 3470 | { |
| 3456 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | 3471 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); |
| 3457 | } | 3472 | } |
| 3458 | #endif | 3473 | #endif |
| 3459 | 3474 |