Commit 60105e1248f571aa3b895cd63bef072ed9d90c77

Authored by Minchan Kim
Committed by Linus Torvalds
1 parent 6335b19344

mm/zswap: support multiple swap devices

Cai Liu reporeted that now zbud pool pages counting has a problem when
multiple swap is used because it just counts only one swap intead of all
of swap so zswap cannot control writeback properly.  The result is
unnecessary writeback or no writeback when we should really writeback.

IOW, it made zswap crazy.

Another problem in zswap is:

For example, let's assume we use two swap A and B with different
priority and A already has charged 19% long time ago and let's assume
that A swap is full now so VM start to use B so that B has charged 1%
recently.  It menas zswap charged (19% + 1%) is full by default.  Then,
if VM want to swap out more pages into B, zbud_reclaim_page would be
evict one of pages in B's pool and it would be repeated continuously.
It's totally LRU reverse problem and swap thrashing in B would happen.

This patch makes zswap consider mutliple swap by creating *a* zbud pool
which will be shared by multiple swap so all of zswap pages in multiple
swap keep order by LRU so it can prevent above two problems.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Cai Liu <cai.liu@samsung.com>
Suggested-by: Weijie Yang <weijie.yang.kh@gmail.com>
Cc: Seth Jennings <sjennings@variantweb.net>
Reviewed-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 33 additions and 31 deletions Inline Diff

1 /* 1 /*
2 * zswap.c - zswap driver file 2 * zswap.c - zswap driver file
3 * 3 *
4 * zswap is a backend for frontswap that takes pages that are in the process 4 * zswap is a backend for frontswap that takes pages that are in the process
5 * of being swapped out and attempts to compress and store them in a 5 * of being swapped out and attempts to compress and store them in a
6 * RAM-based memory pool. This can result in a significant I/O reduction on 6 * RAM-based memory pool. This can result in a significant I/O reduction on
7 * the swap device and, in the case where decompressing from RAM is faster 7 * the swap device and, in the case where decompressing from RAM is faster
8 * than reading from the swap device, can also improve workload performance. 8 * than reading from the swap device, can also improve workload performance.
9 * 9 *
10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
11 * 11 *
12 * This program is free software; you can redistribute it and/or 12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License 13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2 14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version. 15 * of the License, or (at your option) any later version.
16 * 16 *
17 * This program is distributed in the hope that it will be useful, 17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details. 20 * GNU General Public License for more details.
21 */ 21 */
22 22
23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24 24
25 #include <linux/module.h> 25 #include <linux/module.h>
26 #include <linux/cpu.h> 26 #include <linux/cpu.h>
27 #include <linux/highmem.h> 27 #include <linux/highmem.h>
28 #include <linux/slab.h> 28 #include <linux/slab.h>
29 #include <linux/spinlock.h> 29 #include <linux/spinlock.h>
30 #include <linux/types.h> 30 #include <linux/types.h>
31 #include <linux/atomic.h> 31 #include <linux/atomic.h>
32 #include <linux/frontswap.h> 32 #include <linux/frontswap.h>
33 #include <linux/rbtree.h> 33 #include <linux/rbtree.h>
34 #include <linux/swap.h> 34 #include <linux/swap.h>
35 #include <linux/crypto.h> 35 #include <linux/crypto.h>
36 #include <linux/mempool.h> 36 #include <linux/mempool.h>
37 #include <linux/zbud.h> 37 #include <linux/zbud.h>
38 38
39 #include <linux/mm_types.h> 39 #include <linux/mm_types.h>
40 #include <linux/page-flags.h> 40 #include <linux/page-flags.h>
41 #include <linux/swapops.h> 41 #include <linux/swapops.h>
42 #include <linux/writeback.h> 42 #include <linux/writeback.h>
43 #include <linux/pagemap.h> 43 #include <linux/pagemap.h>
44 44
45 /********************************* 45 /*********************************
46 * statistics 46 * statistics
47 **********************************/ 47 **********************************/
48 /* Number of memory pages used by the compressed pool */ 48 /* Number of memory pages used by the compressed pool */
49 static u64 zswap_pool_pages; 49 static u64 zswap_pool_pages;
50 /* The number of compressed pages currently stored in zswap */ 50 /* The number of compressed pages currently stored in zswap */
51 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 51 static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
52 52
53 /* 53 /*
54 * The statistics below are not protected from concurrent access for 54 * The statistics below are not protected from concurrent access for
55 * performance reasons so they may not be a 100% accurate. However, 55 * performance reasons so they may not be a 100% accurate. However,
56 * they do provide useful information on roughly how many times a 56 * they do provide useful information on roughly how many times a
57 * certain event is occurring. 57 * certain event is occurring.
58 */ 58 */
59 59
60 /* Pool limit was hit (see zswap_max_pool_percent) */ 60 /* Pool limit was hit (see zswap_max_pool_percent) */
61 static u64 zswap_pool_limit_hit; 61 static u64 zswap_pool_limit_hit;
62 /* Pages written back when pool limit was reached */ 62 /* Pages written back when pool limit was reached */
63 static u64 zswap_written_back_pages; 63 static u64 zswap_written_back_pages;
64 /* Store failed due to a reclaim failure after pool limit was reached */ 64 /* Store failed due to a reclaim failure after pool limit was reached */
65 static u64 zswap_reject_reclaim_fail; 65 static u64 zswap_reject_reclaim_fail;
66 /* Compressed page was too big for the allocator to (optimally) store */ 66 /* Compressed page was too big for the allocator to (optimally) store */
67 static u64 zswap_reject_compress_poor; 67 static u64 zswap_reject_compress_poor;
68 /* Store failed because underlying allocator could not get memory */ 68 /* Store failed because underlying allocator could not get memory */
69 static u64 zswap_reject_alloc_fail; 69 static u64 zswap_reject_alloc_fail;
70 /* Store failed because the entry metadata could not be allocated (rare) */ 70 /* Store failed because the entry metadata could not be allocated (rare) */
71 static u64 zswap_reject_kmemcache_fail; 71 static u64 zswap_reject_kmemcache_fail;
72 /* Duplicate store was encountered (rare) */ 72 /* Duplicate store was encountered (rare) */
73 static u64 zswap_duplicate_entry; 73 static u64 zswap_duplicate_entry;
74 74
75 /********************************* 75 /*********************************
76 * tunables 76 * tunables
77 **********************************/ 77 **********************************/
78 /* Enable/disable zswap (disabled by default, fixed at boot for now) */ 78 /* Enable/disable zswap (disabled by default, fixed at boot for now) */
79 static bool zswap_enabled __read_mostly; 79 static bool zswap_enabled __read_mostly;
80 module_param_named(enabled, zswap_enabled, bool, 0444); 80 module_param_named(enabled, zswap_enabled, bool, 0444);
81 81
82 /* Compressor to be used by zswap (fixed at boot for now) */ 82 /* Compressor to be used by zswap (fixed at boot for now) */
83 #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 83 #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
84 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 84 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
85 module_param_named(compressor, zswap_compressor, charp, 0444); 85 module_param_named(compressor, zswap_compressor, charp, 0444);
86 86
87 /* The maximum percentage of memory that the compressed pool can occupy */ 87 /* The maximum percentage of memory that the compressed pool can occupy */
88 static unsigned int zswap_max_pool_percent = 20; 88 static unsigned int zswap_max_pool_percent = 20;
89 module_param_named(max_pool_percent, 89 module_param_named(max_pool_percent,
90 zswap_max_pool_percent, uint, 0644); 90 zswap_max_pool_percent, uint, 0644);
91 91
92 /* zbud_pool is shared by all of zswap backend */
93 static struct zbud_pool *zswap_pool;
94
92 /********************************* 95 /*********************************
93 * compression functions 96 * compression functions
94 **********************************/ 97 **********************************/
95 /* per-cpu compression transforms */ 98 /* per-cpu compression transforms */
96 static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; 99 static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
97 100
98 enum comp_op { 101 enum comp_op {
99 ZSWAP_COMPOP_COMPRESS, 102 ZSWAP_COMPOP_COMPRESS,
100 ZSWAP_COMPOP_DECOMPRESS 103 ZSWAP_COMPOP_DECOMPRESS
101 }; 104 };
102 105
103 static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, 106 static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
104 u8 *dst, unsigned int *dlen) 107 u8 *dst, unsigned int *dlen)
105 { 108 {
106 struct crypto_comp *tfm; 109 struct crypto_comp *tfm;
107 int ret; 110 int ret;
108 111
109 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); 112 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
110 switch (op) { 113 switch (op) {
111 case ZSWAP_COMPOP_COMPRESS: 114 case ZSWAP_COMPOP_COMPRESS:
112 ret = crypto_comp_compress(tfm, src, slen, dst, dlen); 115 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
113 break; 116 break;
114 case ZSWAP_COMPOP_DECOMPRESS: 117 case ZSWAP_COMPOP_DECOMPRESS:
115 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); 118 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
116 break; 119 break;
117 default: 120 default:
118 ret = -EINVAL; 121 ret = -EINVAL;
119 } 122 }
120 123
121 put_cpu(); 124 put_cpu();
122 return ret; 125 return ret;
123 } 126 }
124 127
125 static int __init zswap_comp_init(void) 128 static int __init zswap_comp_init(void)
126 { 129 {
127 if (!crypto_has_comp(zswap_compressor, 0, 0)) { 130 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
128 pr_info("%s compressor not available\n", zswap_compressor); 131 pr_info("%s compressor not available\n", zswap_compressor);
129 /* fall back to default compressor */ 132 /* fall back to default compressor */
130 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 133 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
131 if (!crypto_has_comp(zswap_compressor, 0, 0)) 134 if (!crypto_has_comp(zswap_compressor, 0, 0))
132 /* can't even load the default compressor */ 135 /* can't even load the default compressor */
133 return -ENODEV; 136 return -ENODEV;
134 } 137 }
135 pr_info("using %s compressor\n", zswap_compressor); 138 pr_info("using %s compressor\n", zswap_compressor);
136 139
137 /* alloc percpu transforms */ 140 /* alloc percpu transforms */
138 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); 141 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
139 if (!zswap_comp_pcpu_tfms) 142 if (!zswap_comp_pcpu_tfms)
140 return -ENOMEM; 143 return -ENOMEM;
141 return 0; 144 return 0;
142 } 145 }
143 146
144 static void zswap_comp_exit(void) 147 static void zswap_comp_exit(void)
145 { 148 {
146 /* free percpu transforms */ 149 /* free percpu transforms */
147 if (zswap_comp_pcpu_tfms) 150 if (zswap_comp_pcpu_tfms)
148 free_percpu(zswap_comp_pcpu_tfms); 151 free_percpu(zswap_comp_pcpu_tfms);
149 } 152 }
150 153
151 /********************************* 154 /*********************************
152 * data structures 155 * data structures
153 **********************************/ 156 **********************************/
154 /* 157 /*
155 * struct zswap_entry 158 * struct zswap_entry
156 * 159 *
157 * This structure contains the metadata for tracking a single compressed 160 * This structure contains the metadata for tracking a single compressed
158 * page within zswap. 161 * page within zswap.
159 * 162 *
160 * rbnode - links the entry into red-black tree for the appropriate swap type 163 * rbnode - links the entry into red-black tree for the appropriate swap type
161 * refcount - the number of outstanding reference to the entry. This is needed 164 * refcount - the number of outstanding reference to the entry. This is needed
162 * to protect against premature freeing of the entry by code 165 * to protect against premature freeing of the entry by code
163 * concurrent calls to load, invalidate, and writeback. The lock 166 * concurrent calls to load, invalidate, and writeback. The lock
164 * for the zswap_tree structure that contains the entry must 167 * for the zswap_tree structure that contains the entry must
165 * be held while changing the refcount. Since the lock must 168 * be held while changing the refcount. Since the lock must
166 * be held, there is no reason to also make refcount atomic. 169 * be held, there is no reason to also make refcount atomic.
167 * offset - the swap offset for the entry. Index into the red-black tree. 170 * offset - the swap offset for the entry. Index into the red-black tree.
168 * handle - zbud allocation handle that stores the compressed page data 171 * handle - zbud allocation handle that stores the compressed page data
169 * length - the length in bytes of the compressed page data. Needed during 172 * length - the length in bytes of the compressed page data. Needed during
170 * decompression 173 * decompression
171 */ 174 */
172 struct zswap_entry { 175 struct zswap_entry {
173 struct rb_node rbnode; 176 struct rb_node rbnode;
174 pgoff_t offset; 177 pgoff_t offset;
175 int refcount; 178 int refcount;
176 unsigned int length; 179 unsigned int length;
177 unsigned long handle; 180 unsigned long handle;
178 }; 181 };
179 182
180 struct zswap_header { 183 struct zswap_header {
181 swp_entry_t swpentry; 184 swp_entry_t swpentry;
182 }; 185 };
183 186
184 /* 187 /*
185 * The tree lock in the zswap_tree struct protects a few things: 188 * The tree lock in the zswap_tree struct protects a few things:
186 * - the rbtree 189 * - the rbtree
187 * - the refcount field of each entry in the tree 190 * - the refcount field of each entry in the tree
188 */ 191 */
189 struct zswap_tree { 192 struct zswap_tree {
190 struct rb_root rbroot; 193 struct rb_root rbroot;
191 spinlock_t lock; 194 spinlock_t lock;
192 struct zbud_pool *pool;
193 }; 195 };
194 196
195 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 197 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
196 198
197 /********************************* 199 /*********************************
198 * zswap entry functions 200 * zswap entry functions
199 **********************************/ 201 **********************************/
200 static struct kmem_cache *zswap_entry_cache; 202 static struct kmem_cache *zswap_entry_cache;
201 203
202 static int zswap_entry_cache_create(void) 204 static int zswap_entry_cache_create(void)
203 { 205 {
204 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 206 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
205 return (zswap_entry_cache == NULL); 207 return (zswap_entry_cache == NULL);
206 } 208 }
207 209
208 static void zswap_entry_cache_destory(void) 210 static void zswap_entry_cache_destory(void)
209 { 211 {
210 kmem_cache_destroy(zswap_entry_cache); 212 kmem_cache_destroy(zswap_entry_cache);
211 } 213 }
212 214
213 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 215 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
214 { 216 {
215 struct zswap_entry *entry; 217 struct zswap_entry *entry;
216 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 218 entry = kmem_cache_alloc(zswap_entry_cache, gfp);
217 if (!entry) 219 if (!entry)
218 return NULL; 220 return NULL;
219 entry->refcount = 1; 221 entry->refcount = 1;
220 RB_CLEAR_NODE(&entry->rbnode); 222 RB_CLEAR_NODE(&entry->rbnode);
221 return entry; 223 return entry;
222 } 224 }
223 225
224 static void zswap_entry_cache_free(struct zswap_entry *entry) 226 static void zswap_entry_cache_free(struct zswap_entry *entry)
225 { 227 {
226 kmem_cache_free(zswap_entry_cache, entry); 228 kmem_cache_free(zswap_entry_cache, entry);
227 } 229 }
228 230
229 /********************************* 231 /*********************************
230 * rbtree functions 232 * rbtree functions
231 **********************************/ 233 **********************************/
232 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 234 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
233 { 235 {
234 struct rb_node *node = root->rb_node; 236 struct rb_node *node = root->rb_node;
235 struct zswap_entry *entry; 237 struct zswap_entry *entry;
236 238
237 while (node) { 239 while (node) {
238 entry = rb_entry(node, struct zswap_entry, rbnode); 240 entry = rb_entry(node, struct zswap_entry, rbnode);
239 if (entry->offset > offset) 241 if (entry->offset > offset)
240 node = node->rb_left; 242 node = node->rb_left;
241 else if (entry->offset < offset) 243 else if (entry->offset < offset)
242 node = node->rb_right; 244 node = node->rb_right;
243 else 245 else
244 return entry; 246 return entry;
245 } 247 }
246 return NULL; 248 return NULL;
247 } 249 }
248 250
249 /* 251 /*
250 * In the case that a entry with the same offset is found, a pointer to 252 * In the case that a entry with the same offset is found, a pointer to
251 * the existing entry is stored in dupentry and the function returns -EEXIST 253 * the existing entry is stored in dupentry and the function returns -EEXIST
252 */ 254 */
253 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 255 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
254 struct zswap_entry **dupentry) 256 struct zswap_entry **dupentry)
255 { 257 {
256 struct rb_node **link = &root->rb_node, *parent = NULL; 258 struct rb_node **link = &root->rb_node, *parent = NULL;
257 struct zswap_entry *myentry; 259 struct zswap_entry *myentry;
258 260
259 while (*link) { 261 while (*link) {
260 parent = *link; 262 parent = *link;
261 myentry = rb_entry(parent, struct zswap_entry, rbnode); 263 myentry = rb_entry(parent, struct zswap_entry, rbnode);
262 if (myentry->offset > entry->offset) 264 if (myentry->offset > entry->offset)
263 link = &(*link)->rb_left; 265 link = &(*link)->rb_left;
264 else if (myentry->offset < entry->offset) 266 else if (myentry->offset < entry->offset)
265 link = &(*link)->rb_right; 267 link = &(*link)->rb_right;
266 else { 268 else {
267 *dupentry = myentry; 269 *dupentry = myentry;
268 return -EEXIST; 270 return -EEXIST;
269 } 271 }
270 } 272 }
271 rb_link_node(&entry->rbnode, parent, link); 273 rb_link_node(&entry->rbnode, parent, link);
272 rb_insert_color(&entry->rbnode, root); 274 rb_insert_color(&entry->rbnode, root);
273 return 0; 275 return 0;
274 } 276 }
275 277
276 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 278 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
277 { 279 {
278 if (!RB_EMPTY_NODE(&entry->rbnode)) { 280 if (!RB_EMPTY_NODE(&entry->rbnode)) {
279 rb_erase(&entry->rbnode, root); 281 rb_erase(&entry->rbnode, root);
280 RB_CLEAR_NODE(&entry->rbnode); 282 RB_CLEAR_NODE(&entry->rbnode);
281 } 283 }
282 } 284 }
283 285
284 /* 286 /*
285 * Carries out the common pattern of freeing and entry's zbud allocation, 287 * Carries out the common pattern of freeing and entry's zbud allocation,
286 * freeing the entry itself, and decrementing the number of stored pages. 288 * freeing the entry itself, and decrementing the number of stored pages.
287 */ 289 */
288 static void zswap_free_entry(struct zswap_tree *tree, 290 static void zswap_free_entry(struct zswap_entry *entry)
289 struct zswap_entry *entry)
290 { 291 {
291 zbud_free(tree->pool, entry->handle); 292 zbud_free(zswap_pool, entry->handle);
292 zswap_entry_cache_free(entry); 293 zswap_entry_cache_free(entry);
293 atomic_dec(&zswap_stored_pages); 294 atomic_dec(&zswap_stored_pages);
294 zswap_pool_pages = zbud_get_pool_size(tree->pool); 295 zswap_pool_pages = zbud_get_pool_size(zswap_pool);
295 } 296 }
296 297
297 /* caller must hold the tree lock */ 298 /* caller must hold the tree lock */
298 static void zswap_entry_get(struct zswap_entry *entry) 299 static void zswap_entry_get(struct zswap_entry *entry)
299 { 300 {
300 entry->refcount++; 301 entry->refcount++;
301 } 302 }
302 303
303 /* caller must hold the tree lock 304 /* caller must hold the tree lock
304 * remove from the tree and free it, if nobody reference the entry 305 * remove from the tree and free it, if nobody reference the entry
305 */ 306 */
306 static void zswap_entry_put(struct zswap_tree *tree, 307 static void zswap_entry_put(struct zswap_tree *tree,
307 struct zswap_entry *entry) 308 struct zswap_entry *entry)
308 { 309 {
309 int refcount = --entry->refcount; 310 int refcount = --entry->refcount;
310 311
311 BUG_ON(refcount < 0); 312 BUG_ON(refcount < 0);
312 if (refcount == 0) { 313 if (refcount == 0) {
313 zswap_rb_erase(&tree->rbroot, entry); 314 zswap_rb_erase(&tree->rbroot, entry);
314 zswap_free_entry(tree, entry); 315 zswap_free_entry(entry);
315 } 316 }
316 } 317 }
317 318
318 /* caller must hold the tree lock */ 319 /* caller must hold the tree lock */
319 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 320 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
320 pgoff_t offset) 321 pgoff_t offset)
321 { 322 {
322 struct zswap_entry *entry = NULL; 323 struct zswap_entry *entry = NULL;
323 324
324 entry = zswap_rb_search(root, offset); 325 entry = zswap_rb_search(root, offset);
325 if (entry) 326 if (entry)
326 zswap_entry_get(entry); 327 zswap_entry_get(entry);
327 328
328 return entry; 329 return entry;
329 } 330 }
330 331
331 /********************************* 332 /*********************************
332 * per-cpu code 333 * per-cpu code
333 **********************************/ 334 **********************************/
334 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 335 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
335 336
336 static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) 337 static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
337 { 338 {
338 struct crypto_comp *tfm; 339 struct crypto_comp *tfm;
339 u8 *dst; 340 u8 *dst;
340 341
341 switch (action) { 342 switch (action) {
342 case CPU_UP_PREPARE: 343 case CPU_UP_PREPARE:
343 tfm = crypto_alloc_comp(zswap_compressor, 0, 0); 344 tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
344 if (IS_ERR(tfm)) { 345 if (IS_ERR(tfm)) {
345 pr_err("can't allocate compressor transform\n"); 346 pr_err("can't allocate compressor transform\n");
346 return NOTIFY_BAD; 347 return NOTIFY_BAD;
347 } 348 }
348 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; 349 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
349 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); 350 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
350 if (!dst) { 351 if (!dst) {
351 pr_err("can't allocate compressor buffer\n"); 352 pr_err("can't allocate compressor buffer\n");
352 crypto_free_comp(tfm); 353 crypto_free_comp(tfm);
353 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; 354 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
354 return NOTIFY_BAD; 355 return NOTIFY_BAD;
355 } 356 }
356 per_cpu(zswap_dstmem, cpu) = dst; 357 per_cpu(zswap_dstmem, cpu) = dst;
357 break; 358 break;
358 case CPU_DEAD: 359 case CPU_DEAD:
359 case CPU_UP_CANCELED: 360 case CPU_UP_CANCELED:
360 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); 361 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
361 if (tfm) { 362 if (tfm) {
362 crypto_free_comp(tfm); 363 crypto_free_comp(tfm);
363 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; 364 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
364 } 365 }
365 dst = per_cpu(zswap_dstmem, cpu); 366 dst = per_cpu(zswap_dstmem, cpu);
366 kfree(dst); 367 kfree(dst);
367 per_cpu(zswap_dstmem, cpu) = NULL; 368 per_cpu(zswap_dstmem, cpu) = NULL;
368 break; 369 break;
369 default: 370 default:
370 break; 371 break;
371 } 372 }
372 return NOTIFY_OK; 373 return NOTIFY_OK;
373 } 374 }
374 375
375 static int zswap_cpu_notifier(struct notifier_block *nb, 376 static int zswap_cpu_notifier(struct notifier_block *nb,
376 unsigned long action, void *pcpu) 377 unsigned long action, void *pcpu)
377 { 378 {
378 unsigned long cpu = (unsigned long)pcpu; 379 unsigned long cpu = (unsigned long)pcpu;
379 return __zswap_cpu_notifier(action, cpu); 380 return __zswap_cpu_notifier(action, cpu);
380 } 381 }
381 382
382 static struct notifier_block zswap_cpu_notifier_block = { 383 static struct notifier_block zswap_cpu_notifier_block = {
383 .notifier_call = zswap_cpu_notifier 384 .notifier_call = zswap_cpu_notifier
384 }; 385 };
385 386
386 static int zswap_cpu_init(void) 387 static int zswap_cpu_init(void)
387 { 388 {
388 unsigned long cpu; 389 unsigned long cpu;
389 390
390 get_online_cpus(); 391 get_online_cpus();
391 for_each_online_cpu(cpu) 392 for_each_online_cpu(cpu)
392 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) 393 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
393 goto cleanup; 394 goto cleanup;
394 register_cpu_notifier(&zswap_cpu_notifier_block); 395 register_cpu_notifier(&zswap_cpu_notifier_block);
395 put_online_cpus(); 396 put_online_cpus();
396 return 0; 397 return 0;
397 398
398 cleanup: 399 cleanup:
399 for_each_online_cpu(cpu) 400 for_each_online_cpu(cpu)
400 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); 401 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
401 put_online_cpus(); 402 put_online_cpus();
402 return -ENOMEM; 403 return -ENOMEM;
403 } 404 }
404 405
405 /********************************* 406 /*********************************
406 * helpers 407 * helpers
407 **********************************/ 408 **********************************/
408 static bool zswap_is_full(void) 409 static bool zswap_is_full(void)
409 { 410 {
410 return (totalram_pages * zswap_max_pool_percent / 100 < 411 return (totalram_pages * zswap_max_pool_percent / 100 <
411 zswap_pool_pages); 412 zswap_pool_pages);
412 } 413 }
413 414
414 /********************************* 415 /*********************************
415 * writeback code 416 * writeback code
416 **********************************/ 417 **********************************/
417 /* return enum for zswap_get_swap_cache_page */ 418 /* return enum for zswap_get_swap_cache_page */
418 enum zswap_get_swap_ret { 419 enum zswap_get_swap_ret {
419 ZSWAP_SWAPCACHE_NEW, 420 ZSWAP_SWAPCACHE_NEW,
420 ZSWAP_SWAPCACHE_EXIST, 421 ZSWAP_SWAPCACHE_EXIST,
421 ZSWAP_SWAPCACHE_FAIL, 422 ZSWAP_SWAPCACHE_FAIL,
422 }; 423 };
423 424
424 /* 425 /*
425 * zswap_get_swap_cache_page 426 * zswap_get_swap_cache_page
426 * 427 *
427 * This is an adaption of read_swap_cache_async() 428 * This is an adaption of read_swap_cache_async()
428 * 429 *
429 * This function tries to find a page with the given swap entry 430 * This function tries to find a page with the given swap entry
430 * in the swapper_space address space (the swap cache). If the page 431 * in the swapper_space address space (the swap cache). If the page
431 * is found, it is returned in retpage. Otherwise, a page is allocated, 432 * is found, it is returned in retpage. Otherwise, a page is allocated,
432 * added to the swap cache, and returned in retpage. 433 * added to the swap cache, and returned in retpage.
433 * 434 *
434 * If success, the swap cache page is returned in retpage 435 * If success, the swap cache page is returned in retpage
435 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 436 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
436 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 437 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
437 * the new page is added to swapcache and locked 438 * the new page is added to swapcache and locked
438 * Returns ZSWAP_SWAPCACHE_FAIL on error 439 * Returns ZSWAP_SWAPCACHE_FAIL on error
439 */ 440 */
440 static int zswap_get_swap_cache_page(swp_entry_t entry, 441 static int zswap_get_swap_cache_page(swp_entry_t entry,
441 struct page **retpage) 442 struct page **retpage)
442 { 443 {
443 struct page *found_page, *new_page = NULL; 444 struct page *found_page, *new_page = NULL;
444 struct address_space *swapper_space = swap_address_space(entry); 445 struct address_space *swapper_space = swap_address_space(entry);
445 int err; 446 int err;
446 447
447 *retpage = NULL; 448 *retpage = NULL;
448 do { 449 do {
449 /* 450 /*
450 * First check the swap cache. Since this is normally 451 * First check the swap cache. Since this is normally
451 * called after lookup_swap_cache() failed, re-calling 452 * called after lookup_swap_cache() failed, re-calling
452 * that would confuse statistics. 453 * that would confuse statistics.
453 */ 454 */
454 found_page = find_get_page(swapper_space, entry.val); 455 found_page = find_get_page(swapper_space, entry.val);
455 if (found_page) 456 if (found_page)
456 break; 457 break;
457 458
458 /* 459 /*
459 * Get a new page to read into from swap. 460 * Get a new page to read into from swap.
460 */ 461 */
461 if (!new_page) { 462 if (!new_page) {
462 new_page = alloc_page(GFP_KERNEL); 463 new_page = alloc_page(GFP_KERNEL);
463 if (!new_page) 464 if (!new_page)
464 break; /* Out of memory */ 465 break; /* Out of memory */
465 } 466 }
466 467
467 /* 468 /*
468 * call radix_tree_preload() while we can wait. 469 * call radix_tree_preload() while we can wait.
469 */ 470 */
470 err = radix_tree_preload(GFP_KERNEL); 471 err = radix_tree_preload(GFP_KERNEL);
471 if (err) 472 if (err)
472 break; 473 break;
473 474
474 /* 475 /*
475 * Swap entry may have been freed since our caller observed it. 476 * Swap entry may have been freed since our caller observed it.
476 */ 477 */
477 err = swapcache_prepare(entry); 478 err = swapcache_prepare(entry);
478 if (err == -EEXIST) { /* seems racy */ 479 if (err == -EEXIST) { /* seems racy */
479 radix_tree_preload_end(); 480 radix_tree_preload_end();
480 continue; 481 continue;
481 } 482 }
482 if (err) { /* swp entry is obsolete ? */ 483 if (err) { /* swp entry is obsolete ? */
483 radix_tree_preload_end(); 484 radix_tree_preload_end();
484 break; 485 break;
485 } 486 }
486 487
487 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 488 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
488 __set_page_locked(new_page); 489 __set_page_locked(new_page);
489 SetPageSwapBacked(new_page); 490 SetPageSwapBacked(new_page);
490 err = __add_to_swap_cache(new_page, entry); 491 err = __add_to_swap_cache(new_page, entry);
491 if (likely(!err)) { 492 if (likely(!err)) {
492 radix_tree_preload_end(); 493 radix_tree_preload_end();
493 lru_cache_add_anon(new_page); 494 lru_cache_add_anon(new_page);
494 *retpage = new_page; 495 *retpage = new_page;
495 return ZSWAP_SWAPCACHE_NEW; 496 return ZSWAP_SWAPCACHE_NEW;
496 } 497 }
497 radix_tree_preload_end(); 498 radix_tree_preload_end();
498 ClearPageSwapBacked(new_page); 499 ClearPageSwapBacked(new_page);
499 __clear_page_locked(new_page); 500 __clear_page_locked(new_page);
500 /* 501 /*
501 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 502 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
502 * clear SWAP_HAS_CACHE flag. 503 * clear SWAP_HAS_CACHE flag.
503 */ 504 */
504 swapcache_free(entry, NULL); 505 swapcache_free(entry, NULL);
505 } while (err != -ENOMEM); 506 } while (err != -ENOMEM);
506 507
507 if (new_page) 508 if (new_page)
508 page_cache_release(new_page); 509 page_cache_release(new_page);
509 if (!found_page) 510 if (!found_page)
510 return ZSWAP_SWAPCACHE_FAIL; 511 return ZSWAP_SWAPCACHE_FAIL;
511 *retpage = found_page; 512 *retpage = found_page;
512 return ZSWAP_SWAPCACHE_EXIST; 513 return ZSWAP_SWAPCACHE_EXIST;
513 } 514 }
514 515
515 /* 516 /*
516 * Attempts to free an entry by adding a page to the swap cache, 517 * Attempts to free an entry by adding a page to the swap cache,
517 * decompressing the entry data into the page, and issuing a 518 * decompressing the entry data into the page, and issuing a
518 * bio write to write the page back to the swap device. 519 * bio write to write the page back to the swap device.
519 * 520 *
520 * This can be thought of as a "resumed writeback" of the page 521 * This can be thought of as a "resumed writeback" of the page
521 * to the swap device. We are basically resuming the same swap 522 * to the swap device. We are basically resuming the same swap
522 * writeback path that was intercepted with the frontswap_store() 523 * writeback path that was intercepted with the frontswap_store()
523 * in the first place. After the page has been decompressed into 524 * in the first place. After the page has been decompressed into
524 * the swap cache, the compressed version stored by zswap can be 525 * the swap cache, the compressed version stored by zswap can be
525 * freed. 526 * freed.
526 */ 527 */
527 static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) 528 static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
528 { 529 {
529 struct zswap_header *zhdr; 530 struct zswap_header *zhdr;
530 swp_entry_t swpentry; 531 swp_entry_t swpentry;
531 struct zswap_tree *tree; 532 struct zswap_tree *tree;
532 pgoff_t offset; 533 pgoff_t offset;
533 struct zswap_entry *entry; 534 struct zswap_entry *entry;
534 struct page *page; 535 struct page *page;
535 u8 *src, *dst; 536 u8 *src, *dst;
536 unsigned int dlen; 537 unsigned int dlen;
537 int ret; 538 int ret;
538 struct writeback_control wbc = { 539 struct writeback_control wbc = {
539 .sync_mode = WB_SYNC_NONE, 540 .sync_mode = WB_SYNC_NONE,
540 }; 541 };
541 542
542 /* extract swpentry from data */ 543 /* extract swpentry from data */
543 zhdr = zbud_map(pool, handle); 544 zhdr = zbud_map(pool, handle);
544 swpentry = zhdr->swpentry; /* here */ 545 swpentry = zhdr->swpentry; /* here */
545 zbud_unmap(pool, handle); 546 zbud_unmap(pool, handle);
546 tree = zswap_trees[swp_type(swpentry)]; 547 tree = zswap_trees[swp_type(swpentry)];
547 offset = swp_offset(swpentry); 548 offset = swp_offset(swpentry);
548 BUG_ON(pool != tree->pool);
549 549
550 /* find and ref zswap entry */ 550 /* find and ref zswap entry */
551 spin_lock(&tree->lock); 551 spin_lock(&tree->lock);
552 entry = zswap_entry_find_get(&tree->rbroot, offset); 552 entry = zswap_entry_find_get(&tree->rbroot, offset);
553 if (!entry) { 553 if (!entry) {
554 /* entry was invalidated */ 554 /* entry was invalidated */
555 spin_unlock(&tree->lock); 555 spin_unlock(&tree->lock);
556 return 0; 556 return 0;
557 } 557 }
558 spin_unlock(&tree->lock); 558 spin_unlock(&tree->lock);
559 BUG_ON(offset != entry->offset); 559 BUG_ON(offset != entry->offset);
560 560
561 /* try to allocate swap cache page */ 561 /* try to allocate swap cache page */
562 switch (zswap_get_swap_cache_page(swpentry, &page)) { 562 switch (zswap_get_swap_cache_page(swpentry, &page)) {
563 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 563 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
564 ret = -ENOMEM; 564 ret = -ENOMEM;
565 goto fail; 565 goto fail;
566 566
567 case ZSWAP_SWAPCACHE_EXIST: 567 case ZSWAP_SWAPCACHE_EXIST:
568 /* page is already in the swap cache, ignore for now */ 568 /* page is already in the swap cache, ignore for now */
569 page_cache_release(page); 569 page_cache_release(page);
570 ret = -EEXIST; 570 ret = -EEXIST;
571 goto fail; 571 goto fail;
572 572
573 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 573 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
574 /* decompress */ 574 /* decompress */
575 dlen = PAGE_SIZE; 575 dlen = PAGE_SIZE;
576 src = (u8 *)zbud_map(tree->pool, entry->handle) + 576 src = (u8 *)zbud_map(zswap_pool, entry->handle) +
577 sizeof(struct zswap_header); 577 sizeof(struct zswap_header);
578 dst = kmap_atomic(page); 578 dst = kmap_atomic(page);
579 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, 579 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
580 entry->length, dst, &dlen); 580 entry->length, dst, &dlen);
581 kunmap_atomic(dst); 581 kunmap_atomic(dst);
582 zbud_unmap(tree->pool, entry->handle); 582 zbud_unmap(zswap_pool, entry->handle);
583 BUG_ON(ret); 583 BUG_ON(ret);
584 BUG_ON(dlen != PAGE_SIZE); 584 BUG_ON(dlen != PAGE_SIZE);
585 585
586 /* page is up to date */ 586 /* page is up to date */
587 SetPageUptodate(page); 587 SetPageUptodate(page);
588 } 588 }
589 589
590 /* move it to the tail of the inactive list after end_writeback */ 590 /* move it to the tail of the inactive list after end_writeback */
591 SetPageReclaim(page); 591 SetPageReclaim(page);
592 592
593 /* start writeback */ 593 /* start writeback */
594 __swap_writepage(page, &wbc, end_swap_bio_write); 594 __swap_writepage(page, &wbc, end_swap_bio_write);
595 page_cache_release(page); 595 page_cache_release(page);
596 zswap_written_back_pages++; 596 zswap_written_back_pages++;
597 597
598 spin_lock(&tree->lock); 598 spin_lock(&tree->lock);
599 /* drop local reference */ 599 /* drop local reference */
600 zswap_entry_put(tree, entry); 600 zswap_entry_put(tree, entry);
601 601
602 /* 602 /*
603 * There are two possible situations for entry here: 603 * There are two possible situations for entry here:
604 * (1) refcount is 1(normal case), entry is valid and on the tree 604 * (1) refcount is 1(normal case), entry is valid and on the tree
605 * (2) refcount is 0, entry is freed and not on the tree 605 * (2) refcount is 0, entry is freed and not on the tree
606 * because invalidate happened during writeback 606 * because invalidate happened during writeback
607 * search the tree and free the entry if find entry 607 * search the tree and free the entry if find entry
608 */ 608 */
609 if (entry == zswap_rb_search(&tree->rbroot, offset)) 609 if (entry == zswap_rb_search(&tree->rbroot, offset))
610 zswap_entry_put(tree, entry); 610 zswap_entry_put(tree, entry);
611 spin_unlock(&tree->lock); 611 spin_unlock(&tree->lock);
612 612
613 goto end; 613 goto end;
614 614
615 /* 615 /*
616 * if we get here due to ZSWAP_SWAPCACHE_EXIST 616 * if we get here due to ZSWAP_SWAPCACHE_EXIST
617 * a load may happening concurrently 617 * a load may happening concurrently
618 * it is safe and okay to not free the entry 618 * it is safe and okay to not free the entry
619 * if we free the entry in the following put 619 * if we free the entry in the following put
620 * it it either okay to return !0 620 * it it either okay to return !0
621 */ 621 */
622 fail: 622 fail:
623 spin_lock(&tree->lock); 623 spin_lock(&tree->lock);
624 zswap_entry_put(tree, entry); 624 zswap_entry_put(tree, entry);
625 spin_unlock(&tree->lock); 625 spin_unlock(&tree->lock);
626 626
627 end: 627 end:
628 return ret; 628 return ret;
629 } 629 }
630 630
631 /********************************* 631 /*********************************
632 * frontswap hooks 632 * frontswap hooks
633 **********************************/ 633 **********************************/
634 /* attempts to compress and store an single page */ 634 /* attempts to compress and store an single page */
635 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 635 static int zswap_frontswap_store(unsigned type, pgoff_t offset,
636 struct page *page) 636 struct page *page)
637 { 637 {
638 struct zswap_tree *tree = zswap_trees[type]; 638 struct zswap_tree *tree = zswap_trees[type];
639 struct zswap_entry *entry, *dupentry; 639 struct zswap_entry *entry, *dupentry;
640 int ret; 640 int ret;
641 unsigned int dlen = PAGE_SIZE, len; 641 unsigned int dlen = PAGE_SIZE, len;
642 unsigned long handle; 642 unsigned long handle;
643 char *buf; 643 char *buf;
644 u8 *src, *dst; 644 u8 *src, *dst;
645 struct zswap_header *zhdr; 645 struct zswap_header *zhdr;
646 646
647 if (!tree) { 647 if (!tree) {
648 ret = -ENODEV; 648 ret = -ENODEV;
649 goto reject; 649 goto reject;
650 } 650 }
651 651
652 /* reclaim space if needed */ 652 /* reclaim space if needed */
653 if (zswap_is_full()) { 653 if (zswap_is_full()) {
654 zswap_pool_limit_hit++; 654 zswap_pool_limit_hit++;
655 if (zbud_reclaim_page(tree->pool, 8)) { 655 if (zbud_reclaim_page(zswap_pool, 8)) {
656 zswap_reject_reclaim_fail++; 656 zswap_reject_reclaim_fail++;
657 ret = -ENOMEM; 657 ret = -ENOMEM;
658 goto reject; 658 goto reject;
659 } 659 }
660 } 660 }
661 661
662 /* allocate entry */ 662 /* allocate entry */
663 entry = zswap_entry_cache_alloc(GFP_KERNEL); 663 entry = zswap_entry_cache_alloc(GFP_KERNEL);
664 if (!entry) { 664 if (!entry) {
665 zswap_reject_kmemcache_fail++; 665 zswap_reject_kmemcache_fail++;
666 ret = -ENOMEM; 666 ret = -ENOMEM;
667 goto reject; 667 goto reject;
668 } 668 }
669 669
670 /* compress */ 670 /* compress */
671 dst = get_cpu_var(zswap_dstmem); 671 dst = get_cpu_var(zswap_dstmem);
672 src = kmap_atomic(page); 672 src = kmap_atomic(page);
673 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); 673 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
674 kunmap_atomic(src); 674 kunmap_atomic(src);
675 if (ret) { 675 if (ret) {
676 ret = -EINVAL; 676 ret = -EINVAL;
677 goto freepage; 677 goto freepage;
678 } 678 }
679 679
680 /* store */ 680 /* store */
681 len = dlen + sizeof(struct zswap_header); 681 len = dlen + sizeof(struct zswap_header);
682 ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, 682 ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
683 &handle); 683 &handle);
684 if (ret == -ENOSPC) { 684 if (ret == -ENOSPC) {
685 zswap_reject_compress_poor++; 685 zswap_reject_compress_poor++;
686 goto freepage; 686 goto freepage;
687 } 687 }
688 if (ret) { 688 if (ret) {
689 zswap_reject_alloc_fail++; 689 zswap_reject_alloc_fail++;
690 goto freepage; 690 goto freepage;
691 } 691 }
692 zhdr = zbud_map(tree->pool, handle); 692 zhdr = zbud_map(zswap_pool, handle);
693 zhdr->swpentry = swp_entry(type, offset); 693 zhdr->swpentry = swp_entry(type, offset);
694 buf = (u8 *)(zhdr + 1); 694 buf = (u8 *)(zhdr + 1);
695 memcpy(buf, dst, dlen); 695 memcpy(buf, dst, dlen);
696 zbud_unmap(tree->pool, handle); 696 zbud_unmap(zswap_pool, handle);
697 put_cpu_var(zswap_dstmem); 697 put_cpu_var(zswap_dstmem);
698 698
699 /* populate entry */ 699 /* populate entry */
700 entry->offset = offset; 700 entry->offset = offset;
701 entry->handle = handle; 701 entry->handle = handle;
702 entry->length = dlen; 702 entry->length = dlen;
703 703
704 /* map */ 704 /* map */
705 spin_lock(&tree->lock); 705 spin_lock(&tree->lock);
706 do { 706 do {
707 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 707 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
708 if (ret == -EEXIST) { 708 if (ret == -EEXIST) {
709 zswap_duplicate_entry++; 709 zswap_duplicate_entry++;
710 /* remove from rbtree */ 710 /* remove from rbtree */
711 zswap_rb_erase(&tree->rbroot, dupentry); 711 zswap_rb_erase(&tree->rbroot, dupentry);
712 zswap_entry_put(tree, dupentry); 712 zswap_entry_put(tree, dupentry);
713 } 713 }
714 } while (ret == -EEXIST); 714 } while (ret == -EEXIST);
715 spin_unlock(&tree->lock); 715 spin_unlock(&tree->lock);
716 716
717 /* update stats */ 717 /* update stats */
718 atomic_inc(&zswap_stored_pages); 718 atomic_inc(&zswap_stored_pages);
719 zswap_pool_pages = zbud_get_pool_size(tree->pool); 719 zswap_pool_pages = zbud_get_pool_size(zswap_pool);
720 720
721 return 0; 721 return 0;
722 722
723 freepage: 723 freepage:
724 put_cpu_var(zswap_dstmem); 724 put_cpu_var(zswap_dstmem);
725 zswap_entry_cache_free(entry); 725 zswap_entry_cache_free(entry);
726 reject: 726 reject:
727 return ret; 727 return ret;
728 } 728 }
729 729
730 /* 730 /*
731 * returns 0 if the page was successfully decompressed 731 * returns 0 if the page was successfully decompressed
732 * return -1 on entry not found or error 732 * return -1 on entry not found or error
733 */ 733 */
734 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 734 static int zswap_frontswap_load(unsigned type, pgoff_t offset,
735 struct page *page) 735 struct page *page)
736 { 736 {
737 struct zswap_tree *tree = zswap_trees[type]; 737 struct zswap_tree *tree = zswap_trees[type];
738 struct zswap_entry *entry; 738 struct zswap_entry *entry;
739 u8 *src, *dst; 739 u8 *src, *dst;
740 unsigned int dlen; 740 unsigned int dlen;
741 int ret; 741 int ret;
742 742
743 /* find */ 743 /* find */
744 spin_lock(&tree->lock); 744 spin_lock(&tree->lock);
745 entry = zswap_entry_find_get(&tree->rbroot, offset); 745 entry = zswap_entry_find_get(&tree->rbroot, offset);
746 if (!entry) { 746 if (!entry) {
747 /* entry was written back */ 747 /* entry was written back */
748 spin_unlock(&tree->lock); 748 spin_unlock(&tree->lock);
749 return -1; 749 return -1;
750 } 750 }
751 spin_unlock(&tree->lock); 751 spin_unlock(&tree->lock);
752 752
753 /* decompress */ 753 /* decompress */
754 dlen = PAGE_SIZE; 754 dlen = PAGE_SIZE;
755 src = (u8 *)zbud_map(tree->pool, entry->handle) + 755 src = (u8 *)zbud_map(zswap_pool, entry->handle) +
756 sizeof(struct zswap_header); 756 sizeof(struct zswap_header);
757 dst = kmap_atomic(page); 757 dst = kmap_atomic(page);
758 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, 758 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
759 dst, &dlen); 759 dst, &dlen);
760 kunmap_atomic(dst); 760 kunmap_atomic(dst);
761 zbud_unmap(tree->pool, entry->handle); 761 zbud_unmap(zswap_pool, entry->handle);
762 BUG_ON(ret); 762 BUG_ON(ret);
763 763
764 spin_lock(&tree->lock); 764 spin_lock(&tree->lock);
765 zswap_entry_put(tree, entry); 765 zswap_entry_put(tree, entry);
766 spin_unlock(&tree->lock); 766 spin_unlock(&tree->lock);
767 767
768 return 0; 768 return 0;
769 } 769 }
770 770
771 /* frees an entry in zswap */ 771 /* frees an entry in zswap */
772 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 772 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
773 { 773 {
774 struct zswap_tree *tree = zswap_trees[type]; 774 struct zswap_tree *tree = zswap_trees[type];
775 struct zswap_entry *entry; 775 struct zswap_entry *entry;
776 776
777 /* find */ 777 /* find */
778 spin_lock(&tree->lock); 778 spin_lock(&tree->lock);
779 entry = zswap_rb_search(&tree->rbroot, offset); 779 entry = zswap_rb_search(&tree->rbroot, offset);
780 if (!entry) { 780 if (!entry) {
781 /* entry was written back */ 781 /* entry was written back */
782 spin_unlock(&tree->lock); 782 spin_unlock(&tree->lock);
783 return; 783 return;
784 } 784 }
785 785
786 /* remove from rbtree */ 786 /* remove from rbtree */
787 zswap_rb_erase(&tree->rbroot, entry); 787 zswap_rb_erase(&tree->rbroot, entry);
788 788
789 /* drop the initial reference from entry creation */ 789 /* drop the initial reference from entry creation */
790 zswap_entry_put(tree, entry); 790 zswap_entry_put(tree, entry);
791 791
792 spin_unlock(&tree->lock); 792 spin_unlock(&tree->lock);
793 } 793 }
794 794
795 /* frees all zswap entries for the given swap type */ 795 /* frees all zswap entries for the given swap type */
796 static void zswap_frontswap_invalidate_area(unsigned type) 796 static void zswap_frontswap_invalidate_area(unsigned type)
797 { 797 {
798 struct zswap_tree *tree = zswap_trees[type]; 798 struct zswap_tree *tree = zswap_trees[type];
799 struct zswap_entry *entry, *n; 799 struct zswap_entry *entry, *n;
800 800
801 if (!tree) 801 if (!tree)
802 return; 802 return;
803 803
804 /* walk the tree and free everything */ 804 /* walk the tree and free everything */
805 spin_lock(&tree->lock); 805 spin_lock(&tree->lock);
806 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 806 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
807 zswap_free_entry(tree, entry); 807 zswap_free_entry(entry);
808 tree->rbroot = RB_ROOT; 808 tree->rbroot = RB_ROOT;
809 spin_unlock(&tree->lock); 809 spin_unlock(&tree->lock);
810
811 zbud_destroy_pool(tree->pool);
812 kfree(tree); 810 kfree(tree);
813 zswap_trees[type] = NULL; 811 zswap_trees[type] = NULL;
814 } 812 }
815 813
816 static struct zbud_ops zswap_zbud_ops = { 814 static struct zbud_ops zswap_zbud_ops = {
817 .evict = zswap_writeback_entry 815 .evict = zswap_writeback_entry
818 }; 816 };
819 817
820 static void zswap_frontswap_init(unsigned type) 818 static void zswap_frontswap_init(unsigned type)
821 { 819 {
822 struct zswap_tree *tree; 820 struct zswap_tree *tree;
823 821
824 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); 822 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
825 if (!tree) 823 if (!tree) {
826 goto err; 824 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
827 tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); 825 return;
828 if (!tree->pool) 826 }
829 goto freetree; 827
830 tree->rbroot = RB_ROOT; 828 tree->rbroot = RB_ROOT;
831 spin_lock_init(&tree->lock); 829 spin_lock_init(&tree->lock);
832 zswap_trees[type] = tree; 830 zswap_trees[type] = tree;
833 return;
834
835 freetree:
836 kfree(tree);
837 err:
838 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
839 } 831 }
840 832
841 static struct frontswap_ops zswap_frontswap_ops = { 833 static struct frontswap_ops zswap_frontswap_ops = {
842 .store = zswap_frontswap_store, 834 .store = zswap_frontswap_store,
843 .load = zswap_frontswap_load, 835 .load = zswap_frontswap_load,
844 .invalidate_page = zswap_frontswap_invalidate_page, 836 .invalidate_page = zswap_frontswap_invalidate_page,
845 .invalidate_area = zswap_frontswap_invalidate_area, 837 .invalidate_area = zswap_frontswap_invalidate_area,
846 .init = zswap_frontswap_init 838 .init = zswap_frontswap_init
847 }; 839 };
848 840
849 /********************************* 841 /*********************************
850 * debugfs functions 842 * debugfs functions
851 **********************************/ 843 **********************************/
852 #ifdef CONFIG_DEBUG_FS 844 #ifdef CONFIG_DEBUG_FS
853 #include <linux/debugfs.h> 845 #include <linux/debugfs.h>
854 846
855 static struct dentry *zswap_debugfs_root; 847 static struct dentry *zswap_debugfs_root;
856 848
857 static int __init zswap_debugfs_init(void) 849 static int __init zswap_debugfs_init(void)
858 { 850 {
859 if (!debugfs_initialized()) 851 if (!debugfs_initialized())
860 return -ENODEV; 852 return -ENODEV;
861 853
862 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 854 zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
863 if (!zswap_debugfs_root) 855 if (!zswap_debugfs_root)
864 return -ENOMEM; 856 return -ENOMEM;
865 857
866 debugfs_create_u64("pool_limit_hit", S_IRUGO, 858 debugfs_create_u64("pool_limit_hit", S_IRUGO,
867 zswap_debugfs_root, &zswap_pool_limit_hit); 859 zswap_debugfs_root, &zswap_pool_limit_hit);
868 debugfs_create_u64("reject_reclaim_fail", S_IRUGO, 860 debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
869 zswap_debugfs_root, &zswap_reject_reclaim_fail); 861 zswap_debugfs_root, &zswap_reject_reclaim_fail);
870 debugfs_create_u64("reject_alloc_fail", S_IRUGO, 862 debugfs_create_u64("reject_alloc_fail", S_IRUGO,
871 zswap_debugfs_root, &zswap_reject_alloc_fail); 863 zswap_debugfs_root, &zswap_reject_alloc_fail);
872 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, 864 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
873 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 865 zswap_debugfs_root, &zswap_reject_kmemcache_fail);
874 debugfs_create_u64("reject_compress_poor", S_IRUGO, 866 debugfs_create_u64("reject_compress_poor", S_IRUGO,
875 zswap_debugfs_root, &zswap_reject_compress_poor); 867 zswap_debugfs_root, &zswap_reject_compress_poor);
876 debugfs_create_u64("written_back_pages", S_IRUGO, 868 debugfs_create_u64("written_back_pages", S_IRUGO,
877 zswap_debugfs_root, &zswap_written_back_pages); 869 zswap_debugfs_root, &zswap_written_back_pages);
878 debugfs_create_u64("duplicate_entry", S_IRUGO, 870 debugfs_create_u64("duplicate_entry", S_IRUGO,
879 zswap_debugfs_root, &zswap_duplicate_entry); 871 zswap_debugfs_root, &zswap_duplicate_entry);
880 debugfs_create_u64("pool_pages", S_IRUGO, 872 debugfs_create_u64("pool_pages", S_IRUGO,
881 zswap_debugfs_root, &zswap_pool_pages); 873 zswap_debugfs_root, &zswap_pool_pages);
882 debugfs_create_atomic_t("stored_pages", S_IRUGO, 874 debugfs_create_atomic_t("stored_pages", S_IRUGO,
883 zswap_debugfs_root, &zswap_stored_pages); 875 zswap_debugfs_root, &zswap_stored_pages);
884 876
885 return 0; 877 return 0;
886 } 878 }
887 879
888 static void __exit zswap_debugfs_exit(void) 880 static void __exit zswap_debugfs_exit(void)
889 { 881 {
890 debugfs_remove_recursive(zswap_debugfs_root); 882 debugfs_remove_recursive(zswap_debugfs_root);
891 } 883 }
892 #else 884 #else
893 static int __init zswap_debugfs_init(void) 885 static int __init zswap_debugfs_init(void)
894 { 886 {
895 return 0; 887 return 0;
896 } 888 }
897 889
898 static void __exit zswap_debugfs_exit(void) { } 890 static void __exit zswap_debugfs_exit(void) { }
899 #endif 891 #endif
900 892
901 /********************************* 893 /*********************************
902 * module init and exit 894 * module init and exit
903 **********************************/ 895 **********************************/
904 static int __init init_zswap(void) 896 static int __init init_zswap(void)
905 { 897 {
906 if (!zswap_enabled) 898 if (!zswap_enabled)
907 return 0; 899 return 0;
908 900
909 pr_info("loading zswap\n"); 901 pr_info("loading zswap\n");
902
903 zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
904 if (!zswap_pool) {
905 pr_err("zbud pool creation failed\n");
906 goto error;
907 }
908
910 if (zswap_entry_cache_create()) { 909 if (zswap_entry_cache_create()) {
911 pr_err("entry cache creation failed\n"); 910 pr_err("entry cache creation failed\n");
912 goto error; 911 goto cachefail;
913 } 912 }
914 if (zswap_comp_init()) { 913 if (zswap_comp_init()) {
915 pr_err("compressor initialization failed\n"); 914 pr_err("compressor initialization failed\n");
916 goto compfail; 915 goto compfail;
917 } 916 }
918 if (zswap_cpu_init()) { 917 if (zswap_cpu_init()) {
919 pr_err("per-cpu initialization failed\n"); 918 pr_err("per-cpu initialization failed\n");
920 goto pcpufail; 919 goto pcpufail;
921 } 920 }
921
922 frontswap_register_ops(&zswap_frontswap_ops); 922 frontswap_register_ops(&zswap_frontswap_ops);
923 if (zswap_debugfs_init()) 923 if (zswap_debugfs_init())
924 pr_warn("debugfs initialization failed\n"); 924 pr_warn("debugfs initialization failed\n");
925 return 0; 925 return 0;
926 pcpufail: 926 pcpufail:
927 zswap_comp_exit(); 927 zswap_comp_exit();
928 compfail: 928 compfail:
929 zswap_entry_cache_destory(); 929 zswap_entry_cache_destory();
930 cachefail: