Eric Lee / smarc-fsl-linux-kernel

Blame view

mm/vmpressure.c 14.3 KB

70ddf637e Anton Vorontsov memcg: add memory...	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21	/* * Linux VM pressure * * Copyright 2012 Linaro Ltd. * Anton Vorontsov <anton.vorontsov@linaro.org> * * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published * by the Free Software Foundation. */ #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/eventfd.h>
1ff6bbfd1 Tejun Heo arm, pm, vmpressu...	22	#include <linux/slab.h>
70ddf637e Anton Vorontsov memcg: add memory...	23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76	#include <linux/swap.h> #include <linux/printk.h> #include <linux/vmpressure.h> /* * The window size (vmpressure_win) is the number of scanned pages before * we try to analyze scanned/reclaimed ratio. So the window is used as a * rate-limit tunable for the "low" level notification, and also for * averaging the ratio for medium/critical levels. Using small window * sizes can cause lot of false positives, but too big window size will * delay the notifications. * * As the vmscan reclaimer logic works with chunks which are multiple of * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. * * TODO: Make the window size depend on machine size, as we do for vmstat * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). / static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX 16; /* * These thresholds are used when we account memory pressure through * scanned/reclaimed ratio. The current values were chosen empirically. In * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. / static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95; / * When there are too little pages left to scan, vmpressure() may miss the * critical pressure as number of pages will be less than "window size". * However, in that case the vmscan priority will raise fast as the * reclaimer will try to scan LRUs more deeply. * * The vmscan logic considers these special priorities: * * prio == DEF_PRIORITY (12): reclaimer starts with that value * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed * prio == 0 : close to OOM, kernel scans every page in an lru * * Any value in this range is acceptable for this tunable (i.e. from 12 to * 0). Current value for the vmpressure_level_critical_prio is chosen * empirically, but the number, in essence, means that we consider * critical level when scanning depth is ~10% of the lru size (vmscan * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one * eights). / static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); static struct vmpressure work_to_vmpressure(struct work_struct *work) { return container_of(work, struct vmpressure, work); }
70ddf637e Anton Vorontsov memcg: add memory...	77 78	static struct vmpressure vmpressure_parent(struct vmpressure vmpr) {
182446d08 Tejun Heo cgroup: pass arou...	79 80	struct cgroup_subsys_state css = vmpressure_to_css(vmpr); struct mem_cgroup memcg = mem_cgroup_from_css(css);
70ddf637e Anton Vorontsov memcg: add memory...	81 82 83 84 85 86 87 88 89 90 91 92 93	memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return memcg_to_vmpressure(memcg); } enum vmpressure_levels { VMPRESSURE_LOW = 0, VMPRESSURE_MEDIUM, VMPRESSURE_CRITICAL, VMPRESSURE_NUM_LEVELS, };
b6bb98114 David Rientjes mm, vmpressure: p...	94 95 96 97 98 99	enum vmpressure_modes { VMPRESSURE_NO_PASSTHROUGH = 0, VMPRESSURE_HIERARCHY, VMPRESSURE_LOCAL, VMPRESSURE_NUM_MODES, };
70ddf637e Anton Vorontsov memcg: add memory...	100 101 102 103 104	static const char * const vmpressure_str_levels[] = { [VMPRESSURE_LOW] = "low", [VMPRESSURE_MEDIUM] = "medium", [VMPRESSURE_CRITICAL] = "critical", };
b6bb98114 David Rientjes mm, vmpressure: p...	105 106 107 108 109	static const char * const vmpressure_str_modes[] = { [VMPRESSURE_NO_PASSTHROUGH] = "default", [VMPRESSURE_HIERARCHY] = "hierarchy", [VMPRESSURE_LOCAL] = "local", };
70ddf637e Anton Vorontsov memcg: add memory...	110 111 112 113 114 115 116 117 118 119 120 121 122	static enum vmpressure_levels vmpressure_level(unsigned long pressure) { if (pressure >= vmpressure_level_critical) return VMPRESSURE_CRITICAL; else if (pressure >= vmpressure_level_med) return VMPRESSURE_MEDIUM; return VMPRESSURE_LOW; } static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed;
e1587a494 Vinayak Menon mm: vmpressure: f...	123	unsigned long pressure = 0;
70ddf637e Anton Vorontsov memcg: add memory...	124 125	/*
d7143e312 zhongjiang mm: correct the c...	126 127 128	* reclaimed can be greater than scanned for things such as reclaimed * slab pages. shrink_node() just adds reclaimed pages without a * related increment to scanned pages.
e1587a494 Vinayak Menon mm: vmpressure: f...	129 130 131 132	/ if (reclaimed >= scanned) goto out; /
70ddf637e Anton Vorontsov memcg: add memory...	133 134 135 136 137 138 139 140	* We calculate the ratio (in percents) of how many pages were * scanned vs. reclaimed in a given time frame (window). Note that * time is in VM reclaimer's "ticks", i.e. number of pages * scanned. This makes it possible to set desired reaction time * and serves as a ratelimit. / pressure = scale - (reclaimed scale / scanned); pressure = pressure * 100 / scale;
e1587a494 Vinayak Menon mm: vmpressure: f...	141	out:
70ddf637e Anton Vorontsov memcg: add memory...	142 143 144 145 146 147 148 149 150 151	pr_debug("%s: %3lu (s: %lu r: %lu) ", __func__, pressure, scanned, reclaimed); return vmpressure_level(pressure); } struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level;
b6bb98114 David Rientjes mm, vmpressure: p...	152	enum vmpressure_modes mode;
70ddf637e Anton Vorontsov memcg: add memory...	153 154 155 156	struct list_head node; }; static bool vmpressure_event(struct vmpressure *vmpr,
b6bb98114 David Rientjes mm, vmpressure: p...	157 158	const enum vmpressure_levels level, bool ancestor, bool signalled)
70ddf637e Anton Vorontsov memcg: add memory...	159 160	{ struct vmpressure_event *ev;
b6bb98114 David Rientjes mm, vmpressure: p...	161	bool ret = false;
70ddf637e Anton Vorontsov memcg: add memory...	162
70ddf637e Anton Vorontsov memcg: add memory...	163	mutex_lock(&vmpr->events_lock);
70ddf637e Anton Vorontsov memcg: add memory...	164	list_for_each_entry(ev, &vmpr->events, node) {
b6bb98114 David Rientjes mm, vmpressure: p...	165 166 167 168 169 170 171 172	if (ancestor && ev->mode == VMPRESSURE_LOCAL) continue; if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) continue; if (level < ev->level) continue; eventfd_signal(ev->efd, 1); ret = true;
70ddf637e Anton Vorontsov memcg: add memory...	173	}
70ddf637e Anton Vorontsov memcg: add memory...	174	mutex_unlock(&vmpr->events_lock);
b6bb98114 David Rientjes mm, vmpressure: p...	175	return ret;
70ddf637e Anton Vorontsov memcg: add memory...	176 177 178 179 180 181 182	} static void vmpressure_work_fn(struct work_struct work) { struct vmpressure vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed;
8e8ae6452 Johannes Weiner mm: memcontrol: h...	183	enum vmpressure_levels level;
b6bb98114 David Rientjes mm, vmpressure: p...	184 185	bool ancestor = false; bool signalled = false;
70ddf637e Anton Vorontsov memcg: add memory...	186
91b57191c Andrew Morton mm/vmpressure.c: ...	187	spin_lock(&vmpr->sr_lock);
70ddf637e Anton Vorontsov memcg: add memory...	188 189 190 191 192 193 194 195	/* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old * work context cleared the counters. In that case we will run * just after the old work returns, but then scanned might be zero * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */
8e8ae6452 Johannes Weiner mm: memcontrol: h...	196	scanned = vmpr->tree_scanned;
91b57191c Andrew Morton mm/vmpressure.c: ...	197 198	if (!scanned) { spin_unlock(&vmpr->sr_lock);
70ddf637e Anton Vorontsov memcg: add memory...	199	return;
91b57191c Andrew Morton mm/vmpressure.c: ...	200	}
70ddf637e Anton Vorontsov memcg: add memory...	201
8e8ae6452 Johannes Weiner mm: memcontrol: h...	202 203 204	reclaimed = vmpr->tree_reclaimed; vmpr->tree_scanned = 0; vmpr->tree_reclaimed = 0;
22f2020f8 Michal Hocko vmpressure: chang...	205	spin_unlock(&vmpr->sr_lock);
70ddf637e Anton Vorontsov memcg: add memory...	206
8e8ae6452 Johannes Weiner mm: memcontrol: h...	207	level = vmpressure_calc_level(scanned, reclaimed);
70ddf637e Anton Vorontsov memcg: add memory...	208	do {
b6bb98114 David Rientjes mm, vmpressure: p...	209 210 211	if (vmpressure_event(vmpr, level, ancestor, signalled)) signalled = true; ancestor = true;
70ddf637e Anton Vorontsov memcg: add memory...	212 213 214 215 216 217 218	} while ((vmpr = vmpressure_parent(vmpr))); } /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle
8e8ae6452 Johannes Weiner mm: memcontrol: h...	219	* @tree: legacy subtree mode
70ddf637e Anton Vorontsov memcg: add memory...	220 221 222 223 224 225 226	* @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * * This function should be called from the vmscan reclaim path to account * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. *
8e8ae6452 Johannes Weiner mm: memcontrol: h...	227 228 229 230 231 232 233	* If @tree is set, vmpressure is in traditional userspace reporting * mode: @memcg is considered the pressure root and userspace is * notified of the entire subtree's reclaim efficiency. * * If @tree is not set, reclaim efficiency is recorded for @memcg, and * only in-kernel users are notified. *
70ddf637e Anton Vorontsov memcg: add memory...	234 235	* This function does not return any value. */
8e8ae6452 Johannes Weiner mm: memcontrol: h...	236	void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
70ddf637e Anton Vorontsov memcg: add memory...	237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264	unsigned long scanned, unsigned long reclaimed) { struct vmpressure vmpr = memcg_to_vmpressure(memcg); / * Here we only want to account pressure that userland is able to * help us with. For example, suppose that DMA zone is under * pressure; if we notify userland about that kind of pressure, * then it will be mostly a waste as it will trigger unnecessary * freeing of memory by userland (since userland is more likely to * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That * is why we include only movable, highmem and FS/IO pages. * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so * we account it too. / if (!(gfp & (__GFP_HIGHMEM \| __GFP_MOVABLE \| __GFP_IO \| __GFP_FS))) return; / * If we got here with no pages scanned, then that is an indicator * that reclaimer was unable to find any shrinkable LRUs at the * current scanning depth. But it does not mean that we should * report the critical pressure, yet. If the scanning priority * (scanning depth) goes too high (deep), we will be notified * through vmpressure_prio(). But so far, keep calm. */ if (!scanned) return;
8e8ae6452 Johannes Weiner mm: memcontrol: h...	265 266	if (tree) { spin_lock(&vmpr->sr_lock);
3c1da7bee Vladimir Davydov mm/vmpressure.c: ...	267	scanned = vmpr->tree_scanned += scanned;
8e8ae6452 Johannes Weiner mm: memcontrol: h...	268	vmpr->tree_reclaimed += reclaimed;
8e8ae6452 Johannes Weiner mm: memcontrol: h...	269	spin_unlock(&vmpr->sr_lock);
70ddf637e Anton Vorontsov memcg: add memory...	270
8e8ae6452 Johannes Weiner mm: memcontrol: h...	271 272 273 274 275 276 277	if (scanned < vmpressure_win) return; schedule_work(&vmpr->work); } else { enum vmpressure_levels level; /* For now, no users for root-level efficiency */
686739f6a Hugh Dickins memcg: avoid vmpr...	278	if (!memcg \|\| memcg == root_mem_cgroup)
8e8ae6452 Johannes Weiner mm: memcontrol: h...	279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304	return; spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned += scanned; reclaimed = vmpr->reclaimed += reclaimed; if (scanned < vmpressure_win) { spin_unlock(&vmpr->sr_lock); return; } vmpr->scanned = vmpr->reclaimed = 0; spin_unlock(&vmpr->sr_lock); level = vmpressure_calc_level(scanned, reclaimed); if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that * we are having trouble reclaiming LRU pages. * * For hysteresis keep the pressure state * asserted for a second in which subsequent * pressure events can occur. */ memcg->socket_pressure = jiffies + HZ; } }
70ddf637e Anton Vorontsov memcg: add memory...	305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333	} /** * vmpressure_prio() - Account memory pressure through reclaimer priority level * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle * @prio: reclaimer's priority * * This function should be called from the reclaim path every time when * the vmscan's reclaiming priority (scanning depth) changes. * * This function does not return any value. / void vmpressure_prio(gfp_t gfp, struct mem_cgroup memcg, int prio) { /* * We only use prio for accounting critical level. For more info * see comment for vmpressure_level_critical_prio variable above. / if (prio > vmpressure_level_critical_prio) return; / * OK, the prio is below the threshold, updating vmpressure * information before shrinker dives into long shrinking of long * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 * to the vmpressure() basically means that we signal 'critical' * level. */
8e8ae6452 Johannes Weiner mm: memcontrol: h...	334	vmpressure(gfp, memcg, true, vmpressure_win, 0);
70ddf637e Anton Vorontsov memcg: add memory...	335	}
b6bb98114 David Rientjes mm, vmpressure: p...	336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356	static enum vmpressure_levels str_to_level(const char arg) { enum vmpressure_levels level; for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) if (!strcmp(vmpressure_str_levels[level], arg)) return level; return -1; } static enum vmpressure_modes str_to_mode(const char arg) { enum vmpressure_modes mode; for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) if (!strcmp(vmpressure_str_modes[mode], arg)) return mode; return -1; } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
70ddf637e Anton Vorontsov memcg: add memory...	357 358	/** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
59b6f8734 Tejun Heo memcg: make cgrou...	359	* @memcg: memcg that is interested in vmpressure notifications
70ddf637e Anton Vorontsov memcg: add memory...	360	* @eventfd: eventfd context to link notifications with
b6bb98114 David Rientjes mm, vmpressure: p...	361	* @args: event arguments (pressure level threshold, optional mode)
70ddf637e Anton Vorontsov memcg: add memory...	362 363 364	* * This function associates eventfd context with the vmpressure * infrastructure, so that the notifications will be delivered to the
b6bb98114 David Rientjes mm, vmpressure: p...	365 366 367 368	* @eventfd. The @args parameter is a comma-delimited string that denotes a * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. * "hierarchy" or "local").
70ddf637e Anton Vorontsov memcg: add memory...	369	*
347c4a874 Tejun Heo memcg: remove cgr...	370	* To be used as memcg event method.
70ddf637e Anton Vorontsov memcg: add memory...	371	*/
59b6f8734 Tejun Heo memcg: make cgrou...	372	int vmpressure_register_event(struct mem_cgroup *memcg,
347c4a874 Tejun Heo memcg: remove cgr...	373	struct eventfd_ctx eventfd, const char args)
70ddf637e Anton Vorontsov memcg: add memory...	374	{
59b6f8734 Tejun Heo memcg: make cgrou...	375	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
70ddf637e Anton Vorontsov memcg: add memory...	376	struct vmpressure_event *ev;
b6bb98114 David Rientjes mm, vmpressure: p...	377 378 379 380 381 382 383 384 385 386 387 388	enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; enum vmpressure_levels level = -1; char spec, spec_orig; char *token; int ret = 0; spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); if (!spec) { ret = -ENOMEM; goto out; } strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
70ddf637e Anton Vorontsov memcg: add memory...	389
b6bb98114 David Rientjes mm, vmpressure: p...	390 391 392 393 394 395	/* Find required level */ token = strsep(&spec, ","); level = str_to_level(token); if (level == -1) { ret = -EINVAL; goto out;
70ddf637e Anton Vorontsov memcg: add memory...	396	}
b6bb98114 David Rientjes mm, vmpressure: p...	397 398 399 400 401 402 403 404 405	/* Find optional mode */ token = strsep(&spec, ","); if (token) { mode = str_to_mode(token); if (mode == -1) { ret = -EINVAL; goto out; } }
70ddf637e Anton Vorontsov memcg: add memory...	406 407	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
b6bb98114 David Rientjes mm, vmpressure: p...	408 409 410 411	if (!ev) { ret = -ENOMEM; goto out; }
70ddf637e Anton Vorontsov memcg: add memory...	412 413 414	ev->efd = eventfd; ev->level = level;
b6bb98114 David Rientjes mm, vmpressure: p...	415	ev->mode = mode;
70ddf637e Anton Vorontsov memcg: add memory...	416 417 418 419	mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock);
b6bb98114 David Rientjes mm, vmpressure: p...	420 421 422	out: kfree(spec_orig); return ret;
70ddf637e Anton Vorontsov memcg: add memory...	423 424 425 426	} /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure
59b6f8734 Tejun Heo memcg: make cgrou...	427	* @memcg: memcg handle
70ddf637e Anton Vorontsov memcg: add memory...	428 429 430 431 432 433	* @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from * the vmpressure notifications, and then frees internal resources * associated with the @eventfd (but the @eventfd itself is not freed). *
347c4a874 Tejun Heo memcg: remove cgr...	434	* To be used as memcg event method.
70ddf637e Anton Vorontsov memcg: add memory...	435	*/
59b6f8734 Tejun Heo memcg: make cgrou...	436	void vmpressure_unregister_event(struct mem_cgroup *memcg,
70ddf637e Anton Vorontsov memcg: add memory...	437 438	struct eventfd_ctx *eventfd) {
59b6f8734 Tejun Heo memcg: make cgrou...	439	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
70ddf637e Anton Vorontsov memcg: add memory...	440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461	struct vmpressure_event ev; mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ev->efd != eventfd) continue; list_del(&ev->node); kfree(ev); break; } mutex_unlock(&vmpr->events_lock); } /* * vmpressure_init() - Initialize vmpressure control structure * @vmpr: Structure to be initialized * * This function should be called on every allocated vmpressure structure * before any usage. / void vmpressure_init(struct vmpressure vmpr) {
22f2020f8 Michal Hocko vmpressure: chang...	462	spin_lock_init(&vmpr->sr_lock);
70ddf637e Anton Vorontsov memcg: add memory...	463 464 465 466	mutex_init(&vmpr->events_lock); INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); }
33cb876e9 Michal Hocko vmpressure: make ...	467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482	/** * vmpressure_cleanup() - shuts down vmpressure control structure * @vmpr: Structure to be cleaned up * * This function should be called before the structure in which it is * embedded is cleaned up. / void vmpressure_cleanup(struct vmpressure vmpr) { /* * Make sure there is no pending work before eventfd infrastructure * goes away. */ flush_work(&vmpr->work); }