Blame view

mm/memcontrol.c 27.5 KB
8cdea7c05   Balbir Singh   Memory controller...
1
2
3
4
5
  /* memcontrol.c - Memory Controller
   *
   * Copyright IBM Corporation, 2007
   * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   *
78fb74669   Pavel Emelianov   Memory controller...
6
7
8
   * Copyright 2007 OpenVZ SWsoft Inc
   * Author: Pavel Emelianov <xemul@openvz.org>
   *
8cdea7c05   Balbir Singh   Memory controller...
9
10
11
12
13
14
15
16
17
18
19
20
21
22
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   */
  
  #include <linux/res_counter.h>
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
78fb74669   Pavel Emelianov   Memory controller...
23
  #include <linux/mm.h>
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
24
  #include <linux/smp.h>
8a9f3ccd2   Balbir Singh   Memory controller...
25
  #include <linux/page-flags.h>
66e1707bc   Balbir Singh   Memory controller...
26
  #include <linux/backing-dev.h>
8a9f3ccd2   Balbir Singh   Memory controller...
27
28
  #include <linux/bit_spinlock.h>
  #include <linux/rcupdate.h>
b6ac57d50   Balbir Singh   memcgroup: move m...
29
  #include <linux/slab.h>
66e1707bc   Balbir Singh   Memory controller...
30
31
32
  #include <linux/swap.h>
  #include <linux/spinlock.h>
  #include <linux/fs.h>
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
33
  #include <linux/seq_file.h>
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
34
  #include <linux/vmalloc.h>
8cdea7c05   Balbir Singh   Memory controller...
35

8697d3319   Balbir Singh   Memory controller...
36
  #include <asm/uaccess.h>
8cdea7c05   Balbir Singh   Memory controller...
37
  struct cgroup_subsys mem_cgroup_subsys;
66e1707bc   Balbir Singh   Memory controller...
38
  static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
b6ac57d50   Balbir Singh   memcgroup: move m...
39
  static struct kmem_cache *page_cgroup_cache;
8cdea7c05   Balbir Singh   Memory controller...
40
41
  
  /*
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
42
43
44
45
46
47
48
49
   * Statistics for memory cgroup.
   */
  enum mem_cgroup_stat_index {
  	/*
  	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  	 */
  	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
  	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
55e462b05   Balaji Rao   memcg: simple sta...
50
51
  	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
  	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
  
  	MEM_CGROUP_STAT_NSTATS,
  };
  
  struct mem_cgroup_stat_cpu {
  	s64 count[MEM_CGROUP_STAT_NSTATS];
  } ____cacheline_aligned_in_smp;
  
  struct mem_cgroup_stat {
  	struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
  };
  
  /*
   * For accounting under irq disable, no need for increment preempt count.
   */
  static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
  		enum mem_cgroup_stat_index idx, int val)
  {
  	int cpu = smp_processor_id();
  	stat->cpustat[cpu].count[idx] += val;
  }
  
  static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
  		enum mem_cgroup_stat_index idx)
  {
  	int cpu;
  	s64 ret = 0;
  	for_each_possible_cpu(cpu)
  		ret += stat->cpustat[cpu].count[idx];
  	return ret;
  }
  
  /*
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
85
86
87
88
89
90
91
92
93
94
95
   * per-zone information in memory controller.
   */
  
  enum mem_cgroup_zstat_index {
  	MEM_CGROUP_ZSTAT_ACTIVE,
  	MEM_CGROUP_ZSTAT_INACTIVE,
  
  	NR_MEM_CGROUP_ZSTAT,
  };
  
  struct mem_cgroup_per_zone {
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
96
97
98
99
  	/*
  	 * spin_lock to protect the per cgroup LRU
  	 */
  	spinlock_t		lru_lock;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
100
101
  	struct list_head	active_list;
  	struct list_head	inactive_list;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
102
103
104
105
106
107
108
109
110
111
112
113
114
115
  	unsigned long count[NR_MEM_CGROUP_ZSTAT];
  };
  /* Macro for accessing counter */
  #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
  
  struct mem_cgroup_per_node {
  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_lru_info {
  	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
  };
  
  /*
8cdea7c05   Balbir Singh   Memory controller...
116
117
118
119
120
121
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
   * statistics based on the statistics developed by Rik Van Riel for clock-pro,
   * to help the administrator determine what knobs to tune.
   *
   * TODO: Add a water mark for the memory controller. Reclaim will begin when
8a9f3ccd2   Balbir Singh   Memory controller...
122
123
124
   * we hit the water mark. May be even add a low water mark, such that
   * no reclaim occurs from a cgroup at it's low water mark, this is
   * a feature that will be implemented much later in the future.
8cdea7c05   Balbir Singh   Memory controller...
125
126
127
128
129
130
131
   */
  struct mem_cgroup {
  	struct cgroup_subsys_state css;
  	/*
  	 * the counter to account for memory usage
  	 */
  	struct res_counter res;
78fb74669   Pavel Emelianov   Memory controller...
132
133
134
  	/*
  	 * Per cgroup active and inactive list, similar to the
  	 * per zone LRU lists.
78fb74669   Pavel Emelianov   Memory controller...
135
  	 */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
136
  	struct mem_cgroup_lru_info info;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
137

6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
138
  	int	prev_priority;	/* for recording reclaim priority */
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
139
140
141
142
  	/*
  	 * statistics.
  	 */
  	struct mem_cgroup_stat stat;
8cdea7c05   Balbir Singh   Memory controller...
143
  };
8869b8f6e   Hugh Dickins   memcg: memcontrol...
144
  static struct mem_cgroup init_mem_cgroup;
8cdea7c05   Balbir Singh   Memory controller...
145
146
  
  /*
8a9f3ccd2   Balbir Singh   Memory controller...
147
   * We use the lower bit of the page->page_cgroup pointer as a bit spin
9442ec9df   Hugh Dickins   memcg: bad page i...
148
149
150
151
   * lock.  We need to ensure that page->page_cgroup is at least two
   * byte aligned (based on comments from Nick Piggin).  But since
   * bit_spin_lock doesn't actually set that lock bit in a non-debug
   * uniprocessor kernel, we should avoid setting it here too.
8a9f3ccd2   Balbir Singh   Memory controller...
152
153
   */
  #define PAGE_CGROUP_LOCK_BIT 	0x0
9442ec9df   Hugh Dickins   memcg: bad page i...
154
155
156
157
158
  #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
  #define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
  #else
  #define PAGE_CGROUP_LOCK	0x0
  #endif
8a9f3ccd2   Balbir Singh   Memory controller...
159
160
  
  /*
8cdea7c05   Balbir Singh   Memory controller...
161
162
163
164
165
166
167
   * A page_cgroup page is associated with every page descriptor. The
   * page_cgroup helps us identify information about the cgroup
   */
  struct page_cgroup {
  	struct list_head lru;		/* per cgroup LRU list */
  	struct page *page;
  	struct mem_cgroup *mem_cgroup;
b9c565d5a   Hugh Dickins   memcg: remove cle...
168
  	int ref_cnt;			/* cached, mapped, migrating */
8869b8f6e   Hugh Dickins   memcg: memcontrol...
169
  	int flags;
8cdea7c05   Balbir Singh   Memory controller...
170
  };
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
171
  #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
3564c7c45   KAMEZAWA Hiroyuki   memory cgroup enh...
172
  #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
8cdea7c05   Balbir Singh   Memory controller...
173

d5b69e38f   Hugh Dickins   memcg: memcontrol...
174
  static int page_cgroup_nid(struct page_cgroup *pc)
c0149530d   KAMEZAWA Hiroyuki   per-zone and recl...
175
176
177
  {
  	return page_to_nid(pc->page);
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
178
  static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
c0149530d   KAMEZAWA Hiroyuki   per-zone and recl...
179
180
181
  {
  	return page_zonenum(pc->page);
  }
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
182
183
184
185
  enum charge_type {
  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
  	MEM_CGROUP_CHARGE_TYPE_MAPPED,
  };
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
186
187
188
189
190
191
192
193
  /*
   * Always modified under lru lock. Then, not necessary to preempt_disable()
   */
  static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
  					bool charge)
  {
  	int val = (charge)? 1 : -1;
  	struct mem_cgroup_stat *stat = &mem->stat;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
194

8869b8f6e   Hugh Dickins   memcg: memcontrol...
195
  	VM_BUG_ON(!irqs_disabled());
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
196
  	if (flags & PAGE_CGROUP_FLAG_CACHE)
8869b8f6e   Hugh Dickins   memcg: memcontrol...
197
  		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
198
199
  	else
  		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
55e462b05   Balaji Rao   memcg: simple sta...
200
201
202
203
204
205
206
  
  	if (charge)
  		__mem_cgroup_stat_add_safe(stat,
  				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
  	else
  		__mem_cgroup_stat_add_safe(stat,
  				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
207
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
208
  static struct mem_cgroup_per_zone *
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
209
210
  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
  {
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
211
212
  	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
213
  static struct mem_cgroup_per_zone *
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
214
215
216
217
218
  page_cgroup_zoneinfo(struct page_cgroup *pc)
  {
  	struct mem_cgroup *mem = pc->mem_cgroup;
  	int nid = page_cgroup_nid(pc);
  	int zid = page_cgroup_zid(pc);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
219

6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
  	return mem_cgroup_zoneinfo(mem, nid, zid);
  }
  
  static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
  					enum mem_cgroup_zstat_index idx)
  {
  	int nid, zid;
  	struct mem_cgroup_per_zone *mz;
  	u64 total = 0;
  
  	for_each_online_node(nid)
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			mz = mem_cgroup_zoneinfo(mem, nid, zid);
  			total += MEM_CGROUP_ZSTAT(mz, idx);
  		}
  	return total;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
236
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
237
  static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
8cdea7c05   Balbir Singh   Memory controller...
238
239
240
241
242
  {
  	return container_of(cgroup_subsys_state(cont,
  				mem_cgroup_subsys_id), struct mem_cgroup,
  				css);
  }
cf475ad28   Balbir Singh   cgroups: add an o...
243
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb74669   Pavel Emelianov   Memory controller...
244
245
246
247
  {
  	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
  				struct mem_cgroup, css);
  }
8a9f3ccd2   Balbir Singh   Memory controller...
248
249
  static inline int page_cgroup_locked(struct page *page)
  {
8869b8f6e   Hugh Dickins   memcg: memcontrol...
250
  	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
8a9f3ccd2   Balbir Singh   Memory controller...
251
  }
9442ec9df   Hugh Dickins   memcg: bad page i...
252
  static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
78fb74669   Pavel Emelianov   Memory controller...
253
  {
9442ec9df   Hugh Dickins   memcg: bad page i...
254
255
  	VM_BUG_ON(!page_cgroup_locked(page));
  	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
78fb74669   Pavel Emelianov   Memory controller...
256
257
258
259
  }
  
  struct page_cgroup *page_get_page_cgroup(struct page *page)
  {
8869b8f6e   Hugh Dickins   memcg: memcontrol...
260
  	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
8a9f3ccd2   Balbir Singh   Memory controller...
261
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
262
  static void lock_page_cgroup(struct page *page)
8a9f3ccd2   Balbir Singh   Memory controller...
263
264
  {
  	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
8a9f3ccd2   Balbir Singh   Memory controller...
265
  }
2680eed72   Hugh Dickins   memcg: fix mem_cg...
266
267
268
269
  static int try_lock_page_cgroup(struct page *page)
  {
  	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
270
  static void unlock_page_cgroup(struct page *page)
8a9f3ccd2   Balbir Singh   Memory controller...
271
272
273
  {
  	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
  }
3eae90c3c   KAMEZAWA Hiroyuki   memcg: remove red...
274
275
  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
  			struct page_cgroup *pc)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
276
277
  {
  	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
278
279
280
281
282
283
284
285
286
  
  	if (from)
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
  	else
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
  
  	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
  	list_del_init(&pc->lru);
  }
3eae90c3c   KAMEZAWA Hiroyuki   memcg: remove red...
287
288
  static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
  				struct page_cgroup *pc)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
289
290
  {
  	int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
291
292
293
  
  	if (!to) {
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
294
  		list_add(&pc->lru, &mz->inactive_list);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
295
296
  	} else {
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
297
  		list_add(&pc->lru, &mz->active_list);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
298
299
300
  	}
  	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
  }
8697d3319   Balbir Singh   Memory controller...
301
  static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
66e1707bc   Balbir Singh   Memory controller...
302
  {
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
303
304
305
306
307
308
309
  	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
  	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
  
  	if (from)
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
  	else
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
3564c7c45   KAMEZAWA Hiroyuki   memory cgroup enh...
310
  	if (active) {
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
311
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
3564c7c45   KAMEZAWA Hiroyuki   memory cgroup enh...
312
  		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
313
  		list_move(&pc->lru, &mz->active_list);
3564c7c45   KAMEZAWA Hiroyuki   memory cgroup enh...
314
  	} else {
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
315
  		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
3564c7c45   KAMEZAWA Hiroyuki   memory cgroup enh...
316
  		pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
317
  		list_move(&pc->lru, &mz->inactive_list);
3564c7c45   KAMEZAWA Hiroyuki   memory cgroup enh...
318
  	}
66e1707bc   Balbir Singh   Memory controller...
319
  }
4c4a22148   David Rientjes   memcontrol: move ...
320
321
322
323
324
  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
  {
  	int ret;
  
  	task_lock(task);
bd845e38c   Hugh Dickins   memcg: mm_match_c...
325
  	ret = task->mm && mm_match_cgroup(task->mm, mem);
4c4a22148   David Rientjes   memcontrol: move ...
326
327
328
  	task_unlock(task);
  	return ret;
  }
66e1707bc   Balbir Singh   Memory controller...
329
330
331
  /*
   * This routine assumes that the appropriate zone's lru lock is already held
   */
427d5416f   Hugh Dickins   memcg: move_lists...
332
  void mem_cgroup_move_lists(struct page *page, bool active)
66e1707bc   Balbir Singh   Memory controller...
333
  {
427d5416f   Hugh Dickins   memcg: move_lists...
334
  	struct page_cgroup *pc;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
335
336
  	struct mem_cgroup_per_zone *mz;
  	unsigned long flags;
2680eed72   Hugh Dickins   memcg: fix mem_cg...
337
338
339
340
341
342
343
344
  	/*
  	 * We cannot lock_page_cgroup while holding zone's lru_lock,
  	 * because other holders of lock_page_cgroup can be interrupted
  	 * with an attempt to rotate_reclaimable_page.  But we cannot
  	 * safely get to page_cgroup without it, so just try_lock it:
  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
  	 */
  	if (!try_lock_page_cgroup(page))
66e1707bc   Balbir Singh   Memory controller...
345
  		return;
2680eed72   Hugh Dickins   memcg: fix mem_cg...
346
347
  	pc = page_get_page_cgroup(page);
  	if (pc) {
2680eed72   Hugh Dickins   memcg: fix mem_cg...
348
  		mz = page_cgroup_zoneinfo(pc);
2680eed72   Hugh Dickins   memcg: fix mem_cg...
349
  		spin_lock_irqsave(&mz->lru_lock, flags);
9b3c0a07e   Hirokazu Takahashi   memcg: simplify f...
350
  		__mem_cgroup_move_lists(pc, active);
2680eed72   Hugh Dickins   memcg: fix mem_cg...
351
  		spin_unlock_irqrestore(&mz->lru_lock, flags);
9b3c0a07e   Hirokazu Takahashi   memcg: simplify f...
352
353
  	}
  	unlock_page_cgroup(page);
66e1707bc   Balbir Singh   Memory controller...
354
  }
58ae83db2   KAMEZAWA Hiroyuki   per-zone and recl...
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
  /*
   * Calculate mapped_ratio under memory controller. This will be used in
   * vmscan.c for deteremining we have to reclaim mapped pages.
   */
  int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
  {
  	long total, rss;
  
  	/*
  	 * usage is recorded in bytes. But, here, we assume the number of
  	 * physical pages can be represented by "long" on any arch.
  	 */
  	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
  	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
  	return (int)((rss * 100L) / total);
  }
8869b8f6e   Hugh Dickins   memcg: memcontrol...
371

5932f3671   KAMEZAWA Hiroyuki   per-zone and recl...
372
373
374
375
376
377
378
379
380
381
382
383
384
385
  /*
   * This function is called from vmscan.c. In page reclaiming loop. balance
   * between active and inactive list is calculated. For memory controller
   * page reclaiming, we should use using mem_cgroup's imbalance rather than
   * zone's global lru imbalance.
   */
  long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
  {
  	unsigned long active, inactive;
  	/* active and inactive are the number of pages. 'long' is ok.*/
  	active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
  	inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
  	return (long) (active / (inactive + 1));
  }
58ae83db2   KAMEZAWA Hiroyuki   per-zone and recl...
386

6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  /*
   * prev_priority control...this will be used in memory reclaim path.
   */
  int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
  {
  	return mem->prev_priority;
  }
  
  void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
  {
  	if (priority < mem->prev_priority)
  		mem->prev_priority = priority;
  }
  
  void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
  {
  	mem->prev_priority = priority;
  }
cc38108e1   KAMEZAWA Hiroyuki   per-zone and recl...
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
  /*
   * Calculate # of pages to be scanned in this priority/zone.
   * See also vmscan.c
   *
   * priority starts from "DEF_PRIORITY" and decremented in each loop.
   * (see include/linux/mmzone.h)
   */
  
  long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
  				   struct zone *zone, int priority)
  {
  	long nr_active;
  	int nid = zone->zone_pgdat->node_id;
  	int zid = zone_idx(zone);
  	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
  
  	nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
  	return (nr_active >> priority);
  }
  
  long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
  					struct zone *zone, int priority)
  {
  	long nr_inactive;
  	int nid = zone->zone_pgdat->node_id;
  	int zid = zone_idx(zone);
  	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
  
  	nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
cc38108e1   KAMEZAWA Hiroyuki   per-zone and recl...
434
435
  	return (nr_inactive >> priority);
  }
66e1707bc   Balbir Singh   Memory controller...
436
437
438
439
440
441
442
443
444
445
446
447
  unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  					struct list_head *dst,
  					unsigned long *scanned, int order,
  					int mode, struct zone *z,
  					struct mem_cgroup *mem_cont,
  					int active)
  {
  	unsigned long nr_taken = 0;
  	struct page *page;
  	unsigned long scan;
  	LIST_HEAD(pc_list);
  	struct list_head *src;
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
448
  	struct page_cgroup *pc, *tmp;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
449
450
451
  	int nid = z->zone_pgdat->node_id;
  	int zid = zone_idx(z);
  	struct mem_cgroup_per_zone *mz;
66e1707bc   Balbir Singh   Memory controller...
452

cf475ad28   Balbir Singh   cgroups: add an o...
453
  	BUG_ON(!mem_cont);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
454
  	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
66e1707bc   Balbir Singh   Memory controller...
455
  	if (active)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
456
  		src = &mz->active_list;
66e1707bc   Balbir Singh   Memory controller...
457
  	else
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
458
  		src = &mz->inactive_list;
66e1707bc   Balbir Singh   Memory controller...
459

072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
460
  	spin_lock(&mz->lru_lock);
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
461
462
  	scan = 0;
  	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
436c6541b   Hugh Dickins   memcgroup: fix zo...
463
  		if (scan >= nr_to_scan)
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
464
  			break;
66e1707bc   Balbir Singh   Memory controller...
465
  		page = pc->page;
66e1707bc   Balbir Singh   Memory controller...
466

436c6541b   Hugh Dickins   memcgroup: fix zo...
467
  		if (unlikely(!PageLRU(page)))
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
468
  			continue;
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
469

66e1707bc   Balbir Singh   Memory controller...
470
471
  		if (PageActive(page) && !active) {
  			__mem_cgroup_move_lists(pc, true);
66e1707bc   Balbir Singh   Memory controller...
472
473
474
475
  			continue;
  		}
  		if (!PageActive(page) && active) {
  			__mem_cgroup_move_lists(pc, false);
66e1707bc   Balbir Singh   Memory controller...
476
477
  			continue;
  		}
436c6541b   Hugh Dickins   memcgroup: fix zo...
478
479
  		scan++;
  		list_move(&pc->lru, &pc_list);
66e1707bc   Balbir Singh   Memory controller...
480
481
482
483
484
485
486
487
  
  		if (__isolate_lru_page(page, mode) == 0) {
  			list_move(&page->lru, dst);
  			nr_taken++;
  		}
  	}
  
  	list_splice(&pc_list, src);
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
488
  	spin_unlock(&mz->lru_lock);
66e1707bc   Balbir Singh   Memory controller...
489
490
491
492
  
  	*scanned = scan;
  	return nr_taken;
  }
8a9f3ccd2   Balbir Singh   Memory controller...
493
494
495
496
497
498
  /*
   * Charge the memory controller for page usage.
   * Return
   * 0 if the charge was successful
   * < 0 if the cgroup is over its limit
   */
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
499
500
  static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
  				gfp_t gfp_mask, enum charge_type ctype)
8a9f3ccd2   Balbir Singh   Memory controller...
501
502
  {
  	struct mem_cgroup *mem;
9175e0311   KAMEZAWA Hiroyuki   bugfix for memory...
503
  	struct page_cgroup *pc;
66e1707bc   Balbir Singh   Memory controller...
504
505
  	unsigned long flags;
  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
506
  	struct mem_cgroup_per_zone *mz;
8a9f3ccd2   Balbir Singh   Memory controller...
507

4077960e2   Balbir Singh   memory controller...
508
509
  	if (mem_cgroup_subsys.disabled)
  		return 0;
8a9f3ccd2   Balbir Singh   Memory controller...
510
511
512
513
514
515
516
  	/*
  	 * Should page_cgroup's go to their own slab?
  	 * One could optimize the performance of the charging routine
  	 * by saving a bit in the page_flags and using it as a lock
  	 * to see if the cgroup page already has a page_cgroup associated
  	 * with it
  	 */
66e1707bc   Balbir Singh   Memory controller...
517
  retry:
7e924aafa   Hugh Dickins   memcg: mem_cgroup...
518
519
520
521
522
523
524
  	lock_page_cgroup(page);
  	pc = page_get_page_cgroup(page);
  	/*
  	 * The page_cgroup exists and
  	 * the page has already been accounted.
  	 */
  	if (pc) {
b9c565d5a   Hugh Dickins   memcg: remove cle...
525
526
527
528
529
530
  		VM_BUG_ON(pc->page != page);
  		VM_BUG_ON(pc->ref_cnt <= 0);
  
  		pc->ref_cnt++;
  		unlock_page_cgroup(page);
  		goto done;
8a9f3ccd2   Balbir Singh   Memory controller...
531
  	}
7e924aafa   Hugh Dickins   memcg: mem_cgroup...
532
  	unlock_page_cgroup(page);
8a9f3ccd2   Balbir Singh   Memory controller...
533

b6ac57d50   Balbir Singh   memcgroup: move m...
534
  	pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
8a9f3ccd2   Balbir Singh   Memory controller...
535
536
  	if (pc == NULL)
  		goto err;
8a9f3ccd2   Balbir Singh   Memory controller...
537
  	/*
3be91277e   Hugh Dickins   memcgroup: tidy u...
538
539
  	 * We always charge the cgroup the mm_struct belongs to.
  	 * The mm_struct's mem_cgroup changes on task migration if the
8a9f3ccd2   Balbir Singh   Memory controller...
540
541
542
543
544
  	 * thread group leader migrates. It's possible that mm is not
  	 * set, if so charge the init_mm (happens for pagecache usage).
  	 */
  	if (!mm)
  		mm = &init_mm;
3be91277e   Hugh Dickins   memcgroup: tidy u...
545
  	rcu_read_lock();
cf475ad28   Balbir Singh   cgroups: add an o...
546
  	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
8a9f3ccd2   Balbir Singh   Memory controller...
547
  	/*
8869b8f6e   Hugh Dickins   memcg: memcontrol...
548
  	 * For every charge from the cgroup, increment reference count
8a9f3ccd2   Balbir Singh   Memory controller...
549
550
551
  	 */
  	css_get(&mem->css);
  	rcu_read_unlock();
0eea10301   Balbir Singh   Memory controller...
552
  	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
3be91277e   Hugh Dickins   memcgroup: tidy u...
553
554
  		if (!(gfp_mask & __GFP_WAIT))
  			goto out;
e1a1cd590   Balbir Singh   Memory controller...
555
556
  
  		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
66e1707bc   Balbir Singh   Memory controller...
557
558
559
  			continue;
  
  		/*
8869b8f6e   Hugh Dickins   memcg: memcontrol...
560
561
562
563
564
565
  		 * try_to_free_mem_cgroup_pages() might not give us a full
  		 * picture of reclaim. Some pages are reclaimed and might be
  		 * moved to swap cache or just unmapped from the cgroup.
  		 * Check the limit again to see if the reclaim reduced the
  		 * current usage of the cgroup before giving up
  		 */
66e1707bc   Balbir Singh   Memory controller...
566
567
  		if (res_counter_check_under_limit(&mem->res))
  			continue;
3be91277e   Hugh Dickins   memcgroup: tidy u...
568
569
570
571
  
  		if (!nr_retries--) {
  			mem_cgroup_out_of_memory(mem, gfp_mask);
  			goto out;
66e1707bc   Balbir Singh   Memory controller...
572
  		}
8a9f3ccd2   Balbir Singh   Memory controller...
573
  	}
b9c565d5a   Hugh Dickins   memcg: remove cle...
574
  	pc->ref_cnt = 1;
8a9f3ccd2   Balbir Singh   Memory controller...
575
576
  	pc->mem_cgroup = mem;
  	pc->page = page;
3564c7c45   KAMEZAWA Hiroyuki   memory cgroup enh...
577
  	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
578
  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
4a56d02e3   Balbir Singh   memcgroup: make t...
579
  		pc->flags = PAGE_CGROUP_FLAG_CACHE;
3be91277e   Hugh Dickins   memcgroup: tidy u...
580

7e924aafa   Hugh Dickins   memcg: mem_cgroup...
581
582
583
  	lock_page_cgroup(page);
  	if (page_get_page_cgroup(page)) {
  		unlock_page_cgroup(page);
9175e0311   KAMEZAWA Hiroyuki   bugfix for memory...
584
  		/*
3be91277e   Hugh Dickins   memcgroup: tidy u...
585
586
  		 * Another charge has been added to this page already.
  		 * We take lock_page_cgroup(page) again and read
9175e0311   KAMEZAWA Hiroyuki   bugfix for memory...
587
588
589
590
  		 * page->cgroup, increment refcnt.... just retry is OK.
  		 */
  		res_counter_uncharge(&mem->res, PAGE_SIZE);
  		css_put(&mem->css);
b6ac57d50   Balbir Singh   memcgroup: move m...
591
  		kmem_cache_free(page_cgroup_cache, pc);
9175e0311   KAMEZAWA Hiroyuki   bugfix for memory...
592
593
  		goto retry;
  	}
7e924aafa   Hugh Dickins   memcg: mem_cgroup...
594
  	page_assign_page_cgroup(page, pc);
8a9f3ccd2   Balbir Singh   Memory controller...
595

072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
596
597
  	mz = page_cgroup_zoneinfo(pc);
  	spin_lock_irqsave(&mz->lru_lock, flags);
3eae90c3c   KAMEZAWA Hiroyuki   memcg: remove red...
598
  	__mem_cgroup_add_list(mz, pc);
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
599
  	spin_unlock_irqrestore(&mz->lru_lock, flags);
66e1707bc   Balbir Singh   Memory controller...
600

fb59e9f1e   Hugh Dickins   memcg: fix oops o...
601
  	unlock_page_cgroup(page);
8a9f3ccd2   Balbir Singh   Memory controller...
602
  done:
8a9f3ccd2   Balbir Singh   Memory controller...
603
  	return 0;
3be91277e   Hugh Dickins   memcgroup: tidy u...
604
605
  out:
  	css_put(&mem->css);
b6ac57d50   Balbir Singh   memcgroup: move m...
606
  	kmem_cache_free(page_cgroup_cache, pc);
8a9f3ccd2   Balbir Singh   Memory controller...
607
  err:
8a9f3ccd2   Balbir Singh   Memory controller...
608
609
  	return -ENOMEM;
  }
8869b8f6e   Hugh Dickins   memcg: memcontrol...
610
  int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
611
612
  {
  	return mem_cgroup_charge_common(page, mm, gfp_mask,
8869b8f6e   Hugh Dickins   memcg: memcontrol...
613
  				MEM_CGROUP_CHARGE_TYPE_MAPPED);
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
614
  }
e1a1cd590   Balbir Singh   Memory controller...
615
616
  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
  				gfp_t gfp_mask)
8697d3319   Balbir Singh   Memory controller...
617
  {
8697d3319   Balbir Singh   Memory controller...
618
619
  	if (!mm)
  		mm = &init_mm;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
620
  	return mem_cgroup_charge_common(page, mm, gfp_mask,
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
621
  				MEM_CGROUP_CHARGE_TYPE_CACHE);
8697d3319   Balbir Singh   Memory controller...
622
623
624
  }
  
  /*
8a9f3ccd2   Balbir Singh   Memory controller...
625
   * Uncharging is always a welcome operation, we never complain, simply
8289546e5   Hugh Dickins   memcg: remove mem...
626
   * uncharge.
8a9f3ccd2   Balbir Singh   Memory controller...
627
   */
8289546e5   Hugh Dickins   memcg: remove mem...
628
  void mem_cgroup_uncharge_page(struct page *page)
8a9f3ccd2   Balbir Singh   Memory controller...
629
  {
8289546e5   Hugh Dickins   memcg: remove mem...
630
  	struct page_cgroup *pc;
8a9f3ccd2   Balbir Singh   Memory controller...
631
  	struct mem_cgroup *mem;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
632
  	struct mem_cgroup_per_zone *mz;
66e1707bc   Balbir Singh   Memory controller...
633
  	unsigned long flags;
8a9f3ccd2   Balbir Singh   Memory controller...
634

4077960e2   Balbir Singh   memory controller...
635
636
  	if (mem_cgroup_subsys.disabled)
  		return;
8697d3319   Balbir Singh   Memory controller...
637
  	/*
3c541e14b   Balbir Singh   Memory controller...
638
  	 * Check if our page_cgroup is valid
8697d3319   Balbir Singh   Memory controller...
639
  	 */
8289546e5   Hugh Dickins   memcg: remove mem...
640
641
  	lock_page_cgroup(page);
  	pc = page_get_page_cgroup(page);
8a9f3ccd2   Balbir Singh   Memory controller...
642
  	if (!pc)
8289546e5   Hugh Dickins   memcg: remove mem...
643
  		goto unlock;
8a9f3ccd2   Balbir Singh   Memory controller...
644

b9c565d5a   Hugh Dickins   memcg: remove cle...
645
646
647
648
  	VM_BUG_ON(pc->page != page);
  	VM_BUG_ON(pc->ref_cnt <= 0);
  
  	if (--(pc->ref_cnt) == 0) {
b9c565d5a   Hugh Dickins   memcg: remove cle...
649
650
  		mz = page_cgroup_zoneinfo(pc);
  		spin_lock_irqsave(&mz->lru_lock, flags);
3eae90c3c   KAMEZAWA Hiroyuki   memcg: remove red...
651
  		__mem_cgroup_remove_list(mz, pc);
b9c565d5a   Hugh Dickins   memcg: remove cle...
652
  		spin_unlock_irqrestore(&mz->lru_lock, flags);
fb59e9f1e   Hugh Dickins   memcg: fix oops o...
653
654
  		page_assign_page_cgroup(page, NULL);
  		unlock_page_cgroup(page);
6d48ff8bc   Hugh Dickins   memcg: css_put af...
655
656
657
  		mem = pc->mem_cgroup;
  		res_counter_uncharge(&mem->res, PAGE_SIZE);
  		css_put(&mem->css);
b6ac57d50   Balbir Singh   memcgroup: move m...
658
  		kmem_cache_free(page_cgroup_cache, pc);
b9c565d5a   Hugh Dickins   memcg: remove cle...
659
  		return;
8a9f3ccd2   Balbir Singh   Memory controller...
660
  	}
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
661

8289546e5   Hugh Dickins   memcg: remove mem...
662
  unlock:
3c541e14b   Balbir Singh   Memory controller...
663
664
  	unlock_page_cgroup(page);
  }
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
665
666
667
668
  /*
   * Returns non-zero if a page (under migration) has valid page_cgroup member.
   * Refcnt of page_cgroup is incremented.
   */
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
669
670
671
  int mem_cgroup_prepare_migration(struct page *page)
  {
  	struct page_cgroup *pc;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
672

4077960e2   Balbir Singh   memory controller...
673
674
  	if (mem_cgroup_subsys.disabled)
  		return 0;
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
675
676
  	lock_page_cgroup(page);
  	pc = page_get_page_cgroup(page);
b9c565d5a   Hugh Dickins   memcg: remove cle...
677
678
  	if (pc)
  		pc->ref_cnt++;
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
679
  	unlock_page_cgroup(page);
b9c565d5a   Hugh Dickins   memcg: remove cle...
680
  	return pc != NULL;
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
681
682
683
684
  }
  
  void mem_cgroup_end_migration(struct page *page)
  {
8289546e5   Hugh Dickins   memcg: remove mem...
685
  	mem_cgroup_uncharge_page(page);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
686
  }
8869b8f6e   Hugh Dickins   memcg: memcontrol...
687

ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
688
  /*
8869b8f6e   Hugh Dickins   memcg: memcontrol...
689
   * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
690
691
692
   * And no race with uncharge() routines because page_cgroup for *page*
   * has extra one reference by mem_cgroup_prepare_migration.
   */
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
693
694
695
  void mem_cgroup_page_migration(struct page *page, struct page *newpage)
  {
  	struct page_cgroup *pc;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
696
  	struct mem_cgroup_per_zone *mz;
d5b69e38f   Hugh Dickins   memcg: memcontrol...
697
  	unsigned long flags;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
698

b9c565d5a   Hugh Dickins   memcg: remove cle...
699
  	lock_page_cgroup(page);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
700
  	pc = page_get_page_cgroup(page);
b9c565d5a   Hugh Dickins   memcg: remove cle...
701
702
  	if (!pc) {
  		unlock_page_cgroup(page);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
703
  		return;
b9c565d5a   Hugh Dickins   memcg: remove cle...
704
  	}
8869b8f6e   Hugh Dickins   memcg: memcontrol...
705

b9c565d5a   Hugh Dickins   memcg: remove cle...
706
  	mz = page_cgroup_zoneinfo(pc);
8869b8f6e   Hugh Dickins   memcg: memcontrol...
707
  	spin_lock_irqsave(&mz->lru_lock, flags);
3eae90c3c   KAMEZAWA Hiroyuki   memcg: remove red...
708
  	__mem_cgroup_remove_list(mz, pc);
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
709
  	spin_unlock_irqrestore(&mz->lru_lock, flags);
fb59e9f1e   Hugh Dickins   memcg: fix oops o...
710
711
  	page_assign_page_cgroup(page, NULL);
  	unlock_page_cgroup(page);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
712
713
714
  	pc->page = newpage;
  	lock_page_cgroup(newpage);
  	page_assign_page_cgroup(newpage, pc);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
715

072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
716
717
  	mz = page_cgroup_zoneinfo(pc);
  	spin_lock_irqsave(&mz->lru_lock, flags);
3eae90c3c   KAMEZAWA Hiroyuki   memcg: remove red...
718
  	__mem_cgroup_add_list(mz, pc);
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
719
  	spin_unlock_irqrestore(&mz->lru_lock, flags);
fb59e9f1e   Hugh Dickins   memcg: fix oops o...
720
721
  
  	unlock_page_cgroup(newpage);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
722
  }
78fb74669   Pavel Emelianov   Memory controller...
723

cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
724
725
726
727
728
729
  /*
   * This routine traverse page_cgroup in given list and drop them all.
   * This routine ignores page_cgroup->ref_cnt.
   * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
   */
  #define FORCE_UNCHARGE_BATCH	(128)
8869b8f6e   Hugh Dickins   memcg: memcontrol...
730
  static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
731
732
  			    struct mem_cgroup_per_zone *mz,
  			    int active)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
733
734
735
  {
  	struct page_cgroup *pc;
  	struct page *page;
9b3c0a07e   Hirokazu Takahashi   memcg: simplify f...
736
  	int count = FORCE_UNCHARGE_BATCH;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
737
  	unsigned long flags;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
738
739
740
741
742
743
  	struct list_head *list;
  
  	if (active)
  		list = &mz->active_list;
  	else
  		list = &mz->inactive_list;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
744

072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
745
  	spin_lock_irqsave(&mz->lru_lock, flags);
9b3c0a07e   Hirokazu Takahashi   memcg: simplify f...
746
  	while (!list_empty(list)) {
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
747
748
  		pc = list_entry(list->prev, struct page_cgroup, lru);
  		page = pc->page;
9b3c0a07e   Hirokazu Takahashi   memcg: simplify f...
749
750
751
752
753
754
755
  		get_page(page);
  		spin_unlock_irqrestore(&mz->lru_lock, flags);
  		mem_cgroup_uncharge_page(page);
  		put_page(page);
  		if (--count <= 0) {
  			count = FORCE_UNCHARGE_BATCH;
  			cond_resched();
b9c565d5a   Hugh Dickins   memcg: remove cle...
756
  		}
9b3c0a07e   Hirokazu Takahashi   memcg: simplify f...
757
  		spin_lock_irqsave(&mz->lru_lock, flags);
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
758
  	}
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
759
  	spin_unlock_irqrestore(&mz->lru_lock, flags);
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
760
761
762
763
764
765
  }
  
  /*
   * make mem_cgroup's charge to be 0 if there is no task.
   * This enables deleting this mem_cgroup.
   */
d5b69e38f   Hugh Dickins   memcg: memcontrol...
766
  static int mem_cgroup_force_empty(struct mem_cgroup *mem)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
767
768
  {
  	int ret = -EBUSY;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
769
  	int node, zid;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
770

4077960e2   Balbir Singh   memory controller...
771
772
  	if (mem_cgroup_subsys.disabled)
  		return 0;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
773
774
775
  	css_get(&mem->css);
  	/*
  	 * page reclaim code (kswapd etc..) will move pages between
8869b8f6e   Hugh Dickins   memcg: memcontrol...
776
  	 * active_list <-> inactive_list while we don't take a lock.
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
777
778
  	 * So, we have to do loop here until all lists are empty.
  	 */
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
779
  	while (mem->res.usage > 0) {
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
780
781
  		if (atomic_read(&mem->css.cgroup->count) > 0)
  			goto out;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
782
783
784
785
786
  		for_each_node_state(node, N_POSSIBLE)
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  				struct mem_cgroup_per_zone *mz;
  				mz = mem_cgroup_zoneinfo(mem, node, zid);
  				/* drop all page_cgroup in active_list */
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
787
  				mem_cgroup_force_empty_list(mem, mz, 1);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
788
  				/* drop all page_cgroup in inactive_list */
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
789
  				mem_cgroup_force_empty_list(mem, mz, 0);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
790
  			}
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
791
792
793
794
795
796
  	}
  	ret = 0;
  out:
  	css_put(&mem->css);
  	return ret;
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
797
  static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
0eea10301   Balbir Singh   Memory controller...
798
799
800
801
802
803
804
805
806
807
808
  {
  	*tmp = memparse(buf, &buf);
  	if (*buf != '\0')
  		return -EINVAL;
  
  	/*
  	 * Round up the value to the closest page size
  	 */
  	*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
  	return 0;
  }
2c3daa722   Paul Menage   CGroup API files:...
809
  static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
8cdea7c05   Balbir Singh   Memory controller...
810
  {
2c3daa722   Paul Menage   CGroup API files:...
811
812
  	return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
  				    cft->private);
8cdea7c05   Balbir Singh   Memory controller...
813
814
815
816
817
818
819
  }
  
  static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
  				struct file *file, const char __user *userbuf,
  				size_t nbytes, loff_t *ppos)
  {
  	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
0eea10301   Balbir Singh   Memory controller...
820
821
  				cft->private, userbuf, nbytes, ppos,
  				mem_cgroup_write_strategy);
8cdea7c05   Balbir Singh   Memory controller...
822
  }
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
823
  static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
c84872e16   Pavel Emelyanov   memcgroup: add th...
824
825
826
827
  {
  	struct mem_cgroup *mem;
  
  	mem = mem_cgroup_from_cont(cont);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
828
829
830
831
832
833
834
835
  	switch (event) {
  	case RES_MAX_USAGE:
  		res_counter_reset_max(&mem->res);
  		break;
  	case RES_FAILCNT:
  		res_counter_reset_failcnt(&mem->res);
  		break;
  	}
85cc59db1   Pavel Emelyanov   memcgroup: use tr...
836
  	return 0;
c84872e16   Pavel Emelyanov   memcgroup: add th...
837
  }
85cc59db1   Pavel Emelyanov   memcgroup: use tr...
838
  static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
839
  {
85cc59db1   Pavel Emelyanov   memcgroup: use tr...
840
  	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
841
  }
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
842
843
844
845
846
847
  static const struct mem_cgroup_stat_desc {
  	const char *msg;
  	u64 unit;
  } mem_cgroup_stat_desc[] = {
  	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
  	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
55e462b05   Balaji Rao   memcg: simple sta...
848
849
  	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
  	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
850
  };
c64745cf0   Paul Menage   CGroup API files:...
851
852
  static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
  				 struct cgroup_map_cb *cb)
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
853
  {
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
854
855
856
857
858
859
860
861
862
  	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
  	struct mem_cgroup_stat *stat = &mem_cont->stat;
  	int i;
  
  	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
  		s64 val;
  
  		val = mem_cgroup_read_stat(stat, i);
  		val *= mem_cgroup_stat_desc[i].unit;
c64745cf0   Paul Menage   CGroup API files:...
863
  		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
864
  	}
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
865
866
867
868
869
870
871
872
  	/* showing # of active pages */
  	{
  		unsigned long active, inactive;
  
  		inactive = mem_cgroup_get_all_zonestat(mem_cont,
  						MEM_CGROUP_ZSTAT_INACTIVE);
  		active = mem_cgroup_get_all_zonestat(mem_cont,
  						MEM_CGROUP_ZSTAT_ACTIVE);
c64745cf0   Paul Menage   CGroup API files:...
873
874
  		cb->fill(cb, "active", (active) * PAGE_SIZE);
  		cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
875
  	}
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
876
877
  	return 0;
  }
8cdea7c05   Balbir Singh   Memory controller...
878
879
  static struct cftype mem_cgroup_files[] = {
  	{
0eea10301   Balbir Singh   Memory controller...
880
  		.name = "usage_in_bytes",
8cdea7c05   Balbir Singh   Memory controller...
881
  		.private = RES_USAGE,
2c3daa722   Paul Menage   CGroup API files:...
882
  		.read_u64 = mem_cgroup_read,
8cdea7c05   Balbir Singh   Memory controller...
883
884
  	},
  	{
c84872e16   Pavel Emelyanov   memcgroup: add th...
885
886
  		.name = "max_usage_in_bytes",
  		.private = RES_MAX_USAGE,
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
887
  		.trigger = mem_cgroup_reset,
c84872e16   Pavel Emelyanov   memcgroup: add th...
888
889
890
  		.read_u64 = mem_cgroup_read,
  	},
  	{
0eea10301   Balbir Singh   Memory controller...
891
  		.name = "limit_in_bytes",
8cdea7c05   Balbir Singh   Memory controller...
892
893
  		.private = RES_LIMIT,
  		.write = mem_cgroup_write,
2c3daa722   Paul Menage   CGroup API files:...
894
  		.read_u64 = mem_cgroup_read,
8cdea7c05   Balbir Singh   Memory controller...
895
896
897
898
  	},
  	{
  		.name = "failcnt",
  		.private = RES_FAILCNT,
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
899
  		.trigger = mem_cgroup_reset,
2c3daa722   Paul Menage   CGroup API files:...
900
  		.read_u64 = mem_cgroup_read,
8cdea7c05   Balbir Singh   Memory controller...
901
  	},
8697d3319   Balbir Singh   Memory controller...
902
  	{
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
903
  		.name = "force_empty",
85cc59db1   Pavel Emelyanov   memcgroup: use tr...
904
  		.trigger = mem_force_empty_write,
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
905
  	},
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
906
907
  	{
  		.name = "stat",
c64745cf0   Paul Menage   CGroup API files:...
908
  		.read_map = mem_control_stat_show,
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
909
  	},
8cdea7c05   Balbir Singh   Memory controller...
910
  };
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
911
912
913
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
  {
  	struct mem_cgroup_per_node *pn;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
914
  	struct mem_cgroup_per_zone *mz;
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
915
  	int zone, tmp = node;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
916
917
918
919
920
921
922
923
  	/*
  	 * This routine is called against possible nodes.
  	 * But it's BUG to call kmalloc() against offline node.
  	 *
  	 * TODO: this routine can waste much memory for nodes which will
  	 *       never be onlined. It's better to use memory hotplug callback
  	 *       function.
  	 */
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
924
925
926
  	if (!node_state(node, N_NORMAL_MEMORY))
  		tmp = -1;
  	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
927
928
  	if (!pn)
  		return 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
929

6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
930
931
  	mem->info.nodeinfo[node] = pn;
  	memset(pn, 0, sizeof(*pn));
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
932
933
934
935
936
  
  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  		mz = &pn->zoneinfo[zone];
  		INIT_LIST_HEAD(&mz->active_list);
  		INIT_LIST_HEAD(&mz->inactive_list);
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
937
  		spin_lock_init(&mz->lru_lock);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
938
  	}
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
939
940
  	return 0;
  }
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
941
942
943
944
  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
  {
  	kfree(mem->info.nodeinfo[node]);
  }
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
  	struct mem_cgroup *mem;
  
  	if (sizeof(*mem) < PAGE_SIZE)
  		mem = kmalloc(sizeof(*mem), GFP_KERNEL);
  	else
  		mem = vmalloc(sizeof(*mem));
  
  	if (mem)
  		memset(mem, 0, sizeof(*mem));
  	return mem;
  }
  
  static void mem_cgroup_free(struct mem_cgroup *mem)
  {
  	if (sizeof(*mem) < PAGE_SIZE)
  		kfree(mem);
  	else
  		vfree(mem);
  }
8cdea7c05   Balbir Singh   Memory controller...
966
967
968
969
  static struct cgroup_subsys_state *
  mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	struct mem_cgroup *mem;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
970
  	int node;
8cdea7c05   Balbir Singh   Memory controller...
971

b6ac57d50   Balbir Singh   memcgroup: move m...
972
  	if (unlikely((cont->parent) == NULL)) {
78fb74669   Pavel Emelianov   Memory controller...
973
  		mem = &init_mem_cgroup;
b6ac57d50   Balbir Singh   memcgroup: move m...
974
975
  		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
  	} else {
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
976
977
978
  		mem = mem_cgroup_alloc();
  		if (!mem)
  			return ERR_PTR(-ENOMEM);
b6ac57d50   Balbir Singh   memcgroup: move m...
979
  	}
78fb74669   Pavel Emelianov   Memory controller...
980

8cdea7c05   Balbir Singh   Memory controller...
981
  	res_counter_init(&mem->res);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
982

6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
983
984
985
  	for_each_node_state(node, N_POSSIBLE)
  		if (alloc_mem_cgroup_per_zone_info(mem, node))
  			goto free_out;
8cdea7c05   Balbir Singh   Memory controller...
986
  	return &mem->css;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
987
988
  free_out:
  	for_each_node_state(node, N_POSSIBLE)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
989
  		free_mem_cgroup_per_zone_info(mem, node);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
990
  	if (cont->parent != NULL)
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
991
  		mem_cgroup_free(mem);
2dda81ca3   Li Zefan   memcgroup: return...
992
  	return ERR_PTR(-ENOMEM);
8cdea7c05   Balbir Singh   Memory controller...
993
  }
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
994
995
996
997
998
999
  static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
  					struct cgroup *cont)
  {
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
  	mem_cgroup_force_empty(mem);
  }
8cdea7c05   Balbir Singh   Memory controller...
1000
1001
1002
  static void mem_cgroup_destroy(struct cgroup_subsys *ss,
  				struct cgroup *cont)
  {
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
1003
1004
1005
1006
  	int node;
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
  
  	for_each_node_state(node, N_POSSIBLE)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
1007
  		free_mem_cgroup_per_zone_info(mem, node);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
1008

333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
1009
  	mem_cgroup_free(mem_cgroup_from_cont(cont));
8cdea7c05   Balbir Singh   Memory controller...
1010
1011
1012
1013
1014
  }
  
  static int mem_cgroup_populate(struct cgroup_subsys *ss,
  				struct cgroup *cont)
  {
4077960e2   Balbir Singh   memory controller...
1015
1016
  	if (mem_cgroup_subsys.disabled)
  		return 0;
8cdea7c05   Balbir Singh   Memory controller...
1017
1018
1019
  	return cgroup_add_files(cont, ss, mem_cgroup_files,
  					ARRAY_SIZE(mem_cgroup_files));
  }
67e465a77   Balbir Singh   Memory controller...
1020
1021
1022
1023
1024
1025
1026
  static void mem_cgroup_move_task(struct cgroup_subsys *ss,
  				struct cgroup *cont,
  				struct cgroup *old_cont,
  				struct task_struct *p)
  {
  	struct mm_struct *mm;
  	struct mem_cgroup *mem, *old_mem;
4077960e2   Balbir Singh   memory controller...
1027
1028
  	if (mem_cgroup_subsys.disabled)
  		return;
67e465a77   Balbir Singh   Memory controller...
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
  	mm = get_task_mm(p);
  	if (mm == NULL)
  		return;
  
  	mem = mem_cgroup_from_cont(cont);
  	old_mem = mem_cgroup_from_cont(old_cont);
  
  	if (mem == old_mem)
  		goto out;
  
  	/*
  	 * Only thread group leaders are allowed to migrate, the mm_struct is
  	 * in effect owned by the leader
  	 */
52ea27eb4   Pavel Emelyanov   memcgroup: fix ch...
1043
  	if (!thread_group_leader(p))
67e465a77   Balbir Singh   Memory controller...
1044
  		goto out;
67e465a77   Balbir Singh   Memory controller...
1045
1046
  out:
  	mmput(mm);
67e465a77   Balbir Singh   Memory controller...
1047
  }
8cdea7c05   Balbir Singh   Memory controller...
1048
1049
1050
1051
  struct cgroup_subsys mem_cgroup_subsys = {
  	.name = "memory",
  	.subsys_id = mem_cgroup_subsys_id,
  	.create = mem_cgroup_create,
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
1052
  	.pre_destroy = mem_cgroup_pre_destroy,
8cdea7c05   Balbir Singh   Memory controller...
1053
1054
  	.destroy = mem_cgroup_destroy,
  	.populate = mem_cgroup_populate,
67e465a77   Balbir Singh   Memory controller...
1055
  	.attach = mem_cgroup_move_task,
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
1056
  	.early_init = 0,
8cdea7c05   Balbir Singh   Memory controller...
1057
  };