Commit 6b3ae58efca06623c197fd6d91ded4aa3a8fe039
Committed by
Linus Torvalds
1 parent
5564e88ba6
Exists in
master
and in
7 other branches
memcg: remove direct page_cgroup-to-page pointer
In struct page_cgroup, we have a full word for flags but only a few are reserved. Use the remaining upper bits to encode, depending on configuration, the node or the section, to enable page_cgroup-to-page lookups without a direct pointer. This saves a full word for every page in a system with memory cgroups enabled. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 117 additions and 55 deletions Side-by-side Diff
include/linux/page_cgroup.h
1 | 1 | #ifndef __LINUX_PAGE_CGROUP_H |
2 | 2 | #define __LINUX_PAGE_CGROUP_H |
3 | 3 | |
4 | +enum { | |
5 | + /* flags for mem_cgroup */ | |
6 | + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ | |
7 | + PCG_CACHE, /* charged as cache */ | |
8 | + PCG_USED, /* this object is in use. */ | |
9 | + PCG_MIGRATION, /* under page migration */ | |
10 | + /* flags for mem_cgroup and file and I/O status */ | |
11 | + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ | |
12 | + PCG_FILE_MAPPED, /* page is accounted as "mapped" */ | |
13 | + /* No lock in page_cgroup */ | |
14 | + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ | |
15 | + __NR_PCG_FLAGS, | |
16 | +}; | |
17 | + | |
18 | +#ifndef __GENERATING_BOUNDS_H | |
19 | +#include <generated/bounds.h> | |
20 | + | |
4 | 21 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
5 | 22 | #include <linux/bit_spinlock.h> |
23 | + | |
6 | 24 | /* |
7 | 25 | * Page Cgroup can be considered as an extended mem_map. |
8 | 26 | * A page_cgroup page is associated with every page descriptor. The |
... | ... | @@ -13,7 +31,6 @@ |
13 | 31 | struct page_cgroup { |
14 | 32 | unsigned long flags; |
15 | 33 | struct mem_cgroup *mem_cgroup; |
16 | - struct page *page; | |
17 | 34 | struct list_head lru; /* per cgroup LRU list */ |
18 | 35 | }; |
19 | 36 | |
20 | 37 | |
... | ... | @@ -32,20 +49,8 @@ |
32 | 49 | #endif |
33 | 50 | |
34 | 51 | struct page_cgroup *lookup_page_cgroup(struct page *page); |
52 | +struct page *lookup_cgroup_page(struct page_cgroup *pc); | |
35 | 53 | |
36 | -enum { | |
37 | - /* flags for mem_cgroup */ | |
38 | - PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ | |
39 | - PCG_CACHE, /* charged as cache */ | |
40 | - PCG_USED, /* this object is in use. */ | |
41 | - PCG_MIGRATION, /* under page migration */ | |
42 | - /* flags for mem_cgroup and file and I/O status */ | |
43 | - PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ | |
44 | - PCG_FILE_MAPPED, /* page is accounted as "mapped" */ | |
45 | - /* No lock in page_cgroup */ | |
46 | - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ | |
47 | -}; | |
48 | - | |
49 | 54 | #define TESTPCGFLAG(uname, lname) \ |
50 | 55 | static inline int PageCgroup##uname(struct page_cgroup *pc) \ |
51 | 56 | { return test_bit(PCG_##lname, &pc->flags); } |
... | ... | @@ -117,6 +122,39 @@ |
117 | 122 | local_irq_restore(*flags); |
118 | 123 | } |
119 | 124 | |
125 | +#ifdef CONFIG_SPARSEMEM | |
126 | +#define PCG_ARRAYID_WIDTH SECTIONS_SHIFT | |
127 | +#else | |
128 | +#define PCG_ARRAYID_WIDTH NODES_SHIFT | |
129 | +#endif | |
130 | + | |
131 | +#if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS) | |
132 | +#error Not enough space left in pc->flags to store page_cgroup array IDs | |
133 | +#endif | |
134 | + | |
135 | +/* pc->flags: ARRAY-ID | FLAGS */ | |
136 | + | |
137 | +#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_WIDTH) - 1) | |
138 | + | |
139 | +#define PCG_ARRAYID_OFFSET (BITS_PER_LONG - PCG_ARRAYID_WIDTH) | |
140 | +/* | |
141 | + * Zero the shift count for non-existant fields, to prevent compiler | |
142 | + * warnings and ensure references are optimized away. | |
143 | + */ | |
144 | +#define PCG_ARRAYID_SHIFT (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0)) | |
145 | + | |
146 | +static inline void set_page_cgroup_array_id(struct page_cgroup *pc, | |
147 | + unsigned long id) | |
148 | +{ | |
149 | + pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT); | |
150 | + pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT; | |
151 | +} | |
152 | + | |
153 | +static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc) | |
154 | +{ | |
155 | + return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK; | |
156 | +} | |
157 | + | |
120 | 158 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ |
121 | 159 | struct page_cgroup; |
122 | 160 | |
... | ... | @@ -137,7 +175,7 @@ |
137 | 175 | { |
138 | 176 | } |
139 | 177 | |
140 | -#endif | |
178 | +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ | |
141 | 179 | |
142 | 180 | #include <linux/swap.h> |
143 | 181 | |
... | ... | @@ -173,6 +211,9 @@ |
173 | 211 | return; |
174 | 212 | } |
175 | 213 | |
176 | -#endif | |
177 | -#endif | |
214 | +#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ | |
215 | + | |
216 | +#endif /* !__GENERATING_BOUNDS_H */ | |
217 | + | |
218 | +#endif /* __LINUX_PAGE_CGROUP_H */ |
kernel/bounds.c
... | ... | @@ -9,12 +9,14 @@ |
9 | 9 | #include <linux/page-flags.h> |
10 | 10 | #include <linux/mmzone.h> |
11 | 11 | #include <linux/kbuild.h> |
12 | +#include <linux/page_cgroup.h> | |
12 | 13 | |
13 | 14 | void foo(void) |
14 | 15 | { |
15 | 16 | /* The enum constants to put into include/generated/bounds.h */ |
16 | 17 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
17 | 18 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
19 | + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | |
18 | 20 | /* End of constants */ |
19 | 21 | } |
mm/memcontrol.c
... | ... | @@ -1080,7 +1080,7 @@ |
1080 | 1080 | if (unlikely(!PageCgroupUsed(pc))) |
1081 | 1081 | continue; |
1082 | 1082 | |
1083 | - page = pc->page; | |
1083 | + page = lookup_cgroup_page(pc); | |
1084 | 1084 | |
1085 | 1085 | if (unlikely(!PageLRU(page))) |
1086 | 1086 | continue; |
... | ... | @@ -3344,7 +3344,7 @@ |
3344 | 3344 | } |
3345 | 3345 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3346 | 3346 | |
3347 | - page = pc->page; | |
3347 | + page = lookup_cgroup_page(pc); | |
3348 | 3348 | |
3349 | 3349 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); |
3350 | 3350 | if (ret == -ENOMEM) |
mm/page_cgroup.c
... | ... | @@ -11,12 +11,11 @@ |
11 | 11 | #include <linux/swapops.h> |
12 | 12 | #include <linux/kmemleak.h> |
13 | 13 | |
14 | -static void __meminit | |
15 | -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | |
14 | +static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) | |
16 | 15 | { |
17 | 16 | pc->flags = 0; |
17 | + set_page_cgroup_array_id(pc, id); | |
18 | 18 | pc->mem_cgroup = NULL; |
19 | - pc->page = pfn_to_page(pfn); | |
20 | 19 | INIT_LIST_HEAD(&pc->lru); |
21 | 20 | } |
22 | 21 | static unsigned long total_usage; |
... | ... | @@ -43,6 +42,19 @@ |
43 | 42 | return base + offset; |
44 | 43 | } |
45 | 44 | |
45 | +struct page *lookup_cgroup_page(struct page_cgroup *pc) | |
46 | +{ | |
47 | + unsigned long pfn; | |
48 | + struct page *page; | |
49 | + pg_data_t *pgdat; | |
50 | + | |
51 | + pgdat = NODE_DATA(page_cgroup_array_id(pc)); | |
52 | + pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; | |
53 | + page = pfn_to_page(pfn); | |
54 | + VM_BUG_ON(pc != lookup_page_cgroup(page)); | |
55 | + return page; | |
56 | +} | |
57 | + | |
46 | 58 | static int __init alloc_node_page_cgroup(int nid) |
47 | 59 | { |
48 | 60 | struct page_cgroup *base, *pc; |
... | ... | @@ -63,7 +75,7 @@ |
63 | 75 | return -ENOMEM; |
64 | 76 | for (index = 0; index < nr_pages; index++) { |
65 | 77 | pc = base + index; |
66 | - __init_page_cgroup(pc, start_pfn + index); | |
78 | + init_page_cgroup(pc, nid); | |
67 | 79 | } |
68 | 80 | NODE_DATA(nid)->node_page_cgroup = base; |
69 | 81 | total_usage += table_size; |
70 | 82 | |
71 | 83 | |
72 | 84 | |
73 | 85 | |
74 | 86 | |
75 | 87 | |
... | ... | @@ -105,46 +117,53 @@ |
105 | 117 | return section->page_cgroup + pfn; |
106 | 118 | } |
107 | 119 | |
120 | +struct page *lookup_cgroup_page(struct page_cgroup *pc) | |
121 | +{ | |
122 | + struct mem_section *section; | |
123 | + struct page *page; | |
124 | + unsigned long nr; | |
125 | + | |
126 | + nr = page_cgroup_array_id(pc); | |
127 | + section = __nr_to_section(nr); | |
128 | + page = pfn_to_page(pc - section->page_cgroup); | |
129 | + VM_BUG_ON(pc != lookup_page_cgroup(page)); | |
130 | + return page; | |
131 | +} | |
132 | + | |
108 | 133 | /* __alloc_bootmem...() is protected by !slab_available() */ |
109 | 134 | static int __init_refok init_section_page_cgroup(unsigned long pfn) |
110 | 135 | { |
111 | - struct mem_section *section = __pfn_to_section(pfn); | |
112 | 136 | struct page_cgroup *base, *pc; |
137 | + struct mem_section *section; | |
113 | 138 | unsigned long table_size; |
139 | + unsigned long nr; | |
114 | 140 | int nid, index; |
115 | 141 | |
116 | - if (!section->page_cgroup) { | |
117 | - nid = page_to_nid(pfn_to_page(pfn)); | |
118 | - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | |
119 | - VM_BUG_ON(!slab_is_available()); | |
120 | - if (node_state(nid, N_HIGH_MEMORY)) { | |
121 | - base = kmalloc_node(table_size, | |
122 | - GFP_KERNEL | __GFP_NOWARN, nid); | |
123 | - if (!base) | |
124 | - base = vmalloc_node(table_size, nid); | |
125 | - } else { | |
126 | - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); | |
127 | - if (!base) | |
128 | - base = vmalloc(table_size); | |
129 | - } | |
130 | - /* | |
131 | - * The value stored in section->page_cgroup is (base - pfn) | |
132 | - * and it does not point to the memory block allocated above, | |
133 | - * causing kmemleak false positives. | |
134 | - */ | |
135 | - kmemleak_not_leak(base); | |
142 | + nr = pfn_to_section_nr(pfn); | |
143 | + section = __nr_to_section(nr); | |
144 | + | |
145 | + if (section->page_cgroup) | |
146 | + return 0; | |
147 | + | |
148 | + nid = page_to_nid(pfn_to_page(pfn)); | |
149 | + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | |
150 | + VM_BUG_ON(!slab_is_available()); | |
151 | + if (node_state(nid, N_HIGH_MEMORY)) { | |
152 | + base = kmalloc_node(table_size, | |
153 | + GFP_KERNEL | __GFP_NOWARN, nid); | |
154 | + if (!base) | |
155 | + base = vmalloc_node(table_size, nid); | |
136 | 156 | } else { |
137 | - /* | |
138 | - * We don't have to allocate page_cgroup again, but | |
139 | - * address of memmap may be changed. So, we have to initialize | |
140 | - * again. | |
141 | - */ | |
142 | - base = section->page_cgroup + pfn; | |
143 | - table_size = 0; | |
144 | - /* check address of memmap is changed or not. */ | |
145 | - if (base->page == pfn_to_page(pfn)) | |
146 | - return 0; | |
157 | + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); | |
158 | + if (!base) | |
159 | + base = vmalloc(table_size); | |
147 | 160 | } |
161 | + /* | |
162 | + * The value stored in section->page_cgroup is (base - pfn) | |
163 | + * and it does not point to the memory block allocated above, | |
164 | + * causing kmemleak false positives. | |
165 | + */ | |
166 | + kmemleak_not_leak(base); | |
148 | 167 | |
149 | 168 | if (!base) { |
150 | 169 | printk(KERN_ERR "page cgroup allocation failure\n"); |
... | ... | @@ -153,7 +172,7 @@ |
153 | 172 | |
154 | 173 | for (index = 0; index < PAGES_PER_SECTION; index++) { |
155 | 174 | pc = base + index; |
156 | - __init_page_cgroup(pc, pfn + index); | |
175 | + init_page_cgroup(pc, nr); | |
157 | 176 | } |
158 | 177 | |
159 | 178 | section->page_cgroup = base - pfn; |