Commit 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent b38c6845b6

[PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 23 changed files with 138 additions and 79 deletions Side-by-side Diff

arch/arm/mm/mm-armv.c
... ... @@ -229,6 +229,7 @@
229 229 pte = pmd_page(*pmd);
230 230 pmd_clear(pmd);
231 231 dec_page_state(nr_page_table_pages);
  232 + pte_lock_deinit(pte);
232 233 pte_free(pte);
233 234 pmd_free(pmd);
234 235 free:
arch/frv/mm/pgalloc.c
... ... @@ -87,14 +87,14 @@
87 87 if (pgd_list)
88 88 pgd_list->private = (unsigned long) &page->index;
89 89 pgd_list = page;
90   - page->private = (unsigned long) &pgd_list;
  90 + set_page_private(page, (unsigned long)&pgd_list);
91 91 }
92 92  
93 93 static inline void pgd_list_del(pgd_t *pgd)
94 94 {
95 95 struct page *next, **pprev, *page = virt_to_page(pgd);
96 96 next = (struct page *) page->index;
97   - pprev = (struct page **) page->private;
  97 + pprev = (struct page **)page_private(page);
98 98 *pprev = next;
99 99 if (next)
100 100 next->private = (unsigned long) pprev;
arch/i386/mm/pgtable.c
... ... @@ -188,19 +188,19 @@
188 188 struct page *page = virt_to_page(pgd);
189 189 page->index = (unsigned long)pgd_list;
190 190 if (pgd_list)
191   - pgd_list->private = (unsigned long)&page->index;
  191 + set_page_private(pgd_list, (unsigned long)&page->index);
192 192 pgd_list = page;
193   - page->private = (unsigned long)&pgd_list;
  193 + set_page_private(page, (unsigned long)&pgd_list);
194 194 }
195 195  
196 196 static inline void pgd_list_del(pgd_t *pgd)
197 197 {
198 198 struct page *next, **pprev, *page = virt_to_page(pgd);
199 199 next = (struct page *)page->index;
200   - pprev = (struct page **)page->private;
  200 + pprev = (struct page **)page_private(page);
201 201 *pprev = next;
202 202 if (next)
203   - next->private = (unsigned long)pprev;
  203 + set_page_private(next, (unsigned long)pprev);
204 204 }
205 205  
206 206 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
arch/um/kernel/skas/mmu.c
... ... @@ -144,6 +144,7 @@
144 144  
145 145 if(!proc_mm || !ptrace_faultinfo){
146 146 free_page(mmu->id.stack);
  147 + pte_lock_deinit(virt_to_page(mmu->last_page_table));
147 148 pte_free_kernel((pte_t *) mmu->last_page_table);
148 149 dec_page_state(nr_page_table_pages);
149 150 #ifdef CONFIG_3_LEVEL_PGTABLES
... ... @@ -291,8 +291,8 @@
291 291 cachefs_uncache_page(vnode->cache, page);
292 292 #endif
293 293  
294   - pageio = (struct cachefs_page *) page->private;
295   - page->private = 0;
  294 + pageio = (struct cachefs_page *) page_private(page);
  295 + set_page_private(page, 0);
296 296 ClearPagePrivate(page);
297 297  
298 298 if (pageio)
... ... @@ -96,7 +96,7 @@
96 96 __clear_page_buffers(struct page *page)
97 97 {
98 98 ClearPagePrivate(page);
99   - page->private = 0;
  99 + set_page_private(page, 0);
100 100 page_cache_release(page);
101 101 }
102 102  
fs/jfs/jfs_metapage.c
... ... @@ -86,7 +86,7 @@
86 86 atomic_t io_count;
87 87 struct metapage *mp[MPS_PER_PAGE];
88 88 };
89   -#define mp_anchor(page) ((struct meta_anchor *)page->private)
  89 +#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
90 90  
91 91 static inline struct metapage *page_to_mp(struct page *page, uint offset)
92 92 {
... ... @@ -108,7 +108,7 @@
108 108 if (!a)
109 109 return -ENOMEM;
110 110 memset(a, 0, sizeof(struct meta_anchor));
111   - page->private = (unsigned long)a;
  111 + set_page_private(page, (unsigned long)a);
112 112 SetPagePrivate(page);
113 113 kmap(page);
114 114 }
... ... @@ -136,7 +136,7 @@
136 136 a->mp[index] = NULL;
137 137 if (--a->mp_count == 0) {
138 138 kfree(a);
139   - page->private = 0;
  139 + set_page_private(page, 0);
140 140 ClearPagePrivate(page);
141 141 kunmap(page);
142 142 }
143 143  
... ... @@ -156,13 +156,13 @@
156 156 #else
157 157 static inline struct metapage *page_to_mp(struct page *page, uint offset)
158 158 {
159   - return PagePrivate(page) ? (struct metapage *)page->private : NULL;
  159 + return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
160 160 }
161 161  
162 162 static inline int insert_metapage(struct page *page, struct metapage *mp)
163 163 {
164 164 if (mp) {
165   - page->private = (unsigned long)mp;
  165 + set_page_private(page, (unsigned long)mp);
166 166 SetPagePrivate(page);
167 167 kmap(page);
168 168 }
... ... @@ -171,7 +171,7 @@
171 171  
172 172 static inline void remove_metapage(struct page *page, struct metapage *mp)
173 173 {
174   - page->private = 0;
  174 + set_page_private(page, 0);
175 175 ClearPagePrivate(page);
176 176 kunmap(page);
177 177 }
fs/xfs/linux-2.6/xfs_buf.c
... ... @@ -181,8 +181,9 @@
181 181 size_t offset,
182 182 size_t length)
183 183 {
184   - page->private |= page_region_mask(offset, length);
185   - if (page->private == ~0UL)
  184 + set_page_private(page,
  185 + page_private(page) | page_region_mask(offset, length));
  186 + if (page_private(page) == ~0UL)
186 187 SetPageUptodate(page);
187 188 }
188 189  
... ... @@ -194,7 +195,7 @@
194 195 {
195 196 unsigned long mask = page_region_mask(offset, length);
196 197  
197   - return (mask && (page->private & mask) == mask);
  198 + return (mask && (page_private(page) & mask) == mask);
198 199 }
199 200  
200 201 /*
include/linux/buffer_head.h
... ... @@ -126,8 +126,8 @@
126 126 /* If we *know* page->private refers to buffer_heads */
127 127 #define page_buffers(page) \
128 128 ({ \
129   - BUG_ON(!PagePrivate(page)); \
130   - ((struct buffer_head *)(page)->private); \
  129 + BUG_ON(!PagePrivate(page)); \
  130 + ((struct buffer_head *)page_private(page)); \
131 131 })
132 132 #define page_has_buffers(page) PagePrivate(page)
133 133  
... ... @@ -219,7 +219,7 @@
219 219 {
220 220 page_cache_get(page);
221 221 SetPagePrivate(page);
222   - page->private = (unsigned long)head;
  222 + set_page_private(page, (unsigned long)head);
223 223 }
224 224  
225 225 static inline void get_bh(struct buffer_head *bh)
... ... @@ -226,13 +226,18 @@
226 226 * to show when page is mapped
227 227 * & limit reverse map searches.
228 228 */
229   - unsigned long private; /* Mapping-private opaque data:
  229 + union {
  230 + unsigned long private; /* Mapping-private opaque data:
230 231 * usually used for buffer_heads
231 232 * if PagePrivate set; used for
232 233 * swp_entry_t if PageSwapCache
233 234 * When page is free, this indicates
234 235 * order in the buddy system.
235 236 */
  237 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
  238 + spinlock_t ptl;
  239 +#endif
  240 + } u;
236 241 struct address_space *mapping; /* If low bit clear, points to
237 242 * inode address_space, or NULL.
238 243 * If page mapped as anonymous
... ... @@ -260,6 +265,9 @@
260 265 #endif /* WANT_PAGE_VIRTUAL */
261 266 };
262 267  
  268 +#define page_private(page) ((page)->u.private)
  269 +#define set_page_private(page, v) ((page)->u.private = (v))
  270 +
263 271 /*
264 272 * FIXME: take this include out, include page-flags.h in
265 273 * files which need it (119 of them)
266 274  
267 275  
... ... @@ -311,17 +319,17 @@
311 319  
312 320 #ifdef CONFIG_HUGETLB_PAGE
313 321  
314   -static inline int page_count(struct page *p)
  322 +static inline int page_count(struct page *page)
315 323 {
316   - if (PageCompound(p))
317   - p = (struct page *)p->private;
318   - return atomic_read(&(p)->_count) + 1;
  324 + if (PageCompound(page))
  325 + page = (struct page *)page_private(page);
  326 + return atomic_read(&page->_count) + 1;
319 327 }
320 328  
321 329 static inline void get_page(struct page *page)
322 330 {
323 331 if (unlikely(PageCompound(page)))
324   - page = (struct page *)page->private;
  332 + page = (struct page *)page_private(page);
325 333 atomic_inc(&page->_count);
326 334 }
327 335  
... ... @@ -587,7 +595,7 @@
587 595 static inline pgoff_t page_index(struct page *page)
588 596 {
589 597 if (unlikely(PageSwapCache(page)))
590   - return page->private;
  598 + return page_private(page);
591 599 return page->index;
592 600 }
593 601  
594 602  
... ... @@ -779,9 +787,31 @@
779 787 }
780 788 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
781 789  
  790 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
  791 +/*
  792 + * We tuck a spinlock to guard each pagetable page into its struct page,
  793 + * at page->private, with BUILD_BUG_ON to make sure that this will not
  794 + * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  795 + * When freeing, reset page->mapping so free_pages_check won't complain.
  796 + */
  797 +#define __pte_lockptr(page) &((page)->u.ptl)
  798 +#define pte_lock_init(_page) do { \
  799 + spin_lock_init(__pte_lockptr(_page)); \
  800 +} while (0)
  801 +#define pte_lock_deinit(page) ((page)->mapping = NULL)
  802 +#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
  803 +#else
  804 +/*
  805 + * We use mm->page_table_lock to guard all pagetable pages of the mm.
  806 + */
  807 +#define pte_lock_init(page) do {} while (0)
  808 +#define pte_lock_deinit(page) do {} while (0)
  809 +#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
  810 +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
  811 +
782 812 #define pte_offset_map_lock(mm, pmd, address, ptlp) \
783 813 ({ \
784   - spinlock_t *__ptl = &(mm)->page_table_lock; \
  814 + spinlock_t *__ptl = pte_lockptr(mm, pmd); \
785 815 pte_t *__pte = pte_offset_map(pmd, address); \
786 816 *(ptlp) = __ptl; \
787 817 spin_lock(__ptl); \
... ... @@ -334,7 +334,7 @@
334 334 if (pages) {
335 335 unsigned int count, i;
336 336 pages->mapping = NULL;
337   - pages->private = order;
  337 + set_page_private(pages, order);
338 338 count = 1 << order;
339 339 for (i = 0; i < count; i++)
340 340 SetPageReserved(pages + i);
... ... @@ -347,7 +347,7 @@
347 347 {
348 348 unsigned int order, count, i;
349 349  
350   - order = page->private;
  350 + order = page_private(page);
351 351 count = 1 << order;
352 352 for (i = 0; i < count; i++)
353 353 ClearPageReserved(page + i);
... ... @@ -111,4 +111,17 @@
111 111 config SPARSEMEM_EXTREME
112 112 def_bool y
113 113 depends on SPARSEMEM && !SPARSEMEM_STATIC
  114 +
  115 +# Heavily threaded applications may benefit from splitting the mm-wide
  116 +# page_table_lock, so that faults on different parts of the user address
  117 +# space can be handled with less contention: split it at this NR_CPUS.
  118 +# Default to 4 for wider testing, though 8 might be more appropriate.
  119 +# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
  120 +# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
  121 +#
  122 +config SPLIT_PTLOCK_CPUS
  123 + int
  124 + default "4096" if ARM && !CPU_CACHE_VIPT
  125 + default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
  126 + default "4"
... ... @@ -152,7 +152,7 @@
152 152 * in the ->sync_page() methods make essential use of the
153 153 * page_mapping(), merely passing the page down to the backing
154 154 * device's unplug functions when it's non-NULL, which in turn
155   - * ignore it for all cases but swap, where only page->private is
  155 + * ignore it for all cases but swap, where only page_private(page) is
156 156 * of interest. When page_mapping() does go NULL, the entire
157 157 * call stack gracefully ignores the page and returns.
158 158 * -- wli
... ... @@ -114,6 +114,7 @@
114 114 {
115 115 struct page *page = pmd_page(*pmd);
116 116 pmd_clear(pmd);
  117 + pte_lock_deinit(page);
117 118 pte_free_tlb(tlb, page);
118 119 dec_page_state(nr_page_table_pages);
119 120 tlb->mm->nr_ptes--;
120 121  
121 122  
... ... @@ -294,10 +295,12 @@
294 295 if (!new)
295 296 return -ENOMEM;
296 297  
  298 + pte_lock_init(new);
297 299 spin_lock(&mm->page_table_lock);
298   - if (pmd_present(*pmd)) /* Another has populated it */
  300 + if (pmd_present(*pmd)) { /* Another has populated it */
  301 + pte_lock_deinit(new);
299 302 pte_free(new);
300   - else {
  303 + } else {
301 304 mm->nr_ptes++;
302 305 inc_page_state(nr_page_table_pages);
303 306 pmd_populate(mm, pmd, new);
... ... @@ -432,7 +435,7 @@
432 435 if (!dst_pte)
433 436 return -ENOMEM;
434 437 src_pte = pte_offset_map_nested(src_pmd, addr);
435   - src_ptl = &src_mm->page_table_lock;
  438 + src_ptl = pte_lockptr(src_mm, src_pmd);
436 439 spin_lock(src_ptl);
437 440  
438 441 do {
439 442  
440 443  
... ... @@ -1194,15 +1197,16 @@
1194 1197 * (but do_wp_page is only called after already making such a check;
1195 1198 * and do_anonymous_page and do_no_page can safely check later on).
1196 1199 */
1197   -static inline int pte_unmap_same(struct mm_struct *mm,
  1200 +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1198 1201 pte_t *page_table, pte_t orig_pte)
1199 1202 {
1200 1203 int same = 1;
1201 1204 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1202 1205 if (sizeof(pte_t) > sizeof(unsigned long)) {
1203   - spin_lock(&mm->page_table_lock);
  1206 + spinlock_t *ptl = pte_lockptr(mm, pmd);
  1207 + spin_lock(ptl);
1204 1208 same = pte_same(*page_table, orig_pte);
1205   - spin_unlock(&mm->page_table_lock);
  1209 + spin_unlock(ptl);
1206 1210 }
1207 1211 #endif
1208 1212 pte_unmap(page_table);
... ... @@ -1655,7 +1659,7 @@
1655 1659 pte_t pte;
1656 1660 int ret = VM_FAULT_MINOR;
1657 1661  
1658   - if (!pte_unmap_same(mm, page_table, orig_pte))
  1662 + if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1659 1663 goto out;
1660 1664  
1661 1665 entry = pte_to_swp_entry(orig_pte);
... ... @@ -1773,7 +1777,7 @@
1773 1777 page_cache_get(page);
1774 1778 entry = mk_pte(page, vma->vm_page_prot);
1775 1779  
1776   - ptl = &mm->page_table_lock;
  1780 + ptl = pte_lockptr(mm, pmd);
1777 1781 spin_lock(ptl);
1778 1782 if (!pte_none(*page_table))
1779 1783 goto release;
... ... @@ -1934,7 +1938,7 @@
1934 1938 pgoff_t pgoff;
1935 1939 int err;
1936 1940  
1937   - if (!pte_unmap_same(mm, page_table, orig_pte))
  1941 + if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1938 1942 return VM_FAULT_MINOR;
1939 1943  
1940 1944 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
... ... @@ -1992,7 +1996,7 @@
1992 1996 pte, pmd, write_access, entry);
1993 1997 }
1994 1998  
1995   - ptl = &mm->page_table_lock;
  1999 + ptl = pte_lockptr(mm, pmd);
1996 2000 spin_lock(ptl);
1997 2001 if (unlikely(!pte_same(*pte, entry)))
1998 2002 goto unlock;
... ... @@ -72,7 +72,7 @@
72 72 struct address_space *mapping = NULL;
73 73 struct mm_struct *mm = vma->vm_mm;
74 74 pte_t *old_pte, *new_pte, pte;
75   - spinlock_t *old_ptl;
  75 + spinlock_t *old_ptl, *new_ptl;
76 76  
77 77 if (vma->vm_file) {
78 78 /*
79 79  
... ... @@ -88,8 +88,15 @@
88 88 new_vma->vm_truncate_count = 0;
89 89 }
90 90  
  91 + /*
  92 + * We don't have to worry about the ordering of src and dst
  93 + * pte locks because exclusive mmap_sem prevents deadlock.
  94 + */
91 95 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
92 96 new_pte = pte_offset_map_nested(new_pmd, new_addr);
  97 + new_ptl = pte_lockptr(mm, new_pmd);
  98 + if (new_ptl != old_ptl)
  99 + spin_lock(new_ptl);
93 100  
94 101 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
95 102 new_pte++, new_addr += PAGE_SIZE) {
... ... @@ -101,6 +108,8 @@
101 108 set_pte_at(mm, new_addr, new_pte, pte);
102 109 }
103 110  
  111 + if (new_ptl != old_ptl)
  112 + spin_unlock(new_ptl);
104 113 pte_unmap_nested(new_pte - 1);
105 114 pte_unmap_unlock(old_pte - 1, old_ptl);
106 115 if (mapping)
... ... @@ -154,7 +154,7 @@
154 154 struct page *p = page + i;
155 155  
156 156 SetPageCompound(p);
157   - p->private = (unsigned long)page;
  157 + set_page_private(p, (unsigned long)page);
158 158 }
159 159 }
160 160  
... ... @@ -174,7 +174,7 @@
174 174  
175 175 if (!PageCompound(p))
176 176 bad_page(__FUNCTION__, page);
177   - if (p->private != (unsigned long)page)
  177 + if (page_private(p) != (unsigned long)page)
178 178 bad_page(__FUNCTION__, page);
179 179 ClearPageCompound(p);
180 180 }
181 181  
182 182  
... ... @@ -187,18 +187,18 @@
187 187 * So, we don't need atomic page->flags operations here.
188 188 */
189 189 static inline unsigned long page_order(struct page *page) {
190   - return page->private;
  190 + return page_private(page);
191 191 }
192 192  
193 193 static inline void set_page_order(struct page *page, int order) {
194   - page->private = order;
  194 + set_page_private(page, order);
195 195 __SetPagePrivate(page);
196 196 }
197 197  
198 198 static inline void rmv_page_order(struct page *page)
199 199 {
200 200 __ClearPagePrivate(page);
201   - page->private = 0;
  201 + set_page_private(page, 0);
202 202 }
203 203  
204 204 /*
... ... @@ -238,7 +238,7 @@
238 238 * (a) the buddy is free &&
239 239 * (b) the buddy is on the buddy system &&
240 240 * (c) a page and its buddy have the same order.
241   - * for recording page's order, we use page->private and PG_private.
  241 + * for recording page's order, we use page_private(page) and PG_private.
242 242 *
243 243 */
244 244 static inline int page_is_buddy(struct page *page, int order)
... ... @@ -264,7 +264,7 @@
264 264 * parts of the VM system.
265 265 * At each level, we keep a list of pages, which are heads of continuous
266 266 * free pages of length of (1 << order) and marked with PG_Private.Page's
267   - * order is recorded in page->private field.
  267 + * order is recorded in page_private(page) field.
268 268 * So when we are allocating or freeing one, we can derive the state of the
269 269 * other. That is, if we allocate a small block, and both were
270 270 * free, the remainder of the region must be split into blocks.
... ... @@ -463,7 +463,7 @@
463 463 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
464 464 1 << PG_referenced | 1 << PG_arch_1 |
465 465 1 << PG_checked | 1 << PG_mappedtodisk);
466   - page->private = 0;
  466 + set_page_private(page, 0);
467 467 set_page_refs(page, order);
468 468 kernel_map_pages(page, 1 << order, 1);
469 469 }
... ... @@ -91,7 +91,8 @@
91 91 unlock_page(page);
92 92 goto out;
93 93 }
94   - bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
  94 + bio = get_swap_bio(GFP_NOIO, page_private(page), page,
  95 + end_swap_bio_write);
95 96 if (bio == NULL) {
96 97 set_page_dirty(page);
97 98 unlock_page(page);
... ... @@ -115,7 +116,8 @@
115 116  
116 117 BUG_ON(!PageLocked(page));
117 118 ClearPageUptodate(page);
118   - bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
  119 + bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
  120 + end_swap_bio_read);
119 121 if (bio == NULL) {
120 122 unlock_page(page);
121 123 ret = -ENOMEM;
... ... @@ -274,7 +274,7 @@
274 274 return NULL;
275 275 }
276 276  
277   - ptl = &mm->page_table_lock;
  277 + ptl = pte_lockptr(mm, pmd);
278 278 spin_lock(ptl);
279 279 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
280 280 *ptlp = ptl;
... ... @@ -550,7 +550,7 @@
550 550 update_hiwater_rss(mm);
551 551  
552 552 if (PageAnon(page)) {
553   - swp_entry_t entry = { .val = page->private };
  553 + swp_entry_t entry = { .val = page_private(page) };
554 554 /*
555 555 * Store the swap location in the pte.
556 556 * See handle_pte_fault() ...
... ... @@ -71,9 +71,6 @@
71 71 /* Pretend that each entry is of this size in directory's i_size */
72 72 #define BOGO_DIRENT_SIZE 20
73 73  
74   -/* Keep swapped page count in private field of indirect struct page */
75   -#define nr_swapped private
76   -
77 74 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
78 75 enum sgp_type {
79 76 SGP_QUICK, /* don't try more than file page cache lookup */
... ... @@ -324,8 +321,10 @@
324 321  
325 322 entry->val = value;
326 323 info->swapped += incdec;
327   - if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
328   - kmap_atomic_to_page(entry)->nr_swapped += incdec;
  324 + if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
  325 + struct page *page = kmap_atomic_to_page(entry);
  326 + set_page_private(page, page_private(page) + incdec);
  327 + }
329 328 }
330 329  
331 330 /*
... ... @@ -368,9 +367,8 @@
368 367  
369 368 spin_unlock(&info->lock);
370 369 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
371   - if (page) {
372   - page->nr_swapped = 0;
373   - }
  370 + if (page)
  371 + set_page_private(page, 0);
374 372 spin_lock(&info->lock);
375 373  
376 374 if (!page) {
... ... @@ -561,7 +559,7 @@
561 559 diroff = 0;
562 560 }
563 561 subdir = dir[diroff];
564   - if (subdir && subdir->nr_swapped) {
  562 + if (subdir && page_private(subdir)) {
565 563 size = limit - idx;
566 564 if (size > ENTRIES_PER_PAGE)
567 565 size = ENTRIES_PER_PAGE;
568 566  
... ... @@ -572,10 +570,10 @@
572 570 nr_swaps_freed += freed;
573 571 if (offset)
574 572 spin_lock(&info->lock);
575   - subdir->nr_swapped -= freed;
  573 + set_page_private(subdir, page_private(subdir) - freed);
576 574 if (offset)
577 575 spin_unlock(&info->lock);
578   - BUG_ON(subdir->nr_swapped > offset);
  576 + BUG_ON(page_private(subdir) > offset);
579 577 }
580 578 if (offset)
581 579 offset = 0;
... ... @@ -743,7 +741,7 @@
743 741 dir = shmem_dir_map(subdir);
744 742 }
745 743 subdir = *dir;
746   - if (subdir && subdir->nr_swapped) {
  744 + if (subdir && page_private(subdir)) {
747 745 ptr = shmem_swp_map(subdir);
748 746 size = limit - idx;
749 747 if (size > ENTRIES_PER_PAGE)
... ... @@ -39,7 +39,7 @@
39 39 void put_page(struct page *page)
40 40 {
41 41 if (unlikely(PageCompound(page))) {
42   - page = (struct page *)page->private;
  42 + page = (struct page *)page_private(page);
43 43 if (put_page_testzero(page)) {
44 44 void (*dtor)(struct page *page);
45 45  
... ... @@ -83,7 +83,7 @@
83 83 page_cache_get(page);
84 84 SetPageLocked(page);
85 85 SetPageSwapCache(page);
86   - page->private = entry.val;
  86 + set_page_private(page, entry.val);
87 87 total_swapcache_pages++;
88 88 pagecache_acct(1);
89 89 }
... ... @@ -126,8 +126,8 @@
126 126 BUG_ON(PageWriteback(page));
127 127 BUG_ON(PagePrivate(page));
128 128  
129   - radix_tree_delete(&swapper_space.page_tree, page->private);
130   - page->private = 0;
  129 + radix_tree_delete(&swapper_space.page_tree, page_private(page));
  130 + set_page_private(page, 0);
131 131 ClearPageSwapCache(page);
132 132 total_swapcache_pages--;
133 133 pagecache_acct(-1);
... ... @@ -197,7 +197,7 @@
197 197 {
198 198 swp_entry_t entry;
199 199  
200   - entry.val = page->private;
  200 + entry.val = page_private(page);
201 201  
202 202 write_lock_irq(&swapper_space.tree_lock);
203 203 __delete_from_swap_cache(page);
... ... @@ -61,7 +61,7 @@
61 61 swp_entry_t entry;
62 62  
63 63 down_read(&swap_unplug_sem);
64   - entry.val = page->private;
  64 + entry.val = page_private(page);
65 65 if (PageSwapCache(page)) {
66 66 struct block_device *bdev = swap_info[swp_type(entry)].bdev;
67 67 struct backing_dev_info *bdi;
... ... @@ -69,8 +69,8 @@
69 69 /*
70 70 * If the page is removed from swapcache from under us (with a
71 71 * racy try_to_unuse/swapoff) we need an additional reference
72   - * count to avoid reading garbage from page->private above. If
73   - * the WARN_ON triggers during a swapoff it maybe the race
  72 + * count to avoid reading garbage from page_private(page) above.
  73 + * If the WARN_ON triggers during a swapoff it maybe the race
74 74 * condition and it's harmless. However if it triggers without
75 75 * swapoff it signals a problem.
76 76 */
... ... @@ -294,7 +294,7 @@
294 294 struct swap_info_struct *p;
295 295 swp_entry_t entry;
296 296  
297   - entry.val = page->private;
  297 + entry.val = page_private(page);
298 298 p = swap_info_get(entry);
299 299 if (p) {
300 300 /* Subtract the 1 for the swap cache itself */
... ... @@ -339,7 +339,7 @@
339 339 if (page_count(page) != 2) /* 2: us + cache */
340 340 return 0;
341 341  
342   - entry.val = page->private;
  342 + entry.val = page_private(page);
343 343 p = swap_info_get(entry);
344 344 if (!p)
345 345 return 0;
... ... @@ -1042,7 +1042,7 @@
1042 1042 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1043 1043  
1044 1044 if (PageSwapCache(page)) {
1045   - swp_entry_t entry = { .val = page->private };
  1045 + swp_entry_t entry = { .val = page_private(page) };
1046 1046 struct swap_info_struct *sis;
1047 1047  
1048 1048 sis = get_swap_info_struct(swp_type(entry));
... ... @@ -521,7 +521,7 @@
521 521  
522 522 #ifdef CONFIG_SWAP
523 523 if (PageSwapCache(page)) {
524   - swp_entry_t swap = { .val = page->private };
  524 + swp_entry_t swap = { .val = page_private(page) };
525 525 __delete_from_swap_cache(page);
526 526 write_unlock_irq(&mapping->tree_lock);
527 527 swap_free(swap);