Commit 7e675137a8e1a4d45822746456dd389b65745bf6

Authored by Nick Piggin
Committed by Linus Torvalds
1 parent b379d79019

mm: introduce pte_special pte bit

s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most).  Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:

vm_normal_page()
{
	...
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
			if (!mixedmap_refcount_pte(pte))
				return NULL;
#else
                        if (!pfn_valid(pfn))
                                return NULL;
#endif
                        goto out;
                }
	...
}

This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes.  So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):

vm_normal_page()
{
#ifdef s390
	if (!mixedmap_refcount_pte(pte))
		return NULL;
	return pte_page(pte);
#else
	...
#endif
}

And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits.  This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.

So introduce a pte_special pte state, and use it in mm/memory.c.  It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.

BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 25 changed files with 168 additions and 49 deletions Side-by-side Diff

include/asm-alpha/pgtable.h
... ... @@ -268,6 +268,7 @@
268 268 extern inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
269 269 extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
270 270 extern inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  271 +extern inline int pte_special(pte_t pte) { return 0; }
271 272  
272 273 extern inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_FOW; return pte; }
273 274 extern inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~(__DIRTY_BITS); return pte; }
... ... @@ -275,6 +276,7 @@
275 276 extern inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) &= ~_PAGE_FOW; return pte; }
276 277 extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; return pte; }
277 278 extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; }
  279 +extern inline pte_t pte_mkspecial(pte_t pte) { return pte; }
278 280  
279 281 #define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address))
280 282  
include/asm-arm/pgtable.h
... ... @@ -260,6 +260,7 @@
260 260 #define pte_write(pte) (pte_val(pte) & L_PTE_WRITE)
261 261 #define pte_dirty(pte) (pte_val(pte) & L_PTE_DIRTY)
262 262 #define pte_young(pte) (pte_val(pte) & L_PTE_YOUNG)
  263 +#define pte_special(pte) (0)
263 264  
264 265 /*
265 266 * The following only works if pte_present() is not true.
... ... @@ -279,6 +280,8 @@
279 280 PTE_BIT_FUNC(mkdirty, |= L_PTE_DIRTY);
280 281 PTE_BIT_FUNC(mkold, &= ~L_PTE_YOUNG);
281 282 PTE_BIT_FUNC(mkyoung, |= L_PTE_YOUNG);
  283 +
  284 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
282 285  
283 286 /*
284 287 * Mark the prot value as uncacheable and unbufferable.
include/asm-avr32/pgtable.h
... ... @@ -212,6 +212,10 @@
212 212 {
213 213 return pte_val(pte) & _PAGE_ACCESSED;
214 214 }
  215 +static inline int pte_special(pte_t pte)
  216 +{
  217 + return 0;
  218 +}
215 219  
216 220 /*
217 221 * The following only work if pte_present() is not true.
... ... @@ -250,6 +254,10 @@
250 254 static inline pte_t pte_mkyoung(pte_t pte)
251 255 {
252 256 set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED));
  257 + return pte;
  258 +}
  259 +static inline pte_t pte_mkspecial(pte_t pte)
  260 +{
253 261 return pte;
254 262 }
255 263  
include/asm-cris/pgtable.h
... ... @@ -115,6 +115,7 @@
115 115 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_MODIFIED; }
116 116 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
117 117 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  118 +static inline int pte_special(pte_t pte) { return 0; }
118 119  
119 120 static inline pte_t pte_wrprotect(pte_t pte)
120 121 {
... ... @@ -162,6 +163,7 @@
162 163 }
163 164 return pte;
164 165 }
  166 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
165 167  
166 168 /*
167 169 * Conversion functions: convert a page and protection to a page entry,
include/asm-frv/pgtable.h
... ... @@ -380,6 +380,7 @@
380 380 static inline int pte_dirty(pte_t pte) { return (pte).pte & _PAGE_DIRTY; }
381 381 static inline int pte_young(pte_t pte) { return (pte).pte & _PAGE_ACCESSED; }
382 382 static inline int pte_write(pte_t pte) { return !((pte).pte & _PAGE_WP); }
  383 +static inline int pte_special(pte_t pte) { return 0; }
383 384  
384 385 static inline pte_t pte_mkclean(pte_t pte) { (pte).pte &= ~_PAGE_DIRTY; return pte; }
385 386 static inline pte_t pte_mkold(pte_t pte) { (pte).pte &= ~_PAGE_ACCESSED; return pte; }
... ... @@ -387,6 +388,7 @@
387 388 static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte |= _PAGE_DIRTY; return pte; }
388 389 static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte |= _PAGE_ACCESSED; return pte; }
389 390 static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte &= ~_PAGE_WP; return pte; }
  391 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
390 392  
391 393 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
392 394 {
include/asm-ia64/pgtable.h
... ... @@ -302,6 +302,8 @@
302 302 #define pte_dirty(pte) ((pte_val(pte) & _PAGE_D) != 0)
303 303 #define pte_young(pte) ((pte_val(pte) & _PAGE_A) != 0)
304 304 #define pte_file(pte) ((pte_val(pte) & _PAGE_FILE) != 0)
  305 +#define pte_special(pte) 0
  306 +
305 307 /*
306 308 * Note: we convert AR_RWX to AR_RX and AR_RW to AR_R by clearing the 2nd bit in the
307 309 * access rights:
... ... @@ -313,6 +315,7 @@
313 315 #define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D))
314 316 #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D))
315 317 #define pte_mkhuge(pte) (__pte(pte_val(pte)))
  318 +#define pte_mkspecial(pte) (pte)
316 319  
317 320 /*
318 321 * Because ia64's Icache and Dcache is not coherent (on a cpu), we need to
include/asm-m32r/pgtable.h
... ... @@ -214,6 +214,11 @@
214 214 return pte_val(pte) & _PAGE_FILE;
215 215 }
216 216  
  217 +static inline int pte_special(pte_t pte)
  218 +{
  219 + return 0;
  220 +}
  221 +
217 222 static inline pte_t pte_mkclean(pte_t pte)
218 223 {
219 224 pte_val(pte) &= ~_PAGE_DIRTY;
... ... @@ -247,6 +252,11 @@
247 252 static inline pte_t pte_mkwrite(pte_t pte)
248 253 {
249 254 pte_val(pte) |= _PAGE_WRITE;
  255 + return pte;
  256 +}
  257 +
  258 +static inline pte_t pte_mkspecial(pte_t pte)
  259 +{
250 260 return pte;
251 261 }
252 262  
include/asm-m68k/motorola_pgtable.h
... ... @@ -168,6 +168,7 @@
168 168 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
169 169 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
170 170 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  171 +static inline int pte_special(pte_t pte) { return 0; }
171 172  
172 173 static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_RONLY; return pte; }
173 174 static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
... ... @@ -185,6 +186,7 @@
185 186 pte_val(pte) = (pte_val(pte) & _CACHEMASK040) | m68k_supervisor_cachemode;
186 187 return pte;
187 188 }
  189 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
188 190  
189 191 #define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address))
190 192  
include/asm-m68k/sun3_pgtable.h
... ... @@ -169,6 +169,7 @@
169 169 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & SUN3_PAGE_MODIFIED; }
170 170 static inline int pte_young(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESSED; }
171 171 static inline int pte_file(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESSED; }
  172 +static inline int pte_special(pte_t pte) { return 0; }
172 173  
173 174 static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_WRITEABLE; return pte; }
174 175 static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_MODIFIED; return pte; }
... ... @@ -181,6 +182,7 @@
181 182 //static inline pte_t pte_mkcache(pte_t pte) { pte_val(pte) &= SUN3_PAGE_NOCACHE; return pte; }
182 183 // until then, use:
183 184 static inline pte_t pte_mkcache(pte_t pte) { return pte; }
  185 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
184 186  
185 187 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
186 188 extern pgd_t kernel_pg_dir[PTRS_PER_PGD];
include/asm-mips/pgtable.h
... ... @@ -285,6 +285,8 @@
285 285 return pte;
286 286 }
287 287 #endif
  288 +static inline int pte_special(pte_t pte) { return 0; }
  289 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
288 290  
289 291 /*
290 292 * Macro to make mark a page protection value as "uncacheable". Note
include/asm-mn10300/pgtable.h
... ... @@ -224,6 +224,7 @@
224 224 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
225 225 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
226 226 static inline int pte_write(pte_t pte) { return pte_val(pte) & __PAGE_PROT_WRITE; }
  227 +static inline int pte_special(pte_t pte){ return 0; }
227 228  
228 229 /*
229 230 * The following only works if pte_present() is not true.
... ... @@ -264,6 +265,8 @@
264 265 pte_val(pte) |= __PAGE_PROT_UWAUX;
265 266 return pte;
266 267 }
  268 +
  269 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
267 270  
268 271 #define pte_ERROR(e) \
269 272 printk(KERN_ERR "%s:%d: bad pte %08lx.\n", \
include/asm-parisc/pgtable.h
... ... @@ -323,6 +323,7 @@
323 323 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
324 324 static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; }
325 325 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  326 +static inline int pte_special(pte_t pte) { return 0; }
326 327  
327 328 static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
328 329 static inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
... ... @@ -330,6 +331,7 @@
330 331 static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; }
331 332 static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; }
332 333 static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= _PAGE_WRITE; return pte; }
  334 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
333 335  
334 336 /*
335 337 * Conversion functions: convert a page and protection to a page entry,
include/asm-powerpc/pgtable-ppc32.h
... ... @@ -504,6 +504,7 @@
504 504 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
505 505 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
506 506 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  507 +static inline int pte_special(pte_t pte) { return 0; }
507 508  
508 509 static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; }
509 510 static inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; }
... ... @@ -521,6 +522,8 @@
521 522 pte_val(pte) |= _PAGE_DIRTY; return pte; }
522 523 static inline pte_t pte_mkyoung(pte_t pte) {
523 524 pte_val(pte) |= _PAGE_ACCESSED; return pte; }
  525 +static inline pte_t pte_mkspecial(pte_t pte) {
  526 + return pte; }
524 527  
525 528 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
526 529 {
include/asm-powerpc/pgtable-ppc64.h
... ... @@ -239,6 +239,7 @@
239 239 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY;}
240 240 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;}
241 241 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE;}
  242 +static inline int pte_special(pte_t pte) { return 0; }
242 243  
243 244 static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; }
244 245 static inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; }
... ... @@ -256,6 +257,8 @@
256 257 static inline pte_t pte_mkyoung(pte_t pte) {
257 258 pte_val(pte) |= _PAGE_ACCESSED; return pte; }
258 259 static inline pte_t pte_mkhuge(pte_t pte) {
  260 + return pte; }
  261 +static inline pte_t pte_mkspecial(pte_t pte) {
259 262 return pte; }
260 263  
261 264 /* Atomic PTE updates */
include/asm-ppc/pgtable.h
... ... @@ -483,6 +483,7 @@
483 483 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
484 484 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
485 485 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  486 +static inline int pte_special(pte_t pte) { return 0; }
486 487  
487 488 static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; }
488 489 static inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; }
... ... @@ -500,6 +501,8 @@
500 501 pte_val(pte) |= _PAGE_DIRTY; return pte; }
501 502 static inline pte_t pte_mkyoung(pte_t pte) {
502 503 pte_val(pte) |= _PAGE_ACCESSED; return pte; }
  504 +static inline pte_t pte_mkspecial(pte_t pte) {
  505 + return pte; }
503 506  
504 507 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
505 508 {
include/asm-s390/pgtable.h
... ... @@ -518,6 +518,11 @@
518 518 return (pte_val(pte) & mask) == _PAGE_TYPE_FILE;
519 519 }
520 520  
  521 +static inline int pte_special(pte_t pte)
  522 +{
  523 + return 0;
  524 +}
  525 +
521 526 #define __HAVE_ARCH_PTE_SAME
522 527 #define pte_same(a,b) (pte_val(a) == pte_val(b))
523 528  
... ... @@ -712,6 +717,11 @@
712 717 /* S/390 doesn't keep its dirty/referenced bit in the pte.
713 718 * There is no point in setting the real referenced bit.
714 719 */
  720 + return pte;
  721 +}
  722 +
  723 +static inline pte_t pte_mkspecial(pte_t pte)
  724 +{
715 725 return pte;
716 726 }
717 727  
include/asm-sh/pgtable_32.h
... ... @@ -326,6 +326,7 @@
326 326 #define pte_dirty(pte) ((pte).pte_low & _PAGE_DIRTY)
327 327 #define pte_young(pte) ((pte).pte_low & _PAGE_ACCESSED)
328 328 #define pte_file(pte) ((pte).pte_low & _PAGE_FILE)
  329 +#define pte_special(pte) (0)
329 330  
330 331 #ifdef CONFIG_X2TLB
331 332 #define pte_write(pte) ((pte).pte_high & _PAGE_EXT_USER_WRITE)
... ... @@ -355,6 +356,8 @@
355 356 PTE_BIT_FUNC(low, mkdirty, |= _PAGE_DIRTY);
356 357 PTE_BIT_FUNC(low, mkold, &= ~_PAGE_ACCESSED);
357 358 PTE_BIT_FUNC(low, mkyoung, |= _PAGE_ACCESSED);
  359 +
  360 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
358 361  
359 362 /*
360 363 * Macro and implementation to make a page protection as uncachable.
include/asm-sh/pgtable_64.h
... ... @@ -254,10 +254,11 @@
254 254 /*
255 255 * The following have defined behavior only work if pte_present() is true.
256 256 */
257   -static inline int pte_dirty(pte_t pte){ return pte_val(pte) & _PAGE_DIRTY; }
258   -static inline int pte_young(pte_t pte){ return pte_val(pte) & _PAGE_ACCESSED; }
259   -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
260   -static inline int pte_write(pte_t pte){ return pte_val(pte) & _PAGE_WRITE; }
  257 +static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
  258 +static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
  259 +static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  260 +static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; }
  261 +static inline int pte_special(pte_t pte){ return 0; }
261 262  
262 263 static inline pte_t pte_wrprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_WRITE)); return pte; }
263 264 static inline pte_t pte_mkclean(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; }
... ... @@ -266,6 +267,7 @@
266 267 static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
267 268 static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
268 269 static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; }
  270 +static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
269 271  
270 272  
271 273 /*
include/asm-sparc/pgtable.h
... ... @@ -219,6 +219,11 @@
219 219 return pte_val(pte) & BTFIXUP_HALF(pte_filei);
220 220 }
221 221  
  222 +static inline int pte_special(pte_t pte)
  223 +{
  224 + return 0;
  225 +}
  226 +
222 227 /*
223 228 */
224 229 BTFIXUPDEF_HALF(pte_wrprotecti)
... ... @@ -250,6 +255,8 @@
250 255 #define pte_mkwrite(pte) BTFIXUP_CALL(pte_mkwrite)(pte)
251 256 #define pte_mkdirty(pte) BTFIXUP_CALL(pte_mkdirty)(pte)
252 257 #define pte_mkyoung(pte) BTFIXUP_CALL(pte_mkyoung)(pte)
  258 +
  259 +#define pte_mkspecial(pte) (pte)
253 260  
254 261 #define pfn_pte(pfn, prot) mk_pte(pfn_to_page(pfn), prot)
255 262  
include/asm-sparc64/pgtable.h
... ... @@ -506,6 +506,11 @@
506 506 return __pte(pte_val(pte) | mask);
507 507 }
508 508  
  509 +static inline pte_t pte_mkspecial(pte_t pte)
  510 +{
  511 + return pte;
  512 +}
  513 +
509 514 static inline unsigned long pte_young(pte_t pte)
510 515 {
511 516 unsigned long mask;
... ... @@ -606,6 +611,11 @@
606 611 : "0" (val), "i" (_PAGE_PRESENT_4U), "i" (_PAGE_PRESENT_4V));
607 612  
608 613 return val;
  614 +}
  615 +
  616 +static inline int pte_special(pte_t pte)
  617 +{
  618 + return 0;
609 619 }
610 620  
611 621 #define pmd_set(pmdp, ptep) \
include/asm-um/pgtable.h
... ... @@ -173,6 +173,11 @@
173 173 return(pte_present(pte) && (pte_get_bits(pte, _PAGE_NEWPROT)));
174 174 }
175 175  
  176 +static inline int pte_special(pte_t pte)
  177 +{
  178 + return 0;
  179 +}
  180 +
176 181 /*
177 182 * =================================
178 183 * Flags setting section.
... ... @@ -238,6 +243,11 @@
238 243 static inline pte_t pte_mknewpage(pte_t pte)
239 244 {
240 245 pte_set_bits(pte, _PAGE_NEWPAGE);
  246 + return(pte);
  247 +}
  248 +
  249 +static inline pte_t pte_mkspecial(pte_t pte)
  250 +{
241 251 return(pte);
242 252 }
243 253  
include/asm-x86/pgtable.h
... ... @@ -195,6 +195,11 @@
195 195 return !(pte_val(pte) & _PAGE_NX);
196 196 }
197 197  
  198 +static inline int pte_special(pte_t pte)
  199 +{
  200 + return 0;
  201 +}
  202 +
198 203 static inline int pmd_large(pmd_t pte)
199 204 {
200 205 return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
... ... @@ -254,6 +259,11 @@
254 259 static inline pte_t pte_clrglobal(pte_t pte)
255 260 {
256 261 return __pte(pte_val(pte) & ~(pteval_t)_PAGE_GLOBAL);
  262 +}
  263 +
  264 +static inline pte_t pte_mkspecial(pte_t pte)
  265 +{
  266 + return pte;
257 267 }
258 268  
259 269 extern pteval_t __supported_pte_mask;
include/asm-xtensa/pgtable.h
... ... @@ -210,6 +210,8 @@
210 210 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
211 211 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
212 212 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
  213 +static inline int pte_special(pte_t pte) { return 0; }
  214 +
213 215 static inline pte_t pte_wrprotect(pte_t pte)
214 216 { pte_val(pte) &= ~(_PAGE_WRITABLE | _PAGE_HW_WRITE); return pte; }
215 217 static inline pte_t pte_mkclean(pte_t pte)
... ... @@ -222,6 +224,8 @@
222 224 { pte_val(pte) |= _PAGE_ACCESSED; return pte; }
223 225 static inline pte_t pte_mkwrite(pte_t pte)
224 226 { pte_val(pte) |= _PAGE_WRITABLE; return pte; }
  227 +static inline pte_t pte_mkspecial(pte_t pte)
  228 + { return pte; }
225 229  
226 230 /*
227 231 * Conversion functions: convert a page and protection to a page entry,
... ... @@ -721,7 +721,9 @@
721 721 unsigned long truncate_count; /* Compare vm_truncate_count */
722 722 };
723 723  
724   -struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
  724 +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
  725 + pte_t pte);
  726 +
725 727 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
726 728 unsigned long size, struct zap_details *);
727 729 unsigned long unmap_vmas(struct mmu_gather **tlb,
... ... @@ -371,34 +371,38 @@
371 371 }
372 372  
373 373 /*
374   - * This function gets the "struct page" associated with a pte or returns
375   - * NULL if no "struct page" is associated with the pte.
  374 + * vm_normal_page -- This function gets the "struct page" associated with a pte.
376 375 *
377   - * A raw VM_PFNMAP mapping (ie. one that is not COWed) may not have any "struct
378   - * page" backing, and even if they do, they are not refcounted. COWed pages of
379   - * a VM_PFNMAP do always have a struct page, and they are normally refcounted
380   - * (they are _normal_ pages).
  376 + * "Special" mappings do not wish to be associated with a "struct page" (either
  377 + * it doesn't exist, or it exists but they don't want to touch it). In this
  378 + * case, NULL is returned here. "Normal" mappings do have a struct page.
381 379 *
382   - * So a raw PFNMAP mapping will have each page table entry just pointing
383   - * to a page frame number, and as far as the VM layer is concerned, those do
384   - * not have pages associated with them - even if the PFN might point to memory
385   - * that otherwise is perfectly fine and has a "struct page".
  380 + * There are 2 broad cases. Firstly, an architecture may define a pte_special()
  381 + * pte bit, in which case this function is trivial. Secondly, an architecture
  382 + * may not have a spare pte bit, which requires a more complicated scheme,
  383 + * described below.
386 384 *
  385 + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
  386 + * special mapping (even if there are underlying and valid "struct pages").
  387 + * COWed pages of a VM_PFNMAP are always normal.
  388 + *
387 389 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
388 390 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
389   - * set, and the vm_pgoff will point to the first PFN mapped: thus every
390   - * page that is a raw mapping will always honor the rule
  391 + * set, and the vm_pgoff will point to the first PFN mapped: thus every special
  392 + * mapping will always honor the rule
391 393 *
392 394 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
393 395 *
394   - * A call to vm_normal_page() will return NULL for such a page.
  396 + * And for normal mappings this is false.
395 397 *
396   - * If the page doesn't follow the "remap_pfn_range()" rule in a VM_PFNMAP
397   - * then the page has been COW'ed. A COW'ed page _does_ have a "struct page"
398   - * associated with it even if it is in a VM_PFNMAP range. Calling
399   - * vm_normal_page() on such a page will therefore return the "struct page".
  398 + * This restricts such mappings to be a linear translation from virtual address
  399 + * to pfn. To get around this restriction, we allow arbitrary mappings so long
  400 + * as the vma is not a COW mapping; in that case, we know that all ptes are
  401 + * special (because none can have been COWed).
400 402 *
401 403 *
  404 + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
  405 + *
402 406 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
403 407 * page" backing, however the difference is that _all_ pages with a struct
404 408 * page (that is, those where pfn_valid is true) are refcounted and considered
405 409  
406 410  
407 411  
408 412  
... ... @@ -407,24 +411,38 @@
407 411 * advantage is that we don't have to follow the strict linearity rule of
408 412 * PFNMAP mappings in order to support COWable mappings.
409 413 *
410   - * A call to vm_normal_page() with a VM_MIXEDMAP mapping will return the
411   - * associated "struct page" or NULL for memory not backed by a "struct page".
412   - *
413   - *
414   - * All other mappings should have a valid struct page, which will be
415   - * returned by a call to vm_normal_page().
416 414 */
417   -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
  415 +#ifdef __HAVE_ARCH_PTE_SPECIAL
  416 +# define HAVE_PTE_SPECIAL 1
  417 +#else
  418 +# define HAVE_PTE_SPECIAL 0
  419 +#endif
  420 +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
  421 + pte_t pte)
418 422 {
419   - unsigned long pfn = pte_pfn(pte);
  423 + unsigned long pfn;
420 424  
  425 + if (HAVE_PTE_SPECIAL) {
  426 + if (likely(!pte_special(pte))) {
  427 + VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  428 + return pte_page(pte);
  429 + }
  430 + VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
  431 + return NULL;
  432 + }
  433 +
  434 + /* !HAVE_PTE_SPECIAL case follows: */
  435 +
  436 + pfn = pte_pfn(pte);
  437 +
421 438 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
422 439 if (vma->vm_flags & VM_MIXEDMAP) {
423 440 if (!pfn_valid(pfn))
424 441 return NULL;
425 442 goto out;
426 443 } else {
427   - unsigned long off = (addr-vma->vm_start) >> PAGE_SHIFT;
  444 + unsigned long off;
  445 + off = (addr - vma->vm_start) >> PAGE_SHIFT;
428 446 if (pfn == vma->vm_pgoff + off)
429 447 return NULL;
430 448 if (!is_cow_mapping(vma->vm_flags))
431 449  
432 450  
... ... @@ -432,25 +450,12 @@
432 450 }
433 451 }
434 452  
435   -#ifdef CONFIG_DEBUG_VM
436   - /*
437   - * Add some anal sanity checks for now. Eventually,
438   - * we should just do "return pfn_to_page(pfn)", but
439   - * in the meantime we check that we get a valid pfn,
440   - * and that the resulting page looks ok.
441   - */
442   - if (unlikely(!pfn_valid(pfn))) {
443   - print_bad_pte(vma, pte, addr);
444   - return NULL;
445   - }
446   -#endif
  453 + VM_BUG_ON(!pfn_valid(pfn));
447 454  
448 455 /*
449   - * NOTE! We still have PageReserved() pages in the page
450   - * tables.
  456 + * NOTE! We still have PageReserved() pages in the page tables.
451 457 *
452   - * The PAGE_ZERO() pages and various VDSO mappings can
453   - * cause them to exist.
  458 + * eg. VDSO mappings can cause them to exist.
454 459 */
455 460 out:
456 461 return pfn_to_page(pfn);
... ... @@ -1263,6 +1268,12 @@
1263 1268 pte_t *pte, entry;
1264 1269 spinlock_t *ptl;
1265 1270  
  1271 + /*
  1272 + * Technically, architectures with pte_special can avoid all these
  1273 + * restrictions (same for remap_pfn_range). However we would like
  1274 + * consistency in testing and feature parity among all, so we should
  1275 + * try to keep these invariants in place for everybody.
  1276 + */
1266 1277 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1267 1278 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1268 1279 (VM_PFNMAP|VM_MIXEDMAP));
... ... @@ -1278,7 +1289,7 @@
1278 1289 goto out_unlock;
1279 1290  
1280 1291 /* Ok, finally just insert the thing.. */
1281   - entry = pfn_pte(pfn, vma->vm_page_prot);
  1292 + entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot));
1282 1293 set_pte_at(mm, addr, pte, entry);
1283 1294 update_mmu_cache(vma, addr, entry);
1284 1295  
... ... @@ -1309,7 +1320,7 @@
1309 1320 arch_enter_lazy_mmu_mode();
1310 1321 do {
1311 1322 BUG_ON(!pte_none(*pte));
1312   - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
  1323 + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1313 1324 pfn++;
1314 1325 } while (pte++, addr += PAGE_SIZE, addr != end);
1315 1326 arch_leave_lazy_mmu_mode();