Commit c3b86a29429dac1033e3f602f51fa8d00006a8eb
Exists in
master
and in
7 other branches
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: x86-32, percpu: Correct the ordering of the percpu readmostly section x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G x86: Spread tlb flush vector between nodes percpu: Introduce a read-mostly percpu API x86, mm: Fix incorrect data type in vmalloc_sync_all() x86, mm: Hold mm->page_table_lock while doing vmalloc_sync x86, mm: Fix bogus whitespace in sync_global_pgds() x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation x86, mm: Add RESERVE_BRK_ARRAY() helper mm, x86: Saving vmcore with non-lazy freeing of vmas x86, kdump: Change copy_oldmem_page() to use cached addressing x86, mm: fix uninitialized addr in kernel_physical_mapping_init() x86, kmemcheck: Remove double test x86, mm: Make spurious_fault check explicitly check the PRESENT bit x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions x86, mm: Avoid unnecessary TLB flush
Showing 17 changed files Side-by-side Diff
- arch/x86/Kconfig
- arch/x86/include/asm/io.h
- arch/x86/include/asm/page_types.h
- arch/x86/include/asm/pgtable.h
- arch/x86/include/asm/pgtable_64.h
- arch/x86/include/asm/setup.h
- arch/x86/kernel/crash_dump_64.c
- arch/x86/mm/fault.c
- arch/x86/mm/init_64.c
- arch/x86/mm/kmemcheck/opcode.c
- arch/x86/mm/pgtable.c
- arch/x86/mm/tlb.c
- include/asm-generic/pgtable.h
- include/asm-generic/vmlinux.lds.h
- include/linux/percpu-defs.h
- mm/memory.c
- mm/vmalloc.c
arch/x86/Kconfig
... | ... | @@ -1163,6 +1163,9 @@ |
1163 | 1163 | config ARCH_PHYS_ADDR_T_64BIT |
1164 | 1164 | def_bool X86_64 || X86_PAE |
1165 | 1165 | |
1166 | +config ARCH_DMA_ADDR_T_64BIT | |
1167 | + def_bool X86_64 || HIGHMEM64G | |
1168 | + | |
1166 | 1169 | config DIRECT_GBPAGES |
1167 | 1170 | bool "Enable 1GB pages for kernel pagetables" if EMBEDDED |
1168 | 1171 | default y |
arch/x86/include/asm/io.h
arch/x86/include/asm/page_types.h
... | ... | @@ -8,7 +8,7 @@ |
8 | 8 | #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) |
9 | 9 | #define PAGE_MASK (~(PAGE_SIZE-1)) |
10 | 10 | |
11 | -#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) | |
11 | +#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) | |
12 | 12 | #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) |
13 | 13 | |
14 | 14 | /* Cast PAGE_MASK to a signed type so that it is sign-extended if |
arch/x86/include/asm/pgtable.h
... | ... | @@ -28,6 +28,8 @@ |
28 | 28 | extern spinlock_t pgd_lock; |
29 | 29 | extern struct list_head pgd_list; |
30 | 30 | |
31 | +extern struct mm_struct *pgd_page_get_mm(struct page *page); | |
32 | + | |
31 | 33 | #ifdef CONFIG_PARAVIRT |
32 | 34 | #include <asm/paravirt.h> |
33 | 35 | #else /* !CONFIG_PARAVIRT */ |
... | ... | @@ -602,6 +604,8 @@ |
602 | 604 | clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); |
603 | 605 | pte_update(mm, addr, ptep); |
604 | 606 | } |
607 | + | |
608 | +#define flush_tlb_fix_spurious_fault(vma, address) | |
605 | 609 | |
606 | 610 | /* |
607 | 611 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); |
arch/x86/include/asm/pgtable_64.h
... | ... | @@ -102,6 +102,8 @@ |
102 | 102 | native_set_pgd(pgd, native_make_pgd(0)); |
103 | 103 | } |
104 | 104 | |
105 | +extern void sync_global_pgds(unsigned long start, unsigned long end); | |
106 | + | |
105 | 107 | /* |
106 | 108 | * Conversion functions: convert a page and protection to a page entry, |
107 | 109 | * and a page entry and page directory to the page they refer to. |
arch/x86/include/asm/setup.h
... | ... | @@ -93,6 +93,11 @@ |
93 | 93 | : : "i" (sz)); \ |
94 | 94 | } |
95 | 95 | |
96 | +/* Helper for reserving space for arrays of things */ | |
97 | +#define RESERVE_BRK_ARRAY(type, name, entries) \ | |
98 | + type *name; \ | |
99 | + RESERVE_BRK(name, sizeof(type) * entries) | |
100 | + | |
96 | 101 | #ifdef __i386__ |
97 | 102 | |
98 | 103 | void __init i386_start_kernel(void); |
arch/x86/kernel/crash_dump_64.c
... | ... | @@ -34,7 +34,7 @@ |
34 | 34 | if (!csize) |
35 | 35 | return 0; |
36 | 36 | |
37 | - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); | |
37 | + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); | |
38 | 38 | if (!vaddr) |
39 | 39 | return -ENOMEM; |
40 | 40 | |
... | ... | @@ -46,6 +46,7 @@ |
46 | 46 | } else |
47 | 47 | memcpy(buf, vaddr + offset, csize); |
48 | 48 | |
49 | + set_iounmap_nonlazy(); | |
49 | 50 | iounmap(vaddr); |
50 | 51 | return csize; |
51 | 52 | } |
arch/x86/mm/fault.c
... | ... | @@ -229,7 +229,16 @@ |
229 | 229 | |
230 | 230 | spin_lock_irqsave(&pgd_lock, flags); |
231 | 231 | list_for_each_entry(page, &pgd_list, lru) { |
232 | - if (!vmalloc_sync_one(page_address(page), address)) | |
232 | + spinlock_t *pgt_lock; | |
233 | + pmd_t *ret; | |
234 | + | |
235 | + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | |
236 | + | |
237 | + spin_lock(pgt_lock); | |
238 | + ret = vmalloc_sync_one(page_address(page), address); | |
239 | + spin_unlock(pgt_lock); | |
240 | + | |
241 | + if (!ret) | |
233 | 242 | break; |
234 | 243 | } |
235 | 244 | spin_unlock_irqrestore(&pgd_lock, flags); |
... | ... | @@ -328,29 +337,7 @@ |
328 | 337 | |
329 | 338 | void vmalloc_sync_all(void) |
330 | 339 | { |
331 | - unsigned long address; | |
332 | - | |
333 | - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; | |
334 | - address += PGDIR_SIZE) { | |
335 | - | |
336 | - const pgd_t *pgd_ref = pgd_offset_k(address); | |
337 | - unsigned long flags; | |
338 | - struct page *page; | |
339 | - | |
340 | - if (pgd_none(*pgd_ref)) | |
341 | - continue; | |
342 | - | |
343 | - spin_lock_irqsave(&pgd_lock, flags); | |
344 | - list_for_each_entry(page, &pgd_list, lru) { | |
345 | - pgd_t *pgd; | |
346 | - pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
347 | - if (pgd_none(*pgd)) | |
348 | - set_pgd(pgd, *pgd_ref); | |
349 | - else | |
350 | - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | |
351 | - } | |
352 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
353 | - } | |
340 | + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); | |
354 | 341 | } |
355 | 342 | |
356 | 343 | /* |
357 | 344 | |
... | ... | @@ -898,8 +885,14 @@ |
898 | 885 | if (pmd_large(*pmd)) |
899 | 886 | return spurious_fault_check(error_code, (pte_t *) pmd); |
900 | 887 | |
888 | + /* | |
889 | + * Note: don't use pte_present() here, since it returns true | |
890 | + * if the _PAGE_PROTNONE bit is set. However, this aliases the | |
891 | + * _PAGE_GLOBAL bit, which for kernel pages give false positives | |
892 | + * when CONFIG_DEBUG_PAGEALLOC is used. | |
893 | + */ | |
901 | 894 | pte = pte_offset_kernel(pmd, address); |
902 | - if (!pte_present(*pte)) | |
895 | + if (!(pte_flags(*pte) & _PAGE_PRESENT)) | |
903 | 896 | return 0; |
904 | 897 | |
905 | 898 | ret = spurious_fault_check(error_code, pte); |
arch/x86/mm/init_64.c
... | ... | @@ -98,6 +98,43 @@ |
98 | 98 | __setup("noexec32=", nonx32_setup); |
99 | 99 | |
100 | 100 | /* |
101 | + * When memory was added/removed make sure all the processes MM have | |
102 | + * suitable PGD entries in the local PGD level page. | |
103 | + */ | |
104 | +void sync_global_pgds(unsigned long start, unsigned long end) | |
105 | +{ | |
106 | + unsigned long address; | |
107 | + | |
108 | + for (address = start; address <= end; address += PGDIR_SIZE) { | |
109 | + const pgd_t *pgd_ref = pgd_offset_k(address); | |
110 | + unsigned long flags; | |
111 | + struct page *page; | |
112 | + | |
113 | + if (pgd_none(*pgd_ref)) | |
114 | + continue; | |
115 | + | |
116 | + spin_lock_irqsave(&pgd_lock, flags); | |
117 | + list_for_each_entry(page, &pgd_list, lru) { | |
118 | + pgd_t *pgd; | |
119 | + spinlock_t *pgt_lock; | |
120 | + | |
121 | + pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
122 | + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | |
123 | + spin_lock(pgt_lock); | |
124 | + | |
125 | + if (pgd_none(*pgd)) | |
126 | + set_pgd(pgd, *pgd_ref); | |
127 | + else | |
128 | + BUG_ON(pgd_page_vaddr(*pgd) | |
129 | + != pgd_page_vaddr(*pgd_ref)); | |
130 | + | |
131 | + spin_unlock(pgt_lock); | |
132 | + } | |
133 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
134 | + } | |
135 | +} | |
136 | + | |
137 | +/* | |
101 | 138 | * NOTE: This function is marked __ref because it calls __init function |
102 | 139 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. |
103 | 140 | */ |
104 | 141 | |
105 | 142 | |
... | ... | @@ -534,11 +571,13 @@ |
534 | 571 | unsigned long end, |
535 | 572 | unsigned long page_size_mask) |
536 | 573 | { |
537 | - | |
574 | + bool pgd_changed = false; | |
538 | 575 | unsigned long next, last_map_addr = end; |
576 | + unsigned long addr; | |
539 | 577 | |
540 | 578 | start = (unsigned long)__va(start); |
541 | 579 | end = (unsigned long)__va(end); |
580 | + addr = start; | |
542 | 581 | |
543 | 582 | for (; start < end; start = next) { |
544 | 583 | pgd_t *pgd = pgd_offset_k(start); |
545 | 584 | |
... | ... | @@ -563,7 +602,12 @@ |
563 | 602 | spin_lock(&init_mm.page_table_lock); |
564 | 603 | pgd_populate(&init_mm, pgd, __va(pud_phys)); |
565 | 604 | spin_unlock(&init_mm.page_table_lock); |
605 | + pgd_changed = true; | |
566 | 606 | } |
607 | + | |
608 | + if (pgd_changed) | |
609 | + sync_global_pgds(addr, end); | |
610 | + | |
567 | 611 | __flush_tlb_all(); |
568 | 612 | |
569 | 613 | return last_map_addr; |
... | ... | @@ -1003,6 +1047,7 @@ |
1003 | 1047 | } |
1004 | 1048 | |
1005 | 1049 | } |
1050 | + sync_global_pgds((unsigned long)start_page, end); | |
1006 | 1051 | return 0; |
1007 | 1052 | } |
1008 | 1053 |
arch/x86/mm/kmemcheck/opcode.c
arch/x86/mm/pgtable.c
... | ... | @@ -87,8 +87,20 @@ |
87 | 87 | #define UNSHARED_PTRS_PER_PGD \ |
88 | 88 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) |
89 | 89 | |
90 | -static void pgd_ctor(pgd_t *pgd) | |
90 | + | |
91 | +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) | |
91 | 92 | { |
93 | + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); | |
94 | + virt_to_page(pgd)->index = (pgoff_t)mm; | |
95 | +} | |
96 | + | |
97 | +struct mm_struct *pgd_page_get_mm(struct page *page) | |
98 | +{ | |
99 | + return (struct mm_struct *)page->index; | |
100 | +} | |
101 | + | |
102 | +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) | |
103 | +{ | |
92 | 104 | /* If the pgd points to a shared pagetable level (either the |
93 | 105 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
94 | 106 | references from swapper_pg_dir. */ |
95 | 107 | |
... | ... | @@ -105,8 +117,10 @@ |
105 | 117 | } |
106 | 118 | |
107 | 119 | /* list required to sync kernel mapping updates */ |
108 | - if (!SHARED_KERNEL_PMD) | |
120 | + if (!SHARED_KERNEL_PMD) { | |
121 | + pgd_set_mm(pgd, mm); | |
109 | 122 | pgd_list_add(pgd); |
123 | + } | |
110 | 124 | } |
111 | 125 | |
112 | 126 | static void pgd_dtor(pgd_t *pgd) |
... | ... | @@ -272,7 +286,7 @@ |
272 | 286 | */ |
273 | 287 | spin_lock_irqsave(&pgd_lock, flags); |
274 | 288 | |
275 | - pgd_ctor(pgd); | |
289 | + pgd_ctor(mm, pgd); | |
276 | 290 | pgd_prepopulate_pmd(mm, pgd, pmds); |
277 | 291 | |
278 | 292 | spin_unlock_irqrestore(&pgd_lock, flags); |
arch/x86/mm/tlb.c
... | ... | @@ -5,6 +5,7 @@ |
5 | 5 | #include <linux/smp.h> |
6 | 6 | #include <linux/interrupt.h> |
7 | 7 | #include <linux/module.h> |
8 | +#include <linux/cpu.h> | |
8 | 9 | |
9 | 10 | #include <asm/tlbflush.h> |
10 | 11 | #include <asm/mmu_context.h> |
... | ... | @@ -52,6 +53,8 @@ |
52 | 53 | want false sharing in the per cpu data segment. */ |
53 | 54 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; |
54 | 55 | |
56 | +static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); | |
57 | + | |
55 | 58 | /* |
56 | 59 | * We cannot call mmdrop() because we are in interrupt context, |
57 | 60 | * instead update mm->cpu_vm_mask. |
... | ... | @@ -173,7 +176,7 @@ |
173 | 176 | union smp_flush_state *f; |
174 | 177 | |
175 | 178 | /* Caller has disabled preemption */ |
176 | - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | |
179 | + sender = this_cpu_read(tlb_vector_offset); | |
177 | 180 | f = &flush_state[sender]; |
178 | 181 | |
179 | 182 | /* |
... | ... | @@ -218,6 +221,47 @@ |
218 | 221 | flush_tlb_others_ipi(cpumask, mm, va); |
219 | 222 | } |
220 | 223 | |
224 | +static void __cpuinit calculate_tlb_offset(void) | |
225 | +{ | |
226 | + int cpu, node, nr_node_vecs; | |
227 | + /* | |
228 | + * we are changing tlb_vector_offset for each CPU in runtime, but this | |
229 | + * will not cause inconsistency, as the write is atomic under X86. we | |
230 | + * might see more lock contentions in a short time, but after all CPU's | |
231 | + * tlb_vector_offset are changed, everything should go normal | |
232 | + * | |
233 | + * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might | |
234 | + * waste some vectors. | |
235 | + **/ | |
236 | + if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) | |
237 | + nr_node_vecs = 1; | |
238 | + else | |
239 | + nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; | |
240 | + | |
241 | + for_each_online_node(node) { | |
242 | + int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * | |
243 | + nr_node_vecs; | |
244 | + int cpu_offset = 0; | |
245 | + for_each_cpu(cpu, cpumask_of_node(node)) { | |
246 | + per_cpu(tlb_vector_offset, cpu) = node_offset + | |
247 | + cpu_offset; | |
248 | + cpu_offset++; | |
249 | + cpu_offset = cpu_offset % nr_node_vecs; | |
250 | + } | |
251 | + } | |
252 | +} | |
253 | + | |
254 | +static int tlb_cpuhp_notify(struct notifier_block *n, | |
255 | + unsigned long action, void *hcpu) | |
256 | +{ | |
257 | + switch (action & 0xf) { | |
258 | + case CPU_ONLINE: | |
259 | + case CPU_DEAD: | |
260 | + calculate_tlb_offset(); | |
261 | + } | |
262 | + return NOTIFY_OK; | |
263 | +} | |
264 | + | |
221 | 265 | static int __cpuinit init_smp_flush(void) |
222 | 266 | { |
223 | 267 | int i; |
... | ... | @@ -225,6 +269,8 @@ |
225 | 269 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) |
226 | 270 | raw_spin_lock_init(&flush_state[i].tlbstate_lock); |
227 | 271 | |
272 | + calculate_tlb_offset(); | |
273 | + hotcpu_notifier(tlb_cpuhp_notify, 0); | |
228 | 274 | return 0; |
229 | 275 | } |
230 | 276 | core_initcall(init_smp_flush); |
include/asm-generic/pgtable.h
... | ... | @@ -129,6 +129,10 @@ |
129 | 129 | #define move_pte(pte, prot, old_addr, new_addr) (pte) |
130 | 130 | #endif |
131 | 131 | |
132 | +#ifndef flush_tlb_fix_spurious_fault | |
133 | +#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) | |
134 | +#endif | |
135 | + | |
132 | 136 | #ifndef pgprot_noncached |
133 | 137 | #define pgprot_noncached(prot) (prot) |
134 | 138 | #endif |
include/asm-generic/vmlinux.lds.h
... | ... | @@ -687,7 +687,9 @@ |
687 | 687 | - LOAD_OFFSET) { \ |
688 | 688 | VMLINUX_SYMBOL(__per_cpu_start) = .; \ |
689 | 689 | *(.data..percpu..first) \ |
690 | + . = ALIGN(PAGE_SIZE); \ | |
690 | 691 | *(.data..percpu..page_aligned) \ |
692 | + *(.data..percpu..readmostly) \ | |
691 | 693 | *(.data..percpu) \ |
692 | 694 | *(.data..percpu..shared_aligned) \ |
693 | 695 | VMLINUX_SYMBOL(__per_cpu_end) = .; \ |
694 | 696 | |
... | ... | @@ -713,7 +715,9 @@ |
713 | 715 | VMLINUX_SYMBOL(__per_cpu_load) = .; \ |
714 | 716 | VMLINUX_SYMBOL(__per_cpu_start) = .; \ |
715 | 717 | *(.data..percpu..first) \ |
718 | + . = ALIGN(PAGE_SIZE); \ | |
716 | 719 | *(.data..percpu..page_aligned) \ |
720 | + *(.data..percpu..readmostly) \ | |
717 | 721 | *(.data..percpu) \ |
718 | 722 | *(.data..percpu..shared_aligned) \ |
719 | 723 | VMLINUX_SYMBOL(__per_cpu_end) = .; \ |
include/linux/percpu-defs.h
... | ... | @@ -139,6 +139,15 @@ |
139 | 139 | __aligned(PAGE_SIZE) |
140 | 140 | |
141 | 141 | /* |
142 | + * Declaration/definition used for per-CPU variables that must be read mostly. | |
143 | + */ | |
144 | +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ | |
145 | + DECLARE_PER_CPU_SECTION(type, name, "..readmostly") | |
146 | + | |
147 | +#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ | |
148 | + DEFINE_PER_CPU_SECTION(type, name, "..readmostly") | |
149 | + | |
150 | +/* | |
142 | 151 | * Intermodule exports for per-CPU variables. sparse forgets about |
143 | 152 | * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to |
144 | 153 | * noop if __CHECKER__. |
mm/memory.c
mm/vmalloc.c
... | ... | @@ -517,6 +517,15 @@ |
517 | 517 | static void purge_fragmented_blocks_allcpus(void); |
518 | 518 | |
519 | 519 | /* |
520 | + * called before a call to iounmap() if the caller wants vm_area_struct's | |
521 | + * immediately freed. | |
522 | + */ | |
523 | +void set_iounmap_nonlazy(void) | |
524 | +{ | |
525 | + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); | |
526 | +} | |
527 | + | |
528 | +/* | |
520 | 529 | * Purges all lazily-freed vmap areas. |
521 | 530 | * |
522 | 531 | * If sync is 0 then don't purge if there is already a purge in progress. |