Commit c3b86a29429dac1033e3f602f51fa8d00006a8eb

Authored by Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  x86-32, percpu: Correct the ordering of the percpu readmostly section
  x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G
  x86: Spread tlb flush vector between nodes
  percpu: Introduce a read-mostly percpu API
  x86, mm: Fix incorrect data type in vmalloc_sync_all()
  x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
  x86, mm: Fix bogus whitespace in sync_global_pgds()
  x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation
  x86, mm: Add RESERVE_BRK_ARRAY() helper
  mm, x86: Saving vmcore with non-lazy freeing of vmas
  x86, kdump: Change copy_oldmem_page() to use cached addressing
  x86, mm: fix uninitialized addr in kernel_physical_mapping_init()
  x86, kmemcheck: Remove double test
  x86, mm: Make spurious_fault check explicitly check the PRESENT bit
  x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes
  x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions
  x86, mm: Avoid unnecessary TLB flush

Showing 17 changed files Side-by-side Diff

... ... @@ -1163,6 +1163,9 @@
1163 1163 config ARCH_PHYS_ADDR_T_64BIT
1164 1164 def_bool X86_64 || X86_PAE
1165 1165  
  1166 +config ARCH_DMA_ADDR_T_64BIT
  1167 + def_bool X86_64 || HIGHMEM64G
  1168 +
1166 1169 config DIRECT_GBPAGES
1167 1170 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
1168 1171 default y
arch/x86/include/asm/io.h
... ... @@ -206,6 +206,7 @@
206 206  
207 207 extern void iounmap(volatile void __iomem *addr);
208 208  
  209 +extern void set_iounmap_nonlazy(void);
209 210  
210 211 #ifdef __KERNEL__
211 212  
arch/x86/include/asm/page_types.h
... ... @@ -8,7 +8,7 @@
8 8 #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9 9 #define PAGE_MASK (~(PAGE_SIZE-1))
10 10  
11   -#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
  11 +#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
12 12 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
13 13  
14 14 /* Cast PAGE_MASK to a signed type so that it is sign-extended if
arch/x86/include/asm/pgtable.h
... ... @@ -28,6 +28,8 @@
28 28 extern spinlock_t pgd_lock;
29 29 extern struct list_head pgd_list;
30 30  
  31 +extern struct mm_struct *pgd_page_get_mm(struct page *page);
  32 +
31 33 #ifdef CONFIG_PARAVIRT
32 34 #include <asm/paravirt.h>
33 35 #else /* !CONFIG_PARAVIRT */
... ... @@ -602,6 +604,8 @@
602 604 clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
603 605 pte_update(mm, addr, ptep);
604 606 }
  607 +
  608 +#define flush_tlb_fix_spurious_fault(vma, address)
605 609  
606 610 /*
607 611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
arch/x86/include/asm/pgtable_64.h
... ... @@ -102,6 +102,8 @@
102 102 native_set_pgd(pgd, native_make_pgd(0));
103 103 }
104 104  
  105 +extern void sync_global_pgds(unsigned long start, unsigned long end);
  106 +
105 107 /*
106 108 * Conversion functions: convert a page and protection to a page entry,
107 109 * and a page entry and page directory to the page they refer to.
arch/x86/include/asm/setup.h
... ... @@ -93,6 +93,11 @@
93 93 : : "i" (sz)); \
94 94 }
95 95  
  96 +/* Helper for reserving space for arrays of things */
  97 +#define RESERVE_BRK_ARRAY(type, name, entries) \
  98 + type *name; \
  99 + RESERVE_BRK(name, sizeof(type) * entries)
  100 +
96 101 #ifdef __i386__
97 102  
98 103 void __init i386_start_kernel(void);
arch/x86/kernel/crash_dump_64.c
... ... @@ -34,7 +34,7 @@
34 34 if (!csize)
35 35 return 0;
36 36  
37   - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
  37 + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
38 38 if (!vaddr)
39 39 return -ENOMEM;
40 40  
... ... @@ -46,6 +46,7 @@
46 46 } else
47 47 memcpy(buf, vaddr + offset, csize);
48 48  
  49 + set_iounmap_nonlazy();
49 50 iounmap(vaddr);
50 51 return csize;
51 52 }
... ... @@ -229,7 +229,16 @@
229 229  
230 230 spin_lock_irqsave(&pgd_lock, flags);
231 231 list_for_each_entry(page, &pgd_list, lru) {
232   - if (!vmalloc_sync_one(page_address(page), address))
  232 + spinlock_t *pgt_lock;
  233 + pmd_t *ret;
  234 +
  235 + pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  236 +
  237 + spin_lock(pgt_lock);
  238 + ret = vmalloc_sync_one(page_address(page), address);
  239 + spin_unlock(pgt_lock);
  240 +
  241 + if (!ret)
233 242 break;
234 243 }
235 244 spin_unlock_irqrestore(&pgd_lock, flags);
... ... @@ -328,29 +337,7 @@
328 337  
329 338 void vmalloc_sync_all(void)
330 339 {
331   - unsigned long address;
332   -
333   - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
334   - address += PGDIR_SIZE) {
335   -
336   - const pgd_t *pgd_ref = pgd_offset_k(address);
337   - unsigned long flags;
338   - struct page *page;
339   -
340   - if (pgd_none(*pgd_ref))
341   - continue;
342   -
343   - spin_lock_irqsave(&pgd_lock, flags);
344   - list_for_each_entry(page, &pgd_list, lru) {
345   - pgd_t *pgd;
346   - pgd = (pgd_t *)page_address(page) + pgd_index(address);
347   - if (pgd_none(*pgd))
348   - set_pgd(pgd, *pgd_ref);
349   - else
350   - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
351   - }
352   - spin_unlock_irqrestore(&pgd_lock, flags);
353   - }
  340 + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
354 341 }
355 342  
356 343 /*
357 344  
... ... @@ -898,8 +885,14 @@
898 885 if (pmd_large(*pmd))
899 886 return spurious_fault_check(error_code, (pte_t *) pmd);
900 887  
  888 + /*
  889 + * Note: don't use pte_present() here, since it returns true
  890 + * if the _PAGE_PROTNONE bit is set. However, this aliases the
  891 + * _PAGE_GLOBAL bit, which for kernel pages give false positives
  892 + * when CONFIG_DEBUG_PAGEALLOC is used.
  893 + */
901 894 pte = pte_offset_kernel(pmd, address);
902   - if (!pte_present(*pte))
  895 + if (!(pte_flags(*pte) & _PAGE_PRESENT))
903 896 return 0;
904 897  
905 898 ret = spurious_fault_check(error_code, pte);
arch/x86/mm/init_64.c
... ... @@ -98,6 +98,43 @@
98 98 __setup("noexec32=", nonx32_setup);
99 99  
100 100 /*
  101 + * When memory was added/removed make sure all the processes MM have
  102 + * suitable PGD entries in the local PGD level page.
  103 + */
  104 +void sync_global_pgds(unsigned long start, unsigned long end)
  105 +{
  106 + unsigned long address;
  107 +
  108 + for (address = start; address <= end; address += PGDIR_SIZE) {
  109 + const pgd_t *pgd_ref = pgd_offset_k(address);
  110 + unsigned long flags;
  111 + struct page *page;
  112 +
  113 + if (pgd_none(*pgd_ref))
  114 + continue;
  115 +
  116 + spin_lock_irqsave(&pgd_lock, flags);
  117 + list_for_each_entry(page, &pgd_list, lru) {
  118 + pgd_t *pgd;
  119 + spinlock_t *pgt_lock;
  120 +
  121 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
  122 + pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  123 + spin_lock(pgt_lock);
  124 +
  125 + if (pgd_none(*pgd))
  126 + set_pgd(pgd, *pgd_ref);
  127 + else
  128 + BUG_ON(pgd_page_vaddr(*pgd)
  129 + != pgd_page_vaddr(*pgd_ref));
  130 +
  131 + spin_unlock(pgt_lock);
  132 + }
  133 + spin_unlock_irqrestore(&pgd_lock, flags);
  134 + }
  135 +}
  136 +
  137 +/*
101 138 * NOTE: This function is marked __ref because it calls __init function
102 139 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
103 140 */
104 141  
105 142  
... ... @@ -534,11 +571,13 @@
534 571 unsigned long end,
535 572 unsigned long page_size_mask)
536 573 {
537   -
  574 + bool pgd_changed = false;
538 575 unsigned long next, last_map_addr = end;
  576 + unsigned long addr;
539 577  
540 578 start = (unsigned long)__va(start);
541 579 end = (unsigned long)__va(end);
  580 + addr = start;
542 581  
543 582 for (; start < end; start = next) {
544 583 pgd_t *pgd = pgd_offset_k(start);
545 584  
... ... @@ -563,7 +602,12 @@
563 602 spin_lock(&init_mm.page_table_lock);
564 603 pgd_populate(&init_mm, pgd, __va(pud_phys));
565 604 spin_unlock(&init_mm.page_table_lock);
  605 + pgd_changed = true;
566 606 }
  607 +
  608 + if (pgd_changed)
  609 + sync_global_pgds(addr, end);
  610 +
567 611 __flush_tlb_all();
568 612  
569 613 return last_map_addr;
... ... @@ -1003,6 +1047,7 @@
1003 1047 }
1004 1048  
1005 1049 }
  1050 + sync_global_pgds((unsigned long)start_page, end);
1006 1051 return 0;
1007 1052 }
1008 1053  
arch/x86/mm/kmemcheck/opcode.c
... ... @@ -9,7 +9,7 @@
9 9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 10 /* Group 2 */
11 11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12   - || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
  12 + || b == 0x64 || b == 0x65
13 13 /* Group 3 */
14 14 || b == 0x66
15 15 /* Group 4 */
arch/x86/mm/pgtable.c
... ... @@ -87,8 +87,20 @@
87 87 #define UNSHARED_PTRS_PER_PGD \
88 88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
89 89  
90   -static void pgd_ctor(pgd_t *pgd)
  90 +
  91 +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
91 92 {
  93 + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
  94 + virt_to_page(pgd)->index = (pgoff_t)mm;
  95 +}
  96 +
  97 +struct mm_struct *pgd_page_get_mm(struct page *page)
  98 +{
  99 + return (struct mm_struct *)page->index;
  100 +}
  101 +
  102 +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
  103 +{
92 104 /* If the pgd points to a shared pagetable level (either the
93 105 ptes in non-PAE, or shared PMD in PAE), then just copy the
94 106 references from swapper_pg_dir. */
95 107  
... ... @@ -105,8 +117,10 @@
105 117 }
106 118  
107 119 /* list required to sync kernel mapping updates */
108   - if (!SHARED_KERNEL_PMD)
  120 + if (!SHARED_KERNEL_PMD) {
  121 + pgd_set_mm(pgd, mm);
109 122 pgd_list_add(pgd);
  123 + }
110 124 }
111 125  
112 126 static void pgd_dtor(pgd_t *pgd)
... ... @@ -272,7 +286,7 @@
272 286 */
273 287 spin_lock_irqsave(&pgd_lock, flags);
274 288  
275   - pgd_ctor(pgd);
  289 + pgd_ctor(mm, pgd);
276 290 pgd_prepopulate_pmd(mm, pgd, pmds);
277 291  
278 292 spin_unlock_irqrestore(&pgd_lock, flags);
... ... @@ -5,6 +5,7 @@
5 5 #include <linux/smp.h>
6 6 #include <linux/interrupt.h>
7 7 #include <linux/module.h>
  8 +#include <linux/cpu.h>
8 9  
9 10 #include <asm/tlbflush.h>
10 11 #include <asm/mmu_context.h>
... ... @@ -52,6 +53,8 @@
52 53 want false sharing in the per cpu data segment. */
53 54 static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54 55  
  56 +static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
  57 +
55 58 /*
56 59 * We cannot call mmdrop() because we are in interrupt context,
57 60 * instead update mm->cpu_vm_mask.
... ... @@ -173,7 +176,7 @@
173 176 union smp_flush_state *f;
174 177  
175 178 /* Caller has disabled preemption */
176   - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
  179 + sender = this_cpu_read(tlb_vector_offset);
177 180 f = &flush_state[sender];
178 181  
179 182 /*
... ... @@ -218,6 +221,47 @@
218 221 flush_tlb_others_ipi(cpumask, mm, va);
219 222 }
220 223  
  224 +static void __cpuinit calculate_tlb_offset(void)
  225 +{
  226 + int cpu, node, nr_node_vecs;
  227 + /*
  228 + * we are changing tlb_vector_offset for each CPU in runtime, but this
  229 + * will not cause inconsistency, as the write is atomic under X86. we
  230 + * might see more lock contentions in a short time, but after all CPU's
  231 + * tlb_vector_offset are changed, everything should go normal
  232 + *
  233 + * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
  234 + * waste some vectors.
  235 + **/
  236 + if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
  237 + nr_node_vecs = 1;
  238 + else
  239 + nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
  240 +
  241 + for_each_online_node(node) {
  242 + int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
  243 + nr_node_vecs;
  244 + int cpu_offset = 0;
  245 + for_each_cpu(cpu, cpumask_of_node(node)) {
  246 + per_cpu(tlb_vector_offset, cpu) = node_offset +
  247 + cpu_offset;
  248 + cpu_offset++;
  249 + cpu_offset = cpu_offset % nr_node_vecs;
  250 + }
  251 + }
  252 +}
  253 +
  254 +static int tlb_cpuhp_notify(struct notifier_block *n,
  255 + unsigned long action, void *hcpu)
  256 +{
  257 + switch (action & 0xf) {
  258 + case CPU_ONLINE:
  259 + case CPU_DEAD:
  260 + calculate_tlb_offset();
  261 + }
  262 + return NOTIFY_OK;
  263 +}
  264 +
221 265 static int __cpuinit init_smp_flush(void)
222 266 {
223 267 int i;
... ... @@ -225,6 +269,8 @@
225 269 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 270 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 271  
  272 + calculate_tlb_offset();
  273 + hotcpu_notifier(tlb_cpuhp_notify, 0);
228 274 return 0;
229 275 }
230 276 core_initcall(init_smp_flush);
include/asm-generic/pgtable.h
... ... @@ -129,6 +129,10 @@
129 129 #define move_pte(pte, prot, old_addr, new_addr) (pte)
130 130 #endif
131 131  
  132 +#ifndef flush_tlb_fix_spurious_fault
  133 +#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
  134 +#endif
  135 +
132 136 #ifndef pgprot_noncached
133 137 #define pgprot_noncached(prot) (prot)
134 138 #endif
include/asm-generic/vmlinux.lds.h
... ... @@ -687,7 +687,9 @@
687 687 - LOAD_OFFSET) { \
688 688 VMLINUX_SYMBOL(__per_cpu_start) = .; \
689 689 *(.data..percpu..first) \
  690 + . = ALIGN(PAGE_SIZE); \
690 691 *(.data..percpu..page_aligned) \
  692 + *(.data..percpu..readmostly) \
691 693 *(.data..percpu) \
692 694 *(.data..percpu..shared_aligned) \
693 695 VMLINUX_SYMBOL(__per_cpu_end) = .; \
694 696  
... ... @@ -713,7 +715,9 @@
713 715 VMLINUX_SYMBOL(__per_cpu_load) = .; \
714 716 VMLINUX_SYMBOL(__per_cpu_start) = .; \
715 717 *(.data..percpu..first) \
  718 + . = ALIGN(PAGE_SIZE); \
716 719 *(.data..percpu..page_aligned) \
  720 + *(.data..percpu..readmostly) \
717 721 *(.data..percpu) \
718 722 *(.data..percpu..shared_aligned) \
719 723 VMLINUX_SYMBOL(__per_cpu_end) = .; \
include/linux/percpu-defs.h
... ... @@ -139,6 +139,15 @@
139 139 __aligned(PAGE_SIZE)
140 140  
141 141 /*
  142 + * Declaration/definition used for per-CPU variables that must be read mostly.
  143 + */
  144 +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
  145 + DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
  146 +
  147 +#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
  148 + DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
  149 +
  150 +/*
142 151 * Intermodule exports for per-CPU variables. sparse forgets about
143 152 * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to
144 153 * noop if __CHECKER__.
... ... @@ -3185,7 +3185,7 @@
3185 3185 * with threads.
3186 3186 */
3187 3187 if (flags & FAULT_FLAG_WRITE)
3188   - flush_tlb_page(vma, address);
  3188 + flush_tlb_fix_spurious_fault(vma, address);
3189 3189 }
3190 3190 unlock:
3191 3191 pte_unmap_unlock(pte, ptl);
... ... @@ -517,6 +517,15 @@
517 517 static void purge_fragmented_blocks_allcpus(void);
518 518  
519 519 /*
  520 + * called before a call to iounmap() if the caller wants vm_area_struct's
  521 + * immediately freed.
  522 + */
  523 +void set_iounmap_nonlazy(void)
  524 +{
  525 + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
  526 +}
  527 +
  528 +/*
520 529 * Purges all lazily-freed vmap areas.
521 530 *
522 531 * If sync is 0 then don't purge if there is already a purge in progress.