Commit 505569d208e61ab14f4b87957be0970ab33eb319

Authored by Linus Torvalds

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "Misc fixes: two vdso fixes, two kbuild fixes and a boot failure fix
  with certain odd memory mappings"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86, vdso: Use asm volatile in __getcpu
  x86/build: Clean auto-generated processor feature files
  x86: Fix mkcapflags.sh bash-ism
  x86: Fix step size adjustment during initial memory mapping
  x86_64, vdso: Fix the vdso address randomization algorithm

Showing 6 changed files Inline Diff

arch/x86/boot/Makefile
1 # 1 #
2 # arch/x86/boot/Makefile 2 # arch/x86/boot/Makefile
3 # 3 #
4 # This file is subject to the terms and conditions of the GNU General Public 4 # This file is subject to the terms and conditions of the GNU General Public
5 # License. See the file "COPYING" in the main directory of this archive 5 # License. See the file "COPYING" in the main directory of this archive
6 # for more details. 6 # for more details.
7 # 7 #
8 # Copyright (C) 1994 by Linus Torvalds 8 # Copyright (C) 1994 by Linus Torvalds
9 # Changed by many, many contributors over the years. 9 # Changed by many, many contributors over the years.
10 # 10 #
11 11
12 # If you want to preset the SVGA mode, uncomment the next line and 12 # If you want to preset the SVGA mode, uncomment the next line and
13 # set SVGA_MODE to whatever number you want. 13 # set SVGA_MODE to whatever number you want.
14 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 14 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
15 # The number is the same as you would ordinarily press at bootup. 15 # The number is the same as you would ordinarily press at bootup.
16 16
17 SVGA_MODE := -DSVGA_MODE=NORMAL_VGA 17 SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
18 18
19 targets := vmlinux.bin setup.bin setup.elf bzImage 19 targets := vmlinux.bin setup.bin setup.elf bzImage
20 targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 20 targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
21 subdir- := compressed 21 subdir- := compressed
22 22
23 setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o 23 setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
24 setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o 24 setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
25 setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o 25 setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
26 setup-y += video-mode.o version.o 26 setup-y += video-mode.o version.o
27 setup-$(CONFIG_X86_APM_BOOT) += apm.o 27 setup-$(CONFIG_X86_APM_BOOT) += apm.o
28 28
29 # The link order of the video-*.o modules can matter. In particular, 29 # The link order of the video-*.o modules can matter. In particular,
30 # video-vga.o *must* be listed first, followed by video-vesa.o. 30 # video-vga.o *must* be listed first, followed by video-vesa.o.
31 # Hardware-specific drivers should follow in the order they should be 31 # Hardware-specific drivers should follow in the order they should be
32 # probed, and video-bios.o should typically be last. 32 # probed, and video-bios.o should typically be last.
33 setup-y += video-vga.o 33 setup-y += video-vga.o
34 setup-y += video-vesa.o 34 setup-y += video-vesa.o
35 setup-y += video-bios.o 35 setup-y += video-bios.o
36 36
37 targets += $(setup-y) 37 targets += $(setup-y)
38 hostprogs-y := tools/build 38 hostprogs-y := tools/build
39 hostprogs-$(CONFIG_X86_FEATURE_NAMES) += mkcpustr 39 hostprogs-$(CONFIG_X86_FEATURE_NAMES) += mkcpustr
40 40
41 HOST_EXTRACFLAGS += -I$(srctree)/tools/include \ 41 HOST_EXTRACFLAGS += -I$(srctree)/tools/include \
42 -include include/generated/autoconf.h \ 42 -include include/generated/autoconf.h \
43 -D__EXPORTED_HEADERS__ 43 -D__EXPORTED_HEADERS__
44 44
45 ifdef CONFIG_X86_FEATURE_NAMES 45 ifdef CONFIG_X86_FEATURE_NAMES
46 $(obj)/cpu.o: $(obj)/cpustr.h 46 $(obj)/cpu.o: $(obj)/cpustr.h
47 47
48 quiet_cmd_cpustr = CPUSTR $@ 48 quiet_cmd_cpustr = CPUSTR $@
49 cmd_cpustr = $(obj)/mkcpustr > $@ 49 cmd_cpustr = $(obj)/mkcpustr > $@
50 targets += cpustr.h 50 targets += cpustr.h
51 $(obj)/cpustr.h: $(obj)/mkcpustr FORCE 51 $(obj)/cpustr.h: $(obj)/mkcpustr FORCE
52 $(call if_changed,cpustr) 52 $(call if_changed,cpustr)
53 endif 53 endif
54 clean-files += cpustr.h
54 55
55 # --------------------------------------------------------------------------- 56 # ---------------------------------------------------------------------------
56 57
57 KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP 58 KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP
58 KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 59 KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
59 GCOV_PROFILE := n 60 GCOV_PROFILE := n
60 61
61 $(obj)/bzImage: asflags-y := $(SVGA_MODE) 62 $(obj)/bzImage: asflags-y := $(SVGA_MODE)
62 63
63 quiet_cmd_image = BUILD $@ 64 quiet_cmd_image = BUILD $@
64 cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ 65 cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
65 $(obj)/zoffset.h $@ 66 $(obj)/zoffset.h $@
66 67
67 $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE 68 $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
68 $(call if_changed,image) 69 $(call if_changed,image)
69 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' 70 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
70 71
71 OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S 72 OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
72 $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE 73 $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
73 $(call if_changed,objcopy) 74 $(call if_changed,objcopy)
74 75
75 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) 76 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
76 77
77 sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' 78 sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
78 79
79 quiet_cmd_voffset = VOFFSET $@ 80 quiet_cmd_voffset = VOFFSET $@
80 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ 81 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
81 82
82 targets += voffset.h 83 targets += voffset.h
83 $(obj)/voffset.h: vmlinux FORCE 84 $(obj)/voffset.h: vmlinux FORCE
84 $(call if_changed,voffset) 85 $(call if_changed,voffset)
85 86
86 sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' 87 sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
87 88
88 quiet_cmd_zoffset = ZOFFSET $@ 89 quiet_cmd_zoffset = ZOFFSET $@
89 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ 90 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
90 91
91 targets += zoffset.h 92 targets += zoffset.h
92 $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE 93 $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
93 $(call if_changed,zoffset) 94 $(call if_changed,zoffset)
94 95
95 96
96 AFLAGS_header.o += -I$(obj) 97 AFLAGS_header.o += -I$(obj)
97 $(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h 98 $(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
98 99
99 LDFLAGS_setup.elf := -T 100 LDFLAGS_setup.elf := -T
100 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE 101 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
101 $(call if_changed,ld) 102 $(call if_changed,ld)
102 103
103 OBJCOPYFLAGS_setup.bin := -O binary 104 OBJCOPYFLAGS_setup.bin := -O binary
104 $(obj)/setup.bin: $(obj)/setup.elf FORCE 105 $(obj)/setup.bin: $(obj)/setup.elf FORCE
105 $(call if_changed,objcopy) 106 $(call if_changed,objcopy)
106 107
107 $(obj)/compressed/vmlinux: FORCE 108 $(obj)/compressed/vmlinux: FORCE
108 $(Q)$(MAKE) $(build)=$(obj)/compressed $@ 109 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
109 110
110 # Set this if you want to pass append arguments to the 111 # Set this if you want to pass append arguments to the
111 # bzdisk/fdimage/isoimage kernel 112 # bzdisk/fdimage/isoimage kernel
112 FDARGS = 113 FDARGS =
113 # Set this if you want an initrd included with the 114 # Set this if you want an initrd included with the
114 # bzdisk/fdimage/isoimage kernel 115 # bzdisk/fdimage/isoimage kernel
115 FDINITRD = 116 FDINITRD =
116 117
117 image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) 118 image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
118 119
119 $(obj)/mtools.conf: $(src)/mtools.conf.in 120 $(obj)/mtools.conf: $(src)/mtools.conf.in
120 sed -e 's|@OBJ@|$(obj)|g' < $< > $@ 121 sed -e 's|@OBJ@|$(obj)|g' < $< > $@
121 122
122 # This requires write access to /dev/fd0 123 # This requires write access to /dev/fd0
123 bzdisk: $(obj)/bzImage $(obj)/mtools.conf 124 bzdisk: $(obj)/bzImage $(obj)/mtools.conf
124 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync 125 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
125 syslinux /dev/fd0 ; sync 126 syslinux /dev/fd0 ; sync
126 echo '$(image_cmdline)' | \ 127 echo '$(image_cmdline)' | \
127 MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg 128 MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg
128 if [ -f '$(FDINITRD)' ] ; then \ 129 if [ -f '$(FDINITRD)' ] ; then \
129 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ 130 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
130 fi 131 fi
131 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync 132 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
132 133
133 # These require being root or having syslinux 2.02 or higher installed 134 # These require being root or having syslinux 2.02 or higher installed
134 fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf 135 fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
135 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 136 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
136 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync 137 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
137 syslinux $(obj)/fdimage ; sync 138 syslinux $(obj)/fdimage ; sync
138 echo '$(image_cmdline)' | \ 139 echo '$(image_cmdline)' | \
139 MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg 140 MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
140 if [ -f '$(FDINITRD)' ] ; then \ 141 if [ -f '$(FDINITRD)' ] ; then \
141 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ 142 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
142 fi 143 fi
143 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync 144 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
144 145
145 fdimage288: $(obj)/bzImage $(obj)/mtools.conf 146 fdimage288: $(obj)/bzImage $(obj)/mtools.conf
146 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 147 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
147 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync 148 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
148 syslinux $(obj)/fdimage ; sync 149 syslinux $(obj)/fdimage ; sync
149 echo '$(image_cmdline)' | \ 150 echo '$(image_cmdline)' | \
150 MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg 151 MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
151 if [ -f '$(FDINITRD)' ] ; then \ 152 if [ -f '$(FDINITRD)' ] ; then \
152 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ 153 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
153 fi 154 fi
154 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync 155 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
155 156
156 isoimage: $(obj)/bzImage 157 isoimage: $(obj)/bzImage
157 -rm -rf $(obj)/isoimage 158 -rm -rf $(obj)/isoimage
158 mkdir $(obj)/isoimage 159 mkdir $(obj)/isoimage
159 for i in lib lib64 share end ; do \ 160 for i in lib lib64 share end ; do \
160 if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ 161 if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
161 cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ 162 cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
162 break ; \ 163 break ; \
163 fi ; \ 164 fi ; \
164 if [ $$i = end ] ; then exit 1 ; fi ; \ 165 if [ $$i = end ] ; then exit 1 ; fi ; \
165 done 166 done
166 cp $(obj)/bzImage $(obj)/isoimage/linux 167 cp $(obj)/bzImage $(obj)/isoimage/linux
167 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg 168 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
168 if [ -f '$(FDINITRD)' ] ; then \ 169 if [ -f '$(FDINITRD)' ] ; then \
169 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ 170 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
170 fi 171 fi
171 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ 172 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
172 -no-emul-boot -boot-load-size 4 -boot-info-table \ 173 -no-emul-boot -boot-load-size 4 -boot-info-table \
173 $(obj)/isoimage 174 $(obj)/isoimage
174 isohybrid $(obj)/image.iso 2>/dev/null || true 175 isohybrid $(obj)/image.iso 2>/dev/null || true
175 rm -rf $(obj)/isoimage 176 rm -rf $(obj)/isoimage
176 177
177 bzlilo: $(obj)/bzImage 178 bzlilo: $(obj)/bzImage
178 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi 179 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
179 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi 180 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
180 cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz 181 cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
181 cp System.map $(INSTALL_PATH)/ 182 cp System.map $(INSTALL_PATH)/
182 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi 183 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
183 184
184 install: 185 install:
185 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ 186 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
186 System.map "$(INSTALL_PATH)" 187 System.map "$(INSTALL_PATH)"
187 188
arch/x86/include/asm/vgtod.h
1 #ifndef _ASM_X86_VGTOD_H 1 #ifndef _ASM_X86_VGTOD_H
2 #define _ASM_X86_VGTOD_H 2 #define _ASM_X86_VGTOD_H
3 3
4 #include <linux/compiler.h> 4 #include <linux/compiler.h>
5 #include <linux/clocksource.h> 5 #include <linux/clocksource.h>
6 6
7 #ifdef BUILD_VDSO32_64 7 #ifdef BUILD_VDSO32_64
8 typedef u64 gtod_long_t; 8 typedef u64 gtod_long_t;
9 #else 9 #else
10 typedef unsigned long gtod_long_t; 10 typedef unsigned long gtod_long_t;
11 #endif 11 #endif
12 /* 12 /*
13 * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time 13 * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time
14 * so be carefull by modifying this structure. 14 * so be carefull by modifying this structure.
15 */ 15 */
16 struct vsyscall_gtod_data { 16 struct vsyscall_gtod_data {
17 unsigned seq; 17 unsigned seq;
18 18
19 int vclock_mode; 19 int vclock_mode;
20 cycle_t cycle_last; 20 cycle_t cycle_last;
21 cycle_t mask; 21 cycle_t mask;
22 u32 mult; 22 u32 mult;
23 u32 shift; 23 u32 shift;
24 24
25 /* open coded 'struct timespec' */ 25 /* open coded 'struct timespec' */
26 u64 wall_time_snsec; 26 u64 wall_time_snsec;
27 gtod_long_t wall_time_sec; 27 gtod_long_t wall_time_sec;
28 gtod_long_t monotonic_time_sec; 28 gtod_long_t monotonic_time_sec;
29 u64 monotonic_time_snsec; 29 u64 monotonic_time_snsec;
30 gtod_long_t wall_time_coarse_sec; 30 gtod_long_t wall_time_coarse_sec;
31 gtod_long_t wall_time_coarse_nsec; 31 gtod_long_t wall_time_coarse_nsec;
32 gtod_long_t monotonic_time_coarse_sec; 32 gtod_long_t monotonic_time_coarse_sec;
33 gtod_long_t monotonic_time_coarse_nsec; 33 gtod_long_t monotonic_time_coarse_nsec;
34 34
35 int tz_minuteswest; 35 int tz_minuteswest;
36 int tz_dsttime; 36 int tz_dsttime;
37 }; 37 };
38 extern struct vsyscall_gtod_data vsyscall_gtod_data; 38 extern struct vsyscall_gtod_data vsyscall_gtod_data;
39 39
40 static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) 40 static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
41 { 41 {
42 unsigned ret; 42 unsigned ret;
43 43
44 repeat: 44 repeat:
45 ret = ACCESS_ONCE(s->seq); 45 ret = ACCESS_ONCE(s->seq);
46 if (unlikely(ret & 1)) { 46 if (unlikely(ret & 1)) {
47 cpu_relax(); 47 cpu_relax();
48 goto repeat; 48 goto repeat;
49 } 49 }
50 smp_rmb(); 50 smp_rmb();
51 return ret; 51 return ret;
52 } 52 }
53 53
54 static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, 54 static inline int gtod_read_retry(const struct vsyscall_gtod_data *s,
55 unsigned start) 55 unsigned start)
56 { 56 {
57 smp_rmb(); 57 smp_rmb();
58 return unlikely(s->seq != start); 58 return unlikely(s->seq != start);
59 } 59 }
60 60
61 static inline void gtod_write_begin(struct vsyscall_gtod_data *s) 61 static inline void gtod_write_begin(struct vsyscall_gtod_data *s)
62 { 62 {
63 ++s->seq; 63 ++s->seq;
64 smp_wmb(); 64 smp_wmb();
65 } 65 }
66 66
67 static inline void gtod_write_end(struct vsyscall_gtod_data *s) 67 static inline void gtod_write_end(struct vsyscall_gtod_data *s)
68 { 68 {
69 smp_wmb(); 69 smp_wmb();
70 ++s->seq; 70 ++s->seq;
71 } 71 }
72 72
73 #ifdef CONFIG_X86_64 73 #ifdef CONFIG_X86_64
74 74
75 #define VGETCPU_CPU_MASK 0xfff 75 #define VGETCPU_CPU_MASK 0xfff
76 76
77 static inline unsigned int __getcpu(void) 77 static inline unsigned int __getcpu(void)
78 { 78 {
79 unsigned int p; 79 unsigned int p;
80 80
81 /* 81 /*
82 * Load per CPU data from GDT. LSL is faster than RDTSCP and 82 * Load per CPU data from GDT. LSL is faster than RDTSCP and
83 * works on all CPUs. 83 * works on all CPUs. This is volatile so that it orders
84 * correctly wrt barrier() and to keep gcc from cleverly
85 * hoisting it out of the calling function.
84 */ 86 */
85 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 87 asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
86 88
87 return p; 89 return p;
88 } 90 }
89 91
90 #endif /* CONFIG_X86_64 */ 92 #endif /* CONFIG_X86_64 */
91 93
92 #endif /* _ASM_X86_VGTOD_H */ 94 #endif /* _ASM_X86_VGTOD_H */
93 95
arch/x86/kernel/cpu/Makefile
1 # 1 #
2 # Makefile for x86-compatible CPU details, features and quirks 2 # Makefile for x86-compatible CPU details, features and quirks
3 # 3 #
4 4
5 # Don't trace early stages of a secondary CPU boot 5 # Don't trace early stages of a secondary CPU boot
6 ifdef CONFIG_FUNCTION_TRACER 6 ifdef CONFIG_FUNCTION_TRACER
7 CFLAGS_REMOVE_common.o = -pg 7 CFLAGS_REMOVE_common.o = -pg
8 CFLAGS_REMOVE_perf_event.o = -pg 8 CFLAGS_REMOVE_perf_event.o = -pg
9 endif 9 endif
10 10
11 # Make sure load_percpu_segment has no stackprotector 11 # Make sure load_percpu_segment has no stackprotector
12 nostackp := $(call cc-option, -fno-stack-protector) 12 nostackp := $(call cc-option, -fno-stack-protector)
13 CFLAGS_common.o := $(nostackp) 13 CFLAGS_common.o := $(nostackp)
14 14
15 obj-y := intel_cacheinfo.o scattered.o topology.o 15 obj-y := intel_cacheinfo.o scattered.o topology.o
16 obj-y += common.o 16 obj-y += common.o
17 obj-y += rdrand.o 17 obj-y += rdrand.o
18 obj-y += match.o 18 obj-y += match.o
19 19
20 obj-$(CONFIG_PROC_FS) += proc.o 20 obj-$(CONFIG_PROC_FS) += proc.o
21 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o 21 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
22 22
23 obj-$(CONFIG_X86_32) += bugs.o 23 obj-$(CONFIG_X86_32) += bugs.o
24 obj-$(CONFIG_X86_64) += bugs_64.o 24 obj-$(CONFIG_X86_64) += bugs_64.o
25 25
26 obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 26 obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
27 obj-$(CONFIG_CPU_SUP_AMD) += amd.o 27 obj-$(CONFIG_CPU_SUP_AMD) += amd.o
28 obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 28 obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
29 obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o 29 obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
30 obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 30 obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
31 obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 31 obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
32 32
33 obj-$(CONFIG_PERF_EVENTS) += perf_event.o 33 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
34 34
35 ifdef CONFIG_PERF_EVENTS 35 ifdef CONFIG_PERF_EVENTS
36 obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o 36 obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o
37 ifdef CONFIG_AMD_IOMMU 37 ifdef CONFIG_AMD_IOMMU
38 obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o 38 obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
39 endif 39 endif
40 obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o 40 obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
41 obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 41 obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
42 obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o 42 obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
43 43
44 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ 44 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
45 perf_event_intel_uncore_snb.o \ 45 perf_event_intel_uncore_snb.o \
46 perf_event_intel_uncore_snbep.o \ 46 perf_event_intel_uncore_snbep.o \
47 perf_event_intel_uncore_nhmex.o 47 perf_event_intel_uncore_nhmex.o
48 endif 48 endif
49 49
50 50
51 obj-$(CONFIG_X86_MCE) += mcheck/ 51 obj-$(CONFIG_X86_MCE) += mcheck/
52 obj-$(CONFIG_MTRR) += mtrr/ 52 obj-$(CONFIG_MTRR) += mtrr/
53 obj-$(CONFIG_MICROCODE) += microcode/ 53 obj-$(CONFIG_MICROCODE) += microcode/
54 54
55 obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o 55 obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
56 56
57 obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o 57 obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
58 58
59 ifdef CONFIG_X86_FEATURE_NAMES 59 ifdef CONFIG_X86_FEATURE_NAMES
60 quiet_cmd_mkcapflags = MKCAP $@ 60 quiet_cmd_mkcapflags = MKCAP $@
61 cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ 61 cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
62 62
63 cpufeature = $(src)/../../include/asm/cpufeature.h 63 cpufeature = $(src)/../../include/asm/cpufeature.h
64 64
65 targets += capflags.c 65 targets += capflags.c
66 $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE 66 $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
67 $(call if_changed,mkcapflags) 67 $(call if_changed,mkcapflags)
68 endif 68 endif
69 clean-files += capflags.c
69 70
arch/x86/kernel/cpu/mkcapflags.sh
1 #!/bin/sh 1 #!/bin/sh
2 # 2 #
3 # Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h 3 # Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
4 # 4 #
5 5
6 IN=$1 6 IN=$1
7 OUT=$2 7 OUT=$2
8 8
9 function dump_array() 9 function dump_array()
10 { 10 {
11 ARRAY=$1 11 ARRAY=$1
12 SIZE=$2 12 SIZE=$2
13 PFX=$3 13 PFX=$3
14 POSTFIX=$4 14 POSTFIX=$4
15 15
16 PFX_SZ=$(echo $PFX | wc -c) 16 PFX_SZ=$(echo $PFX | wc -c)
17 TABS="$(printf '\t\t\t\t\t')" 17 TABS="$(printf '\t\t\t\t\t')"
18 18
19 echo "const char * const $ARRAY[$SIZE] = {" 19 echo "const char * const $ARRAY[$SIZE] = {"
20 20
21 # Iterate through any input lines starting with #define $PFX 21 # Iterate through any input lines starting with #define $PFX
22 sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN | 22 sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
23 while read i 23 while read i
24 do 24 do
25 # Name is everything up to the first whitespace 25 # Name is everything up to the first whitespace
26 NAME="$(echo "$i" | sed 's/ .*//')" 26 NAME="$(echo "$i" | sed 's/ .*//')"
27 27
28 # If the /* comment */ starts with a quote string, grab that. 28 # If the /* comment */ starts with a quote string, grab that.
29 VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" 29 VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
30 [ -z "$VALUE" ] && VALUE="\"$NAME\"" 30 [ -z "$VALUE" ] && VALUE="\"$NAME\""
31 [ "$VALUE" == '""' ] && continue 31 [ "$VALUE" = '""' ] && continue
32 32
33 # Name is uppercase, VALUE is all lowercase 33 # Name is uppercase, VALUE is all lowercase
34 VALUE="$(echo "$VALUE" | tr A-Z a-z)" 34 VALUE="$(echo "$VALUE" | tr A-Z a-z)"
35 35
36 if [ -n "$POSTFIX" ]; then 36 if [ -n "$POSTFIX" ]; then
37 T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 )) 37 T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
38 TABS="$(printf '\t\t\t\t\t\t')" 38 TABS="$(printf '\t\t\t\t\t\t')"
39 TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 )) 39 TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
40 printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE" 40 printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
41 else 41 else
42 TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 )) 42 TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
43 printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE" 43 printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
44 fi 44 fi
45 done 45 done
46 echo "};" 46 echo "};"
47 } 47 }
48 48
49 trap 'rm "$OUT"' EXIT 49 trap 'rm "$OUT"' EXIT
50 50
51 ( 51 (
52 echo "#ifndef _ASM_X86_CPUFEATURE_H" 52 echo "#ifndef _ASM_X86_CPUFEATURE_H"
53 echo "#include <asm/cpufeature.h>" 53 echo "#include <asm/cpufeature.h>"
54 echo "#endif" 54 echo "#endif"
55 echo "" 55 echo ""
56 56
57 dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" 57 dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
58 echo "" 58 echo ""
59 59
60 dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" 60 dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
61 61
62 ) > $OUT 62 ) > $OUT
63 63
64 trap - EXIT 64 trap - EXIT
65 65
1 #include <linux/gfp.h> 1 #include <linux/gfp.h>
2 #include <linux/initrd.h> 2 #include <linux/initrd.h>
3 #include <linux/ioport.h> 3 #include <linux/ioport.h>
4 #include <linux/swap.h> 4 #include <linux/swap.h>
5 #include <linux/memblock.h> 5 #include <linux/memblock.h>
6 #include <linux/bootmem.h> /* for max_low_pfn */ 6 #include <linux/bootmem.h> /* for max_low_pfn */
7 7
8 #include <asm/cacheflush.h> 8 #include <asm/cacheflush.h>
9 #include <asm/e820.h> 9 #include <asm/e820.h>
10 #include <asm/init.h> 10 #include <asm/init.h>
11 #include <asm/page.h> 11 #include <asm/page.h>
12 #include <asm/page_types.h> 12 #include <asm/page_types.h>
13 #include <asm/sections.h> 13 #include <asm/sections.h>
14 #include <asm/setup.h> 14 #include <asm/setup.h>
15 #include <asm/tlbflush.h> 15 #include <asm/tlbflush.h>
16 #include <asm/tlb.h> 16 #include <asm/tlb.h>
17 #include <asm/proto.h> 17 #include <asm/proto.h>
18 #include <asm/dma.h> /* for MAX_DMA_PFN */ 18 #include <asm/dma.h> /* for MAX_DMA_PFN */
19 #include <asm/microcode.h> 19 #include <asm/microcode.h>
20 20
21 /* 21 /*
22 * We need to define the tracepoints somewhere, and tlb.c 22 * We need to define the tracepoints somewhere, and tlb.c
23 * is only compied when SMP=y. 23 * is only compied when SMP=y.
24 */ 24 */
25 #define CREATE_TRACE_POINTS 25 #define CREATE_TRACE_POINTS
26 #include <trace/events/tlb.h> 26 #include <trace/events/tlb.h>
27 27
28 #include "mm_internal.h" 28 #include "mm_internal.h"
29 29
30 /* 30 /*
31 * Tables translating between page_cache_type_t and pte encoding. 31 * Tables translating between page_cache_type_t and pte encoding.
32 * Minimal supported modes are defined statically, modified if more supported 32 * Minimal supported modes are defined statically, modified if more supported
33 * cache modes are available. 33 * cache modes are available.
34 * Index into __cachemode2pte_tbl is the cachemode. 34 * Index into __cachemode2pte_tbl is the cachemode.
35 * Index into __pte2cachemode_tbl are the caching attribute bits of the pte 35 * Index into __pte2cachemode_tbl are the caching attribute bits of the pte
36 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. 36 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
37 */ 37 */
38 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { 38 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
39 [_PAGE_CACHE_MODE_WB] = 0, 39 [_PAGE_CACHE_MODE_WB] = 0,
40 [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, 40 [_PAGE_CACHE_MODE_WC] = _PAGE_PWT,
41 [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, 41 [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD,
42 [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, 42 [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT,
43 [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, 43 [_PAGE_CACHE_MODE_WT] = _PAGE_PCD,
44 [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, 44 [_PAGE_CACHE_MODE_WP] = _PAGE_PCD,
45 }; 45 };
46 EXPORT_SYMBOL_GPL(__cachemode2pte_tbl); 46 EXPORT_SYMBOL_GPL(__cachemode2pte_tbl);
47 uint8_t __pte2cachemode_tbl[8] = { 47 uint8_t __pte2cachemode_tbl[8] = {
48 [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, 48 [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB,
49 [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, 49 [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC,
50 [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, 50 [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS,
51 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, 51 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC,
52 [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, 52 [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
53 [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, 53 [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
54 [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, 54 [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
55 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, 55 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
56 }; 56 };
57 EXPORT_SYMBOL_GPL(__pte2cachemode_tbl); 57 EXPORT_SYMBOL_GPL(__pte2cachemode_tbl);
58 58
59 static unsigned long __initdata pgt_buf_start; 59 static unsigned long __initdata pgt_buf_start;
60 static unsigned long __initdata pgt_buf_end; 60 static unsigned long __initdata pgt_buf_end;
61 static unsigned long __initdata pgt_buf_top; 61 static unsigned long __initdata pgt_buf_top;
62 62
63 static unsigned long min_pfn_mapped; 63 static unsigned long min_pfn_mapped;
64 64
65 static bool __initdata can_use_brk_pgt = true; 65 static bool __initdata can_use_brk_pgt = true;
66 66
67 /* 67 /*
68 * Pages returned are already directly mapped. 68 * Pages returned are already directly mapped.
69 * 69 *
70 * Changing that is likely to break Xen, see commit: 70 * Changing that is likely to break Xen, see commit:
71 * 71 *
72 * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve 72 * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
73 * 73 *
74 * for detailed information. 74 * for detailed information.
75 */ 75 */
76 __ref void *alloc_low_pages(unsigned int num) 76 __ref void *alloc_low_pages(unsigned int num)
77 { 77 {
78 unsigned long pfn; 78 unsigned long pfn;
79 int i; 79 int i;
80 80
81 if (after_bootmem) { 81 if (after_bootmem) {
82 unsigned int order; 82 unsigned int order;
83 83
84 order = get_order((unsigned long)num << PAGE_SHIFT); 84 order = get_order((unsigned long)num << PAGE_SHIFT);
85 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | 85 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
86 __GFP_ZERO, order); 86 __GFP_ZERO, order);
87 } 87 }
88 88
89 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { 89 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
90 unsigned long ret; 90 unsigned long ret;
91 if (min_pfn_mapped >= max_pfn_mapped) 91 if (min_pfn_mapped >= max_pfn_mapped)
92 panic("alloc_low_pages: ran out of memory"); 92 panic("alloc_low_pages: ran out of memory");
93 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, 93 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
94 max_pfn_mapped << PAGE_SHIFT, 94 max_pfn_mapped << PAGE_SHIFT,
95 PAGE_SIZE * num , PAGE_SIZE); 95 PAGE_SIZE * num , PAGE_SIZE);
96 if (!ret) 96 if (!ret)
97 panic("alloc_low_pages: can not alloc memory"); 97 panic("alloc_low_pages: can not alloc memory");
98 memblock_reserve(ret, PAGE_SIZE * num); 98 memblock_reserve(ret, PAGE_SIZE * num);
99 pfn = ret >> PAGE_SHIFT; 99 pfn = ret >> PAGE_SHIFT;
100 } else { 100 } else {
101 pfn = pgt_buf_end; 101 pfn = pgt_buf_end;
102 pgt_buf_end += num; 102 pgt_buf_end += num;
103 printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", 103 printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
104 pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); 104 pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
105 } 105 }
106 106
107 for (i = 0; i < num; i++) { 107 for (i = 0; i < num; i++) {
108 void *adr; 108 void *adr;
109 109
110 adr = __va((pfn + i) << PAGE_SHIFT); 110 adr = __va((pfn + i) << PAGE_SHIFT);
111 clear_page(adr); 111 clear_page(adr);
112 } 112 }
113 113
114 return __va(pfn << PAGE_SHIFT); 114 return __va(pfn << PAGE_SHIFT);
115 } 115 }
116 116
117 /* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ 117 /* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
118 #define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) 118 #define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
119 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); 119 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
120 void __init early_alloc_pgt_buf(void) 120 void __init early_alloc_pgt_buf(void)
121 { 121 {
122 unsigned long tables = INIT_PGT_BUF_SIZE; 122 unsigned long tables = INIT_PGT_BUF_SIZE;
123 phys_addr_t base; 123 phys_addr_t base;
124 124
125 base = __pa(extend_brk(tables, PAGE_SIZE)); 125 base = __pa(extend_brk(tables, PAGE_SIZE));
126 126
127 pgt_buf_start = base >> PAGE_SHIFT; 127 pgt_buf_start = base >> PAGE_SHIFT;
128 pgt_buf_end = pgt_buf_start; 128 pgt_buf_end = pgt_buf_start;
129 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 129 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
130 } 130 }
131 131
132 int after_bootmem; 132 int after_bootmem;
133 133
134 int direct_gbpages 134 int direct_gbpages
135 #ifdef CONFIG_DIRECT_GBPAGES 135 #ifdef CONFIG_DIRECT_GBPAGES
136 = 1 136 = 1
137 #endif 137 #endif
138 ; 138 ;
139 139
140 static void __init init_gbpages(void) 140 static void __init init_gbpages(void)
141 { 141 {
142 #ifdef CONFIG_X86_64 142 #ifdef CONFIG_X86_64
143 if (direct_gbpages && cpu_has_gbpages) 143 if (direct_gbpages && cpu_has_gbpages)
144 printk(KERN_INFO "Using GB pages for direct mapping\n"); 144 printk(KERN_INFO "Using GB pages for direct mapping\n");
145 else 145 else
146 direct_gbpages = 0; 146 direct_gbpages = 0;
147 #endif 147 #endif
148 } 148 }
149 149
150 struct map_range { 150 struct map_range {
151 unsigned long start; 151 unsigned long start;
152 unsigned long end; 152 unsigned long end;
153 unsigned page_size_mask; 153 unsigned page_size_mask;
154 }; 154 };
155 155
156 static int page_size_mask; 156 static int page_size_mask;
157 157
158 static void __init probe_page_size_mask(void) 158 static void __init probe_page_size_mask(void)
159 { 159 {
160 init_gbpages(); 160 init_gbpages();
161 161
162 #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) 162 #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
163 /* 163 /*
164 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 164 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
165 * This will simplify cpa(), which otherwise needs to support splitting 165 * This will simplify cpa(), which otherwise needs to support splitting
166 * large pages into small in interrupt context, etc. 166 * large pages into small in interrupt context, etc.
167 */ 167 */
168 if (direct_gbpages) 168 if (direct_gbpages)
169 page_size_mask |= 1 << PG_LEVEL_1G; 169 page_size_mask |= 1 << PG_LEVEL_1G;
170 if (cpu_has_pse) 170 if (cpu_has_pse)
171 page_size_mask |= 1 << PG_LEVEL_2M; 171 page_size_mask |= 1 << PG_LEVEL_2M;
172 #endif 172 #endif
173 173
174 /* Enable PSE if available */ 174 /* Enable PSE if available */
175 if (cpu_has_pse) 175 if (cpu_has_pse)
176 set_in_cr4(X86_CR4_PSE); 176 set_in_cr4(X86_CR4_PSE);
177 177
178 /* Enable PGE if available */ 178 /* Enable PGE if available */
179 if (cpu_has_pge) { 179 if (cpu_has_pge) {
180 set_in_cr4(X86_CR4_PGE); 180 set_in_cr4(X86_CR4_PGE);
181 __supported_pte_mask |= _PAGE_GLOBAL; 181 __supported_pte_mask |= _PAGE_GLOBAL;
182 } 182 }
183 } 183 }
184 184
185 #ifdef CONFIG_X86_32 185 #ifdef CONFIG_X86_32
186 #define NR_RANGE_MR 3 186 #define NR_RANGE_MR 3
187 #else /* CONFIG_X86_64 */ 187 #else /* CONFIG_X86_64 */
188 #define NR_RANGE_MR 5 188 #define NR_RANGE_MR 5
189 #endif 189 #endif
190 190
191 static int __meminit save_mr(struct map_range *mr, int nr_range, 191 static int __meminit save_mr(struct map_range *mr, int nr_range,
192 unsigned long start_pfn, unsigned long end_pfn, 192 unsigned long start_pfn, unsigned long end_pfn,
193 unsigned long page_size_mask) 193 unsigned long page_size_mask)
194 { 194 {
195 if (start_pfn < end_pfn) { 195 if (start_pfn < end_pfn) {
196 if (nr_range >= NR_RANGE_MR) 196 if (nr_range >= NR_RANGE_MR)
197 panic("run out of range for init_memory_mapping\n"); 197 panic("run out of range for init_memory_mapping\n");
198 mr[nr_range].start = start_pfn<<PAGE_SHIFT; 198 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
199 mr[nr_range].end = end_pfn<<PAGE_SHIFT; 199 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
200 mr[nr_range].page_size_mask = page_size_mask; 200 mr[nr_range].page_size_mask = page_size_mask;
201 nr_range++; 201 nr_range++;
202 } 202 }
203 203
204 return nr_range; 204 return nr_range;
205 } 205 }
206 206
207 /* 207 /*
208 * adjust the page_size_mask for small range to go with 208 * adjust the page_size_mask for small range to go with
209 * big page size instead small one if nearby are ram too. 209 * big page size instead small one if nearby are ram too.
210 */ 210 */
211 static void __init_refok adjust_range_page_size_mask(struct map_range *mr, 211 static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
212 int nr_range) 212 int nr_range)
213 { 213 {
214 int i; 214 int i;
215 215
216 for (i = 0; i < nr_range; i++) { 216 for (i = 0; i < nr_range; i++) {
217 if ((page_size_mask & (1<<PG_LEVEL_2M)) && 217 if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
218 !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { 218 !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
219 unsigned long start = round_down(mr[i].start, PMD_SIZE); 219 unsigned long start = round_down(mr[i].start, PMD_SIZE);
220 unsigned long end = round_up(mr[i].end, PMD_SIZE); 220 unsigned long end = round_up(mr[i].end, PMD_SIZE);
221 221
222 #ifdef CONFIG_X86_32 222 #ifdef CONFIG_X86_32
223 if ((end >> PAGE_SHIFT) > max_low_pfn) 223 if ((end >> PAGE_SHIFT) > max_low_pfn)
224 continue; 224 continue;
225 #endif 225 #endif
226 226
227 if (memblock_is_region_memory(start, end - start)) 227 if (memblock_is_region_memory(start, end - start))
228 mr[i].page_size_mask |= 1<<PG_LEVEL_2M; 228 mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
229 } 229 }
230 if ((page_size_mask & (1<<PG_LEVEL_1G)) && 230 if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
231 !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { 231 !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
232 unsigned long start = round_down(mr[i].start, PUD_SIZE); 232 unsigned long start = round_down(mr[i].start, PUD_SIZE);
233 unsigned long end = round_up(mr[i].end, PUD_SIZE); 233 unsigned long end = round_up(mr[i].end, PUD_SIZE);
234 234
235 if (memblock_is_region_memory(start, end - start)) 235 if (memblock_is_region_memory(start, end - start))
236 mr[i].page_size_mask |= 1<<PG_LEVEL_1G; 236 mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
237 } 237 }
238 } 238 }
239 } 239 }
240 240
241 static int __meminit split_mem_range(struct map_range *mr, int nr_range, 241 static int __meminit split_mem_range(struct map_range *mr, int nr_range,
242 unsigned long start, 242 unsigned long start,
243 unsigned long end) 243 unsigned long end)
244 { 244 {
245 unsigned long start_pfn, end_pfn, limit_pfn; 245 unsigned long start_pfn, end_pfn, limit_pfn;
246 unsigned long pfn; 246 unsigned long pfn;
247 int i; 247 int i;
248 248
249 limit_pfn = PFN_DOWN(end); 249 limit_pfn = PFN_DOWN(end);
250 250
251 /* head if not big page alignment ? */ 251 /* head if not big page alignment ? */
252 pfn = start_pfn = PFN_DOWN(start); 252 pfn = start_pfn = PFN_DOWN(start);
253 #ifdef CONFIG_X86_32 253 #ifdef CONFIG_X86_32
254 /* 254 /*
255 * Don't use a large page for the first 2/4MB of memory 255 * Don't use a large page for the first 2/4MB of memory
256 * because there are often fixed size MTRRs in there 256 * because there are often fixed size MTRRs in there
257 * and overlapping MTRRs into large pages can cause 257 * and overlapping MTRRs into large pages can cause
258 * slowdowns. 258 * slowdowns.
259 */ 259 */
260 if (pfn == 0) 260 if (pfn == 0)
261 end_pfn = PFN_DOWN(PMD_SIZE); 261 end_pfn = PFN_DOWN(PMD_SIZE);
262 else 262 else
263 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 263 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
264 #else /* CONFIG_X86_64 */ 264 #else /* CONFIG_X86_64 */
265 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 265 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
266 #endif 266 #endif
267 if (end_pfn > limit_pfn) 267 if (end_pfn > limit_pfn)
268 end_pfn = limit_pfn; 268 end_pfn = limit_pfn;
269 if (start_pfn < end_pfn) { 269 if (start_pfn < end_pfn) {
270 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 270 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
271 pfn = end_pfn; 271 pfn = end_pfn;
272 } 272 }
273 273
274 /* big page (2M) range */ 274 /* big page (2M) range */
275 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 275 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
276 #ifdef CONFIG_X86_32 276 #ifdef CONFIG_X86_32
277 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 277 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
278 #else /* CONFIG_X86_64 */ 278 #else /* CONFIG_X86_64 */
279 end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); 279 end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
280 if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) 280 if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
281 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 281 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
282 #endif 282 #endif
283 283
284 if (start_pfn < end_pfn) { 284 if (start_pfn < end_pfn) {
285 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 285 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
286 page_size_mask & (1<<PG_LEVEL_2M)); 286 page_size_mask & (1<<PG_LEVEL_2M));
287 pfn = end_pfn; 287 pfn = end_pfn;
288 } 288 }
289 289
290 #ifdef CONFIG_X86_64 290 #ifdef CONFIG_X86_64
291 /* big page (1G) range */ 291 /* big page (1G) range */
292 start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); 292 start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
293 end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); 293 end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
294 if (start_pfn < end_pfn) { 294 if (start_pfn < end_pfn) {
295 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 295 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
296 page_size_mask & 296 page_size_mask &
297 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); 297 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
298 pfn = end_pfn; 298 pfn = end_pfn;
299 } 299 }
300 300
301 /* tail is not big page (1G) alignment */ 301 /* tail is not big page (1G) alignment */
302 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 302 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
303 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 303 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
304 if (start_pfn < end_pfn) { 304 if (start_pfn < end_pfn) {
305 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 305 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
306 page_size_mask & (1<<PG_LEVEL_2M)); 306 page_size_mask & (1<<PG_LEVEL_2M));
307 pfn = end_pfn; 307 pfn = end_pfn;
308 } 308 }
309 #endif 309 #endif
310 310
311 /* tail is not big page (2M) alignment */ 311 /* tail is not big page (2M) alignment */
312 start_pfn = pfn; 312 start_pfn = pfn;
313 end_pfn = limit_pfn; 313 end_pfn = limit_pfn;
314 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 314 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
315 315
316 if (!after_bootmem) 316 if (!after_bootmem)
317 adjust_range_page_size_mask(mr, nr_range); 317 adjust_range_page_size_mask(mr, nr_range);
318 318
319 /* try to merge same page size and continuous */ 319 /* try to merge same page size and continuous */
320 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { 320 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
321 unsigned long old_start; 321 unsigned long old_start;
322 if (mr[i].end != mr[i+1].start || 322 if (mr[i].end != mr[i+1].start ||
323 mr[i].page_size_mask != mr[i+1].page_size_mask) 323 mr[i].page_size_mask != mr[i+1].page_size_mask)
324 continue; 324 continue;
325 /* move it */ 325 /* move it */
326 old_start = mr[i].start; 326 old_start = mr[i].start;
327 memmove(&mr[i], &mr[i+1], 327 memmove(&mr[i], &mr[i+1],
328 (nr_range - 1 - i) * sizeof(struct map_range)); 328 (nr_range - 1 - i) * sizeof(struct map_range));
329 mr[i--].start = old_start; 329 mr[i--].start = old_start;
330 nr_range--; 330 nr_range--;
331 } 331 }
332 332
333 for (i = 0; i < nr_range; i++) 333 for (i = 0; i < nr_range; i++)
334 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", 334 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
335 mr[i].start, mr[i].end - 1, 335 mr[i].start, mr[i].end - 1,
336 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( 336 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
337 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 337 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
338 338
339 return nr_range; 339 return nr_range;
340 } 340 }
341 341
342 struct range pfn_mapped[E820_X_MAX]; 342 struct range pfn_mapped[E820_X_MAX];
343 int nr_pfn_mapped; 343 int nr_pfn_mapped;
344 344
345 static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) 345 static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
346 { 346 {
347 nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, 347 nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
348 nr_pfn_mapped, start_pfn, end_pfn); 348 nr_pfn_mapped, start_pfn, end_pfn);
349 nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); 349 nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
350 350
351 max_pfn_mapped = max(max_pfn_mapped, end_pfn); 351 max_pfn_mapped = max(max_pfn_mapped, end_pfn);
352 352
353 if (start_pfn < (1UL<<(32-PAGE_SHIFT))) 353 if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
354 max_low_pfn_mapped = max(max_low_pfn_mapped, 354 max_low_pfn_mapped = max(max_low_pfn_mapped,
355 min(end_pfn, 1UL<<(32-PAGE_SHIFT))); 355 min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
356 } 356 }
357 357
358 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) 358 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
359 { 359 {
360 int i; 360 int i;
361 361
362 for (i = 0; i < nr_pfn_mapped; i++) 362 for (i = 0; i < nr_pfn_mapped; i++)
363 if ((start_pfn >= pfn_mapped[i].start) && 363 if ((start_pfn >= pfn_mapped[i].start) &&
364 (end_pfn <= pfn_mapped[i].end)) 364 (end_pfn <= pfn_mapped[i].end))
365 return true; 365 return true;
366 366
367 return false; 367 return false;
368 } 368 }
369 369
370 /* 370 /*
371 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 371 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
372 * This runs before bootmem is initialized and gets pages directly from 372 * This runs before bootmem is initialized and gets pages directly from
373 * the physical memory. To access them they are temporarily mapped. 373 * the physical memory. To access them they are temporarily mapped.
374 */ 374 */
375 unsigned long __init_refok init_memory_mapping(unsigned long start, 375 unsigned long __init_refok init_memory_mapping(unsigned long start,
376 unsigned long end) 376 unsigned long end)
377 { 377 {
378 struct map_range mr[NR_RANGE_MR]; 378 struct map_range mr[NR_RANGE_MR];
379 unsigned long ret = 0; 379 unsigned long ret = 0;
380 int nr_range, i; 380 int nr_range, i;
381 381
382 pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", 382 pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
383 start, end - 1); 383 start, end - 1);
384 384
385 memset(mr, 0, sizeof(mr)); 385 memset(mr, 0, sizeof(mr));
386 nr_range = split_mem_range(mr, 0, start, end); 386 nr_range = split_mem_range(mr, 0, start, end);
387 387
388 for (i = 0; i < nr_range; i++) 388 for (i = 0; i < nr_range; i++)
389 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 389 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
390 mr[i].page_size_mask); 390 mr[i].page_size_mask);
391 391
392 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); 392 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
393 393
394 return ret >> PAGE_SHIFT; 394 return ret >> PAGE_SHIFT;
395 } 395 }
396 396
397 /* 397 /*
398 * We need to iterate through the E820 memory map and create direct mappings 398 * We need to iterate through the E820 memory map and create direct mappings
399 * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply 399 * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
400 * create direct mappings for all pfns from [0 to max_low_pfn) and 400 * create direct mappings for all pfns from [0 to max_low_pfn) and
401 * [4GB to max_pfn) because of possible memory holes in high addresses 401 * [4GB to max_pfn) because of possible memory holes in high addresses
402 * that cannot be marked as UC by fixed/variable range MTRRs. 402 * that cannot be marked as UC by fixed/variable range MTRRs.
403 * Depending on the alignment of E820 ranges, this may possibly result 403 * Depending on the alignment of E820 ranges, this may possibly result
404 * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. 404 * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
405 * 405 *
406 * init_mem_mapping() calls init_range_memory_mapping() with big range. 406 * init_mem_mapping() calls init_range_memory_mapping() with big range.
407 * That range would have hole in the middle or ends, and only ram parts 407 * That range would have hole in the middle or ends, and only ram parts
408 * will be mapped in init_range_memory_mapping(). 408 * will be mapped in init_range_memory_mapping().
409 */ 409 */
410 static unsigned long __init init_range_memory_mapping( 410 static unsigned long __init init_range_memory_mapping(
411 unsigned long r_start, 411 unsigned long r_start,
412 unsigned long r_end) 412 unsigned long r_end)
413 { 413 {
414 unsigned long start_pfn, end_pfn; 414 unsigned long start_pfn, end_pfn;
415 unsigned long mapped_ram_size = 0; 415 unsigned long mapped_ram_size = 0;
416 int i; 416 int i;
417 417
418 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 418 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
419 u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); 419 u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
420 u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); 420 u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
421 if (start >= end) 421 if (start >= end)
422 continue; 422 continue;
423 423
424 /* 424 /*
425 * if it is overlapping with brk pgt, we need to 425 * if it is overlapping with brk pgt, we need to
426 * alloc pgt buf from memblock instead. 426 * alloc pgt buf from memblock instead.
427 */ 427 */
428 can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= 428 can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
429 min(end, (u64)pgt_buf_top<<PAGE_SHIFT); 429 min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
430 init_memory_mapping(start, end); 430 init_memory_mapping(start, end);
431 mapped_ram_size += end - start; 431 mapped_ram_size += end - start;
432 can_use_brk_pgt = true; 432 can_use_brk_pgt = true;
433 } 433 }
434 434
435 return mapped_ram_size; 435 return mapped_ram_size;
436 } 436 }
437 437
438 static unsigned long __init get_new_step_size(unsigned long step_size) 438 static unsigned long __init get_new_step_size(unsigned long step_size)
439 { 439 {
440 /* 440 /*
441 * Explain why we shift by 5 and why we don't have to worry about 441 * Initial mapped size is PMD_SIZE (2M).
442 * 'step_size << 5' overflowing:
443 *
444 * initial mapped size is PMD_SIZE (2M).
445 * We can not set step_size to be PUD_SIZE (1G) yet. 442 * We can not set step_size to be PUD_SIZE (1G) yet.
446 * In worse case, when we cross the 1G boundary, and 443 * In worse case, when we cross the 1G boundary, and
447 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) 444 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
448 * to map 1G range with PTE. Use 5 as shift for now. 445 * to map 1G range with PTE. Hence we use one less than the
446 * difference of page table level shifts.
449 * 447 *
450 * Don't need to worry about overflow, on 32bit, when step_size 448 * Don't need to worry about overflow in the top-down case, on 32bit,
451 * is 0, round_down() returns 0 for start, and that turns it 449 * when step_size is 0, round_down() returns 0 for start, and that
452 * into 0x100000000ULL. 450 * turns it into 0x100000000ULL.
451 * In the bottom-up case, round_up(x, 0) returns 0 though too, which
452 * needs to be taken into consideration by the code below.
453 */ 453 */
454 return step_size << 5; 454 return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
455 } 455 }
456 456
457 /** 457 /**
458 * memory_map_top_down - Map [map_start, map_end) top down 458 * memory_map_top_down - Map [map_start, map_end) top down
459 * @map_start: start address of the target memory range 459 * @map_start: start address of the target memory range
460 * @map_end: end address of the target memory range 460 * @map_end: end address of the target memory range
461 * 461 *
462 * This function will setup direct mapping for memory range 462 * This function will setup direct mapping for memory range
463 * [map_start, map_end) in top-down. That said, the page tables 463 * [map_start, map_end) in top-down. That said, the page tables
464 * will be allocated at the end of the memory, and we map the 464 * will be allocated at the end of the memory, and we map the
465 * memory in top-down. 465 * memory in top-down.
466 */ 466 */
467 static void __init memory_map_top_down(unsigned long map_start, 467 static void __init memory_map_top_down(unsigned long map_start,
468 unsigned long map_end) 468 unsigned long map_end)
469 { 469 {
470 unsigned long real_end, start, last_start; 470 unsigned long real_end, start, last_start;
471 unsigned long step_size; 471 unsigned long step_size;
472 unsigned long addr; 472 unsigned long addr;
473 unsigned long mapped_ram_size = 0; 473 unsigned long mapped_ram_size = 0;
474 unsigned long new_mapped_ram_size;
475 474
476 /* xen has big range in reserved near end of ram, skip it at first.*/ 475 /* xen has big range in reserved near end of ram, skip it at first.*/
477 addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); 476 addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
478 real_end = addr + PMD_SIZE; 477 real_end = addr + PMD_SIZE;
479 478
480 /* step_size need to be small so pgt_buf from BRK could cover it */ 479 /* step_size need to be small so pgt_buf from BRK could cover it */
481 step_size = PMD_SIZE; 480 step_size = PMD_SIZE;
482 max_pfn_mapped = 0; /* will get exact value next */ 481 max_pfn_mapped = 0; /* will get exact value next */
483 min_pfn_mapped = real_end >> PAGE_SHIFT; 482 min_pfn_mapped = real_end >> PAGE_SHIFT;
484 last_start = start = real_end; 483 last_start = start = real_end;
485 484
486 /* 485 /*
487 * We start from the top (end of memory) and go to the bottom. 486 * We start from the top (end of memory) and go to the bottom.
488 * The memblock_find_in_range() gets us a block of RAM from the 487 * The memblock_find_in_range() gets us a block of RAM from the
489 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages 488 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
490 * for page table. 489 * for page table.
491 */ 490 */
492 while (last_start > map_start) { 491 while (last_start > map_start) {
493 if (last_start > step_size) { 492 if (last_start > step_size) {
494 start = round_down(last_start - 1, step_size); 493 start = round_down(last_start - 1, step_size);
495 if (start < map_start) 494 if (start < map_start)
496 start = map_start; 495 start = map_start;
497 } else 496 } else
498 start = map_start; 497 start = map_start;
499 new_mapped_ram_size = init_range_memory_mapping(start, 498 mapped_ram_size += init_range_memory_mapping(start,
500 last_start); 499 last_start);
501 last_start = start; 500 last_start = start;
502 min_pfn_mapped = last_start >> PAGE_SHIFT; 501 min_pfn_mapped = last_start >> PAGE_SHIFT;
503 /* only increase step_size after big range get mapped */ 502 if (mapped_ram_size >= step_size)
504 if (new_mapped_ram_size > mapped_ram_size)
505 step_size = get_new_step_size(step_size); 503 step_size = get_new_step_size(step_size);
506 mapped_ram_size += new_mapped_ram_size;
507 } 504 }
508 505
509 if (real_end < map_end) 506 if (real_end < map_end)
510 init_range_memory_mapping(real_end, map_end); 507 init_range_memory_mapping(real_end, map_end);
511 } 508 }
512 509
513 /** 510 /**
514 * memory_map_bottom_up - Map [map_start, map_end) bottom up 511 * memory_map_bottom_up - Map [map_start, map_end) bottom up
515 * @map_start: start address of the target memory range 512 * @map_start: start address of the target memory range
516 * @map_end: end address of the target memory range 513 * @map_end: end address of the target memory range
517 * 514 *
518 * This function will setup direct mapping for memory range 515 * This function will setup direct mapping for memory range
519 * [map_start, map_end) in bottom-up. Since we have limited the 516 * [map_start, map_end) in bottom-up. Since we have limited the
520 * bottom-up allocation above the kernel, the page tables will 517 * bottom-up allocation above the kernel, the page tables will
521 * be allocated just above the kernel and we map the memory 518 * be allocated just above the kernel and we map the memory
522 * in [map_start, map_end) in bottom-up. 519 * in [map_start, map_end) in bottom-up.
523 */ 520 */
524 static void __init memory_map_bottom_up(unsigned long map_start, 521 static void __init memory_map_bottom_up(unsigned long map_start,
525 unsigned long map_end) 522 unsigned long map_end)
526 { 523 {
527 unsigned long next, new_mapped_ram_size, start; 524 unsigned long next, start;
528 unsigned long mapped_ram_size = 0; 525 unsigned long mapped_ram_size = 0;
529 /* step_size need to be small so pgt_buf from BRK could cover it */ 526 /* step_size need to be small so pgt_buf from BRK could cover it */
530 unsigned long step_size = PMD_SIZE; 527 unsigned long step_size = PMD_SIZE;
531 528
532 start = map_start; 529 start = map_start;
533 min_pfn_mapped = start >> PAGE_SHIFT; 530 min_pfn_mapped = start >> PAGE_SHIFT;
534 531
535 /* 532 /*
536 * We start from the bottom (@map_start) and go to the top (@map_end). 533 * We start from the bottom (@map_start) and go to the top (@map_end).
537 * The memblock_find_in_range() gets us a block of RAM from the 534 * The memblock_find_in_range() gets us a block of RAM from the
538 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages 535 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
539 * for page table. 536 * for page table.
540 */ 537 */
541 while (start < map_end) { 538 while (start < map_end) {
542 if (map_end - start > step_size) { 539 if (step_size && map_end - start > step_size) {
543 next = round_up(start + 1, step_size); 540 next = round_up(start + 1, step_size);
544 if (next > map_end) 541 if (next > map_end)
545 next = map_end; 542 next = map_end;
546 } else 543 } else {
547 next = map_end; 544 next = map_end;
545 }
548 546
549 new_mapped_ram_size = init_range_memory_mapping(start, next); 547 mapped_ram_size += init_range_memory_mapping(start, next);
550 start = next; 548 start = next;
551 549
552 if (new_mapped_ram_size > mapped_ram_size) 550 if (mapped_ram_size >= step_size)
553 step_size = get_new_step_size(step_size); 551 step_size = get_new_step_size(step_size);
554 mapped_ram_size += new_mapped_ram_size;
555 } 552 }
556 } 553 }
557 554
558 void __init init_mem_mapping(void) 555 void __init init_mem_mapping(void)
559 { 556 {
560 unsigned long end; 557 unsigned long end;
561 558
562 probe_page_size_mask(); 559 probe_page_size_mask();
563 560
564 #ifdef CONFIG_X86_64 561 #ifdef CONFIG_X86_64
565 end = max_pfn << PAGE_SHIFT; 562 end = max_pfn << PAGE_SHIFT;
566 #else 563 #else
567 end = max_low_pfn << PAGE_SHIFT; 564 end = max_low_pfn << PAGE_SHIFT;
568 #endif 565 #endif
569 566
570 /* the ISA range is always mapped regardless of memory holes */ 567 /* the ISA range is always mapped regardless of memory holes */
571 init_memory_mapping(0, ISA_END_ADDRESS); 568 init_memory_mapping(0, ISA_END_ADDRESS);
572 569
573 /* 570 /*
574 * If the allocation is in bottom-up direction, we setup direct mapping 571 * If the allocation is in bottom-up direction, we setup direct mapping
575 * in bottom-up, otherwise we setup direct mapping in top-down. 572 * in bottom-up, otherwise we setup direct mapping in top-down.
576 */ 573 */
577 if (memblock_bottom_up()) { 574 if (memblock_bottom_up()) {
578 unsigned long kernel_end = __pa_symbol(_end); 575 unsigned long kernel_end = __pa_symbol(_end);
579 576
580 /* 577 /*
581 * we need two separate calls here. This is because we want to 578 * we need two separate calls here. This is because we want to
582 * allocate page tables above the kernel. So we first map 579 * allocate page tables above the kernel. So we first map
583 * [kernel_end, end) to make memory above the kernel be mapped 580 * [kernel_end, end) to make memory above the kernel be mapped
584 * as soon as possible. And then use page tables allocated above 581 * as soon as possible. And then use page tables allocated above
585 * the kernel to map [ISA_END_ADDRESS, kernel_end). 582 * the kernel to map [ISA_END_ADDRESS, kernel_end).
586 */ 583 */
587 memory_map_bottom_up(kernel_end, end); 584 memory_map_bottom_up(kernel_end, end);
588 memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); 585 memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
589 } else { 586 } else {
590 memory_map_top_down(ISA_END_ADDRESS, end); 587 memory_map_top_down(ISA_END_ADDRESS, end);
591 } 588 }
592 589
593 #ifdef CONFIG_X86_64 590 #ifdef CONFIG_X86_64
594 if (max_pfn > max_low_pfn) { 591 if (max_pfn > max_low_pfn) {
595 /* can we preseve max_low_pfn ?*/ 592 /* can we preseve max_low_pfn ?*/
596 max_low_pfn = max_pfn; 593 max_low_pfn = max_pfn;
597 } 594 }
598 #else 595 #else
599 early_ioremap_page_table_range_init(); 596 early_ioremap_page_table_range_init();
600 #endif 597 #endif
601 598
602 load_cr3(swapper_pg_dir); 599 load_cr3(swapper_pg_dir);
603 __flush_tlb_all(); 600 __flush_tlb_all();
604 601
605 early_memtest(0, max_pfn_mapped << PAGE_SHIFT); 602 early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
606 } 603 }
607 604
608 /* 605 /*
609 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 606 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
610 * is valid. The argument is a physical page number. 607 * is valid. The argument is a physical page number.
611 * 608 *
612 * 609 *
613 * On x86, access has to be given to the first megabyte of ram because that area 610 * On x86, access has to be given to the first megabyte of ram because that area
614 * contains bios code and data regions used by X and dosemu and similar apps. 611 * contains bios code and data regions used by X and dosemu and similar apps.
615 * Access has to be given to non-kernel-ram areas as well, these contain the PCI 612 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
616 * mmio resources as well as potential bios/acpi data regions. 613 * mmio resources as well as potential bios/acpi data regions.
617 */ 614 */
618 int devmem_is_allowed(unsigned long pagenr) 615 int devmem_is_allowed(unsigned long pagenr)
619 { 616 {
620 if (pagenr < 256) 617 if (pagenr < 256)
621 return 1; 618 return 1;
622 if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) 619 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
623 return 0; 620 return 0;
624 if (!page_is_ram(pagenr)) 621 if (!page_is_ram(pagenr))
625 return 1; 622 return 1;
626 return 0; 623 return 0;
627 } 624 }
628 625
629 void free_init_pages(char *what, unsigned long begin, unsigned long end) 626 void free_init_pages(char *what, unsigned long begin, unsigned long end)
630 { 627 {
631 unsigned long begin_aligned, end_aligned; 628 unsigned long begin_aligned, end_aligned;
632 629
633 /* Make sure boundaries are page aligned */ 630 /* Make sure boundaries are page aligned */
634 begin_aligned = PAGE_ALIGN(begin); 631 begin_aligned = PAGE_ALIGN(begin);
635 end_aligned = end & PAGE_MASK; 632 end_aligned = end & PAGE_MASK;
636 633
637 if (WARN_ON(begin_aligned != begin || end_aligned != end)) { 634 if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
638 begin = begin_aligned; 635 begin = begin_aligned;
639 end = end_aligned; 636 end = end_aligned;
640 } 637 }
641 638
642 if (begin >= end) 639 if (begin >= end)
643 return; 640 return;
644 641
645 /* 642 /*
646 * If debugging page accesses then do not free this memory but 643 * If debugging page accesses then do not free this memory but
647 * mark them not present - any buggy init-section access will 644 * mark them not present - any buggy init-section access will
648 * create a kernel page fault: 645 * create a kernel page fault:
649 */ 646 */
650 #ifdef CONFIG_DEBUG_PAGEALLOC 647 #ifdef CONFIG_DEBUG_PAGEALLOC
651 printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n", 648 printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
652 begin, end - 1); 649 begin, end - 1);
653 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 650 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
654 #else 651 #else
655 /* 652 /*
656 * We just marked the kernel text read only above, now that 653 * We just marked the kernel text read only above, now that
657 * we are going to free part of that, we need to make that 654 * we are going to free part of that, we need to make that
658 * writeable and non-executable first. 655 * writeable and non-executable first.
659 */ 656 */
660 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); 657 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
661 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 658 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
662 659
663 free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); 660 free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
664 #endif 661 #endif
665 } 662 }
666 663
667 void free_initmem(void) 664 void free_initmem(void)
668 { 665 {
669 free_init_pages("unused kernel", 666 free_init_pages("unused kernel",
670 (unsigned long)(&__init_begin), 667 (unsigned long)(&__init_begin),
671 (unsigned long)(&__init_end)); 668 (unsigned long)(&__init_end));
672 } 669 }
673 670
674 #ifdef CONFIG_BLK_DEV_INITRD 671 #ifdef CONFIG_BLK_DEV_INITRD
675 void __init free_initrd_mem(unsigned long start, unsigned long end) 672 void __init free_initrd_mem(unsigned long start, unsigned long end)
676 { 673 {
677 #ifdef CONFIG_MICROCODE_EARLY 674 #ifdef CONFIG_MICROCODE_EARLY
678 /* 675 /*
679 * Remember, initrd memory may contain microcode or other useful things. 676 * Remember, initrd memory may contain microcode or other useful things.
680 * Before we lose initrd mem, we need to find a place to hold them 677 * Before we lose initrd mem, we need to find a place to hold them
681 * now that normal virtual memory is enabled. 678 * now that normal virtual memory is enabled.
682 */ 679 */
683 save_microcode_in_initrd(); 680 save_microcode_in_initrd();
684 #endif 681 #endif
685 682
686 /* 683 /*
687 * end could be not aligned, and We can not align that, 684 * end could be not aligned, and We can not align that,
688 * decompresser could be confused by aligned initrd_end 685 * decompresser could be confused by aligned initrd_end
689 * We already reserve the end partial page before in 686 * We already reserve the end partial page before in
690 * - i386_start_kernel() 687 * - i386_start_kernel()
691 * - x86_64_start_kernel() 688 * - x86_64_start_kernel()
692 * - relocate_initrd() 689 * - relocate_initrd()
693 * So here We can do PAGE_ALIGN() safely to get partial page to be freed 690 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
694 */ 691 */
695 free_init_pages("initrd", start, PAGE_ALIGN(end)); 692 free_init_pages("initrd", start, PAGE_ALIGN(end));
696 } 693 }
697 #endif 694 #endif
698 695
699 void __init zone_sizes_init(void) 696 void __init zone_sizes_init(void)
700 { 697 {
701 unsigned long max_zone_pfns[MAX_NR_ZONES]; 698 unsigned long max_zone_pfns[MAX_NR_ZONES];
702 699
703 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 700 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
704 701
705 #ifdef CONFIG_ZONE_DMA 702 #ifdef CONFIG_ZONE_DMA
706 max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); 703 max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
707 #endif 704 #endif
708 #ifdef CONFIG_ZONE_DMA32 705 #ifdef CONFIG_ZONE_DMA32
709 max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); 706 max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
710 #endif 707 #endif
711 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 708 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
712 #ifdef CONFIG_HIGHMEM 709 #ifdef CONFIG_HIGHMEM
713 max_zone_pfns[ZONE_HIGHMEM] = max_pfn; 710 max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
714 #endif 711 #endif
715 712
716 free_area_init_nodes(max_zone_pfns); 713 free_area_init_nodes(max_zone_pfns);
717 } 714 }
718 715
719 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) 716 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
720 { 717 {
721 /* entry 0 MUST be WB (hardwired to speed up translations) */ 718 /* entry 0 MUST be WB (hardwired to speed up translations) */
722 BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); 719 BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
723 720
1 /* 1 /*
2 * Copyright 2007 Andi Kleen, SUSE Labs. 2 * Copyright 2007 Andi Kleen, SUSE Labs.
3 * Subject to the GPL, v.2 3 * Subject to the GPL, v.2
4 * 4 *
5 * This contains most of the x86 vDSO kernel-side code. 5 * This contains most of the x86 vDSO kernel-side code.
6 */ 6 */
7 #include <linux/mm.h> 7 #include <linux/mm.h>
8 #include <linux/err.h> 8 #include <linux/err.h>
9 #include <linux/sched.h> 9 #include <linux/sched.h>
10 #include <linux/slab.h> 10 #include <linux/slab.h>
11 #include <linux/init.h> 11 #include <linux/init.h>
12 #include <linux/random.h> 12 #include <linux/random.h>
13 #include <linux/elf.h> 13 #include <linux/elf.h>
14 #include <linux/cpu.h> 14 #include <linux/cpu.h>
15 #include <asm/vgtod.h> 15 #include <asm/vgtod.h>
16 #include <asm/proto.h> 16 #include <asm/proto.h>
17 #include <asm/vdso.h> 17 #include <asm/vdso.h>
18 #include <asm/vvar.h> 18 #include <asm/vvar.h>
19 #include <asm/page.h> 19 #include <asm/page.h>
20 #include <asm/hpet.h> 20 #include <asm/hpet.h>
21 #include <asm/desc.h> 21 #include <asm/desc.h>
22 22
23 #if defined(CONFIG_X86_64) 23 #if defined(CONFIG_X86_64)
24 unsigned int __read_mostly vdso64_enabled = 1; 24 unsigned int __read_mostly vdso64_enabled = 1;
25 #endif 25 #endif
26 26
27 void __init init_vdso_image(const struct vdso_image *image) 27 void __init init_vdso_image(const struct vdso_image *image)
28 { 28 {
29 int i; 29 int i;
30 int npages = (image->size) / PAGE_SIZE; 30 int npages = (image->size) / PAGE_SIZE;
31 31
32 BUG_ON(image->size % PAGE_SIZE != 0); 32 BUG_ON(image->size % PAGE_SIZE != 0);
33 for (i = 0; i < npages; i++) 33 for (i = 0; i < npages; i++)
34 image->text_mapping.pages[i] = 34 image->text_mapping.pages[i] =
35 virt_to_page(image->data + i*PAGE_SIZE); 35 virt_to_page(image->data + i*PAGE_SIZE);
36 36
37 apply_alternatives((struct alt_instr *)(image->data + image->alt), 37 apply_alternatives((struct alt_instr *)(image->data + image->alt),
38 (struct alt_instr *)(image->data + image->alt + 38 (struct alt_instr *)(image->data + image->alt +
39 image->alt_len)); 39 image->alt_len));
40 } 40 }
41 41
42 struct linux_binprm; 42 struct linux_binprm;
43 43
44 /* Put the vdso above the (randomized) stack with another randomized offset. 44 /*
45 This way there is no hole in the middle of address space. 45 * Put the vdso above the (randomized) stack with another randomized
46 To save memory make sure it is still in the same PTE as the stack top. 46 * offset. This way there is no hole in the middle of address space.
47 This doesn't give that many random bits. 47 * To save memory make sure it is still in the same PTE as the stack
48 48 * top. This doesn't give that many random bits.
49 Only used for the 64-bit and x32 vdsos. */ 49 *
50 * Note that this algorithm is imperfect: the distribution of the vdso
51 * start address within a PMD is biased toward the end.
52 *
53 * Only used for the 64-bit and x32 vdsos.
54 */
50 static unsigned long vdso_addr(unsigned long start, unsigned len) 55 static unsigned long vdso_addr(unsigned long start, unsigned len)
51 { 56 {
52 #ifdef CONFIG_X86_32 57 #ifdef CONFIG_X86_32
53 return 0; 58 return 0;
54 #else 59 #else
55 unsigned long addr, end; 60 unsigned long addr, end;
56 unsigned offset; 61 unsigned offset;
57 end = (start + PMD_SIZE - 1) & PMD_MASK; 62
63 /*
64 * Round up the start address. It can start out unaligned as a result
65 * of stack start randomization.
66 */
67 start = PAGE_ALIGN(start);
68
69 /* Round the lowest possible end address up to a PMD boundary. */
70 end = (start + len + PMD_SIZE - 1) & PMD_MASK;
58 if (end >= TASK_SIZE_MAX) 71 if (end >= TASK_SIZE_MAX)
59 end = TASK_SIZE_MAX; 72 end = TASK_SIZE_MAX;
60 end -= len; 73 end -= len;
61 /* This loses some more bits than a modulo, but is cheaper */
62 offset = get_random_int() & (PTRS_PER_PTE - 1);
63 addr = start + (offset << PAGE_SHIFT);
64 if (addr >= end)
65 addr = end;
66 74
75 if (end > start) {
76 offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
77 addr = start + (offset << PAGE_SHIFT);
78 } else {
79 addr = start;
80 }
81
67 /* 82 /*
68 * page-align it here so that get_unmapped_area doesn't 83 * Forcibly align the final address in case we have a hardware
69 * align it wrongfully again to the next page. addr can come in 4K 84 * issue that requires alignment for performance reasons.
70 * unaligned here as a result of stack start randomization.
71 */ 85 */
72 addr = PAGE_ALIGN(addr);
73 addr = align_vdso_addr(addr); 86 addr = align_vdso_addr(addr);
74 87
75 return addr; 88 return addr;
76 #endif 89 #endif
77 } 90 }
78 91
79 static int map_vdso(const struct vdso_image *image, bool calculate_addr) 92 static int map_vdso(const struct vdso_image *image, bool calculate_addr)
80 { 93 {
81 struct mm_struct *mm = current->mm; 94 struct mm_struct *mm = current->mm;
82 struct vm_area_struct *vma; 95 struct vm_area_struct *vma;
83 unsigned long addr, text_start; 96 unsigned long addr, text_start;
84 int ret = 0; 97 int ret = 0;
85 static struct page *no_pages[] = {NULL}; 98 static struct page *no_pages[] = {NULL};
86 static struct vm_special_mapping vvar_mapping = { 99 static struct vm_special_mapping vvar_mapping = {
87 .name = "[vvar]", 100 .name = "[vvar]",
88 .pages = no_pages, 101 .pages = no_pages,
89 }; 102 };
90 103
91 if (calculate_addr) { 104 if (calculate_addr) {
92 addr = vdso_addr(current->mm->start_stack, 105 addr = vdso_addr(current->mm->start_stack,
93 image->size - image->sym_vvar_start); 106 image->size - image->sym_vvar_start);
94 } else { 107 } else {
95 addr = 0; 108 addr = 0;
96 } 109 }
97 110
98 down_write(&mm->mmap_sem); 111 down_write(&mm->mmap_sem);
99 112
100 addr = get_unmapped_area(NULL, addr, 113 addr = get_unmapped_area(NULL, addr,
101 image->size - image->sym_vvar_start, 0, 0); 114 image->size - image->sym_vvar_start, 0, 0);
102 if (IS_ERR_VALUE(addr)) { 115 if (IS_ERR_VALUE(addr)) {
103 ret = addr; 116 ret = addr;
104 goto up_fail; 117 goto up_fail;
105 } 118 }
106 119
107 text_start = addr - image->sym_vvar_start; 120 text_start = addr - image->sym_vvar_start;
108 current->mm->context.vdso = (void __user *)text_start; 121 current->mm->context.vdso = (void __user *)text_start;
109 122
110 /* 123 /*
111 * MAYWRITE to allow gdb to COW and set breakpoints 124 * MAYWRITE to allow gdb to COW and set breakpoints
112 */ 125 */
113 vma = _install_special_mapping(mm, 126 vma = _install_special_mapping(mm,
114 text_start, 127 text_start,
115 image->size, 128 image->size,
116 VM_READ|VM_EXEC| 129 VM_READ|VM_EXEC|
117 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 130 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
118 &image->text_mapping); 131 &image->text_mapping);
119 132
120 if (IS_ERR(vma)) { 133 if (IS_ERR(vma)) {
121 ret = PTR_ERR(vma); 134 ret = PTR_ERR(vma);
122 goto up_fail; 135 goto up_fail;
123 } 136 }
124 137
125 vma = _install_special_mapping(mm, 138 vma = _install_special_mapping(mm,
126 addr, 139 addr,
127 -image->sym_vvar_start, 140 -image->sym_vvar_start,
128 VM_READ|VM_MAYREAD, 141 VM_READ|VM_MAYREAD,
129 &vvar_mapping); 142 &vvar_mapping);
130 143
131 if (IS_ERR(vma)) { 144 if (IS_ERR(vma)) {
132 ret = PTR_ERR(vma); 145 ret = PTR_ERR(vma);
133 goto up_fail; 146 goto up_fail;
134 } 147 }
135 148
136 if (image->sym_vvar_page) 149 if (image->sym_vvar_page)
137 ret = remap_pfn_range(vma, 150 ret = remap_pfn_range(vma,
138 text_start + image->sym_vvar_page, 151 text_start + image->sym_vvar_page,
139 __pa_symbol(&__vvar_page) >> PAGE_SHIFT, 152 __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
140 PAGE_SIZE, 153 PAGE_SIZE,
141 PAGE_READONLY); 154 PAGE_READONLY);
142 155
143 if (ret) 156 if (ret)
144 goto up_fail; 157 goto up_fail;
145 158
146 #ifdef CONFIG_HPET_TIMER 159 #ifdef CONFIG_HPET_TIMER
147 if (hpet_address && image->sym_hpet_page) { 160 if (hpet_address && image->sym_hpet_page) {
148 ret = io_remap_pfn_range(vma, 161 ret = io_remap_pfn_range(vma,
149 text_start + image->sym_hpet_page, 162 text_start + image->sym_hpet_page,
150 hpet_address >> PAGE_SHIFT, 163 hpet_address >> PAGE_SHIFT,
151 PAGE_SIZE, 164 PAGE_SIZE,
152 pgprot_noncached(PAGE_READONLY)); 165 pgprot_noncached(PAGE_READONLY));
153 166
154 if (ret) 167 if (ret)
155 goto up_fail; 168 goto up_fail;
156 } 169 }
157 #endif 170 #endif
158 171
159 up_fail: 172 up_fail:
160 if (ret) 173 if (ret)
161 current->mm->context.vdso = NULL; 174 current->mm->context.vdso = NULL;
162 175
163 up_write(&mm->mmap_sem); 176 up_write(&mm->mmap_sem);
164 return ret; 177 return ret;
165 } 178 }
166 179
167 #if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) 180 #if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
168 static int load_vdso32(void) 181 static int load_vdso32(void)
169 { 182 {
170 int ret; 183 int ret;
171 184
172 if (vdso32_enabled != 1) /* Other values all mean "disabled" */ 185 if (vdso32_enabled != 1) /* Other values all mean "disabled" */
173 return 0; 186 return 0;
174 187
175 ret = map_vdso(selected_vdso32, false); 188 ret = map_vdso(selected_vdso32, false);
176 if (ret) 189 if (ret)
177 return ret; 190 return ret;
178 191
179 if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) 192 if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
180 current_thread_info()->sysenter_return = 193 current_thread_info()->sysenter_return =
181 current->mm->context.vdso + 194 current->mm->context.vdso +
182 selected_vdso32->sym_VDSO32_SYSENTER_RETURN; 195 selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
183 196
184 return 0; 197 return 0;
185 } 198 }
186 #endif 199 #endif
187 200
188 #ifdef CONFIG_X86_64 201 #ifdef CONFIG_X86_64
189 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) 202 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
190 { 203 {
191 if (!vdso64_enabled) 204 if (!vdso64_enabled)
192 return 0; 205 return 0;
193 206
194 return map_vdso(&vdso_image_64, true); 207 return map_vdso(&vdso_image_64, true);
195 } 208 }
196 209
197 #ifdef CONFIG_COMPAT 210 #ifdef CONFIG_COMPAT
198 int compat_arch_setup_additional_pages(struct linux_binprm *bprm, 211 int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
199 int uses_interp) 212 int uses_interp)
200 { 213 {
201 #ifdef CONFIG_X86_X32_ABI 214 #ifdef CONFIG_X86_X32_ABI
202 if (test_thread_flag(TIF_X32)) { 215 if (test_thread_flag(TIF_X32)) {
203 if (!vdso64_enabled) 216 if (!vdso64_enabled)
204 return 0; 217 return 0;
205 218
206 return map_vdso(&vdso_image_x32, true); 219 return map_vdso(&vdso_image_x32, true);
207 } 220 }
208 #endif 221 #endif
209 222
210 return load_vdso32(); 223 return load_vdso32();
211 } 224 }
212 #endif 225 #endif
213 #else 226 #else
214 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) 227 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
215 { 228 {
216 return load_vdso32(); 229 return load_vdso32();
217 } 230 }
218 #endif 231 #endif
219 232
220 #ifdef CONFIG_X86_64 233 #ifdef CONFIG_X86_64
221 static __init int vdso_setup(char *s) 234 static __init int vdso_setup(char *s)
222 { 235 {
223 vdso64_enabled = simple_strtoul(s, NULL, 0); 236 vdso64_enabled = simple_strtoul(s, NULL, 0);
224 return 0; 237 return 0;
225 } 238 }
226 __setup("vdso=", vdso_setup); 239 __setup("vdso=", vdso_setup);
227 #endif 240 #endif
228 241
229 #ifdef CONFIG_X86_64 242 #ifdef CONFIG_X86_64
230 static void vgetcpu_cpu_init(void *arg) 243 static void vgetcpu_cpu_init(void *arg)
231 { 244 {
232 int cpu = smp_processor_id(); 245 int cpu = smp_processor_id();
233 struct desc_struct d = { }; 246 struct desc_struct d = { };
234 unsigned long node = 0; 247 unsigned long node = 0;
235 #ifdef CONFIG_NUMA 248 #ifdef CONFIG_NUMA
236 node = cpu_to_node(cpu); 249 node = cpu_to_node(cpu);
237 #endif 250 #endif
238 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 251 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
239 write_rdtscp_aux((node << 12) | cpu); 252 write_rdtscp_aux((node << 12) | cpu);
240 253
241 /* 254 /*
242 * Store cpu number in limit so that it can be loaded 255 * Store cpu number in limit so that it can be loaded
243 * quickly in user space in vgetcpu. (12 bits for the CPU 256 * quickly in user space in vgetcpu. (12 bits for the CPU
244 * and 8 bits for the node) 257 * and 8 bits for the node)
245 */ 258 */
246 d.limit0 = cpu | ((node & 0xf) << 12); 259 d.limit0 = cpu | ((node & 0xf) << 12);
247 d.limit = node >> 4; 260 d.limit = node >> 4;
248 d.type = 5; /* RO data, expand down, accessed */ 261 d.type = 5; /* RO data, expand down, accessed */
249 d.dpl = 3; /* Visible to user code */ 262 d.dpl = 3; /* Visible to user code */
250 d.s = 1; /* Not a system segment */ 263 d.s = 1; /* Not a system segment */
251 d.p = 1; /* Present */ 264 d.p = 1; /* Present */
252 d.d = 1; /* 32-bit */ 265 d.d = 1; /* 32-bit */
253 266
254 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 267 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
255 } 268 }
256 269
257 static int 270 static int
258 vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) 271 vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg)
259 { 272 {
260 long cpu = (long)arg; 273 long cpu = (long)arg;
261 274
262 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 275 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
263 smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); 276 smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
264 277
265 return NOTIFY_DONE; 278 return NOTIFY_DONE;
266 } 279 }
267 280
268 static int __init init_vdso(void) 281 static int __init init_vdso(void)
269 { 282 {
270 init_vdso_image(&vdso_image_64); 283 init_vdso_image(&vdso_image_64);
271 284
272 #ifdef CONFIG_X86_X32_ABI 285 #ifdef CONFIG_X86_X32_ABI
273 init_vdso_image(&vdso_image_x32); 286 init_vdso_image(&vdso_image_x32);
274 #endif 287 #endif
275 288
276 cpu_notifier_register_begin(); 289 cpu_notifier_register_begin();
277 290
278 on_each_cpu(vgetcpu_cpu_init, NULL, 1); 291 on_each_cpu(vgetcpu_cpu_init, NULL, 1);
279 /* notifier priority > KVM */ 292 /* notifier priority > KVM */
280 __hotcpu_notifier(vgetcpu_cpu_notifier, 30); 293 __hotcpu_notifier(vgetcpu_cpu_notifier, 30);
281 294