Commit c40f6f8bbc4cbd2902671aacd587400ddca62627
Exists in
master
and in
4 other branches
Merge git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-2.6-nommu
* git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-2.6-nommu: NOMMU: Support XIP on initramfs NOMMU: Teach kobjsize() about VMA regions. FLAT: Don't attempt to expand the userspace stack to fill the space allocated FDPIC: Don't attempt to expand the userspace stack to fill the space allocated NOMMU: Improve procfs output using per-MM VMAs NOMMU: Make mmap allocation page trimming behaviour configurable. NOMMU: Make VMAs per MM as for MMU-mode linux NOMMU: Delete askedalloc and realalloc variables NOMMU: Rename ARM's struct vm_region NOMMU: Fix cleanup handling in ramfs_nommu_get_umapped_area()
Showing 29 changed files Inline Diff
- Documentation/nommu-mmap.txt
- Documentation/sysctl/vm.txt
- arch/arm/include/asm/mmu.h
- arch/arm/mm/dma-mapping.c
- arch/blackfin/include/asm/mmu.h
- arch/blackfin/kernel/ptrace.c
- arch/blackfin/kernel/traps.c
- arch/frv/kernel/ptrace.c
- arch/h8300/include/asm/mmu.h
- arch/m68knommu/include/asm/mmu.h
- arch/sh/include/asm/mmu.h
- fs/binfmt_elf_fdpic.c
- fs/binfmt_flat.c
- fs/proc/internal.h
- fs/proc/meminfo.c
- fs/proc/nommu.c
- fs/proc/task_nommu.c
- fs/ramfs/file-nommu.c
- include/asm-frv/mmu.h
- include/asm-m32r/mmu.h
- include/linux/mm.h
- include/linux/mm_types.h
- init/initramfs.c
- ipc/shm.c
- kernel/fork.c
- kernel/sysctl.c
- lib/Kconfig.debug
- mm/mmap.c
- mm/nommu.c
Documentation/nommu-mmap.txt
1 | ============================= | 1 | ============================= |
2 | NO-MMU MEMORY MAPPING SUPPORT | 2 | NO-MMU MEMORY MAPPING SUPPORT |
3 | ============================= | 3 | ============================= |
4 | 4 | ||
5 | The kernel has limited support for memory mapping under no-MMU conditions, such | 5 | The kernel has limited support for memory mapping under no-MMU conditions, such |
6 | as are used in uClinux environments. From the userspace point of view, memory | 6 | as are used in uClinux environments. From the userspace point of view, memory |
7 | mapping is made use of in conjunction with the mmap() system call, the shmat() | 7 | mapping is made use of in conjunction with the mmap() system call, the shmat() |
8 | call and the execve() system call. From the kernel's point of view, execve() | 8 | call and the execve() system call. From the kernel's point of view, execve() |
9 | mapping is actually performed by the binfmt drivers, which call back into the | 9 | mapping is actually performed by the binfmt drivers, which call back into the |
10 | mmap() routines to do the actual work. | 10 | mmap() routines to do the actual work. |
11 | 11 | ||
12 | Memory mapping behaviour also involves the way fork(), vfork(), clone() and | 12 | Memory mapping behaviour also involves the way fork(), vfork(), clone() and |
13 | ptrace() work. Under uClinux there is no fork(), and clone() must be supplied | 13 | ptrace() work. Under uClinux there is no fork(), and clone() must be supplied |
14 | the CLONE_VM flag. | 14 | the CLONE_VM flag. |
15 | 15 | ||
16 | The behaviour is similar between the MMU and no-MMU cases, but not identical; | 16 | The behaviour is similar between the MMU and no-MMU cases, but not identical; |
17 | and it's also much more restricted in the latter case: | 17 | and it's also much more restricted in the latter case: |
18 | 18 | ||
19 | (*) Anonymous mapping, MAP_PRIVATE | 19 | (*) Anonymous mapping, MAP_PRIVATE |
20 | 20 | ||
21 | In the MMU case: VM regions backed by arbitrary pages; copy-on-write | 21 | In the MMU case: VM regions backed by arbitrary pages; copy-on-write |
22 | across fork. | 22 | across fork. |
23 | 23 | ||
24 | In the no-MMU case: VM regions backed by arbitrary contiguous runs of | 24 | In the no-MMU case: VM regions backed by arbitrary contiguous runs of |
25 | pages. | 25 | pages. |
26 | 26 | ||
27 | (*) Anonymous mapping, MAP_SHARED | 27 | (*) Anonymous mapping, MAP_SHARED |
28 | 28 | ||
29 | These behave very much like private mappings, except that they're | 29 | These behave very much like private mappings, except that they're |
30 | shared across fork() or clone() without CLONE_VM in the MMU case. Since | 30 | shared across fork() or clone() without CLONE_VM in the MMU case. Since |
31 | the no-MMU case doesn't support these, behaviour is identical to | 31 | the no-MMU case doesn't support these, behaviour is identical to |
32 | MAP_PRIVATE there. | 32 | MAP_PRIVATE there. |
33 | 33 | ||
34 | (*) File, MAP_PRIVATE, PROT_READ / PROT_EXEC, !PROT_WRITE | 34 | (*) File, MAP_PRIVATE, PROT_READ / PROT_EXEC, !PROT_WRITE |
35 | 35 | ||
36 | In the MMU case: VM regions backed by pages read from file; changes to | 36 | In the MMU case: VM regions backed by pages read from file; changes to |
37 | the underlying file are reflected in the mapping; copied across fork. | 37 | the underlying file are reflected in the mapping; copied across fork. |
38 | 38 | ||
39 | In the no-MMU case: | 39 | In the no-MMU case: |
40 | 40 | ||
41 | - If one exists, the kernel will re-use an existing mapping to the | 41 | - If one exists, the kernel will re-use an existing mapping to the |
42 | same segment of the same file if that has compatible permissions, | 42 | same segment of the same file if that has compatible permissions, |
43 | even if this was created by another process. | 43 | even if this was created by another process. |
44 | 44 | ||
45 | - If possible, the file mapping will be directly on the backing device | 45 | - If possible, the file mapping will be directly on the backing device |
46 | if the backing device has the BDI_CAP_MAP_DIRECT capability and | 46 | if the backing device has the BDI_CAP_MAP_DIRECT capability and |
47 | appropriate mapping protection capabilities. Ramfs, romfs, cramfs | 47 | appropriate mapping protection capabilities. Ramfs, romfs, cramfs |
48 | and mtd might all permit this. | 48 | and mtd might all permit this. |
49 | 49 | ||
50 | - If the backing device device can't or won't permit direct sharing, | 50 | - If the backing device device can't or won't permit direct sharing, |
51 | but does have the BDI_CAP_MAP_COPY capability, then a copy of the | 51 | but does have the BDI_CAP_MAP_COPY capability, then a copy of the |
52 | appropriate bit of the file will be read into a contiguous bit of | 52 | appropriate bit of the file will be read into a contiguous bit of |
53 | memory and any extraneous space beyond the EOF will be cleared | 53 | memory and any extraneous space beyond the EOF will be cleared |
54 | 54 | ||
55 | - Writes to the file do not affect the mapping; writes to the mapping | 55 | - Writes to the file do not affect the mapping; writes to the mapping |
56 | are visible in other processes (no MMU protection), but should not | 56 | are visible in other processes (no MMU protection), but should not |
57 | happen. | 57 | happen. |
58 | 58 | ||
59 | (*) File, MAP_PRIVATE, PROT_READ / PROT_EXEC, PROT_WRITE | 59 | (*) File, MAP_PRIVATE, PROT_READ / PROT_EXEC, PROT_WRITE |
60 | 60 | ||
61 | In the MMU case: like the non-PROT_WRITE case, except that the pages in | 61 | In the MMU case: like the non-PROT_WRITE case, except that the pages in |
62 | question get copied before the write actually happens. From that point | 62 | question get copied before the write actually happens. From that point |
63 | on writes to the file underneath that page no longer get reflected into | 63 | on writes to the file underneath that page no longer get reflected into |
64 | the mapping's backing pages. The page is then backed by swap instead. | 64 | the mapping's backing pages. The page is then backed by swap instead. |
65 | 65 | ||
66 | In the no-MMU case: works much like the non-PROT_WRITE case, except | 66 | In the no-MMU case: works much like the non-PROT_WRITE case, except |
67 | that a copy is always taken and never shared. | 67 | that a copy is always taken and never shared. |
68 | 68 | ||
69 | (*) Regular file / blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE | 69 | (*) Regular file / blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE |
70 | 70 | ||
71 | In the MMU case: VM regions backed by pages read from file; changes to | 71 | In the MMU case: VM regions backed by pages read from file; changes to |
72 | pages written back to file; writes to file reflected into pages backing | 72 | pages written back to file; writes to file reflected into pages backing |
73 | mapping; shared across fork. | 73 | mapping; shared across fork. |
74 | 74 | ||
75 | In the no-MMU case: not supported. | 75 | In the no-MMU case: not supported. |
76 | 76 | ||
77 | (*) Memory backed regular file, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE | 77 | (*) Memory backed regular file, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE |
78 | 78 | ||
79 | In the MMU case: As for ordinary regular files. | 79 | In the MMU case: As for ordinary regular files. |
80 | 80 | ||
81 | In the no-MMU case: The filesystem providing the memory-backed file | 81 | In the no-MMU case: The filesystem providing the memory-backed file |
82 | (such as ramfs or tmpfs) may choose to honour an open, truncate, mmap | 82 | (such as ramfs or tmpfs) may choose to honour an open, truncate, mmap |
83 | sequence by providing a contiguous sequence of pages to map. In that | 83 | sequence by providing a contiguous sequence of pages to map. In that |
84 | case, a shared-writable memory mapping will be possible. It will work | 84 | case, a shared-writable memory mapping will be possible. It will work |
85 | as for the MMU case. If the filesystem does not provide any such | 85 | as for the MMU case. If the filesystem does not provide any such |
86 | support, then the mapping request will be denied. | 86 | support, then the mapping request will be denied. |
87 | 87 | ||
88 | (*) Memory backed blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE | 88 | (*) Memory backed blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE |
89 | 89 | ||
90 | In the MMU case: As for ordinary regular files. | 90 | In the MMU case: As for ordinary regular files. |
91 | 91 | ||
92 | In the no-MMU case: As for memory backed regular files, but the | 92 | In the no-MMU case: As for memory backed regular files, but the |
93 | blockdev must be able to provide a contiguous run of pages without | 93 | blockdev must be able to provide a contiguous run of pages without |
94 | truncate being called. The ramdisk driver could do this if it allocated | 94 | truncate being called. The ramdisk driver could do this if it allocated |
95 | all its memory as a contiguous array upfront. | 95 | all its memory as a contiguous array upfront. |
96 | 96 | ||
97 | (*) Memory backed chardev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE | 97 | (*) Memory backed chardev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE |
98 | 98 | ||
99 | In the MMU case: As for ordinary regular files. | 99 | In the MMU case: As for ordinary regular files. |
100 | 100 | ||
101 | In the no-MMU case: The character device driver may choose to honour | 101 | In the no-MMU case: The character device driver may choose to honour |
102 | the mmap() by providing direct access to the underlying device if it | 102 | the mmap() by providing direct access to the underlying device if it |
103 | provides memory or quasi-memory that can be accessed directly. Examples | 103 | provides memory or quasi-memory that can be accessed directly. Examples |
104 | of such are frame buffers and flash devices. If the driver does not | 104 | of such are frame buffers and flash devices. If the driver does not |
105 | provide any such support, then the mapping request will be denied. | 105 | provide any such support, then the mapping request will be denied. |
106 | 106 | ||
107 | 107 | ||
108 | ============================ | 108 | ============================ |
109 | FURTHER NOTES ON NO-MMU MMAP | 109 | FURTHER NOTES ON NO-MMU MMAP |
110 | ============================ | 110 | ============================ |
111 | 111 | ||
112 | (*) A request for a private mapping of less than a page in size may not return | 112 | (*) A request for a private mapping of a file may return a buffer that is not |
113 | a page-aligned buffer. This is because the kernel calls kmalloc() to | 113 | page-aligned. This is because XIP may take place, and the data may not be |
114 | allocate the buffer, not get_free_page(). | 114 | paged aligned in the backing store. |
115 | 115 | ||
116 | (*) A list of all the mappings on the system is visible through /proc/maps in | 116 | (*) A request for an anonymous mapping will always be page aligned. If |
117 | no-MMU mode. | 117 | possible the size of the request should be a power of two otherwise some |
118 | of the space may be wasted as the kernel must allocate a power-of-2 | ||
119 | granule but will only discard the excess if appropriately configured as | ||
120 | this has an effect on fragmentation. | ||
118 | 121 | ||
122 | (*) A list of all the private copy and anonymous mappings on the system is | ||
123 | visible through /proc/maps in no-MMU mode. | ||
124 | |||
119 | (*) A list of all the mappings in use by a process is visible through | 125 | (*) A list of all the mappings in use by a process is visible through |
120 | /proc/<pid>/maps in no-MMU mode. | 126 | /proc/<pid>/maps in no-MMU mode. |
121 | 127 | ||
122 | (*) Supplying MAP_FIXED or a requesting a particular mapping address will | 128 | (*) Supplying MAP_FIXED or a requesting a particular mapping address will |
123 | result in an error. | 129 | result in an error. |
124 | 130 | ||
125 | (*) Files mapped privately usually have to have a read method provided by the | 131 | (*) Files mapped privately usually have to have a read method provided by the |
126 | driver or filesystem so that the contents can be read into the memory | 132 | driver or filesystem so that the contents can be read into the memory |
127 | allocated if mmap() chooses not to map the backing device directly. An | 133 | allocated if mmap() chooses not to map the backing device directly. An |
128 | error will result if they don't. This is most likely to be encountered | 134 | error will result if they don't. This is most likely to be encountered |
129 | with character device files, pipes, fifos and sockets. | 135 | with character device files, pipes, fifos and sockets. |
130 | 136 | ||
131 | 137 | ||
132 | ========================== | 138 | ========================== |
133 | INTERPROCESS SHARED MEMORY | 139 | INTERPROCESS SHARED MEMORY |
134 | ========================== | 140 | ========================== |
135 | 141 | ||
136 | Both SYSV IPC SHM shared memory and POSIX shared memory is supported in NOMMU | 142 | Both SYSV IPC SHM shared memory and POSIX shared memory is supported in NOMMU |
137 | mode. The former through the usual mechanism, the latter through files created | 143 | mode. The former through the usual mechanism, the latter through files created |
138 | on ramfs or tmpfs mounts. | 144 | on ramfs or tmpfs mounts. |
139 | 145 | ||
140 | 146 | ||
141 | ======= | 147 | ======= |
142 | FUTEXES | 148 | FUTEXES |
143 | ======= | 149 | ======= |
144 | 150 | ||
145 | Futexes are supported in NOMMU mode if the arch supports them. An error will | 151 | Futexes are supported in NOMMU mode if the arch supports them. An error will |
146 | be given if an address passed to the futex system call lies outside the | 152 | be given if an address passed to the futex system call lies outside the |
147 | mappings made by a process or if the mapping in which the address lies does not | 153 | mappings made by a process or if the mapping in which the address lies does not |
148 | support futexes (such as an I/O chardev mapping). | 154 | support futexes (such as an I/O chardev mapping). |
149 | 155 | ||
150 | 156 | ||
151 | ============= | 157 | ============= |
152 | NO-MMU MREMAP | 158 | NO-MMU MREMAP |
153 | ============= | 159 | ============= |
154 | 160 | ||
155 | The mremap() function is partially supported. It may change the size of a | 161 | The mremap() function is partially supported. It may change the size of a |
156 | mapping, and may move it[*] if MREMAP_MAYMOVE is specified and if the new size | 162 | mapping, and may move it[*] if MREMAP_MAYMOVE is specified and if the new size |
157 | of the mapping exceeds the size of the slab object currently occupied by the | 163 | of the mapping exceeds the size of the slab object currently occupied by the |
158 | memory to which the mapping refers, or if a smaller slab object could be used. | 164 | memory to which the mapping refers, or if a smaller slab object could be used. |
159 | 165 | ||
160 | MREMAP_FIXED is not supported, though it is ignored if there's no change of | 166 | MREMAP_FIXED is not supported, though it is ignored if there's no change of |
161 | address and the object does not need to be moved. | 167 | address and the object does not need to be moved. |
162 | 168 | ||
163 | Shared mappings may not be moved. Shareable mappings may not be moved either, | 169 | Shared mappings may not be moved. Shareable mappings may not be moved either, |
164 | even if they are not currently shared. | 170 | even if they are not currently shared. |
165 | 171 | ||
166 | The mremap() function must be given an exact match for base address and size of | 172 | The mremap() function must be given an exact match for base address and size of |
167 | a previously mapped object. It may not be used to create holes in existing | 173 | a previously mapped object. It may not be used to create holes in existing |
168 | mappings, move parts of existing mappings or resize parts of mappings. It must | 174 | mappings, move parts of existing mappings or resize parts of mappings. It must |
169 | act on a complete mapping. | 175 | act on a complete mapping. |
170 | 176 | ||
171 | [*] Not currently supported. | 177 | [*] Not currently supported. |
172 | 178 | ||
173 | 179 | ||
174 | ============================================ | 180 | ============================================ |
175 | PROVIDING SHAREABLE CHARACTER DEVICE SUPPORT | 181 | PROVIDING SHAREABLE CHARACTER DEVICE SUPPORT |
176 | ============================================ | 182 | ============================================ |
177 | 183 | ||
178 | To provide shareable character device support, a driver must provide a | 184 | To provide shareable character device support, a driver must provide a |
179 | file->f_op->get_unmapped_area() operation. The mmap() routines will call this | 185 | file->f_op->get_unmapped_area() operation. The mmap() routines will call this |
180 | to get a proposed address for the mapping. This may return an error if it | 186 | to get a proposed address for the mapping. This may return an error if it |
181 | doesn't wish to honour the mapping because it's too long, at a weird offset, | 187 | doesn't wish to honour the mapping because it's too long, at a weird offset, |
182 | under some unsupported combination of flags or whatever. | 188 | under some unsupported combination of flags or whatever. |
183 | 189 | ||
184 | The driver should also provide backing device information with capabilities set | 190 | The driver should also provide backing device information with capabilities set |
185 | to indicate the permitted types of mapping on such devices. The default is | 191 | to indicate the permitted types of mapping on such devices. The default is |
186 | assumed to be readable and writable, not executable, and only shareable | 192 | assumed to be readable and writable, not executable, and only shareable |
187 | directly (can't be copied). | 193 | directly (can't be copied). |
188 | 194 | ||
189 | The file->f_op->mmap() operation will be called to actually inaugurate the | 195 | The file->f_op->mmap() operation will be called to actually inaugurate the |
190 | mapping. It can be rejected at that point. Returning the ENOSYS error will | 196 | mapping. It can be rejected at that point. Returning the ENOSYS error will |
191 | cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified. | 197 | cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified. |
192 | 198 | ||
193 | The vm_ops->close() routine will be invoked when the last mapping on a chardev | 199 | The vm_ops->close() routine will be invoked when the last mapping on a chardev |
194 | is removed. An existing mapping will be shared, partially or not, if possible | 200 | is removed. An existing mapping will be shared, partially or not, if possible |
195 | without notifying the driver. | 201 | without notifying the driver. |
196 | 202 | ||
197 | It is permitted also for the file->f_op->get_unmapped_area() operation to | 203 | It is permitted also for the file->f_op->get_unmapped_area() operation to |
198 | return -ENOSYS. This will be taken to mean that this operation just doesn't | 204 | return -ENOSYS. This will be taken to mean that this operation just doesn't |
199 | want to handle it, despite the fact it's got an operation. For instance, it | 205 | want to handle it, despite the fact it's got an operation. For instance, it |
200 | might try directing the call to a secondary driver which turns out not to | 206 | might try directing the call to a secondary driver which turns out not to |
201 | implement it. Such is the case for the framebuffer driver which attempts to | 207 | implement it. Such is the case for the framebuffer driver which attempts to |
202 | direct the call to the device-specific driver. Under such circumstances, the | 208 | direct the call to the device-specific driver. Under such circumstances, the |
203 | mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a | 209 | mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a |
204 | copy mapped otherwise. | 210 | copy mapped otherwise. |
205 | 211 | ||
206 | IMPORTANT NOTE: | 212 | IMPORTANT NOTE: |
207 | 213 | ||
208 | Some types of device may present a different appearance to anyone | 214 | Some types of device may present a different appearance to anyone |
209 | looking at them in certain modes. Flash chips can be like this; for | 215 | looking at them in certain modes. Flash chips can be like this; for |
210 | instance if they're in programming or erase mode, you might see the | 216 | instance if they're in programming or erase mode, you might see the |
211 | status reflected in the mapping, instead of the data. | 217 | status reflected in the mapping, instead of the data. |
212 | 218 | ||
213 | In such a case, care must be taken lest userspace see a shared or a | 219 | In such a case, care must be taken lest userspace see a shared or a |
214 | private mapping showing such information when the driver is busy | 220 | private mapping showing such information when the driver is busy |
215 | controlling the device. Remember especially: private executable | 221 | controlling the device. Remember especially: private executable |
216 | mappings may still be mapped directly off the device under some | 222 | mappings may still be mapped directly off the device under some |
217 | circumstances! | 223 | circumstances! |
218 | 224 | ||
219 | 225 | ||
220 | ============================================== | 226 | ============================================== |
221 | PROVIDING SHAREABLE MEMORY-BACKED FILE SUPPORT | 227 | PROVIDING SHAREABLE MEMORY-BACKED FILE SUPPORT |
222 | ============================================== | 228 | ============================================== |
223 | 229 | ||
224 | Provision of shared mappings on memory backed files is similar to the provision | 230 | Provision of shared mappings on memory backed files is similar to the provision |
225 | of support for shared mapped character devices. The main difference is that the | 231 | of support for shared mapped character devices. The main difference is that the |
226 | filesystem providing the service will probably allocate a contiguous collection | 232 | filesystem providing the service will probably allocate a contiguous collection |
227 | of pages and permit mappings to be made on that. | 233 | of pages and permit mappings to be made on that. |
228 | 234 | ||
229 | It is recommended that a truncate operation applied to such a file that | 235 | It is recommended that a truncate operation applied to such a file that |
230 | increases the file size, if that file is empty, be taken as a request to gather | 236 | increases the file size, if that file is empty, be taken as a request to gather |
231 | enough pages to honour a mapping. This is required to support POSIX shared | 237 | enough pages to honour a mapping. This is required to support POSIX shared |
232 | memory. | 238 | memory. |
233 | 239 | ||
234 | Memory backed devices are indicated by the mapping's backing device info having | 240 | Memory backed devices are indicated by the mapping's backing device info having |
235 | the memory_backed flag set. | 241 | the memory_backed flag set. |
236 | 242 | ||
237 | 243 | ||
238 | ======================================== | 244 | ======================================== |
239 | PROVIDING SHAREABLE BLOCK DEVICE SUPPORT | 245 | PROVIDING SHAREABLE BLOCK DEVICE SUPPORT |
240 | ======================================== | 246 | ======================================== |
241 | 247 | ||
242 | Provision of shared mappings on block device files is exactly the same as for | 248 | Provision of shared mappings on block device files is exactly the same as for |
243 | character devices. If there isn't a real device underneath, then the driver | 249 | character devices. If there isn't a real device underneath, then the driver |
244 | should allocate sufficient contiguous memory to honour any supported mapping. | 250 | should allocate sufficient contiguous memory to honour any supported mapping. |
251 | |||
252 | |||
253 | ================================= | ||
254 | ADJUSTING PAGE TRIMMING BEHAVIOUR | ||
255 | ================================= | ||
256 | |||
257 | NOMMU mmap automatically rounds up to the nearest power-of-2 number of pages | ||
258 | when performing an allocation. This can have adverse effects on memory | ||
259 | fragmentation, and as such, is left configurable. The default behaviour is to | ||
260 | aggressively trim allocations and discard any excess pages back in to the page | ||
261 | allocator. In order to retain finer-grained control over fragmentation, this | ||
262 | behaviour can either be disabled completely, or bumped up to a higher page | ||
263 | watermark where trimming begins. | ||
264 | |||
265 | Page trimming behaviour is configurable via the sysctl `vm.nr_trim_pages'. | ||
245 | 266 |
Documentation/sysctl/vm.txt
1 | Documentation for /proc/sys/vm/* kernel version 2.2.10 | 1 | Documentation for /proc/sys/vm/* kernel version 2.2.10 |
2 | (c) 1998, 1999, Rik van Riel <riel@nl.linux.org> | 2 | (c) 1998, 1999, Rik van Riel <riel@nl.linux.org> |
3 | 3 | ||
4 | For general info and legal blurb, please look in README. | 4 | For general info and legal blurb, please look in README. |
5 | 5 | ||
6 | ============================================================== | 6 | ============================================================== |
7 | 7 | ||
8 | This file contains the documentation for the sysctl files in | 8 | This file contains the documentation for the sysctl files in |
9 | /proc/sys/vm and is valid for Linux kernel version 2.2. | 9 | /proc/sys/vm and is valid for Linux kernel version 2.2. |
10 | 10 | ||
11 | The files in this directory can be used to tune the operation | 11 | The files in this directory can be used to tune the operation |
12 | of the virtual memory (VM) subsystem of the Linux kernel and | 12 | of the virtual memory (VM) subsystem of the Linux kernel and |
13 | the writeout of dirty data to disk. | 13 | the writeout of dirty data to disk. |
14 | 14 | ||
15 | Default values and initialization routines for most of these | 15 | Default values and initialization routines for most of these |
16 | files can be found in mm/swap.c. | 16 | files can be found in mm/swap.c. |
17 | 17 | ||
18 | Currently, these files are in /proc/sys/vm: | 18 | Currently, these files are in /proc/sys/vm: |
19 | - overcommit_memory | 19 | - overcommit_memory |
20 | - page-cluster | 20 | - page-cluster |
21 | - dirty_ratio | 21 | - dirty_ratio |
22 | - dirty_background_ratio | 22 | - dirty_background_ratio |
23 | - dirty_expire_centisecs | 23 | - dirty_expire_centisecs |
24 | - dirty_writeback_centisecs | 24 | - dirty_writeback_centisecs |
25 | - highmem_is_dirtyable (only if CONFIG_HIGHMEM set) | 25 | - highmem_is_dirtyable (only if CONFIG_HIGHMEM set) |
26 | - max_map_count | 26 | - max_map_count |
27 | - min_free_kbytes | 27 | - min_free_kbytes |
28 | - laptop_mode | 28 | - laptop_mode |
29 | - block_dump | 29 | - block_dump |
30 | - drop-caches | 30 | - drop-caches |
31 | - zone_reclaim_mode | 31 | - zone_reclaim_mode |
32 | - min_unmapped_ratio | 32 | - min_unmapped_ratio |
33 | - min_slab_ratio | 33 | - min_slab_ratio |
34 | - panic_on_oom | 34 | - panic_on_oom |
35 | - oom_dump_tasks | 35 | - oom_dump_tasks |
36 | - oom_kill_allocating_task | 36 | - oom_kill_allocating_task |
37 | - mmap_min_address | 37 | - mmap_min_address |
38 | - numa_zonelist_order | 38 | - numa_zonelist_order |
39 | - nr_hugepages | 39 | - nr_hugepages |
40 | - nr_overcommit_hugepages | 40 | - nr_overcommit_hugepages |
41 | - nr_trim_pages (only if CONFIG_MMU=n) | ||
41 | 42 | ||
42 | ============================================================== | 43 | ============================================================== |
43 | 44 | ||
44 | dirty_bytes, dirty_ratio, dirty_background_bytes, | 45 | dirty_bytes, dirty_ratio, dirty_background_bytes, |
45 | dirty_background_ratio, dirty_expire_centisecs, | 46 | dirty_background_ratio, dirty_expire_centisecs, |
46 | dirty_writeback_centisecs, highmem_is_dirtyable, | 47 | dirty_writeback_centisecs, highmem_is_dirtyable, |
47 | vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout, | 48 | vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout, |
48 | drop-caches, hugepages_treat_as_movable: | 49 | drop-caches, hugepages_treat_as_movable: |
49 | 50 | ||
50 | See Documentation/filesystems/proc.txt | 51 | See Documentation/filesystems/proc.txt |
51 | 52 | ||
52 | ============================================================== | 53 | ============================================================== |
53 | 54 | ||
54 | overcommit_memory: | 55 | overcommit_memory: |
55 | 56 | ||
56 | This value contains a flag that enables memory overcommitment. | 57 | This value contains a flag that enables memory overcommitment. |
57 | 58 | ||
58 | When this flag is 0, the kernel attempts to estimate the amount | 59 | When this flag is 0, the kernel attempts to estimate the amount |
59 | of free memory left when userspace requests more memory. | 60 | of free memory left when userspace requests more memory. |
60 | 61 | ||
61 | When this flag is 1, the kernel pretends there is always enough | 62 | When this flag is 1, the kernel pretends there is always enough |
62 | memory until it actually runs out. | 63 | memory until it actually runs out. |
63 | 64 | ||
64 | When this flag is 2, the kernel uses a "never overcommit" | 65 | When this flag is 2, the kernel uses a "never overcommit" |
65 | policy that attempts to prevent any overcommit of memory. | 66 | policy that attempts to prevent any overcommit of memory. |
66 | 67 | ||
67 | This feature can be very useful because there are a lot of | 68 | This feature can be very useful because there are a lot of |
68 | programs that malloc() huge amounts of memory "just-in-case" | 69 | programs that malloc() huge amounts of memory "just-in-case" |
69 | and don't use much of it. | 70 | and don't use much of it. |
70 | 71 | ||
71 | The default value is 0. | 72 | The default value is 0. |
72 | 73 | ||
73 | See Documentation/vm/overcommit-accounting and | 74 | See Documentation/vm/overcommit-accounting and |
74 | security/commoncap.c::cap_vm_enough_memory() for more information. | 75 | security/commoncap.c::cap_vm_enough_memory() for more information. |
75 | 76 | ||
76 | ============================================================== | 77 | ============================================================== |
77 | 78 | ||
78 | overcommit_ratio: | 79 | overcommit_ratio: |
79 | 80 | ||
80 | When overcommit_memory is set to 2, the committed address | 81 | When overcommit_memory is set to 2, the committed address |
81 | space is not permitted to exceed swap plus this percentage | 82 | space is not permitted to exceed swap plus this percentage |
82 | of physical RAM. See above. | 83 | of physical RAM. See above. |
83 | 84 | ||
84 | ============================================================== | 85 | ============================================================== |
85 | 86 | ||
86 | page-cluster: | 87 | page-cluster: |
87 | 88 | ||
88 | The Linux VM subsystem avoids excessive disk seeks by reading | 89 | The Linux VM subsystem avoids excessive disk seeks by reading |
89 | multiple pages on a page fault. The number of pages it reads | 90 | multiple pages on a page fault. The number of pages it reads |
90 | is dependent on the amount of memory in your machine. | 91 | is dependent on the amount of memory in your machine. |
91 | 92 | ||
92 | The number of pages the kernel reads in at once is equal to | 93 | The number of pages the kernel reads in at once is equal to |
93 | 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense | 94 | 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense |
94 | for swap because we only cluster swap data in 32-page groups. | 95 | for swap because we only cluster swap data in 32-page groups. |
95 | 96 | ||
96 | ============================================================== | 97 | ============================================================== |
97 | 98 | ||
98 | max_map_count: | 99 | max_map_count: |
99 | 100 | ||
100 | This file contains the maximum number of memory map areas a process | 101 | This file contains the maximum number of memory map areas a process |
101 | may have. Memory map areas are used as a side-effect of calling | 102 | may have. Memory map areas are used as a side-effect of calling |
102 | malloc, directly by mmap and mprotect, and also when loading shared | 103 | malloc, directly by mmap and mprotect, and also when loading shared |
103 | libraries. | 104 | libraries. |
104 | 105 | ||
105 | While most applications need less than a thousand maps, certain | 106 | While most applications need less than a thousand maps, certain |
106 | programs, particularly malloc debuggers, may consume lots of them, | 107 | programs, particularly malloc debuggers, may consume lots of them, |
107 | e.g., up to one or two maps per allocation. | 108 | e.g., up to one or two maps per allocation. |
108 | 109 | ||
109 | The default value is 65536. | 110 | The default value is 65536. |
110 | 111 | ||
111 | ============================================================== | 112 | ============================================================== |
112 | 113 | ||
113 | min_free_kbytes: | 114 | min_free_kbytes: |
114 | 115 | ||
115 | This is used to force the Linux VM to keep a minimum number | 116 | This is used to force the Linux VM to keep a minimum number |
116 | of kilobytes free. The VM uses this number to compute a pages_min | 117 | of kilobytes free. The VM uses this number to compute a pages_min |
117 | value for each lowmem zone in the system. Each lowmem zone gets | 118 | value for each lowmem zone in the system. Each lowmem zone gets |
118 | a number of reserved free pages based proportionally on its size. | 119 | a number of reserved free pages based proportionally on its size. |
119 | 120 | ||
120 | Some minimal amount of memory is needed to satisfy PF_MEMALLOC | 121 | Some minimal amount of memory is needed to satisfy PF_MEMALLOC |
121 | allocations; if you set this to lower than 1024KB, your system will | 122 | allocations; if you set this to lower than 1024KB, your system will |
122 | become subtly broken, and prone to deadlock under high loads. | 123 | become subtly broken, and prone to deadlock under high loads. |
123 | 124 | ||
124 | Setting this too high will OOM your machine instantly. | 125 | Setting this too high will OOM your machine instantly. |
125 | 126 | ||
126 | ============================================================== | 127 | ============================================================== |
127 | 128 | ||
128 | percpu_pagelist_fraction | 129 | percpu_pagelist_fraction |
129 | 130 | ||
130 | This is the fraction of pages at most (high mark pcp->high) in each zone that | 131 | This is the fraction of pages at most (high mark pcp->high) in each zone that |
131 | are allocated for each per cpu page list. The min value for this is 8. It | 132 | are allocated for each per cpu page list. The min value for this is 8. It |
132 | means that we don't allow more than 1/8th of pages in each zone to be | 133 | means that we don't allow more than 1/8th of pages in each zone to be |
133 | allocated in any single per_cpu_pagelist. This entry only changes the value | 134 | allocated in any single per_cpu_pagelist. This entry only changes the value |
134 | of hot per cpu pagelists. User can specify a number like 100 to allocate | 135 | of hot per cpu pagelists. User can specify a number like 100 to allocate |
135 | 1/100th of each zone to each per cpu page list. | 136 | 1/100th of each zone to each per cpu page list. |
136 | 137 | ||
137 | The batch value of each per cpu pagelist is also updated as a result. It is | 138 | The batch value of each per cpu pagelist is also updated as a result. It is |
138 | set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) | 139 | set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) |
139 | 140 | ||
140 | The initial value is zero. Kernel does not use this value at boot time to set | 141 | The initial value is zero. Kernel does not use this value at boot time to set |
141 | the high water marks for each per cpu page list. | 142 | the high water marks for each per cpu page list. |
142 | 143 | ||
143 | =============================================================== | 144 | =============================================================== |
144 | 145 | ||
145 | zone_reclaim_mode: | 146 | zone_reclaim_mode: |
146 | 147 | ||
147 | Zone_reclaim_mode allows someone to set more or less aggressive approaches to | 148 | Zone_reclaim_mode allows someone to set more or less aggressive approaches to |
148 | reclaim memory when a zone runs out of memory. If it is set to zero then no | 149 | reclaim memory when a zone runs out of memory. If it is set to zero then no |
149 | zone reclaim occurs. Allocations will be satisfied from other zones / nodes | 150 | zone reclaim occurs. Allocations will be satisfied from other zones / nodes |
150 | in the system. | 151 | in the system. |
151 | 152 | ||
152 | This is value ORed together of | 153 | This is value ORed together of |
153 | 154 | ||
154 | 1 = Zone reclaim on | 155 | 1 = Zone reclaim on |
155 | 2 = Zone reclaim writes dirty pages out | 156 | 2 = Zone reclaim writes dirty pages out |
156 | 4 = Zone reclaim swaps pages | 157 | 4 = Zone reclaim swaps pages |
157 | 158 | ||
158 | zone_reclaim_mode is set during bootup to 1 if it is determined that pages | 159 | zone_reclaim_mode is set during bootup to 1 if it is determined that pages |
159 | from remote zones will cause a measurable performance reduction. The | 160 | from remote zones will cause a measurable performance reduction. The |
160 | page allocator will then reclaim easily reusable pages (those page | 161 | page allocator will then reclaim easily reusable pages (those page |
161 | cache pages that are currently not used) before allocating off node pages. | 162 | cache pages that are currently not used) before allocating off node pages. |
162 | 163 | ||
163 | It may be beneficial to switch off zone reclaim if the system is | 164 | It may be beneficial to switch off zone reclaim if the system is |
164 | used for a file server and all of memory should be used for caching files | 165 | used for a file server and all of memory should be used for caching files |
165 | from disk. In that case the caching effect is more important than | 166 | from disk. In that case the caching effect is more important than |
166 | data locality. | 167 | data locality. |
167 | 168 | ||
168 | Allowing zone reclaim to write out pages stops processes that are | 169 | Allowing zone reclaim to write out pages stops processes that are |
169 | writing large amounts of data from dirtying pages on other nodes. Zone | 170 | writing large amounts of data from dirtying pages on other nodes. Zone |
170 | reclaim will write out dirty pages if a zone fills up and so effectively | 171 | reclaim will write out dirty pages if a zone fills up and so effectively |
171 | throttle the process. This may decrease the performance of a single process | 172 | throttle the process. This may decrease the performance of a single process |
172 | since it cannot use all of system memory to buffer the outgoing writes | 173 | since it cannot use all of system memory to buffer the outgoing writes |
173 | anymore but it preserve the memory on other nodes so that the performance | 174 | anymore but it preserve the memory on other nodes so that the performance |
174 | of other processes running on other nodes will not be affected. | 175 | of other processes running on other nodes will not be affected. |
175 | 176 | ||
176 | Allowing regular swap effectively restricts allocations to the local | 177 | Allowing regular swap effectively restricts allocations to the local |
177 | node unless explicitly overridden by memory policies or cpuset | 178 | node unless explicitly overridden by memory policies or cpuset |
178 | configurations. | 179 | configurations. |
179 | 180 | ||
180 | ============================================================= | 181 | ============================================================= |
181 | 182 | ||
182 | min_unmapped_ratio: | 183 | min_unmapped_ratio: |
183 | 184 | ||
184 | This is available only on NUMA kernels. | 185 | This is available only on NUMA kernels. |
185 | 186 | ||
186 | A percentage of the total pages in each zone. Zone reclaim will only | 187 | A percentage of the total pages in each zone. Zone reclaim will only |
187 | occur if more than this percentage of pages are file backed and unmapped. | 188 | occur if more than this percentage of pages are file backed and unmapped. |
188 | This is to insure that a minimal amount of local pages is still available for | 189 | This is to insure that a minimal amount of local pages is still available for |
189 | file I/O even if the node is overallocated. | 190 | file I/O even if the node is overallocated. |
190 | 191 | ||
191 | The default is 1 percent. | 192 | The default is 1 percent. |
192 | 193 | ||
193 | ============================================================= | 194 | ============================================================= |
194 | 195 | ||
195 | min_slab_ratio: | 196 | min_slab_ratio: |
196 | 197 | ||
197 | This is available only on NUMA kernels. | 198 | This is available only on NUMA kernels. |
198 | 199 | ||
199 | A percentage of the total pages in each zone. On Zone reclaim | 200 | A percentage of the total pages in each zone. On Zone reclaim |
200 | (fallback from the local zone occurs) slabs will be reclaimed if more | 201 | (fallback from the local zone occurs) slabs will be reclaimed if more |
201 | than this percentage of pages in a zone are reclaimable slab pages. | 202 | than this percentage of pages in a zone are reclaimable slab pages. |
202 | This insures that the slab growth stays under control even in NUMA | 203 | This insures that the slab growth stays under control even in NUMA |
203 | systems that rarely perform global reclaim. | 204 | systems that rarely perform global reclaim. |
204 | 205 | ||
205 | The default is 5 percent. | 206 | The default is 5 percent. |
206 | 207 | ||
207 | Note that slab reclaim is triggered in a per zone / node fashion. | 208 | Note that slab reclaim is triggered in a per zone / node fashion. |
208 | The process of reclaiming slab memory is currently not node specific | 209 | The process of reclaiming slab memory is currently not node specific |
209 | and may not be fast. | 210 | and may not be fast. |
210 | 211 | ||
211 | ============================================================= | 212 | ============================================================= |
212 | 213 | ||
213 | panic_on_oom | 214 | panic_on_oom |
214 | 215 | ||
215 | This enables or disables panic on out-of-memory feature. | 216 | This enables or disables panic on out-of-memory feature. |
216 | 217 | ||
217 | If this is set to 0, the kernel will kill some rogue process, | 218 | If this is set to 0, the kernel will kill some rogue process, |
218 | called oom_killer. Usually, oom_killer can kill rogue processes and | 219 | called oom_killer. Usually, oom_killer can kill rogue processes and |
219 | system will survive. | 220 | system will survive. |
220 | 221 | ||
221 | If this is set to 1, the kernel panics when out-of-memory happens. | 222 | If this is set to 1, the kernel panics when out-of-memory happens. |
222 | However, if a process limits using nodes by mempolicy/cpusets, | 223 | However, if a process limits using nodes by mempolicy/cpusets, |
223 | and those nodes become memory exhaustion status, one process | 224 | and those nodes become memory exhaustion status, one process |
224 | may be killed by oom-killer. No panic occurs in this case. | 225 | may be killed by oom-killer. No panic occurs in this case. |
225 | Because other nodes' memory may be free. This means system total status | 226 | Because other nodes' memory may be free. This means system total status |
226 | may be not fatal yet. | 227 | may be not fatal yet. |
227 | 228 | ||
228 | If this is set to 2, the kernel panics compulsorily even on the | 229 | If this is set to 2, the kernel panics compulsorily even on the |
229 | above-mentioned. | 230 | above-mentioned. |
230 | 231 | ||
231 | The default value is 0. | 232 | The default value is 0. |
232 | 1 and 2 are for failover of clustering. Please select either | 233 | 1 and 2 are for failover of clustering. Please select either |
233 | according to your policy of failover. | 234 | according to your policy of failover. |
234 | 235 | ||
235 | ============================================================= | 236 | ============================================================= |
236 | 237 | ||
237 | oom_dump_tasks | 238 | oom_dump_tasks |
238 | 239 | ||
239 | Enables a system-wide task dump (excluding kernel threads) to be | 240 | Enables a system-wide task dump (excluding kernel threads) to be |
240 | produced when the kernel performs an OOM-killing and includes such | 241 | produced when the kernel performs an OOM-killing and includes such |
241 | information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and | 242 | information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and |
242 | name. This is helpful to determine why the OOM killer was invoked | 243 | name. This is helpful to determine why the OOM killer was invoked |
243 | and to identify the rogue task that caused it. | 244 | and to identify the rogue task that caused it. |
244 | 245 | ||
245 | If this is set to zero, this information is suppressed. On very | 246 | If this is set to zero, this information is suppressed. On very |
246 | large systems with thousands of tasks it may not be feasible to dump | 247 | large systems with thousands of tasks it may not be feasible to dump |
247 | the memory state information for each one. Such systems should not | 248 | the memory state information for each one. Such systems should not |
248 | be forced to incur a performance penalty in OOM conditions when the | 249 | be forced to incur a performance penalty in OOM conditions when the |
249 | information may not be desired. | 250 | information may not be desired. |
250 | 251 | ||
251 | If this is set to non-zero, this information is shown whenever the | 252 | If this is set to non-zero, this information is shown whenever the |
252 | OOM killer actually kills a memory-hogging task. | 253 | OOM killer actually kills a memory-hogging task. |
253 | 254 | ||
254 | The default value is 0. | 255 | The default value is 0. |
255 | 256 | ||
256 | ============================================================= | 257 | ============================================================= |
257 | 258 | ||
258 | oom_kill_allocating_task | 259 | oom_kill_allocating_task |
259 | 260 | ||
260 | This enables or disables killing the OOM-triggering task in | 261 | This enables or disables killing the OOM-triggering task in |
261 | out-of-memory situations. | 262 | out-of-memory situations. |
262 | 263 | ||
263 | If this is set to zero, the OOM killer will scan through the entire | 264 | If this is set to zero, the OOM killer will scan through the entire |
264 | tasklist and select a task based on heuristics to kill. This normally | 265 | tasklist and select a task based on heuristics to kill. This normally |
265 | selects a rogue memory-hogging task that frees up a large amount of | 266 | selects a rogue memory-hogging task that frees up a large amount of |
266 | memory when killed. | 267 | memory when killed. |
267 | 268 | ||
268 | If this is set to non-zero, the OOM killer simply kills the task that | 269 | If this is set to non-zero, the OOM killer simply kills the task that |
269 | triggered the out-of-memory condition. This avoids the expensive | 270 | triggered the out-of-memory condition. This avoids the expensive |
270 | tasklist scan. | 271 | tasklist scan. |
271 | 272 | ||
272 | If panic_on_oom is selected, it takes precedence over whatever value | 273 | If panic_on_oom is selected, it takes precedence over whatever value |
273 | is used in oom_kill_allocating_task. | 274 | is used in oom_kill_allocating_task. |
274 | 275 | ||
275 | The default value is 0. | 276 | The default value is 0. |
276 | 277 | ||
277 | ============================================================== | 278 | ============================================================== |
278 | 279 | ||
279 | mmap_min_addr | 280 | mmap_min_addr |
280 | 281 | ||
281 | This file indicates the amount of address space which a user process will | 282 | This file indicates the amount of address space which a user process will |
282 | be restricted from mmaping. Since kernel null dereference bugs could | 283 | be restricted from mmaping. Since kernel null dereference bugs could |
283 | accidentally operate based on the information in the first couple of pages | 284 | accidentally operate based on the information in the first couple of pages |
284 | of memory userspace processes should not be allowed to write to them. By | 285 | of memory userspace processes should not be allowed to write to them. By |
285 | default this value is set to 0 and no protections will be enforced by the | 286 | default this value is set to 0 and no protections will be enforced by the |
286 | security module. Setting this value to something like 64k will allow the | 287 | security module. Setting this value to something like 64k will allow the |
287 | vast majority of applications to work correctly and provide defense in depth | 288 | vast majority of applications to work correctly and provide defense in depth |
288 | against future potential kernel bugs. | 289 | against future potential kernel bugs. |
289 | 290 | ||
290 | ============================================================== | 291 | ============================================================== |
291 | 292 | ||
292 | numa_zonelist_order | 293 | numa_zonelist_order |
293 | 294 | ||
294 | This sysctl is only for NUMA. | 295 | This sysctl is only for NUMA. |
295 | 'where the memory is allocated from' is controlled by zonelists. | 296 | 'where the memory is allocated from' is controlled by zonelists. |
296 | (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. | 297 | (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. |
297 | you may be able to read ZONE_DMA as ZONE_DMA32...) | 298 | you may be able to read ZONE_DMA as ZONE_DMA32...) |
298 | 299 | ||
299 | In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following. | 300 | In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following. |
300 | ZONE_NORMAL -> ZONE_DMA | 301 | ZONE_NORMAL -> ZONE_DMA |
301 | This means that a memory allocation request for GFP_KERNEL will | 302 | This means that a memory allocation request for GFP_KERNEL will |
302 | get memory from ZONE_DMA only when ZONE_NORMAL is not available. | 303 | get memory from ZONE_DMA only when ZONE_NORMAL is not available. |
303 | 304 | ||
304 | In NUMA case, you can think of following 2 types of order. | 305 | In NUMA case, you can think of following 2 types of order. |
305 | Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL | 306 | Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL |
306 | 307 | ||
307 | (A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL | 308 | (A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL |
308 | (B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA. | 309 | (B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA. |
309 | 310 | ||
310 | Type(A) offers the best locality for processes on Node(0), but ZONE_DMA | 311 | Type(A) offers the best locality for processes on Node(0), but ZONE_DMA |
311 | will be used before ZONE_NORMAL exhaustion. This increases possibility of | 312 | will be used before ZONE_NORMAL exhaustion. This increases possibility of |
312 | out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small. | 313 | out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small. |
313 | 314 | ||
314 | Type(B) cannot offer the best locality but is more robust against OOM of | 315 | Type(B) cannot offer the best locality but is more robust against OOM of |
315 | the DMA zone. | 316 | the DMA zone. |
316 | 317 | ||
317 | Type(A) is called as "Node" order. Type (B) is "Zone" order. | 318 | Type(A) is called as "Node" order. Type (B) is "Zone" order. |
318 | 319 | ||
319 | "Node order" orders the zonelists by node, then by zone within each node. | 320 | "Node order" orders the zonelists by node, then by zone within each node. |
320 | Specify "[Nn]ode" for zone order | 321 | Specify "[Nn]ode" for zone order |
321 | 322 | ||
322 | "Zone Order" orders the zonelists by zone type, then by node within each | 323 | "Zone Order" orders the zonelists by zone type, then by node within each |
323 | zone. Specify "[Zz]one"for zode order. | 324 | zone. Specify "[Zz]one"for zode order. |
324 | 325 | ||
325 | Specify "[Dd]efault" to request automatic configuration. Autoconfiguration | 326 | Specify "[Dd]efault" to request automatic configuration. Autoconfiguration |
326 | will select "node" order in following case. | 327 | will select "node" order in following case. |
327 | (1) if the DMA zone does not exist or | 328 | (1) if the DMA zone does not exist or |
328 | (2) if the DMA zone comprises greater than 50% of the available memory or | 329 | (2) if the DMA zone comprises greater than 50% of the available memory or |
329 | (3) if any node's DMA zone comprises greater than 60% of its local memory and | 330 | (3) if any node's DMA zone comprises greater than 60% of its local memory and |
330 | the amount of local memory is big enough. | 331 | the amount of local memory is big enough. |
331 | 332 | ||
332 | Otherwise, "zone" order will be selected. Default order is recommended unless | 333 | Otherwise, "zone" order will be selected. Default order is recommended unless |
333 | this is causing problems for your system/application. | 334 | this is causing problems for your system/application. |
334 | 335 | ||
335 | ============================================================== | 336 | ============================================================== |
336 | 337 | ||
337 | nr_hugepages | 338 | nr_hugepages |
338 | 339 | ||
339 | Change the minimum size of the hugepage pool. | 340 | Change the minimum size of the hugepage pool. |
340 | 341 | ||
341 | See Documentation/vm/hugetlbpage.txt | 342 | See Documentation/vm/hugetlbpage.txt |
342 | 343 | ||
343 | ============================================================== | 344 | ============================================================== |
344 | 345 | ||
345 | nr_overcommit_hugepages | 346 | nr_overcommit_hugepages |
346 | 347 | ||
347 | Change the maximum size of the hugepage pool. The maximum is | 348 | Change the maximum size of the hugepage pool. The maximum is |
348 | nr_hugepages + nr_overcommit_hugepages. | 349 | nr_hugepages + nr_overcommit_hugepages. |
349 | 350 | ||
350 | See Documentation/vm/hugetlbpage.txt | 351 | See Documentation/vm/hugetlbpage.txt |
352 | |||
353 | ============================================================== | ||
354 | |||
355 | nr_trim_pages | ||
356 | |||
357 | This is available only on NOMMU kernels. | ||
358 | |||
359 | This value adjusts the excess page trimming behaviour of power-of-2 aligned | ||
360 | NOMMU mmap allocations. | ||
361 | |||
362 | A value of 0 disables trimming of allocations entirely, while a value of 1 | ||
363 | trims excess pages aggressively. Any value >= 1 acts as the watermark where | ||
364 | trimming of allocations is initiated. | ||
365 | |||
366 | The default value is 1. | ||
367 | |||
368 | See Documentation/nommu-mmap.txt for more information. | ||
351 | 369 |
arch/arm/include/asm/mmu.h
1 | #ifndef __ARM_MMU_H | 1 | #ifndef __ARM_MMU_H |
2 | #define __ARM_MMU_H | 2 | #define __ARM_MMU_H |
3 | 3 | ||
4 | #ifdef CONFIG_MMU | 4 | #ifdef CONFIG_MMU |
5 | 5 | ||
6 | typedef struct { | 6 | typedef struct { |
7 | #ifdef CONFIG_CPU_HAS_ASID | 7 | #ifdef CONFIG_CPU_HAS_ASID |
8 | unsigned int id; | 8 | unsigned int id; |
9 | #endif | 9 | #endif |
10 | unsigned int kvm_seq; | 10 | unsigned int kvm_seq; |
11 | } mm_context_t; | 11 | } mm_context_t; |
12 | 12 | ||
13 | #ifdef CONFIG_CPU_HAS_ASID | 13 | #ifdef CONFIG_CPU_HAS_ASID |
14 | #define ASID(mm) ((mm)->context.id & 255) | 14 | #define ASID(mm) ((mm)->context.id & 255) |
15 | #else | 15 | #else |
16 | #define ASID(mm) (0) | 16 | #define ASID(mm) (0) |
17 | #endif | 17 | #endif |
18 | 18 | ||
19 | #else | 19 | #else |
20 | 20 | ||
21 | /* | 21 | /* |
22 | * From nommu.h: | 22 | * From nommu.h: |
23 | * Copyright (C) 2002, David McCullough <davidm@snapgear.com> | 23 | * Copyright (C) 2002, David McCullough <davidm@snapgear.com> |
24 | * modified for 2.6 by Hyok S. Choi <hyok.choi@samsung.com> | 24 | * modified for 2.6 by Hyok S. Choi <hyok.choi@samsung.com> |
25 | */ | 25 | */ |
26 | typedef struct { | 26 | typedef struct { |
27 | struct vm_list_struct *vmlist; | ||
28 | unsigned long end_brk; | 27 | unsigned long end_brk; |
29 | } mm_context_t; | 28 | } mm_context_t; |
30 | 29 | ||
31 | #endif | 30 | #endif |
32 | 31 | ||
33 | #endif | 32 | #endif |
34 | 33 |
arch/arm/mm/dma-mapping.c
1 | /* | 1 | /* |
2 | * linux/arch/arm/mm/dma-mapping.c | 2 | * linux/arch/arm/mm/dma-mapping.c |
3 | * | 3 | * |
4 | * Copyright (C) 2000-2004 Russell King | 4 | * Copyright (C) 2000-2004 Russell King |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License version 2 as | 7 | * it under the terms of the GNU General Public License version 2 as |
8 | * published by the Free Software Foundation. | 8 | * published by the Free Software Foundation. |
9 | * | 9 | * |
10 | * DMA uncached mapping support. | 10 | * DMA uncached mapping support. |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
16 | #include <linux/list.h> | 16 | #include <linux/list.h> |
17 | #include <linux/init.h> | 17 | #include <linux/init.h> |
18 | #include <linux/device.h> | 18 | #include <linux/device.h> |
19 | #include <linux/dma-mapping.h> | 19 | #include <linux/dma-mapping.h> |
20 | 20 | ||
21 | #include <asm/memory.h> | 21 | #include <asm/memory.h> |
22 | #include <asm/cacheflush.h> | 22 | #include <asm/cacheflush.h> |
23 | #include <asm/tlbflush.h> | 23 | #include <asm/tlbflush.h> |
24 | #include <asm/sizes.h> | 24 | #include <asm/sizes.h> |
25 | 25 | ||
26 | /* Sanity check size */ | 26 | /* Sanity check size */ |
27 | #if (CONSISTENT_DMA_SIZE % SZ_2M) | 27 | #if (CONSISTENT_DMA_SIZE % SZ_2M) |
28 | #error "CONSISTENT_DMA_SIZE must be multiple of 2MiB" | 28 | #error "CONSISTENT_DMA_SIZE must be multiple of 2MiB" |
29 | #endif | 29 | #endif |
30 | 30 | ||
31 | #define CONSISTENT_END (0xffe00000) | 31 | #define CONSISTENT_END (0xffe00000) |
32 | #define CONSISTENT_BASE (CONSISTENT_END - CONSISTENT_DMA_SIZE) | 32 | #define CONSISTENT_BASE (CONSISTENT_END - CONSISTENT_DMA_SIZE) |
33 | 33 | ||
34 | #define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) | 34 | #define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) |
35 | #define CONSISTENT_PTE_INDEX(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PGDIR_SHIFT) | 35 | #define CONSISTENT_PTE_INDEX(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PGDIR_SHIFT) |
36 | #define NUM_CONSISTENT_PTES (CONSISTENT_DMA_SIZE >> PGDIR_SHIFT) | 36 | #define NUM_CONSISTENT_PTES (CONSISTENT_DMA_SIZE >> PGDIR_SHIFT) |
37 | 37 | ||
38 | 38 | ||
39 | /* | 39 | /* |
40 | * These are the page tables (2MB each) covering uncached, DMA consistent allocations | 40 | * These are the page tables (2MB each) covering uncached, DMA consistent allocations |
41 | */ | 41 | */ |
42 | static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; | 42 | static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; |
43 | static DEFINE_SPINLOCK(consistent_lock); | 43 | static DEFINE_SPINLOCK(consistent_lock); |
44 | 44 | ||
45 | /* | 45 | /* |
46 | * VM region handling support. | 46 | * VM region handling support. |
47 | * | 47 | * |
48 | * This should become something generic, handling VM region allocations for | 48 | * This should become something generic, handling VM region allocations for |
49 | * vmalloc and similar (ioremap, module space, etc). | 49 | * vmalloc and similar (ioremap, module space, etc). |
50 | * | 50 | * |
51 | * I envisage vmalloc()'s supporting vm_struct becoming: | 51 | * I envisage vmalloc()'s supporting vm_struct becoming: |
52 | * | 52 | * |
53 | * struct vm_struct { | 53 | * struct vm_struct { |
54 | * struct vm_region region; | 54 | * struct vm_region region; |
55 | * unsigned long flags; | 55 | * unsigned long flags; |
56 | * struct page **pages; | 56 | * struct page **pages; |
57 | * unsigned int nr_pages; | 57 | * unsigned int nr_pages; |
58 | * unsigned long phys_addr; | 58 | * unsigned long phys_addr; |
59 | * }; | 59 | * }; |
60 | * | 60 | * |
61 | * get_vm_area() would then call vm_region_alloc with an appropriate | 61 | * get_vm_area() would then call vm_region_alloc with an appropriate |
62 | * struct vm_region head (eg): | 62 | * struct vm_region head (eg): |
63 | * | 63 | * |
64 | * struct vm_region vmalloc_head = { | 64 | * struct vm_region vmalloc_head = { |
65 | * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), | 65 | * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), |
66 | * .vm_start = VMALLOC_START, | 66 | * .vm_start = VMALLOC_START, |
67 | * .vm_end = VMALLOC_END, | 67 | * .vm_end = VMALLOC_END, |
68 | * }; | 68 | * }; |
69 | * | 69 | * |
70 | * However, vmalloc_head.vm_start is variable (typically, it is dependent on | 70 | * However, vmalloc_head.vm_start is variable (typically, it is dependent on |
71 | * the amount of RAM found at boot time.) I would imagine that get_vm_area() | 71 | * the amount of RAM found at boot time.) I would imagine that get_vm_area() |
72 | * would have to initialise this each time prior to calling vm_region_alloc(). | 72 | * would have to initialise this each time prior to calling vm_region_alloc(). |
73 | */ | 73 | */ |
74 | struct vm_region { | 74 | struct arm_vm_region { |
75 | struct list_head vm_list; | 75 | struct list_head vm_list; |
76 | unsigned long vm_start; | 76 | unsigned long vm_start; |
77 | unsigned long vm_end; | 77 | unsigned long vm_end; |
78 | struct page *vm_pages; | 78 | struct page *vm_pages; |
79 | int vm_active; | 79 | int vm_active; |
80 | }; | 80 | }; |
81 | 81 | ||
82 | static struct vm_region consistent_head = { | 82 | static struct arm_vm_region consistent_head = { |
83 | .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), | 83 | .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), |
84 | .vm_start = CONSISTENT_BASE, | 84 | .vm_start = CONSISTENT_BASE, |
85 | .vm_end = CONSISTENT_END, | 85 | .vm_end = CONSISTENT_END, |
86 | }; | 86 | }; |
87 | 87 | ||
88 | static struct vm_region * | 88 | static struct arm_vm_region * |
89 | vm_region_alloc(struct vm_region *head, size_t size, gfp_t gfp) | 89 | arm_vm_region_alloc(struct arm_vm_region *head, size_t size, gfp_t gfp) |
90 | { | 90 | { |
91 | unsigned long addr = head->vm_start, end = head->vm_end - size; | 91 | unsigned long addr = head->vm_start, end = head->vm_end - size; |
92 | unsigned long flags; | 92 | unsigned long flags; |
93 | struct vm_region *c, *new; | 93 | struct arm_vm_region *c, *new; |
94 | 94 | ||
95 | new = kmalloc(sizeof(struct vm_region), gfp); | 95 | new = kmalloc(sizeof(struct arm_vm_region), gfp); |
96 | if (!new) | 96 | if (!new) |
97 | goto out; | 97 | goto out; |
98 | 98 | ||
99 | spin_lock_irqsave(&consistent_lock, flags); | 99 | spin_lock_irqsave(&consistent_lock, flags); |
100 | 100 | ||
101 | list_for_each_entry(c, &head->vm_list, vm_list) { | 101 | list_for_each_entry(c, &head->vm_list, vm_list) { |
102 | if ((addr + size) < addr) | 102 | if ((addr + size) < addr) |
103 | goto nospc; | 103 | goto nospc; |
104 | if ((addr + size) <= c->vm_start) | 104 | if ((addr + size) <= c->vm_start) |
105 | goto found; | 105 | goto found; |
106 | addr = c->vm_end; | 106 | addr = c->vm_end; |
107 | if (addr > end) | 107 | if (addr > end) |
108 | goto nospc; | 108 | goto nospc; |
109 | } | 109 | } |
110 | 110 | ||
111 | found: | 111 | found: |
112 | /* | 112 | /* |
113 | * Insert this entry _before_ the one we found. | 113 | * Insert this entry _before_ the one we found. |
114 | */ | 114 | */ |
115 | list_add_tail(&new->vm_list, &c->vm_list); | 115 | list_add_tail(&new->vm_list, &c->vm_list); |
116 | new->vm_start = addr; | 116 | new->vm_start = addr; |
117 | new->vm_end = addr + size; | 117 | new->vm_end = addr + size; |
118 | new->vm_active = 1; | 118 | new->vm_active = 1; |
119 | 119 | ||
120 | spin_unlock_irqrestore(&consistent_lock, flags); | 120 | spin_unlock_irqrestore(&consistent_lock, flags); |
121 | return new; | 121 | return new; |
122 | 122 | ||
123 | nospc: | 123 | nospc: |
124 | spin_unlock_irqrestore(&consistent_lock, flags); | 124 | spin_unlock_irqrestore(&consistent_lock, flags); |
125 | kfree(new); | 125 | kfree(new); |
126 | out: | 126 | out: |
127 | return NULL; | 127 | return NULL; |
128 | } | 128 | } |
129 | 129 | ||
130 | static struct vm_region *vm_region_find(struct vm_region *head, unsigned long addr) | 130 | static struct arm_vm_region *arm_vm_region_find(struct arm_vm_region *head, unsigned long addr) |
131 | { | 131 | { |
132 | struct vm_region *c; | 132 | struct arm_vm_region *c; |
133 | 133 | ||
134 | list_for_each_entry(c, &head->vm_list, vm_list) { | 134 | list_for_each_entry(c, &head->vm_list, vm_list) { |
135 | if (c->vm_active && c->vm_start == addr) | 135 | if (c->vm_active && c->vm_start == addr) |
136 | goto out; | 136 | goto out; |
137 | } | 137 | } |
138 | c = NULL; | 138 | c = NULL; |
139 | out: | 139 | out: |
140 | return c; | 140 | return c; |
141 | } | 141 | } |
142 | 142 | ||
143 | #ifdef CONFIG_HUGETLB_PAGE | 143 | #ifdef CONFIG_HUGETLB_PAGE |
144 | #error ARM Coherent DMA allocator does not (yet) support huge TLB | 144 | #error ARM Coherent DMA allocator does not (yet) support huge TLB |
145 | #endif | 145 | #endif |
146 | 146 | ||
147 | static void * | 147 | static void * |
148 | __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, | 148 | __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, |
149 | pgprot_t prot) | 149 | pgprot_t prot) |
150 | { | 150 | { |
151 | struct page *page; | 151 | struct page *page; |
152 | struct vm_region *c; | 152 | struct arm_vm_region *c; |
153 | unsigned long order; | 153 | unsigned long order; |
154 | u64 mask = ISA_DMA_THRESHOLD, limit; | 154 | u64 mask = ISA_DMA_THRESHOLD, limit; |
155 | 155 | ||
156 | if (!consistent_pte[0]) { | 156 | if (!consistent_pte[0]) { |
157 | printk(KERN_ERR "%s: not initialised\n", __func__); | 157 | printk(KERN_ERR "%s: not initialised\n", __func__); |
158 | dump_stack(); | 158 | dump_stack(); |
159 | return NULL; | 159 | return NULL; |
160 | } | 160 | } |
161 | 161 | ||
162 | if (dev) { | 162 | if (dev) { |
163 | mask = dev->coherent_dma_mask; | 163 | mask = dev->coherent_dma_mask; |
164 | 164 | ||
165 | /* | 165 | /* |
166 | * Sanity check the DMA mask - it must be non-zero, and | 166 | * Sanity check the DMA mask - it must be non-zero, and |
167 | * must be able to be satisfied by a DMA allocation. | 167 | * must be able to be satisfied by a DMA allocation. |
168 | */ | 168 | */ |
169 | if (mask == 0) { | 169 | if (mask == 0) { |
170 | dev_warn(dev, "coherent DMA mask is unset\n"); | 170 | dev_warn(dev, "coherent DMA mask is unset\n"); |
171 | goto no_page; | 171 | goto no_page; |
172 | } | 172 | } |
173 | 173 | ||
174 | if ((~mask) & ISA_DMA_THRESHOLD) { | 174 | if ((~mask) & ISA_DMA_THRESHOLD) { |
175 | dev_warn(dev, "coherent DMA mask %#llx is smaller " | 175 | dev_warn(dev, "coherent DMA mask %#llx is smaller " |
176 | "than system GFP_DMA mask %#llx\n", | 176 | "than system GFP_DMA mask %#llx\n", |
177 | mask, (unsigned long long)ISA_DMA_THRESHOLD); | 177 | mask, (unsigned long long)ISA_DMA_THRESHOLD); |
178 | goto no_page; | 178 | goto no_page; |
179 | } | 179 | } |
180 | } | 180 | } |
181 | 181 | ||
182 | /* | 182 | /* |
183 | * Sanity check the allocation size. | 183 | * Sanity check the allocation size. |
184 | */ | 184 | */ |
185 | size = PAGE_ALIGN(size); | 185 | size = PAGE_ALIGN(size); |
186 | limit = (mask + 1) & ~mask; | 186 | limit = (mask + 1) & ~mask; |
187 | if ((limit && size >= limit) || | 187 | if ((limit && size >= limit) || |
188 | size >= (CONSISTENT_END - CONSISTENT_BASE)) { | 188 | size >= (CONSISTENT_END - CONSISTENT_BASE)) { |
189 | printk(KERN_WARNING "coherent allocation too big " | 189 | printk(KERN_WARNING "coherent allocation too big " |
190 | "(requested %#x mask %#llx)\n", size, mask); | 190 | "(requested %#x mask %#llx)\n", size, mask); |
191 | goto no_page; | 191 | goto no_page; |
192 | } | 192 | } |
193 | 193 | ||
194 | order = get_order(size); | 194 | order = get_order(size); |
195 | 195 | ||
196 | if (mask != 0xffffffff) | 196 | if (mask != 0xffffffff) |
197 | gfp |= GFP_DMA; | 197 | gfp |= GFP_DMA; |
198 | 198 | ||
199 | page = alloc_pages(gfp, order); | 199 | page = alloc_pages(gfp, order); |
200 | if (!page) | 200 | if (!page) |
201 | goto no_page; | 201 | goto no_page; |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * Invalidate any data that might be lurking in the | 204 | * Invalidate any data that might be lurking in the |
205 | * kernel direct-mapped region for device DMA. | 205 | * kernel direct-mapped region for device DMA. |
206 | */ | 206 | */ |
207 | { | 207 | { |
208 | void *ptr = page_address(page); | 208 | void *ptr = page_address(page); |
209 | memset(ptr, 0, size); | 209 | memset(ptr, 0, size); |
210 | dmac_flush_range(ptr, ptr + size); | 210 | dmac_flush_range(ptr, ptr + size); |
211 | outer_flush_range(__pa(ptr), __pa(ptr) + size); | 211 | outer_flush_range(__pa(ptr), __pa(ptr) + size); |
212 | } | 212 | } |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * Allocate a virtual address in the consistent mapping region. | 215 | * Allocate a virtual address in the consistent mapping region. |
216 | */ | 216 | */ |
217 | c = vm_region_alloc(&consistent_head, size, | 217 | c = arm_vm_region_alloc(&consistent_head, size, |
218 | gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); | 218 | gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); |
219 | if (c) { | 219 | if (c) { |
220 | pte_t *pte; | 220 | pte_t *pte; |
221 | struct page *end = page + (1 << order); | 221 | struct page *end = page + (1 << order); |
222 | int idx = CONSISTENT_PTE_INDEX(c->vm_start); | 222 | int idx = CONSISTENT_PTE_INDEX(c->vm_start); |
223 | u32 off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1); | 223 | u32 off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1); |
224 | 224 | ||
225 | pte = consistent_pte[idx] + off; | 225 | pte = consistent_pte[idx] + off; |
226 | c->vm_pages = page; | 226 | c->vm_pages = page; |
227 | 227 | ||
228 | split_page(page, order); | 228 | split_page(page, order); |
229 | 229 | ||
230 | /* | 230 | /* |
231 | * Set the "dma handle" | 231 | * Set the "dma handle" |
232 | */ | 232 | */ |
233 | *handle = page_to_dma(dev, page); | 233 | *handle = page_to_dma(dev, page); |
234 | 234 | ||
235 | do { | 235 | do { |
236 | BUG_ON(!pte_none(*pte)); | 236 | BUG_ON(!pte_none(*pte)); |
237 | 237 | ||
238 | /* | 238 | /* |
239 | * x86 does not mark the pages reserved... | 239 | * x86 does not mark the pages reserved... |
240 | */ | 240 | */ |
241 | SetPageReserved(page); | 241 | SetPageReserved(page); |
242 | set_pte_ext(pte, mk_pte(page, prot), 0); | 242 | set_pte_ext(pte, mk_pte(page, prot), 0); |
243 | page++; | 243 | page++; |
244 | pte++; | 244 | pte++; |
245 | off++; | 245 | off++; |
246 | if (off >= PTRS_PER_PTE) { | 246 | if (off >= PTRS_PER_PTE) { |
247 | off = 0; | 247 | off = 0; |
248 | pte = consistent_pte[++idx]; | 248 | pte = consistent_pte[++idx]; |
249 | } | 249 | } |
250 | } while (size -= PAGE_SIZE); | 250 | } while (size -= PAGE_SIZE); |
251 | 251 | ||
252 | /* | 252 | /* |
253 | * Free the otherwise unused pages. | 253 | * Free the otherwise unused pages. |
254 | */ | 254 | */ |
255 | while (page < end) { | 255 | while (page < end) { |
256 | __free_page(page); | 256 | __free_page(page); |
257 | page++; | 257 | page++; |
258 | } | 258 | } |
259 | 259 | ||
260 | return (void *)c->vm_start; | 260 | return (void *)c->vm_start; |
261 | } | 261 | } |
262 | 262 | ||
263 | if (page) | 263 | if (page) |
264 | __free_pages(page, order); | 264 | __free_pages(page, order); |
265 | no_page: | 265 | no_page: |
266 | *handle = ~0; | 266 | *handle = ~0; |
267 | return NULL; | 267 | return NULL; |
268 | } | 268 | } |
269 | 269 | ||
270 | /* | 270 | /* |
271 | * Allocate DMA-coherent memory space and return both the kernel remapped | 271 | * Allocate DMA-coherent memory space and return both the kernel remapped |
272 | * virtual and bus address for that space. | 272 | * virtual and bus address for that space. |
273 | */ | 273 | */ |
274 | void * | 274 | void * |
275 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) | 275 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) |
276 | { | 276 | { |
277 | void *memory; | 277 | void *memory; |
278 | 278 | ||
279 | if (dma_alloc_from_coherent(dev, size, handle, &memory)) | 279 | if (dma_alloc_from_coherent(dev, size, handle, &memory)) |
280 | return memory; | 280 | return memory; |
281 | 281 | ||
282 | if (arch_is_coherent()) { | 282 | if (arch_is_coherent()) { |
283 | void *virt; | 283 | void *virt; |
284 | 284 | ||
285 | virt = kmalloc(size, gfp); | 285 | virt = kmalloc(size, gfp); |
286 | if (!virt) | 286 | if (!virt) |
287 | return NULL; | 287 | return NULL; |
288 | *handle = virt_to_dma(dev, virt); | 288 | *handle = virt_to_dma(dev, virt); |
289 | 289 | ||
290 | return virt; | 290 | return virt; |
291 | } | 291 | } |
292 | 292 | ||
293 | return __dma_alloc(dev, size, handle, gfp, | 293 | return __dma_alloc(dev, size, handle, gfp, |
294 | pgprot_noncached(pgprot_kernel)); | 294 | pgprot_noncached(pgprot_kernel)); |
295 | } | 295 | } |
296 | EXPORT_SYMBOL(dma_alloc_coherent); | 296 | EXPORT_SYMBOL(dma_alloc_coherent); |
297 | 297 | ||
298 | /* | 298 | /* |
299 | * Allocate a writecombining region, in much the same way as | 299 | * Allocate a writecombining region, in much the same way as |
300 | * dma_alloc_coherent above. | 300 | * dma_alloc_coherent above. |
301 | */ | 301 | */ |
302 | void * | 302 | void * |
303 | dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) | 303 | dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) |
304 | { | 304 | { |
305 | return __dma_alloc(dev, size, handle, gfp, | 305 | return __dma_alloc(dev, size, handle, gfp, |
306 | pgprot_writecombine(pgprot_kernel)); | 306 | pgprot_writecombine(pgprot_kernel)); |
307 | } | 307 | } |
308 | EXPORT_SYMBOL(dma_alloc_writecombine); | 308 | EXPORT_SYMBOL(dma_alloc_writecombine); |
309 | 309 | ||
310 | static int dma_mmap(struct device *dev, struct vm_area_struct *vma, | 310 | static int dma_mmap(struct device *dev, struct vm_area_struct *vma, |
311 | void *cpu_addr, dma_addr_t dma_addr, size_t size) | 311 | void *cpu_addr, dma_addr_t dma_addr, size_t size) |
312 | { | 312 | { |
313 | unsigned long flags, user_size, kern_size; | 313 | unsigned long flags, user_size, kern_size; |
314 | struct vm_region *c; | 314 | struct arm_vm_region *c; |
315 | int ret = -ENXIO; | 315 | int ret = -ENXIO; |
316 | 316 | ||
317 | user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 317 | user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; |
318 | 318 | ||
319 | spin_lock_irqsave(&consistent_lock, flags); | 319 | spin_lock_irqsave(&consistent_lock, flags); |
320 | c = vm_region_find(&consistent_head, (unsigned long)cpu_addr); | 320 | c = arm_vm_region_find(&consistent_head, (unsigned long)cpu_addr); |
321 | spin_unlock_irqrestore(&consistent_lock, flags); | 321 | spin_unlock_irqrestore(&consistent_lock, flags); |
322 | 322 | ||
323 | if (c) { | 323 | if (c) { |
324 | unsigned long off = vma->vm_pgoff; | 324 | unsigned long off = vma->vm_pgoff; |
325 | 325 | ||
326 | kern_size = (c->vm_end - c->vm_start) >> PAGE_SHIFT; | 326 | kern_size = (c->vm_end - c->vm_start) >> PAGE_SHIFT; |
327 | 327 | ||
328 | if (off < kern_size && | 328 | if (off < kern_size && |
329 | user_size <= (kern_size - off)) { | 329 | user_size <= (kern_size - off)) { |
330 | ret = remap_pfn_range(vma, vma->vm_start, | 330 | ret = remap_pfn_range(vma, vma->vm_start, |
331 | page_to_pfn(c->vm_pages) + off, | 331 | page_to_pfn(c->vm_pages) + off, |
332 | user_size << PAGE_SHIFT, | 332 | user_size << PAGE_SHIFT, |
333 | vma->vm_page_prot); | 333 | vma->vm_page_prot); |
334 | } | 334 | } |
335 | } | 335 | } |
336 | 336 | ||
337 | return ret; | 337 | return ret; |
338 | } | 338 | } |
339 | 339 | ||
340 | int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma, | 340 | int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma, |
341 | void *cpu_addr, dma_addr_t dma_addr, size_t size) | 341 | void *cpu_addr, dma_addr_t dma_addr, size_t size) |
342 | { | 342 | { |
343 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 343 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
344 | return dma_mmap(dev, vma, cpu_addr, dma_addr, size); | 344 | return dma_mmap(dev, vma, cpu_addr, dma_addr, size); |
345 | } | 345 | } |
346 | EXPORT_SYMBOL(dma_mmap_coherent); | 346 | EXPORT_SYMBOL(dma_mmap_coherent); |
347 | 347 | ||
348 | int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma, | 348 | int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma, |
349 | void *cpu_addr, dma_addr_t dma_addr, size_t size) | 349 | void *cpu_addr, dma_addr_t dma_addr, size_t size) |
350 | { | 350 | { |
351 | vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); | 351 | vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); |
352 | return dma_mmap(dev, vma, cpu_addr, dma_addr, size); | 352 | return dma_mmap(dev, vma, cpu_addr, dma_addr, size); |
353 | } | 353 | } |
354 | EXPORT_SYMBOL(dma_mmap_writecombine); | 354 | EXPORT_SYMBOL(dma_mmap_writecombine); |
355 | 355 | ||
356 | /* | 356 | /* |
357 | * free a page as defined by the above mapping. | 357 | * free a page as defined by the above mapping. |
358 | * Must not be called with IRQs disabled. | 358 | * Must not be called with IRQs disabled. |
359 | */ | 359 | */ |
360 | void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle) | 360 | void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle) |
361 | { | 361 | { |
362 | struct vm_region *c; | 362 | struct arm_vm_region *c; |
363 | unsigned long flags, addr; | 363 | unsigned long flags, addr; |
364 | pte_t *ptep; | 364 | pte_t *ptep; |
365 | int idx; | 365 | int idx; |
366 | u32 off; | 366 | u32 off; |
367 | 367 | ||
368 | WARN_ON(irqs_disabled()); | 368 | WARN_ON(irqs_disabled()); |
369 | 369 | ||
370 | if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) | 370 | if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) |
371 | return; | 371 | return; |
372 | 372 | ||
373 | if (arch_is_coherent()) { | 373 | if (arch_is_coherent()) { |
374 | kfree(cpu_addr); | 374 | kfree(cpu_addr); |
375 | return; | 375 | return; |
376 | } | 376 | } |
377 | 377 | ||
378 | size = PAGE_ALIGN(size); | 378 | size = PAGE_ALIGN(size); |
379 | 379 | ||
380 | spin_lock_irqsave(&consistent_lock, flags); | 380 | spin_lock_irqsave(&consistent_lock, flags); |
381 | c = vm_region_find(&consistent_head, (unsigned long)cpu_addr); | 381 | c = arm_vm_region_find(&consistent_head, (unsigned long)cpu_addr); |
382 | if (!c) | 382 | if (!c) |
383 | goto no_area; | 383 | goto no_area; |
384 | 384 | ||
385 | c->vm_active = 0; | 385 | c->vm_active = 0; |
386 | spin_unlock_irqrestore(&consistent_lock, flags); | 386 | spin_unlock_irqrestore(&consistent_lock, flags); |
387 | 387 | ||
388 | if ((c->vm_end - c->vm_start) != size) { | 388 | if ((c->vm_end - c->vm_start) != size) { |
389 | printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", | 389 | printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", |
390 | __func__, c->vm_end - c->vm_start, size); | 390 | __func__, c->vm_end - c->vm_start, size); |
391 | dump_stack(); | 391 | dump_stack(); |
392 | size = c->vm_end - c->vm_start; | 392 | size = c->vm_end - c->vm_start; |
393 | } | 393 | } |
394 | 394 | ||
395 | idx = CONSISTENT_PTE_INDEX(c->vm_start); | 395 | idx = CONSISTENT_PTE_INDEX(c->vm_start); |
396 | off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1); | 396 | off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1); |
397 | ptep = consistent_pte[idx] + off; | 397 | ptep = consistent_pte[idx] + off; |
398 | addr = c->vm_start; | 398 | addr = c->vm_start; |
399 | do { | 399 | do { |
400 | pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep); | 400 | pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep); |
401 | unsigned long pfn; | 401 | unsigned long pfn; |
402 | 402 | ||
403 | ptep++; | 403 | ptep++; |
404 | addr += PAGE_SIZE; | 404 | addr += PAGE_SIZE; |
405 | off++; | 405 | off++; |
406 | if (off >= PTRS_PER_PTE) { | 406 | if (off >= PTRS_PER_PTE) { |
407 | off = 0; | 407 | off = 0; |
408 | ptep = consistent_pte[++idx]; | 408 | ptep = consistent_pte[++idx]; |
409 | } | 409 | } |
410 | 410 | ||
411 | if (!pte_none(pte) && pte_present(pte)) { | 411 | if (!pte_none(pte) && pte_present(pte)) { |
412 | pfn = pte_pfn(pte); | 412 | pfn = pte_pfn(pte); |
413 | 413 | ||
414 | if (pfn_valid(pfn)) { | 414 | if (pfn_valid(pfn)) { |
415 | struct page *page = pfn_to_page(pfn); | 415 | struct page *page = pfn_to_page(pfn); |
416 | 416 | ||
417 | /* | 417 | /* |
418 | * x86 does not mark the pages reserved... | 418 | * x86 does not mark the pages reserved... |
419 | */ | 419 | */ |
420 | ClearPageReserved(page); | 420 | ClearPageReserved(page); |
421 | 421 | ||
422 | __free_page(page); | 422 | __free_page(page); |
423 | continue; | 423 | continue; |
424 | } | 424 | } |
425 | } | 425 | } |
426 | 426 | ||
427 | printk(KERN_CRIT "%s: bad page in kernel page table\n", | 427 | printk(KERN_CRIT "%s: bad page in kernel page table\n", |
428 | __func__); | 428 | __func__); |
429 | } while (size -= PAGE_SIZE); | 429 | } while (size -= PAGE_SIZE); |
430 | 430 | ||
431 | flush_tlb_kernel_range(c->vm_start, c->vm_end); | 431 | flush_tlb_kernel_range(c->vm_start, c->vm_end); |
432 | 432 | ||
433 | spin_lock_irqsave(&consistent_lock, flags); | 433 | spin_lock_irqsave(&consistent_lock, flags); |
434 | list_del(&c->vm_list); | 434 | list_del(&c->vm_list); |
435 | spin_unlock_irqrestore(&consistent_lock, flags); | 435 | spin_unlock_irqrestore(&consistent_lock, flags); |
436 | 436 | ||
437 | kfree(c); | 437 | kfree(c); |
438 | return; | 438 | return; |
439 | 439 | ||
440 | no_area: | 440 | no_area: |
441 | spin_unlock_irqrestore(&consistent_lock, flags); | 441 | spin_unlock_irqrestore(&consistent_lock, flags); |
442 | printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", | 442 | printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", |
443 | __func__, cpu_addr); | 443 | __func__, cpu_addr); |
444 | dump_stack(); | 444 | dump_stack(); |
445 | } | 445 | } |
446 | EXPORT_SYMBOL(dma_free_coherent); | 446 | EXPORT_SYMBOL(dma_free_coherent); |
447 | 447 | ||
448 | /* | 448 | /* |
449 | * Initialise the consistent memory allocation. | 449 | * Initialise the consistent memory allocation. |
450 | */ | 450 | */ |
451 | static int __init consistent_init(void) | 451 | static int __init consistent_init(void) |
452 | { | 452 | { |
453 | pgd_t *pgd; | 453 | pgd_t *pgd; |
454 | pmd_t *pmd; | 454 | pmd_t *pmd; |
455 | pte_t *pte; | 455 | pte_t *pte; |
456 | int ret = 0, i = 0; | 456 | int ret = 0, i = 0; |
457 | u32 base = CONSISTENT_BASE; | 457 | u32 base = CONSISTENT_BASE; |
458 | 458 | ||
459 | do { | 459 | do { |
460 | pgd = pgd_offset(&init_mm, base); | 460 | pgd = pgd_offset(&init_mm, base); |
461 | pmd = pmd_alloc(&init_mm, pgd, base); | 461 | pmd = pmd_alloc(&init_mm, pgd, base); |
462 | if (!pmd) { | 462 | if (!pmd) { |
463 | printk(KERN_ERR "%s: no pmd tables\n", __func__); | 463 | printk(KERN_ERR "%s: no pmd tables\n", __func__); |
464 | ret = -ENOMEM; | 464 | ret = -ENOMEM; |
465 | break; | 465 | break; |
466 | } | 466 | } |
467 | WARN_ON(!pmd_none(*pmd)); | 467 | WARN_ON(!pmd_none(*pmd)); |
468 | 468 | ||
469 | pte = pte_alloc_kernel(pmd, base); | 469 | pte = pte_alloc_kernel(pmd, base); |
470 | if (!pte) { | 470 | if (!pte) { |
471 | printk(KERN_ERR "%s: no pte tables\n", __func__); | 471 | printk(KERN_ERR "%s: no pte tables\n", __func__); |
472 | ret = -ENOMEM; | 472 | ret = -ENOMEM; |
473 | break; | 473 | break; |
474 | } | 474 | } |
475 | 475 | ||
476 | consistent_pte[i++] = pte; | 476 | consistent_pte[i++] = pte; |
477 | base += (1 << PGDIR_SHIFT); | 477 | base += (1 << PGDIR_SHIFT); |
478 | } while (base < CONSISTENT_END); | 478 | } while (base < CONSISTENT_END); |
479 | 479 | ||
480 | return ret; | 480 | return ret; |
481 | } | 481 | } |
482 | 482 | ||
483 | core_initcall(consistent_init); | 483 | core_initcall(consistent_init); |
484 | 484 | ||
485 | /* | 485 | /* |
486 | * Make an area consistent for devices. | 486 | * Make an area consistent for devices. |
487 | * Note: Drivers should NOT use this function directly, as it will break | 487 | * Note: Drivers should NOT use this function directly, as it will break |
488 | * platforms with CONFIG_DMABOUNCE. | 488 | * platforms with CONFIG_DMABOUNCE. |
489 | * Use the driver DMA support - see dma-mapping.h (dma_sync_*) | 489 | * Use the driver DMA support - see dma-mapping.h (dma_sync_*) |
490 | */ | 490 | */ |
491 | void dma_cache_maint(const void *start, size_t size, int direction) | 491 | void dma_cache_maint(const void *start, size_t size, int direction) |
492 | { | 492 | { |
493 | const void *end = start + size; | 493 | const void *end = start + size; |
494 | 494 | ||
495 | BUG_ON(!virt_addr_valid(start) || !virt_addr_valid(end - 1)); | 495 | BUG_ON(!virt_addr_valid(start) || !virt_addr_valid(end - 1)); |
496 | 496 | ||
497 | switch (direction) { | 497 | switch (direction) { |
498 | case DMA_FROM_DEVICE: /* invalidate only */ | 498 | case DMA_FROM_DEVICE: /* invalidate only */ |
499 | dmac_inv_range(start, end); | 499 | dmac_inv_range(start, end); |
500 | outer_inv_range(__pa(start), __pa(end)); | 500 | outer_inv_range(__pa(start), __pa(end)); |
501 | break; | 501 | break; |
502 | case DMA_TO_DEVICE: /* writeback only */ | 502 | case DMA_TO_DEVICE: /* writeback only */ |
503 | dmac_clean_range(start, end); | 503 | dmac_clean_range(start, end); |
504 | outer_clean_range(__pa(start), __pa(end)); | 504 | outer_clean_range(__pa(start), __pa(end)); |
505 | break; | 505 | break; |
506 | case DMA_BIDIRECTIONAL: /* writeback and invalidate */ | 506 | case DMA_BIDIRECTIONAL: /* writeback and invalidate */ |
507 | dmac_flush_range(start, end); | 507 | dmac_flush_range(start, end); |
508 | outer_flush_range(__pa(start), __pa(end)); | 508 | outer_flush_range(__pa(start), __pa(end)); |
509 | break; | 509 | break; |
510 | default: | 510 | default: |
511 | BUG(); | 511 | BUG(); |
512 | } | 512 | } |
513 | } | 513 | } |
514 | EXPORT_SYMBOL(dma_cache_maint); | 514 | EXPORT_SYMBOL(dma_cache_maint); |
515 | 515 | ||
516 | /** | 516 | /** |
517 | * dma_map_sg - map a set of SG buffers for streaming mode DMA | 517 | * dma_map_sg - map a set of SG buffers for streaming mode DMA |
518 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices | 518 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices |
519 | * @sg: list of buffers | 519 | * @sg: list of buffers |
520 | * @nents: number of buffers to map | 520 | * @nents: number of buffers to map |
521 | * @dir: DMA transfer direction | 521 | * @dir: DMA transfer direction |
522 | * | 522 | * |
523 | * Map a set of buffers described by scatterlist in streaming mode for DMA. | 523 | * Map a set of buffers described by scatterlist in streaming mode for DMA. |
524 | * This is the scatter-gather version of the dma_map_single interface. | 524 | * This is the scatter-gather version of the dma_map_single interface. |
525 | * Here the scatter gather list elements are each tagged with the | 525 | * Here the scatter gather list elements are each tagged with the |
526 | * appropriate dma address and length. They are obtained via | 526 | * appropriate dma address and length. They are obtained via |
527 | * sg_dma_{address,length}. | 527 | * sg_dma_{address,length}. |
528 | * | 528 | * |
529 | * Device ownership issues as mentioned for dma_map_single are the same | 529 | * Device ownership issues as mentioned for dma_map_single are the same |
530 | * here. | 530 | * here. |
531 | */ | 531 | */ |
532 | int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, | 532 | int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, |
533 | enum dma_data_direction dir) | 533 | enum dma_data_direction dir) |
534 | { | 534 | { |
535 | struct scatterlist *s; | 535 | struct scatterlist *s; |
536 | int i, j; | 536 | int i, j; |
537 | 537 | ||
538 | for_each_sg(sg, s, nents, i) { | 538 | for_each_sg(sg, s, nents, i) { |
539 | s->dma_address = dma_map_page(dev, sg_page(s), s->offset, | 539 | s->dma_address = dma_map_page(dev, sg_page(s), s->offset, |
540 | s->length, dir); | 540 | s->length, dir); |
541 | if (dma_mapping_error(dev, s->dma_address)) | 541 | if (dma_mapping_error(dev, s->dma_address)) |
542 | goto bad_mapping; | 542 | goto bad_mapping; |
543 | } | 543 | } |
544 | return nents; | 544 | return nents; |
545 | 545 | ||
546 | bad_mapping: | 546 | bad_mapping: |
547 | for_each_sg(sg, s, i, j) | 547 | for_each_sg(sg, s, i, j) |
548 | dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir); | 548 | dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir); |
549 | return 0; | 549 | return 0; |
550 | } | 550 | } |
551 | EXPORT_SYMBOL(dma_map_sg); | 551 | EXPORT_SYMBOL(dma_map_sg); |
552 | 552 | ||
553 | /** | 553 | /** |
554 | * dma_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg | 554 | * dma_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg |
555 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices | 555 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices |
556 | * @sg: list of buffers | 556 | * @sg: list of buffers |
557 | * @nents: number of buffers to unmap (returned from dma_map_sg) | 557 | * @nents: number of buffers to unmap (returned from dma_map_sg) |
558 | * @dir: DMA transfer direction (same as was passed to dma_map_sg) | 558 | * @dir: DMA transfer direction (same as was passed to dma_map_sg) |
559 | * | 559 | * |
560 | * Unmap a set of streaming mode DMA translations. Again, CPU access | 560 | * Unmap a set of streaming mode DMA translations. Again, CPU access |
561 | * rules concerning calls here are the same as for dma_unmap_single(). | 561 | * rules concerning calls here are the same as for dma_unmap_single(). |
562 | */ | 562 | */ |
563 | void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, | 563 | void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, |
564 | enum dma_data_direction dir) | 564 | enum dma_data_direction dir) |
565 | { | 565 | { |
566 | struct scatterlist *s; | 566 | struct scatterlist *s; |
567 | int i; | 567 | int i; |
568 | 568 | ||
569 | for_each_sg(sg, s, nents, i) | 569 | for_each_sg(sg, s, nents, i) |
570 | dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir); | 570 | dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir); |
571 | } | 571 | } |
572 | EXPORT_SYMBOL(dma_unmap_sg); | 572 | EXPORT_SYMBOL(dma_unmap_sg); |
573 | 573 | ||
574 | /** | 574 | /** |
575 | * dma_sync_sg_for_cpu | 575 | * dma_sync_sg_for_cpu |
576 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices | 576 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices |
577 | * @sg: list of buffers | 577 | * @sg: list of buffers |
578 | * @nents: number of buffers to map (returned from dma_map_sg) | 578 | * @nents: number of buffers to map (returned from dma_map_sg) |
579 | * @dir: DMA transfer direction (same as was passed to dma_map_sg) | 579 | * @dir: DMA transfer direction (same as was passed to dma_map_sg) |
580 | */ | 580 | */ |
581 | void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, | 581 | void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, |
582 | int nents, enum dma_data_direction dir) | 582 | int nents, enum dma_data_direction dir) |
583 | { | 583 | { |
584 | struct scatterlist *s; | 584 | struct scatterlist *s; |
585 | int i; | 585 | int i; |
586 | 586 | ||
587 | for_each_sg(sg, s, nents, i) { | 587 | for_each_sg(sg, s, nents, i) { |
588 | dmabounce_sync_for_cpu(dev, sg_dma_address(s), 0, | 588 | dmabounce_sync_for_cpu(dev, sg_dma_address(s), 0, |
589 | sg_dma_len(s), dir); | 589 | sg_dma_len(s), dir); |
590 | } | 590 | } |
591 | } | 591 | } |
592 | EXPORT_SYMBOL(dma_sync_sg_for_cpu); | 592 | EXPORT_SYMBOL(dma_sync_sg_for_cpu); |
593 | 593 | ||
594 | /** | 594 | /** |
595 | * dma_sync_sg_for_device | 595 | * dma_sync_sg_for_device |
596 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices | 596 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices |
597 | * @sg: list of buffers | 597 | * @sg: list of buffers |
598 | * @nents: number of buffers to map (returned from dma_map_sg) | 598 | * @nents: number of buffers to map (returned from dma_map_sg) |
599 | * @dir: DMA transfer direction (same as was passed to dma_map_sg) | 599 | * @dir: DMA transfer direction (same as was passed to dma_map_sg) |
600 | */ | 600 | */ |
601 | void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, | 601 | void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, |
602 | int nents, enum dma_data_direction dir) | 602 | int nents, enum dma_data_direction dir) |
603 | { | 603 | { |
604 | struct scatterlist *s; | 604 | struct scatterlist *s; |
605 | int i; | 605 | int i; |
606 | 606 | ||
607 | for_each_sg(sg, s, nents, i) { | 607 | for_each_sg(sg, s, nents, i) { |
608 | if (!dmabounce_sync_for_device(dev, sg_dma_address(s), 0, | 608 | if (!dmabounce_sync_for_device(dev, sg_dma_address(s), 0, |
609 | sg_dma_len(s), dir)) | 609 | sg_dma_len(s), dir)) |
610 | continue; | 610 | continue; |
611 | 611 | ||
612 | if (!arch_is_coherent()) | 612 | if (!arch_is_coherent()) |
613 | dma_cache_maint(sg_virt(s), s->length, dir); | 613 | dma_cache_maint(sg_virt(s), s->length, dir); |
614 | } | 614 | } |
615 | } | 615 | } |
616 | EXPORT_SYMBOL(dma_sync_sg_for_device); | 616 | EXPORT_SYMBOL(dma_sync_sg_for_device); |
617 | 617 |
arch/blackfin/include/asm/mmu.h
1 | #ifndef __MMU_H | 1 | #ifndef __MMU_H |
2 | #define __MMU_H | 2 | #define __MMU_H |
3 | 3 | ||
4 | /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */ | 4 | /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */ |
5 | 5 | ||
6 | struct sram_list_struct { | 6 | struct sram_list_struct { |
7 | struct sram_list_struct *next; | 7 | struct sram_list_struct *next; |
8 | void *addr; | 8 | void *addr; |
9 | size_t length; | 9 | size_t length; |
10 | }; | 10 | }; |
11 | 11 | ||
12 | typedef struct { | 12 | typedef struct { |
13 | struct vm_list_struct *vmlist; | ||
14 | unsigned long end_brk; | 13 | unsigned long end_brk; |
15 | unsigned long stack_start; | 14 | unsigned long stack_start; |
16 | 15 | ||
17 | /* Points to the location in SDRAM where the L1 stack is normally | 16 | /* Points to the location in SDRAM where the L1 stack is normally |
18 | saved, or NULL if the stack is always in SDRAM. */ | 17 | saved, or NULL if the stack is always in SDRAM. */ |
19 | void *l1_stack_save; | 18 | void *l1_stack_save; |
20 | 19 | ||
21 | struct sram_list_struct *sram_list; | 20 | struct sram_list_struct *sram_list; |
22 | 21 | ||
23 | #ifdef CONFIG_BINFMT_ELF_FDPIC | 22 | #ifdef CONFIG_BINFMT_ELF_FDPIC |
24 | unsigned long exec_fdpic_loadmap; | 23 | unsigned long exec_fdpic_loadmap; |
25 | unsigned long interp_fdpic_loadmap; | 24 | unsigned long interp_fdpic_loadmap; |
26 | #endif | 25 | #endif |
27 | #ifdef CONFIG_MPU | 26 | #ifdef CONFIG_MPU |
28 | unsigned long *page_rwx_mask; | 27 | unsigned long *page_rwx_mask; |
29 | #endif | 28 | #endif |
30 | } mm_context_t; | 29 | } mm_context_t; |
31 | 30 | ||
32 | #endif | 31 | #endif |
33 | 32 |
arch/blackfin/kernel/ptrace.c
1 | /* | 1 | /* |
2 | * File: arch/blackfin/kernel/ptrace.c | 2 | * File: arch/blackfin/kernel/ptrace.c |
3 | * Based on: Taken from linux/kernel/ptrace.c | 3 | * Based on: Taken from linux/kernel/ptrace.c |
4 | * Author: linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds | 4 | * Author: linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds |
5 | * | 5 | * |
6 | * Created: 1/23/92 | 6 | * Created: 1/23/92 |
7 | * Description: | 7 | * Description: |
8 | * | 8 | * |
9 | * Modified: | 9 | * Modified: |
10 | * Copyright 2004-2006 Analog Devices Inc. | 10 | * Copyright 2004-2006 Analog Devices Inc. |
11 | * | 11 | * |
12 | * Bugs: Enter bugs at http://blackfin.uclinux.org/ | 12 | * Bugs: Enter bugs at http://blackfin.uclinux.org/ |
13 | * | 13 | * |
14 | * This program is free software; you can redistribute it and/or modify | 14 | * This program is free software; you can redistribute it and/or modify |
15 | * it under the terms of the GNU General Public License as published by | 15 | * it under the terms of the GNU General Public License as published by |
16 | * the Free Software Foundation; either version 2 of the License, or | 16 | * the Free Software Foundation; either version 2 of the License, or |
17 | * (at your option) any later version. | 17 | * (at your option) any later version. |
18 | * | 18 | * |
19 | * This program is distributed in the hope that it will be useful, | 19 | * This program is distributed in the hope that it will be useful, |
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22 | * GNU General Public License for more details. | 22 | * GNU General Public License for more details. |
23 | * | 23 | * |
24 | * You should have received a copy of the GNU General Public License | 24 | * You should have received a copy of the GNU General Public License |
25 | * along with this program; if not, see the file COPYING, or write | 25 | * along with this program; if not, see the file COPYING, or write |
26 | * to the Free Software Foundation, Inc., | 26 | * to the Free Software Foundation, Inc., |
27 | * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | 27 | * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/kernel.h> | 30 | #include <linux/kernel.h> |
31 | #include <linux/sched.h> | 31 | #include <linux/sched.h> |
32 | #include <linux/mm.h> | 32 | #include <linux/mm.h> |
33 | #include <linux/smp.h> | 33 | #include <linux/smp.h> |
34 | #include <linux/smp_lock.h> | 34 | #include <linux/smp_lock.h> |
35 | #include <linux/errno.h> | 35 | #include <linux/errno.h> |
36 | #include <linux/ptrace.h> | 36 | #include <linux/ptrace.h> |
37 | #include <linux/user.h> | 37 | #include <linux/user.h> |
38 | #include <linux/signal.h> | 38 | #include <linux/signal.h> |
39 | #include <linux/uaccess.h> | 39 | #include <linux/uaccess.h> |
40 | 40 | ||
41 | #include <asm/page.h> | 41 | #include <asm/page.h> |
42 | #include <asm/pgtable.h> | 42 | #include <asm/pgtable.h> |
43 | #include <asm/system.h> | 43 | #include <asm/system.h> |
44 | #include <asm/processor.h> | 44 | #include <asm/processor.h> |
45 | #include <asm/asm-offsets.h> | 45 | #include <asm/asm-offsets.h> |
46 | #include <asm/dma.h> | 46 | #include <asm/dma.h> |
47 | #include <asm/fixed_code.h> | 47 | #include <asm/fixed_code.h> |
48 | #include <asm/mem_map.h> | 48 | #include <asm/mem_map.h> |
49 | 49 | ||
50 | #define TEXT_OFFSET 0 | 50 | #define TEXT_OFFSET 0 |
51 | /* | 51 | /* |
52 | * does not yet catch signals sent when the child dies. | 52 | * does not yet catch signals sent when the child dies. |
53 | * in exit.c or in signal.c. | 53 | * in exit.c or in signal.c. |
54 | */ | 54 | */ |
55 | 55 | ||
56 | /* determines which bits in the SYSCFG reg the user has access to. */ | 56 | /* determines which bits in the SYSCFG reg the user has access to. */ |
57 | /* 1 = access 0 = no access */ | 57 | /* 1 = access 0 = no access */ |
58 | #define SYSCFG_MASK 0x0007 /* SYSCFG reg */ | 58 | #define SYSCFG_MASK 0x0007 /* SYSCFG reg */ |
59 | /* sets the trace bits. */ | 59 | /* sets the trace bits. */ |
60 | #define TRACE_BITS 0x0001 | 60 | #define TRACE_BITS 0x0001 |
61 | 61 | ||
62 | /* Find the stack offset for a register, relative to thread.esp0. */ | 62 | /* Find the stack offset for a register, relative to thread.esp0. */ |
63 | #define PT_REG(reg) ((long)&((struct pt_regs *)0)->reg) | 63 | #define PT_REG(reg) ((long)&((struct pt_regs *)0)->reg) |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * Get the address of the live pt_regs for the specified task. | 66 | * Get the address of the live pt_regs for the specified task. |
67 | * These are saved onto the top kernel stack when the process | 67 | * These are saved onto the top kernel stack when the process |
68 | * is not running. | 68 | * is not running. |
69 | * | 69 | * |
70 | * Note: if a user thread is execve'd from kernel space, the | 70 | * Note: if a user thread is execve'd from kernel space, the |
71 | * kernel stack will not be empty on entry to the kernel, so | 71 | * kernel stack will not be empty on entry to the kernel, so |
72 | * ptracing these tasks will fail. | 72 | * ptracing these tasks will fail. |
73 | */ | 73 | */ |
74 | static inline struct pt_regs *get_user_regs(struct task_struct *task) | 74 | static inline struct pt_regs *get_user_regs(struct task_struct *task) |
75 | { | 75 | { |
76 | return (struct pt_regs *) | 76 | return (struct pt_regs *) |
77 | ((unsigned long)task_stack_page(task) + | 77 | ((unsigned long)task_stack_page(task) + |
78 | (THREAD_SIZE - sizeof(struct pt_regs))); | 78 | (THREAD_SIZE - sizeof(struct pt_regs))); |
79 | } | 79 | } |
80 | 80 | ||
81 | /* | 81 | /* |
82 | * Get all user integer registers. | 82 | * Get all user integer registers. |
83 | */ | 83 | */ |
84 | static inline int ptrace_getregs(struct task_struct *tsk, void __user *uregs) | 84 | static inline int ptrace_getregs(struct task_struct *tsk, void __user *uregs) |
85 | { | 85 | { |
86 | struct pt_regs regs; | 86 | struct pt_regs regs; |
87 | memcpy(®s, get_user_regs(tsk), sizeof(regs)); | 87 | memcpy(®s, get_user_regs(tsk), sizeof(regs)); |
88 | regs.usp = tsk->thread.usp; | 88 | regs.usp = tsk->thread.usp; |
89 | return copy_to_user(uregs, ®s, sizeof(struct pt_regs)) ? -EFAULT : 0; | 89 | return copy_to_user(uregs, ®s, sizeof(struct pt_regs)) ? -EFAULT : 0; |
90 | } | 90 | } |
91 | 91 | ||
92 | /* Mapping from PT_xxx to the stack offset at which the register is | 92 | /* Mapping from PT_xxx to the stack offset at which the register is |
93 | * saved. Notice that usp has no stack-slot and needs to be treated | 93 | * saved. Notice that usp has no stack-slot and needs to be treated |
94 | * specially (see get_reg/put_reg below). | 94 | * specially (see get_reg/put_reg below). |
95 | */ | 95 | */ |
96 | 96 | ||
97 | /* | 97 | /* |
98 | * Get contents of register REGNO in task TASK. | 98 | * Get contents of register REGNO in task TASK. |
99 | */ | 99 | */ |
100 | static inline long get_reg(struct task_struct *task, int regno) | 100 | static inline long get_reg(struct task_struct *task, int regno) |
101 | { | 101 | { |
102 | unsigned char *reg_ptr; | 102 | unsigned char *reg_ptr; |
103 | 103 | ||
104 | struct pt_regs *regs = | 104 | struct pt_regs *regs = |
105 | (struct pt_regs *)((unsigned long)task_stack_page(task) + | 105 | (struct pt_regs *)((unsigned long)task_stack_page(task) + |
106 | (THREAD_SIZE - sizeof(struct pt_regs))); | 106 | (THREAD_SIZE - sizeof(struct pt_regs))); |
107 | reg_ptr = (char *)regs; | 107 | reg_ptr = (char *)regs; |
108 | 108 | ||
109 | switch (regno) { | 109 | switch (regno) { |
110 | case PT_USP: | 110 | case PT_USP: |
111 | return task->thread.usp; | 111 | return task->thread.usp; |
112 | default: | 112 | default: |
113 | if (regno <= 216) | 113 | if (regno <= 216) |
114 | return *(long *)(reg_ptr + regno); | 114 | return *(long *)(reg_ptr + regno); |
115 | } | 115 | } |
116 | /* slight mystery ... never seems to come here but kernel misbehaves without this code! */ | 116 | /* slight mystery ... never seems to come here but kernel misbehaves without this code! */ |
117 | 117 | ||
118 | printk(KERN_WARNING "Request to get for unknown register %d\n", regno); | 118 | printk(KERN_WARNING "Request to get for unknown register %d\n", regno); |
119 | return 0; | 119 | return 0; |
120 | } | 120 | } |
121 | 121 | ||
122 | /* | 122 | /* |
123 | * Write contents of register REGNO in task TASK. | 123 | * Write contents of register REGNO in task TASK. |
124 | */ | 124 | */ |
125 | static inline int | 125 | static inline int |
126 | put_reg(struct task_struct *task, int regno, unsigned long data) | 126 | put_reg(struct task_struct *task, int regno, unsigned long data) |
127 | { | 127 | { |
128 | char *reg_ptr; | 128 | char *reg_ptr; |
129 | 129 | ||
130 | struct pt_regs *regs = | 130 | struct pt_regs *regs = |
131 | (struct pt_regs *)((unsigned long)task_stack_page(task) + | 131 | (struct pt_regs *)((unsigned long)task_stack_page(task) + |
132 | (THREAD_SIZE - sizeof(struct pt_regs))); | 132 | (THREAD_SIZE - sizeof(struct pt_regs))); |
133 | reg_ptr = (char *)regs; | 133 | reg_ptr = (char *)regs; |
134 | 134 | ||
135 | switch (regno) { | 135 | switch (regno) { |
136 | case PT_PC: | 136 | case PT_PC: |
137 | /*********************************************************************/ | 137 | /*********************************************************************/ |
138 | /* At this point the kernel is most likely in exception. */ | 138 | /* At this point the kernel is most likely in exception. */ |
139 | /* The RETX register will be used to populate the pc of the process. */ | 139 | /* The RETX register will be used to populate the pc of the process. */ |
140 | /*********************************************************************/ | 140 | /*********************************************************************/ |
141 | regs->retx = data; | 141 | regs->retx = data; |
142 | regs->pc = data; | 142 | regs->pc = data; |
143 | break; | 143 | break; |
144 | case PT_RETX: | 144 | case PT_RETX: |
145 | break; /* regs->retx = data; break; */ | 145 | break; /* regs->retx = data; break; */ |
146 | case PT_USP: | 146 | case PT_USP: |
147 | regs->usp = data; | 147 | regs->usp = data; |
148 | task->thread.usp = data; | 148 | task->thread.usp = data; |
149 | break; | 149 | break; |
150 | default: | 150 | default: |
151 | if (regno <= 216) | 151 | if (regno <= 216) |
152 | *(long *)(reg_ptr + regno) = data; | 152 | *(long *)(reg_ptr + regno) = data; |
153 | } | 153 | } |
154 | return 0; | 154 | return 0; |
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * check that an address falls within the bounds of the target process's memory mappings | 158 | * check that an address falls within the bounds of the target process's memory mappings |
159 | */ | 159 | */ |
160 | static inline int is_user_addr_valid(struct task_struct *child, | 160 | static inline int is_user_addr_valid(struct task_struct *child, |
161 | unsigned long start, unsigned long len) | 161 | unsigned long start, unsigned long len) |
162 | { | 162 | { |
163 | struct vm_list_struct *vml; | 163 | struct vm_area_struct *vma; |
164 | struct sram_list_struct *sraml; | 164 | struct sram_list_struct *sraml; |
165 | 165 | ||
166 | /* overflow */ | 166 | /* overflow */ |
167 | if (start + len < start) | 167 | if (start + len < start) |
168 | return -EIO; | 168 | return -EIO; |
169 | 169 | ||
170 | for (vml = child->mm->context.vmlist; vml; vml = vml->next) | 170 | vma = find_vma(child->mm, start); |
171 | if (start >= vml->vma->vm_start && start + len < vml->vma->vm_end) | 171 | if (vma && start >= vma->vm_start && start + len <= vma->vm_end) |
172 | return 0; | 172 | return 0; |
173 | 173 | ||
174 | for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next) | 174 | for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next) |
175 | if (start >= (unsigned long)sraml->addr | 175 | if (start >= (unsigned long)sraml->addr |
176 | && start + len < (unsigned long)sraml->addr + sraml->length) | 176 | && start + len < (unsigned long)sraml->addr + sraml->length) |
177 | return 0; | 177 | return 0; |
178 | 178 | ||
179 | if (start >= FIXED_CODE_START && start + len < FIXED_CODE_END) | 179 | if (start >= FIXED_CODE_START && start + len < FIXED_CODE_END) |
180 | return 0; | 180 | return 0; |
181 | 181 | ||
182 | return -EIO; | 182 | return -EIO; |
183 | } | 183 | } |
184 | 184 | ||
185 | void ptrace_enable(struct task_struct *child) | 185 | void ptrace_enable(struct task_struct *child) |
186 | { | 186 | { |
187 | unsigned long tmp; | 187 | unsigned long tmp; |
188 | tmp = get_reg(child, PT_SYSCFG) | (TRACE_BITS); | 188 | tmp = get_reg(child, PT_SYSCFG) | (TRACE_BITS); |
189 | put_reg(child, PT_SYSCFG, tmp); | 189 | put_reg(child, PT_SYSCFG, tmp); |
190 | } | 190 | } |
191 | 191 | ||
192 | /* | 192 | /* |
193 | * Called by kernel/ptrace.c when detaching.. | 193 | * Called by kernel/ptrace.c when detaching.. |
194 | * | 194 | * |
195 | * Make sure the single step bit is not set. | 195 | * Make sure the single step bit is not set. |
196 | */ | 196 | */ |
197 | void ptrace_disable(struct task_struct *child) | 197 | void ptrace_disable(struct task_struct *child) |
198 | { | 198 | { |
199 | unsigned long tmp; | 199 | unsigned long tmp; |
200 | /* make sure the single step bit is not set. */ | 200 | /* make sure the single step bit is not set. */ |
201 | tmp = get_reg(child, PT_SYSCFG) & ~TRACE_BITS; | 201 | tmp = get_reg(child, PT_SYSCFG) & ~TRACE_BITS; |
202 | put_reg(child, PT_SYSCFG, tmp); | 202 | put_reg(child, PT_SYSCFG, tmp); |
203 | } | 203 | } |
204 | 204 | ||
205 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) | 205 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) |
206 | { | 206 | { |
207 | int ret; | 207 | int ret; |
208 | unsigned long __user *datap = (unsigned long __user *)data; | 208 | unsigned long __user *datap = (unsigned long __user *)data; |
209 | 209 | ||
210 | switch (request) { | 210 | switch (request) { |
211 | /* when I and D space are separate, these will need to be fixed. */ | 211 | /* when I and D space are separate, these will need to be fixed. */ |
212 | case PTRACE_PEEKDATA: | 212 | case PTRACE_PEEKDATA: |
213 | pr_debug("ptrace: PEEKDATA\n"); | 213 | pr_debug("ptrace: PEEKDATA\n"); |
214 | /* fall through */ | 214 | /* fall through */ |
215 | case PTRACE_PEEKTEXT: /* read word at location addr. */ | 215 | case PTRACE_PEEKTEXT: /* read word at location addr. */ |
216 | { | 216 | { |
217 | unsigned long tmp = 0; | 217 | unsigned long tmp = 0; |
218 | int copied; | 218 | int copied; |
219 | 219 | ||
220 | ret = -EIO; | 220 | ret = -EIO; |
221 | pr_debug("ptrace: PEEKTEXT at addr 0x%08lx + %ld\n", addr, sizeof(data)); | 221 | pr_debug("ptrace: PEEKTEXT at addr 0x%08lx + %ld\n", addr, sizeof(data)); |
222 | if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0) | 222 | if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0) |
223 | break; | 223 | break; |
224 | pr_debug("ptrace: user address is valid\n"); | 224 | pr_debug("ptrace: user address is valid\n"); |
225 | 225 | ||
226 | if (L1_CODE_LENGTH != 0 && addr >= get_l1_code_start() | 226 | if (L1_CODE_LENGTH != 0 && addr >= get_l1_code_start() |
227 | && addr + sizeof(tmp) <= get_l1_code_start() + L1_CODE_LENGTH) { | 227 | && addr + sizeof(tmp) <= get_l1_code_start() + L1_CODE_LENGTH) { |
228 | safe_dma_memcpy (&tmp, (const void *)(addr), sizeof(tmp)); | 228 | safe_dma_memcpy (&tmp, (const void *)(addr), sizeof(tmp)); |
229 | copied = sizeof(tmp); | 229 | copied = sizeof(tmp); |
230 | 230 | ||
231 | } else if (L1_DATA_A_LENGTH != 0 && addr >= L1_DATA_A_START | 231 | } else if (L1_DATA_A_LENGTH != 0 && addr >= L1_DATA_A_START |
232 | && addr + sizeof(tmp) <= L1_DATA_A_START + L1_DATA_A_LENGTH) { | 232 | && addr + sizeof(tmp) <= L1_DATA_A_START + L1_DATA_A_LENGTH) { |
233 | memcpy(&tmp, (const void *)(addr), sizeof(tmp)); | 233 | memcpy(&tmp, (const void *)(addr), sizeof(tmp)); |
234 | copied = sizeof(tmp); | 234 | copied = sizeof(tmp); |
235 | 235 | ||
236 | } else if (L1_DATA_B_LENGTH != 0 && addr >= L1_DATA_B_START | 236 | } else if (L1_DATA_B_LENGTH != 0 && addr >= L1_DATA_B_START |
237 | && addr + sizeof(tmp) <= L1_DATA_B_START + L1_DATA_B_LENGTH) { | 237 | && addr + sizeof(tmp) <= L1_DATA_B_START + L1_DATA_B_LENGTH) { |
238 | memcpy(&tmp, (const void *)(addr), sizeof(tmp)); | 238 | memcpy(&tmp, (const void *)(addr), sizeof(tmp)); |
239 | copied = sizeof(tmp); | 239 | copied = sizeof(tmp); |
240 | 240 | ||
241 | } else if (addr >= FIXED_CODE_START | 241 | } else if (addr >= FIXED_CODE_START |
242 | && addr + sizeof(tmp) <= FIXED_CODE_END) { | 242 | && addr + sizeof(tmp) <= FIXED_CODE_END) { |
243 | memcpy(&tmp, (const void *)(addr), sizeof(tmp)); | 243 | memcpy(&tmp, (const void *)(addr), sizeof(tmp)); |
244 | copied = sizeof(tmp); | 244 | copied = sizeof(tmp); |
245 | 245 | ||
246 | } else | 246 | } else |
247 | copied = access_process_vm(child, addr, &tmp, | 247 | copied = access_process_vm(child, addr, &tmp, |
248 | sizeof(tmp), 0); | 248 | sizeof(tmp), 0); |
249 | 249 | ||
250 | pr_debug("ptrace: copied size %d [0x%08lx]\n", copied, tmp); | 250 | pr_debug("ptrace: copied size %d [0x%08lx]\n", copied, tmp); |
251 | if (copied != sizeof(tmp)) | 251 | if (copied != sizeof(tmp)) |
252 | break; | 252 | break; |
253 | ret = put_user(tmp, datap); | 253 | ret = put_user(tmp, datap); |
254 | break; | 254 | break; |
255 | } | 255 | } |
256 | 256 | ||
257 | /* read the word at location addr in the USER area. */ | 257 | /* read the word at location addr in the USER area. */ |
258 | case PTRACE_PEEKUSR: | 258 | case PTRACE_PEEKUSR: |
259 | { | 259 | { |
260 | unsigned long tmp; | 260 | unsigned long tmp; |
261 | ret = -EIO; | 261 | ret = -EIO; |
262 | tmp = 0; | 262 | tmp = 0; |
263 | if ((addr & 3) || (addr > (sizeof(struct pt_regs) + 16))) { | 263 | if ((addr & 3) || (addr > (sizeof(struct pt_regs) + 16))) { |
264 | printk(KERN_WARNING "ptrace error : PEEKUSR : temporarily returning " | 264 | printk(KERN_WARNING "ptrace error : PEEKUSR : temporarily returning " |
265 | "0 - %x sizeof(pt_regs) is %lx\n", | 265 | "0 - %x sizeof(pt_regs) is %lx\n", |
266 | (int)addr, sizeof(struct pt_regs)); | 266 | (int)addr, sizeof(struct pt_regs)); |
267 | break; | 267 | break; |
268 | } | 268 | } |
269 | if (addr == sizeof(struct pt_regs)) { | 269 | if (addr == sizeof(struct pt_regs)) { |
270 | /* PT_TEXT_ADDR */ | 270 | /* PT_TEXT_ADDR */ |
271 | tmp = child->mm->start_code + TEXT_OFFSET; | 271 | tmp = child->mm->start_code + TEXT_OFFSET; |
272 | } else if (addr == (sizeof(struct pt_regs) + 4)) { | 272 | } else if (addr == (sizeof(struct pt_regs) + 4)) { |
273 | /* PT_TEXT_END_ADDR */ | 273 | /* PT_TEXT_END_ADDR */ |
274 | tmp = child->mm->end_code; | 274 | tmp = child->mm->end_code; |
275 | } else if (addr == (sizeof(struct pt_regs) + 8)) { | 275 | } else if (addr == (sizeof(struct pt_regs) + 8)) { |
276 | /* PT_DATA_ADDR */ | 276 | /* PT_DATA_ADDR */ |
277 | tmp = child->mm->start_data; | 277 | tmp = child->mm->start_data; |
278 | #ifdef CONFIG_BINFMT_ELF_FDPIC | 278 | #ifdef CONFIG_BINFMT_ELF_FDPIC |
279 | } else if (addr == (sizeof(struct pt_regs) + 12)) { | 279 | } else if (addr == (sizeof(struct pt_regs) + 12)) { |
280 | tmp = child->mm->context.exec_fdpic_loadmap; | 280 | tmp = child->mm->context.exec_fdpic_loadmap; |
281 | } else if (addr == (sizeof(struct pt_regs) + 16)) { | 281 | } else if (addr == (sizeof(struct pt_regs) + 16)) { |
282 | tmp = child->mm->context.interp_fdpic_loadmap; | 282 | tmp = child->mm->context.interp_fdpic_loadmap; |
283 | #endif | 283 | #endif |
284 | } else { | 284 | } else { |
285 | tmp = get_reg(child, addr); | 285 | tmp = get_reg(child, addr); |
286 | } | 286 | } |
287 | ret = put_user(tmp, datap); | 287 | ret = put_user(tmp, datap); |
288 | break; | 288 | break; |
289 | } | 289 | } |
290 | 290 | ||
291 | /* when I and D space are separate, this will have to be fixed. */ | 291 | /* when I and D space are separate, this will have to be fixed. */ |
292 | case PTRACE_POKEDATA: | 292 | case PTRACE_POKEDATA: |
293 | pr_debug("ptrace: PTRACE_PEEKDATA\n"); | 293 | pr_debug("ptrace: PTRACE_PEEKDATA\n"); |
294 | /* fall through */ | 294 | /* fall through */ |
295 | case PTRACE_POKETEXT: /* write the word at location addr. */ | 295 | case PTRACE_POKETEXT: /* write the word at location addr. */ |
296 | { | 296 | { |
297 | int copied; | 297 | int copied; |
298 | 298 | ||
299 | ret = -EIO; | 299 | ret = -EIO; |
300 | pr_debug("ptrace: POKETEXT at addr 0x%08lx + %ld bytes %lx\n", | 300 | pr_debug("ptrace: POKETEXT at addr 0x%08lx + %ld bytes %lx\n", |
301 | addr, sizeof(data), data); | 301 | addr, sizeof(data), data); |
302 | if (is_user_addr_valid(child, addr, sizeof(data)) < 0) | 302 | if (is_user_addr_valid(child, addr, sizeof(data)) < 0) |
303 | break; | 303 | break; |
304 | pr_debug("ptrace: user address is valid\n"); | 304 | pr_debug("ptrace: user address is valid\n"); |
305 | 305 | ||
306 | if (L1_CODE_LENGTH != 0 && addr >= get_l1_code_start() | 306 | if (L1_CODE_LENGTH != 0 && addr >= get_l1_code_start() |
307 | && addr + sizeof(data) <= get_l1_code_start() + L1_CODE_LENGTH) { | 307 | && addr + sizeof(data) <= get_l1_code_start() + L1_CODE_LENGTH) { |
308 | safe_dma_memcpy ((void *)(addr), &data, sizeof(data)); | 308 | safe_dma_memcpy ((void *)(addr), &data, sizeof(data)); |
309 | copied = sizeof(data); | 309 | copied = sizeof(data); |
310 | 310 | ||
311 | } else if (L1_DATA_A_LENGTH != 0 && addr >= L1_DATA_A_START | 311 | } else if (L1_DATA_A_LENGTH != 0 && addr >= L1_DATA_A_START |
312 | && addr + sizeof(data) <= L1_DATA_A_START + L1_DATA_A_LENGTH) { | 312 | && addr + sizeof(data) <= L1_DATA_A_START + L1_DATA_A_LENGTH) { |
313 | memcpy((void *)(addr), &data, sizeof(data)); | 313 | memcpy((void *)(addr), &data, sizeof(data)); |
314 | copied = sizeof(data); | 314 | copied = sizeof(data); |
315 | 315 | ||
316 | } else if (L1_DATA_B_LENGTH != 0 && addr >= L1_DATA_B_START | 316 | } else if (L1_DATA_B_LENGTH != 0 && addr >= L1_DATA_B_START |
317 | && addr + sizeof(data) <= L1_DATA_B_START + L1_DATA_B_LENGTH) { | 317 | && addr + sizeof(data) <= L1_DATA_B_START + L1_DATA_B_LENGTH) { |
318 | memcpy((void *)(addr), &data, sizeof(data)); | 318 | memcpy((void *)(addr), &data, sizeof(data)); |
319 | copied = sizeof(data); | 319 | copied = sizeof(data); |
320 | 320 | ||
321 | } else if (addr >= FIXED_CODE_START | 321 | } else if (addr >= FIXED_CODE_START |
322 | && addr + sizeof(data) <= FIXED_CODE_END) { | 322 | && addr + sizeof(data) <= FIXED_CODE_END) { |
323 | memcpy((void *)(addr), &data, sizeof(data)); | 323 | memcpy((void *)(addr), &data, sizeof(data)); |
324 | copied = sizeof(data); | 324 | copied = sizeof(data); |
325 | 325 | ||
326 | } else | 326 | } else |
327 | copied = access_process_vm(child, addr, &data, | 327 | copied = access_process_vm(child, addr, &data, |
328 | sizeof(data), 1); | 328 | sizeof(data), 1); |
329 | 329 | ||
330 | pr_debug("ptrace: copied size %d\n", copied); | 330 | pr_debug("ptrace: copied size %d\n", copied); |
331 | if (copied != sizeof(data)) | 331 | if (copied != sizeof(data)) |
332 | break; | 332 | break; |
333 | ret = 0; | 333 | ret = 0; |
334 | break; | 334 | break; |
335 | } | 335 | } |
336 | 336 | ||
337 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | 337 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ |
338 | ret = -EIO; | 338 | ret = -EIO; |
339 | if ((addr & 3) || (addr > (sizeof(struct pt_regs) + 16))) { | 339 | if ((addr & 3) || (addr > (sizeof(struct pt_regs) + 16))) { |
340 | printk(KERN_WARNING "ptrace error : POKEUSR: temporarily returning 0\n"); | 340 | printk(KERN_WARNING "ptrace error : POKEUSR: temporarily returning 0\n"); |
341 | break; | 341 | break; |
342 | } | 342 | } |
343 | 343 | ||
344 | if (addr >= (sizeof(struct pt_regs))) { | 344 | if (addr >= (sizeof(struct pt_regs))) { |
345 | ret = 0; | 345 | ret = 0; |
346 | break; | 346 | break; |
347 | } | 347 | } |
348 | if (addr == PT_SYSCFG) { | 348 | if (addr == PT_SYSCFG) { |
349 | data &= SYSCFG_MASK; | 349 | data &= SYSCFG_MASK; |
350 | data |= get_reg(child, PT_SYSCFG); | 350 | data |= get_reg(child, PT_SYSCFG); |
351 | } | 351 | } |
352 | ret = put_reg(child, addr, data); | 352 | ret = put_reg(child, addr, data); |
353 | break; | 353 | break; |
354 | 354 | ||
355 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ | 355 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ |
356 | case PTRACE_CONT: /* restart after signal. */ | 356 | case PTRACE_CONT: /* restart after signal. */ |
357 | pr_debug("ptrace: syscall/cont\n"); | 357 | pr_debug("ptrace: syscall/cont\n"); |
358 | 358 | ||
359 | ret = -EIO; | 359 | ret = -EIO; |
360 | if (!valid_signal(data)) | 360 | if (!valid_signal(data)) |
361 | break; | 361 | break; |
362 | if (request == PTRACE_SYSCALL) | 362 | if (request == PTRACE_SYSCALL) |
363 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 363 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
364 | else | 364 | else |
365 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 365 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
366 | child->exit_code = data; | 366 | child->exit_code = data; |
367 | ptrace_disable(child); | 367 | ptrace_disable(child); |
368 | pr_debug("ptrace: before wake_up_process\n"); | 368 | pr_debug("ptrace: before wake_up_process\n"); |
369 | wake_up_process(child); | 369 | wake_up_process(child); |
370 | ret = 0; | 370 | ret = 0; |
371 | break; | 371 | break; |
372 | 372 | ||
373 | /* | 373 | /* |
374 | * make the child exit. Best I can do is send it a sigkill. | 374 | * make the child exit. Best I can do is send it a sigkill. |
375 | * perhaps it should be put in the status that it wants to | 375 | * perhaps it should be put in the status that it wants to |
376 | * exit. | 376 | * exit. |
377 | */ | 377 | */ |
378 | case PTRACE_KILL: | 378 | case PTRACE_KILL: |
379 | ret = 0; | 379 | ret = 0; |
380 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ | 380 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ |
381 | break; | 381 | break; |
382 | child->exit_code = SIGKILL; | 382 | child->exit_code = SIGKILL; |
383 | ptrace_disable(child); | 383 | ptrace_disable(child); |
384 | wake_up_process(child); | 384 | wake_up_process(child); |
385 | break; | 385 | break; |
386 | 386 | ||
387 | case PTRACE_SINGLESTEP: /* set the trap flag. */ | 387 | case PTRACE_SINGLESTEP: /* set the trap flag. */ |
388 | pr_debug("ptrace: single step\n"); | 388 | pr_debug("ptrace: single step\n"); |
389 | ret = -EIO; | 389 | ret = -EIO; |
390 | if (!valid_signal(data)) | 390 | if (!valid_signal(data)) |
391 | break; | 391 | break; |
392 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 392 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
393 | ptrace_enable(child); | 393 | ptrace_enable(child); |
394 | child->exit_code = data; | 394 | child->exit_code = data; |
395 | wake_up_process(child); | 395 | wake_up_process(child); |
396 | ret = 0; | 396 | ret = 0; |
397 | break; | 397 | break; |
398 | 398 | ||
399 | case PTRACE_GETREGS: | 399 | case PTRACE_GETREGS: |
400 | /* Get all gp regs from the child. */ | 400 | /* Get all gp regs from the child. */ |
401 | ret = ptrace_getregs(child, datap); | 401 | ret = ptrace_getregs(child, datap); |
402 | break; | 402 | break; |
403 | 403 | ||
404 | case PTRACE_SETREGS: | 404 | case PTRACE_SETREGS: |
405 | printk(KERN_WARNING "ptrace: SETREGS: **** NOT IMPLEMENTED ***\n"); | 405 | printk(KERN_WARNING "ptrace: SETREGS: **** NOT IMPLEMENTED ***\n"); |
406 | /* Set all gp regs in the child. */ | 406 | /* Set all gp regs in the child. */ |
407 | ret = 0; | 407 | ret = 0; |
408 | break; | 408 | break; |
409 | 409 | ||
410 | default: | 410 | default: |
411 | ret = ptrace_request(child, request, addr, data); | 411 | ret = ptrace_request(child, request, addr, data); |
412 | break; | 412 | break; |
413 | } | 413 | } |
414 | 414 | ||
415 | return ret; | 415 | return ret; |
416 | } | 416 | } |
417 | 417 | ||
418 | asmlinkage void syscall_trace(void) | 418 | asmlinkage void syscall_trace(void) |
419 | { | 419 | { |
420 | if (!test_thread_flag(TIF_SYSCALL_TRACE)) | 420 | if (!test_thread_flag(TIF_SYSCALL_TRACE)) |
421 | return; | 421 | return; |
422 | 422 | ||
423 | if (!(current->ptrace & PT_PTRACED)) | 423 | if (!(current->ptrace & PT_PTRACED)) |
424 | return; | 424 | return; |
425 | 425 | ||
426 | /* the 0x80 provides a way for the tracing parent to distinguish | 426 | /* the 0x80 provides a way for the tracing parent to distinguish |
427 | * between a syscall stop and SIGTRAP delivery | 427 | * between a syscall stop and SIGTRAP delivery |
428 | */ | 428 | */ |
429 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) | 429 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) |
430 | ? 0x80 : 0)); | 430 | ? 0x80 : 0)); |
431 | 431 | ||
432 | /* | 432 | /* |
433 | * this isn't the same as continuing with a signal, but it will do | 433 | * this isn't the same as continuing with a signal, but it will do |
434 | * for normal use. strace only continues with a signal if the | 434 | * for normal use. strace only continues with a signal if the |
435 | * stopping signal is not SIGTRAP. -brl | 435 | * stopping signal is not SIGTRAP. -brl |
436 | */ | 436 | */ |
437 | if (current->exit_code) { | 437 | if (current->exit_code) { |
438 | send_sig(current->exit_code, current, 1); | 438 | send_sig(current->exit_code, current, 1); |
439 | current->exit_code = 0; | 439 | current->exit_code = 0; |
440 | } | 440 | } |
441 | } | 441 | } |
442 | 442 |
arch/blackfin/kernel/traps.c
1 | /* | 1 | /* |
2 | * File: arch/blackfin/kernel/traps.c | 2 | * File: arch/blackfin/kernel/traps.c |
3 | * Based on: | 3 | * Based on: |
4 | * Author: Hamish Macdonald | 4 | * Author: Hamish Macdonald |
5 | * | 5 | * |
6 | * Created: | 6 | * Created: |
7 | * Description: uses S/W interrupt 15 for the system calls | 7 | * Description: uses S/W interrupt 15 for the system calls |
8 | * | 8 | * |
9 | * Modified: | 9 | * Modified: |
10 | * Copyright 2004-2006 Analog Devices Inc. | 10 | * Copyright 2004-2006 Analog Devices Inc. |
11 | * | 11 | * |
12 | * Bugs: Enter bugs at http://blackfin.uclinux.org/ | 12 | * Bugs: Enter bugs at http://blackfin.uclinux.org/ |
13 | * | 13 | * |
14 | * This program is free software; you can redistribute it and/or modify | 14 | * This program is free software; you can redistribute it and/or modify |
15 | * it under the terms of the GNU General Public License as published by | 15 | * it under the terms of the GNU General Public License as published by |
16 | * the Free Software Foundation; either version 2 of the License, or | 16 | * the Free Software Foundation; either version 2 of the License, or |
17 | * (at your option) any later version. | 17 | * (at your option) any later version. |
18 | * | 18 | * |
19 | * This program is distributed in the hope that it will be useful, | 19 | * This program is distributed in the hope that it will be useful, |
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22 | * GNU General Public License for more details. | 22 | * GNU General Public License for more details. |
23 | * | 23 | * |
24 | * You should have received a copy of the GNU General Public License | 24 | * You should have received a copy of the GNU General Public License |
25 | * along with this program; if not, see the file COPYING, or write | 25 | * along with this program; if not, see the file COPYING, or write |
26 | * to the Free Software Foundation, Inc., | 26 | * to the Free Software Foundation, Inc., |
27 | * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | 27 | * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/uaccess.h> | 30 | #include <linux/uaccess.h> |
31 | #include <linux/interrupt.h> | 31 | #include <linux/interrupt.h> |
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <linux/kallsyms.h> | 33 | #include <linux/kallsyms.h> |
34 | #include <linux/fs.h> | 34 | #include <linux/fs.h> |
35 | #include <linux/rbtree.h> | ||
35 | #include <asm/traps.h> | 36 | #include <asm/traps.h> |
36 | #include <asm/cacheflush.h> | 37 | #include <asm/cacheflush.h> |
37 | #include <asm/cplb.h> | 38 | #include <asm/cplb.h> |
38 | #include <asm/blackfin.h> | 39 | #include <asm/blackfin.h> |
39 | #include <asm/irq_handler.h> | 40 | #include <asm/irq_handler.h> |
40 | #include <linux/irq.h> | 41 | #include <linux/irq.h> |
41 | #include <asm/trace.h> | 42 | #include <asm/trace.h> |
42 | #include <asm/fixed_code.h> | 43 | #include <asm/fixed_code.h> |
43 | 44 | ||
44 | #ifdef CONFIG_KGDB | 45 | #ifdef CONFIG_KGDB |
45 | # include <linux/kgdb.h> | 46 | # include <linux/kgdb.h> |
46 | 47 | ||
47 | # define CHK_DEBUGGER_TRAP() \ | 48 | # define CHK_DEBUGGER_TRAP() \ |
48 | do { \ | 49 | do { \ |
49 | kgdb_handle_exception(trapnr, sig, info.si_code, fp); \ | 50 | kgdb_handle_exception(trapnr, sig, info.si_code, fp); \ |
50 | } while (0) | 51 | } while (0) |
51 | # define CHK_DEBUGGER_TRAP_MAYBE() \ | 52 | # define CHK_DEBUGGER_TRAP_MAYBE() \ |
52 | do { \ | 53 | do { \ |
53 | if (kgdb_connected) \ | 54 | if (kgdb_connected) \ |
54 | CHK_DEBUGGER_TRAP(); \ | 55 | CHK_DEBUGGER_TRAP(); \ |
55 | } while (0) | 56 | } while (0) |
56 | #else | 57 | #else |
57 | # define CHK_DEBUGGER_TRAP() do { } while (0) | 58 | # define CHK_DEBUGGER_TRAP() do { } while (0) |
58 | # define CHK_DEBUGGER_TRAP_MAYBE() do { } while (0) | 59 | # define CHK_DEBUGGER_TRAP_MAYBE() do { } while (0) |
59 | #endif | 60 | #endif |
60 | 61 | ||
61 | 62 | ||
62 | #ifdef CONFIG_DEBUG_VERBOSE | 63 | #ifdef CONFIG_DEBUG_VERBOSE |
63 | #define verbose_printk(fmt, arg...) \ | 64 | #define verbose_printk(fmt, arg...) \ |
64 | printk(fmt, ##arg) | 65 | printk(fmt, ##arg) |
65 | #else | 66 | #else |
66 | #define verbose_printk(fmt, arg...) \ | 67 | #define verbose_printk(fmt, arg...) \ |
67 | ({ if (0) printk(fmt, ##arg); 0; }) | 68 | ({ if (0) printk(fmt, ##arg); 0; }) |
68 | #endif | 69 | #endif |
69 | 70 | ||
70 | /* Initiate the event table handler */ | 71 | /* Initiate the event table handler */ |
71 | void __init trap_init(void) | 72 | void __init trap_init(void) |
72 | { | 73 | { |
73 | CSYNC(); | 74 | CSYNC(); |
74 | bfin_write_EVT3(trap); | 75 | bfin_write_EVT3(trap); |
75 | CSYNC(); | 76 | CSYNC(); |
76 | } | 77 | } |
77 | 78 | ||
78 | static void decode_address(char *buf, unsigned long address) | 79 | static void decode_address(char *buf, unsigned long address) |
79 | { | 80 | { |
80 | #ifdef CONFIG_DEBUG_VERBOSE | 81 | #ifdef CONFIG_DEBUG_VERBOSE |
81 | struct vm_list_struct *vml; | 82 | struct vm_list_struct *vml; |
82 | struct task_struct *p; | 83 | struct task_struct *p; |
83 | struct mm_struct *mm; | 84 | struct mm_struct *mm; |
84 | unsigned long flags, offset; | 85 | unsigned long flags, offset; |
85 | unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic(); | 86 | unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic(); |
87 | struct rb_node *n; | ||
86 | 88 | ||
87 | #ifdef CONFIG_KALLSYMS | 89 | #ifdef CONFIG_KALLSYMS |
88 | unsigned long symsize; | 90 | unsigned long symsize; |
89 | const char *symname; | 91 | const char *symname; |
90 | char *modname; | 92 | char *modname; |
91 | char *delim = ":"; | 93 | char *delim = ":"; |
92 | char namebuf[128]; | 94 | char namebuf[128]; |
93 | 95 | ||
94 | /* look up the address and see if we are in kernel space */ | 96 | /* look up the address and see if we are in kernel space */ |
95 | symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); | 97 | symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); |
96 | 98 | ||
97 | if (symname) { | 99 | if (symname) { |
98 | /* yeah! kernel space! */ | 100 | /* yeah! kernel space! */ |
99 | if (!modname) | 101 | if (!modname) |
100 | modname = delim = ""; | 102 | modname = delim = ""; |
101 | sprintf(buf, "<0x%p> { %s%s%s%s + 0x%lx }", | 103 | sprintf(buf, "<0x%p> { %s%s%s%s + 0x%lx }", |
102 | (void *)address, delim, modname, delim, symname, | 104 | (void *)address, delim, modname, delim, symname, |
103 | (unsigned long)offset); | 105 | (unsigned long)offset); |
104 | return; | 106 | return; |
105 | 107 | ||
106 | } | 108 | } |
107 | #endif | 109 | #endif |
108 | 110 | ||
109 | /* Problem in fixed code section? */ | 111 | /* Problem in fixed code section? */ |
110 | if (address >= FIXED_CODE_START && address < FIXED_CODE_END) { | 112 | if (address >= FIXED_CODE_START && address < FIXED_CODE_END) { |
111 | sprintf(buf, "<0x%p> /* Maybe fixed code section */", (void *)address); | 113 | sprintf(buf, "<0x%p> /* Maybe fixed code section */", (void *)address); |
112 | return; | 114 | return; |
113 | } | 115 | } |
114 | 116 | ||
115 | /* Problem somewhere before the kernel start address */ | 117 | /* Problem somewhere before the kernel start address */ |
116 | if (address < CONFIG_BOOT_LOAD) { | 118 | if (address < CONFIG_BOOT_LOAD) { |
117 | sprintf(buf, "<0x%p> /* Maybe null pointer? */", (void *)address); | 119 | sprintf(buf, "<0x%p> /* Maybe null pointer? */", (void *)address); |
118 | return; | 120 | return; |
119 | } | 121 | } |
120 | 122 | ||
121 | /* looks like we're off in user-land, so let's walk all the | 123 | /* looks like we're off in user-land, so let's walk all the |
122 | * mappings of all our processes and see if we can't be a whee | 124 | * mappings of all our processes and see if we can't be a whee |
123 | * bit more specific | 125 | * bit more specific |
124 | */ | 126 | */ |
125 | write_lock_irqsave(&tasklist_lock, flags); | 127 | write_lock_irqsave(&tasklist_lock, flags); |
126 | for_each_process(p) { | 128 | for_each_process(p) { |
127 | mm = (in_atomic ? p->mm : get_task_mm(p)); | 129 | mm = (in_atomic ? p->mm : get_task_mm(p)); |
128 | if (!mm) | 130 | if (!mm) |
129 | continue; | 131 | continue; |
130 | 132 | ||
131 | vml = mm->context.vmlist; | 133 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { |
132 | while (vml) { | 134 | struct vm_area_struct *vma; |
133 | struct vm_area_struct *vma = vml->vma; | ||
134 | 135 | ||
136 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
137 | |||
135 | if (address >= vma->vm_start && address < vma->vm_end) { | 138 | if (address >= vma->vm_start && address < vma->vm_end) { |
136 | char _tmpbuf[256]; | 139 | char _tmpbuf[256]; |
137 | char *name = p->comm; | 140 | char *name = p->comm; |
138 | struct file *file = vma->vm_file; | 141 | struct file *file = vma->vm_file; |
139 | 142 | ||
140 | if (file) { | 143 | if (file) { |
141 | char *d_name = d_path(&file->f_path, _tmpbuf, | 144 | char *d_name = d_path(&file->f_path, _tmpbuf, |
142 | sizeof(_tmpbuf)); | 145 | sizeof(_tmpbuf)); |
143 | if (!IS_ERR(d_name)) | 146 | if (!IS_ERR(d_name)) |
144 | name = d_name; | 147 | name = d_name; |
145 | } | 148 | } |
146 | 149 | ||
147 | /* FLAT does not have its text aligned to the start of | 150 | /* FLAT does not have its text aligned to the start of |
148 | * the map while FDPIC ELF does ... | 151 | * the map while FDPIC ELF does ... |
149 | */ | 152 | */ |
150 | 153 | ||
151 | /* before we can check flat/fdpic, we need to | 154 | /* before we can check flat/fdpic, we need to |
152 | * make sure current is valid | 155 | * make sure current is valid |
153 | */ | 156 | */ |
154 | if ((unsigned long)current >= FIXED_CODE_START && | 157 | if ((unsigned long)current >= FIXED_CODE_START && |
155 | !((unsigned long)current & 0x3)) { | 158 | !((unsigned long)current & 0x3)) { |
156 | if (current->mm && | 159 | if (current->mm && |
157 | (address > current->mm->start_code) && | 160 | (address > current->mm->start_code) && |
158 | (address < current->mm->end_code)) | 161 | (address < current->mm->end_code)) |
159 | offset = address - current->mm->start_code; | 162 | offset = address - current->mm->start_code; |
160 | else | 163 | else |
161 | offset = (address - vma->vm_start) + | 164 | offset = (address - vma->vm_start) + |
162 | (vma->vm_pgoff << PAGE_SHIFT); | 165 | (vma->vm_pgoff << PAGE_SHIFT); |
163 | 166 | ||
164 | sprintf(buf, "<0x%p> [ %s + 0x%lx ]", | 167 | sprintf(buf, "<0x%p> [ %s + 0x%lx ]", |
165 | (void *)address, name, offset); | 168 | (void *)address, name, offset); |
166 | } else | 169 | } else |
167 | sprintf(buf, "<0x%p> [ %s vma:0x%lx-0x%lx]", | 170 | sprintf(buf, "<0x%p> [ %s vma:0x%lx-0x%lx]", |
168 | (void *)address, name, | 171 | (void *)address, name, |
169 | vma->vm_start, vma->vm_end); | 172 | vma->vm_start, vma->vm_end); |
170 | 173 | ||
171 | if (!in_atomic) | 174 | if (!in_atomic) |
172 | mmput(mm); | 175 | mmput(mm); |
173 | 176 | ||
174 | if (!strlen(buf)) | 177 | if (!strlen(buf)) |
175 | sprintf(buf, "<0x%p> [ %s ] dynamic memory", (void *)address, name); | 178 | sprintf(buf, "<0x%p> [ %s ] dynamic memory", (void *)address, name); |
176 | 179 | ||
177 | goto done; | 180 | goto done; |
178 | } | 181 | } |
179 | |||
180 | vml = vml->next; | ||
181 | } | 182 | } |
182 | if (!in_atomic) | 183 | if (!in_atomic) |
183 | mmput(mm); | 184 | mmput(mm); |
184 | } | 185 | } |
185 | 186 | ||
186 | /* we were unable to find this address anywhere */ | 187 | /* we were unable to find this address anywhere */ |
187 | sprintf(buf, "<0x%p> /* kernel dynamic memory */", (void *)address); | 188 | sprintf(buf, "<0x%p> /* kernel dynamic memory */", (void *)address); |
188 | 189 | ||
189 | done: | 190 | done: |
190 | write_unlock_irqrestore(&tasklist_lock, flags); | 191 | write_unlock_irqrestore(&tasklist_lock, flags); |
191 | #else | 192 | #else |
192 | sprintf(buf, " "); | 193 | sprintf(buf, " "); |
193 | #endif | 194 | #endif |
194 | } | 195 | } |
195 | 196 | ||
196 | asmlinkage void double_fault_c(struct pt_regs *fp) | 197 | asmlinkage void double_fault_c(struct pt_regs *fp) |
197 | { | 198 | { |
198 | console_verbose(); | 199 | console_verbose(); |
199 | oops_in_progress = 1; | 200 | oops_in_progress = 1; |
200 | #ifdef CONFIG_DEBUG_VERBOSE | 201 | #ifdef CONFIG_DEBUG_VERBOSE |
201 | printk(KERN_EMERG "\n" KERN_EMERG "Double Fault\n"); | 202 | printk(KERN_EMERG "\n" KERN_EMERG "Double Fault\n"); |
202 | #ifdef CONFIG_DEBUG_DOUBLEFAULT_PRINT | 203 | #ifdef CONFIG_DEBUG_DOUBLEFAULT_PRINT |
203 | if (((long)fp->seqstat & SEQSTAT_EXCAUSE) == VEC_UNCOV) { | 204 | if (((long)fp->seqstat & SEQSTAT_EXCAUSE) == VEC_UNCOV) { |
204 | unsigned int cpu = smp_processor_id(); | 205 | unsigned int cpu = smp_processor_id(); |
205 | char buf[150]; | 206 | char buf[150]; |
206 | decode_address(buf, cpu_pda[cpu].retx); | 207 | decode_address(buf, cpu_pda[cpu].retx); |
207 | printk(KERN_EMERG "While handling exception (EXCAUSE = 0x%x) at %s:\n", | 208 | printk(KERN_EMERG "While handling exception (EXCAUSE = 0x%x) at %s:\n", |
208 | (unsigned int)cpu_pda[cpu].seqstat & SEQSTAT_EXCAUSE, buf); | 209 | (unsigned int)cpu_pda[cpu].seqstat & SEQSTAT_EXCAUSE, buf); |
209 | decode_address(buf, cpu_pda[cpu].dcplb_fault_addr); | 210 | decode_address(buf, cpu_pda[cpu].dcplb_fault_addr); |
210 | printk(KERN_NOTICE " DCPLB_FAULT_ADDR: %s\n", buf); | 211 | printk(KERN_NOTICE " DCPLB_FAULT_ADDR: %s\n", buf); |
211 | decode_address(buf, cpu_pda[cpu].icplb_fault_addr); | 212 | decode_address(buf, cpu_pda[cpu].icplb_fault_addr); |
212 | printk(KERN_NOTICE " ICPLB_FAULT_ADDR: %s\n", buf); | 213 | printk(KERN_NOTICE " ICPLB_FAULT_ADDR: %s\n", buf); |
213 | 214 | ||
214 | decode_address(buf, fp->retx); | 215 | decode_address(buf, fp->retx); |
215 | printk(KERN_NOTICE "The instruction at %s caused a double exception\n", buf); | 216 | printk(KERN_NOTICE "The instruction at %s caused a double exception\n", buf); |
216 | } else | 217 | } else |
217 | #endif | 218 | #endif |
218 | { | 219 | { |
219 | dump_bfin_process(fp); | 220 | dump_bfin_process(fp); |
220 | dump_bfin_mem(fp); | 221 | dump_bfin_mem(fp); |
221 | show_regs(fp); | 222 | show_regs(fp); |
222 | } | 223 | } |
223 | #endif | 224 | #endif |
224 | panic("Double Fault - unrecoverable event\n"); | 225 | panic("Double Fault - unrecoverable event\n"); |
225 | 226 | ||
226 | } | 227 | } |
227 | 228 | ||
228 | asmlinkage void trap_c(struct pt_regs *fp) | 229 | asmlinkage void trap_c(struct pt_regs *fp) |
229 | { | 230 | { |
230 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_ON | 231 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_ON |
231 | int j; | 232 | int j; |
232 | #endif | 233 | #endif |
233 | #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO | 234 | #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO |
234 | unsigned int cpu = smp_processor_id(); | 235 | unsigned int cpu = smp_processor_id(); |
235 | #endif | 236 | #endif |
236 | int sig = 0; | 237 | int sig = 0; |
237 | siginfo_t info; | 238 | siginfo_t info; |
238 | unsigned long trapnr = fp->seqstat & SEQSTAT_EXCAUSE; | 239 | unsigned long trapnr = fp->seqstat & SEQSTAT_EXCAUSE; |
239 | 240 | ||
240 | trace_buffer_save(j); | 241 | trace_buffer_save(j); |
241 | 242 | ||
242 | /* Important - be very careful dereferncing pointers - will lead to | 243 | /* Important - be very careful dereferncing pointers - will lead to |
243 | * double faults if the stack has become corrupt | 244 | * double faults if the stack has become corrupt |
244 | */ | 245 | */ |
245 | 246 | ||
246 | /* If the fault was caused by a kernel thread, or interrupt handler | 247 | /* If the fault was caused by a kernel thread, or interrupt handler |
247 | * we will kernel panic, so the system reboots. | 248 | * we will kernel panic, so the system reboots. |
248 | * If KGDB is enabled, don't set this for kernel breakpoints | 249 | * If KGDB is enabled, don't set this for kernel breakpoints |
249 | */ | 250 | */ |
250 | 251 | ||
251 | /* TODO: check to see if we are in some sort of deferred HWERR | 252 | /* TODO: check to see if we are in some sort of deferred HWERR |
252 | * that we should be able to recover from, not kernel panic | 253 | * that we should be able to recover from, not kernel panic |
253 | */ | 254 | */ |
254 | if ((bfin_read_IPEND() & 0xFFC0) && (trapnr != VEC_STEP) | 255 | if ((bfin_read_IPEND() & 0xFFC0) && (trapnr != VEC_STEP) |
255 | #ifdef CONFIG_KGDB | 256 | #ifdef CONFIG_KGDB |
256 | && (trapnr != VEC_EXCPT02) | 257 | && (trapnr != VEC_EXCPT02) |
257 | #endif | 258 | #endif |
258 | ){ | 259 | ){ |
259 | console_verbose(); | 260 | console_verbose(); |
260 | oops_in_progress = 1; | 261 | oops_in_progress = 1; |
261 | } else if (current) { | 262 | } else if (current) { |
262 | if (current->mm == NULL) { | 263 | if (current->mm == NULL) { |
263 | console_verbose(); | 264 | console_verbose(); |
264 | oops_in_progress = 1; | 265 | oops_in_progress = 1; |
265 | } | 266 | } |
266 | } | 267 | } |
267 | 268 | ||
268 | /* trap_c() will be called for exceptions. During exceptions | 269 | /* trap_c() will be called for exceptions. During exceptions |
269 | * processing, the pc value should be set with retx value. | 270 | * processing, the pc value should be set with retx value. |
270 | * With this change we can cleanup some code in signal.c- TODO | 271 | * With this change we can cleanup some code in signal.c- TODO |
271 | */ | 272 | */ |
272 | fp->orig_pc = fp->retx; | 273 | fp->orig_pc = fp->retx; |
273 | /* printk("exception: 0x%x, ipend=%x, reti=%x, retx=%x\n", | 274 | /* printk("exception: 0x%x, ipend=%x, reti=%x, retx=%x\n", |
274 | trapnr, fp->ipend, fp->pc, fp->retx); */ | 275 | trapnr, fp->ipend, fp->pc, fp->retx); */ |
275 | 276 | ||
276 | /* send the appropriate signal to the user program */ | 277 | /* send the appropriate signal to the user program */ |
277 | switch (trapnr) { | 278 | switch (trapnr) { |
278 | 279 | ||
279 | /* This table works in conjuction with the one in ./mach-common/entry.S | 280 | /* This table works in conjuction with the one in ./mach-common/entry.S |
280 | * Some exceptions are handled there (in assembly, in exception space) | 281 | * Some exceptions are handled there (in assembly, in exception space) |
281 | * Some are handled here, (in C, in interrupt space) | 282 | * Some are handled here, (in C, in interrupt space) |
282 | * Some, like CPLB, are handled in both, where the normal path is | 283 | * Some, like CPLB, are handled in both, where the normal path is |
283 | * handled in assembly/exception space, and the error path is handled | 284 | * handled in assembly/exception space, and the error path is handled |
284 | * here | 285 | * here |
285 | */ | 286 | */ |
286 | 287 | ||
287 | /* 0x00 - Linux Syscall, getting here is an error */ | 288 | /* 0x00 - Linux Syscall, getting here is an error */ |
288 | /* 0x01 - userspace gdb breakpoint, handled here */ | 289 | /* 0x01 - userspace gdb breakpoint, handled here */ |
289 | case VEC_EXCPT01: | 290 | case VEC_EXCPT01: |
290 | info.si_code = TRAP_ILLTRAP; | 291 | info.si_code = TRAP_ILLTRAP; |
291 | sig = SIGTRAP; | 292 | sig = SIGTRAP; |
292 | CHK_DEBUGGER_TRAP_MAYBE(); | 293 | CHK_DEBUGGER_TRAP_MAYBE(); |
293 | /* Check if this is a breakpoint in kernel space */ | 294 | /* Check if this is a breakpoint in kernel space */ |
294 | if (fp->ipend & 0xffc0) | 295 | if (fp->ipend & 0xffc0) |
295 | return; | 296 | return; |
296 | else | 297 | else |
297 | break; | 298 | break; |
298 | /* 0x03 - User Defined, userspace stack overflow */ | 299 | /* 0x03 - User Defined, userspace stack overflow */ |
299 | case VEC_EXCPT03: | 300 | case VEC_EXCPT03: |
300 | info.si_code = SEGV_STACKFLOW; | 301 | info.si_code = SEGV_STACKFLOW; |
301 | sig = SIGSEGV; | 302 | sig = SIGSEGV; |
302 | verbose_printk(KERN_NOTICE EXC_0x03(KERN_NOTICE)); | 303 | verbose_printk(KERN_NOTICE EXC_0x03(KERN_NOTICE)); |
303 | CHK_DEBUGGER_TRAP_MAYBE(); | 304 | CHK_DEBUGGER_TRAP_MAYBE(); |
304 | break; | 305 | break; |
305 | /* 0x02 - KGDB initial connection and break signal trap */ | 306 | /* 0x02 - KGDB initial connection and break signal trap */ |
306 | case VEC_EXCPT02: | 307 | case VEC_EXCPT02: |
307 | #ifdef CONFIG_KGDB | 308 | #ifdef CONFIG_KGDB |
308 | info.si_code = TRAP_ILLTRAP; | 309 | info.si_code = TRAP_ILLTRAP; |
309 | sig = SIGTRAP; | 310 | sig = SIGTRAP; |
310 | CHK_DEBUGGER_TRAP(); | 311 | CHK_DEBUGGER_TRAP(); |
311 | return; | 312 | return; |
312 | #endif | 313 | #endif |
313 | /* 0x04 - User Defined */ | 314 | /* 0x04 - User Defined */ |
314 | /* 0x05 - User Defined */ | 315 | /* 0x05 - User Defined */ |
315 | /* 0x06 - User Defined */ | 316 | /* 0x06 - User Defined */ |
316 | /* 0x07 - User Defined */ | 317 | /* 0x07 - User Defined */ |
317 | /* 0x08 - User Defined */ | 318 | /* 0x08 - User Defined */ |
318 | /* 0x09 - User Defined */ | 319 | /* 0x09 - User Defined */ |
319 | /* 0x0A - User Defined */ | 320 | /* 0x0A - User Defined */ |
320 | /* 0x0B - User Defined */ | 321 | /* 0x0B - User Defined */ |
321 | /* 0x0C - User Defined */ | 322 | /* 0x0C - User Defined */ |
322 | /* 0x0D - User Defined */ | 323 | /* 0x0D - User Defined */ |
323 | /* 0x0E - User Defined */ | 324 | /* 0x0E - User Defined */ |
324 | /* 0x0F - User Defined */ | 325 | /* 0x0F - User Defined */ |
325 | /* If we got here, it is most likely that someone was trying to use a | 326 | /* If we got here, it is most likely that someone was trying to use a |
326 | * custom exception handler, and it is not actually installed properly | 327 | * custom exception handler, and it is not actually installed properly |
327 | */ | 328 | */ |
328 | case VEC_EXCPT04 ... VEC_EXCPT15: | 329 | case VEC_EXCPT04 ... VEC_EXCPT15: |
329 | info.si_code = ILL_ILLPARAOP; | 330 | info.si_code = ILL_ILLPARAOP; |
330 | sig = SIGILL; | 331 | sig = SIGILL; |
331 | verbose_printk(KERN_NOTICE EXC_0x04(KERN_NOTICE)); | 332 | verbose_printk(KERN_NOTICE EXC_0x04(KERN_NOTICE)); |
332 | CHK_DEBUGGER_TRAP_MAYBE(); | 333 | CHK_DEBUGGER_TRAP_MAYBE(); |
333 | break; | 334 | break; |
334 | /* 0x10 HW Single step, handled here */ | 335 | /* 0x10 HW Single step, handled here */ |
335 | case VEC_STEP: | 336 | case VEC_STEP: |
336 | info.si_code = TRAP_STEP; | 337 | info.si_code = TRAP_STEP; |
337 | sig = SIGTRAP; | 338 | sig = SIGTRAP; |
338 | CHK_DEBUGGER_TRAP_MAYBE(); | 339 | CHK_DEBUGGER_TRAP_MAYBE(); |
339 | /* Check if this is a single step in kernel space */ | 340 | /* Check if this is a single step in kernel space */ |
340 | if (fp->ipend & 0xffc0) | 341 | if (fp->ipend & 0xffc0) |
341 | return; | 342 | return; |
342 | else | 343 | else |
343 | break; | 344 | break; |
344 | /* 0x11 - Trace Buffer Full, handled here */ | 345 | /* 0x11 - Trace Buffer Full, handled here */ |
345 | case VEC_OVFLOW: | 346 | case VEC_OVFLOW: |
346 | info.si_code = TRAP_TRACEFLOW; | 347 | info.si_code = TRAP_TRACEFLOW; |
347 | sig = SIGTRAP; | 348 | sig = SIGTRAP; |
348 | verbose_printk(KERN_NOTICE EXC_0x11(KERN_NOTICE)); | 349 | verbose_printk(KERN_NOTICE EXC_0x11(KERN_NOTICE)); |
349 | CHK_DEBUGGER_TRAP_MAYBE(); | 350 | CHK_DEBUGGER_TRAP_MAYBE(); |
350 | break; | 351 | break; |
351 | /* 0x12 - Reserved, Caught by default */ | 352 | /* 0x12 - Reserved, Caught by default */ |
352 | /* 0x13 - Reserved, Caught by default */ | 353 | /* 0x13 - Reserved, Caught by default */ |
353 | /* 0x14 - Reserved, Caught by default */ | 354 | /* 0x14 - Reserved, Caught by default */ |
354 | /* 0x15 - Reserved, Caught by default */ | 355 | /* 0x15 - Reserved, Caught by default */ |
355 | /* 0x16 - Reserved, Caught by default */ | 356 | /* 0x16 - Reserved, Caught by default */ |
356 | /* 0x17 - Reserved, Caught by default */ | 357 | /* 0x17 - Reserved, Caught by default */ |
357 | /* 0x18 - Reserved, Caught by default */ | 358 | /* 0x18 - Reserved, Caught by default */ |
358 | /* 0x19 - Reserved, Caught by default */ | 359 | /* 0x19 - Reserved, Caught by default */ |
359 | /* 0x1A - Reserved, Caught by default */ | 360 | /* 0x1A - Reserved, Caught by default */ |
360 | /* 0x1B - Reserved, Caught by default */ | 361 | /* 0x1B - Reserved, Caught by default */ |
361 | /* 0x1C - Reserved, Caught by default */ | 362 | /* 0x1C - Reserved, Caught by default */ |
362 | /* 0x1D - Reserved, Caught by default */ | 363 | /* 0x1D - Reserved, Caught by default */ |
363 | /* 0x1E - Reserved, Caught by default */ | 364 | /* 0x1E - Reserved, Caught by default */ |
364 | /* 0x1F - Reserved, Caught by default */ | 365 | /* 0x1F - Reserved, Caught by default */ |
365 | /* 0x20 - Reserved, Caught by default */ | 366 | /* 0x20 - Reserved, Caught by default */ |
366 | /* 0x21 - Undefined Instruction, handled here */ | 367 | /* 0x21 - Undefined Instruction, handled here */ |
367 | case VEC_UNDEF_I: | 368 | case VEC_UNDEF_I: |
368 | info.si_code = ILL_ILLOPC; | 369 | info.si_code = ILL_ILLOPC; |
369 | sig = SIGILL; | 370 | sig = SIGILL; |
370 | verbose_printk(KERN_NOTICE EXC_0x21(KERN_NOTICE)); | 371 | verbose_printk(KERN_NOTICE EXC_0x21(KERN_NOTICE)); |
371 | CHK_DEBUGGER_TRAP_MAYBE(); | 372 | CHK_DEBUGGER_TRAP_MAYBE(); |
372 | break; | 373 | break; |
373 | /* 0x22 - Illegal Instruction Combination, handled here */ | 374 | /* 0x22 - Illegal Instruction Combination, handled here */ |
374 | case VEC_ILGAL_I: | 375 | case VEC_ILGAL_I: |
375 | info.si_code = ILL_ILLPARAOP; | 376 | info.si_code = ILL_ILLPARAOP; |
376 | sig = SIGILL; | 377 | sig = SIGILL; |
377 | verbose_printk(KERN_NOTICE EXC_0x22(KERN_NOTICE)); | 378 | verbose_printk(KERN_NOTICE EXC_0x22(KERN_NOTICE)); |
378 | CHK_DEBUGGER_TRAP_MAYBE(); | 379 | CHK_DEBUGGER_TRAP_MAYBE(); |
379 | break; | 380 | break; |
380 | /* 0x23 - Data CPLB protection violation, handled here */ | 381 | /* 0x23 - Data CPLB protection violation, handled here */ |
381 | case VEC_CPLB_VL: | 382 | case VEC_CPLB_VL: |
382 | info.si_code = ILL_CPLB_VI; | 383 | info.si_code = ILL_CPLB_VI; |
383 | sig = SIGBUS; | 384 | sig = SIGBUS; |
384 | verbose_printk(KERN_NOTICE EXC_0x23(KERN_NOTICE)); | 385 | verbose_printk(KERN_NOTICE EXC_0x23(KERN_NOTICE)); |
385 | CHK_DEBUGGER_TRAP_MAYBE(); | 386 | CHK_DEBUGGER_TRAP_MAYBE(); |
386 | break; | 387 | break; |
387 | /* 0x24 - Data access misaligned, handled here */ | 388 | /* 0x24 - Data access misaligned, handled here */ |
388 | case VEC_MISALI_D: | 389 | case VEC_MISALI_D: |
389 | info.si_code = BUS_ADRALN; | 390 | info.si_code = BUS_ADRALN; |
390 | sig = SIGBUS; | 391 | sig = SIGBUS; |
391 | verbose_printk(KERN_NOTICE EXC_0x24(KERN_NOTICE)); | 392 | verbose_printk(KERN_NOTICE EXC_0x24(KERN_NOTICE)); |
392 | CHK_DEBUGGER_TRAP_MAYBE(); | 393 | CHK_DEBUGGER_TRAP_MAYBE(); |
393 | break; | 394 | break; |
394 | /* 0x25 - Unrecoverable Event, handled here */ | 395 | /* 0x25 - Unrecoverable Event, handled here */ |
395 | case VEC_UNCOV: | 396 | case VEC_UNCOV: |
396 | info.si_code = ILL_ILLEXCPT; | 397 | info.si_code = ILL_ILLEXCPT; |
397 | sig = SIGILL; | 398 | sig = SIGILL; |
398 | verbose_printk(KERN_NOTICE EXC_0x25(KERN_NOTICE)); | 399 | verbose_printk(KERN_NOTICE EXC_0x25(KERN_NOTICE)); |
399 | CHK_DEBUGGER_TRAP_MAYBE(); | 400 | CHK_DEBUGGER_TRAP_MAYBE(); |
400 | break; | 401 | break; |
401 | /* 0x26 - Data CPLB Miss, normal case is handled in _cplb_hdr, | 402 | /* 0x26 - Data CPLB Miss, normal case is handled in _cplb_hdr, |
402 | error case is handled here */ | 403 | error case is handled here */ |
403 | case VEC_CPLB_M: | 404 | case VEC_CPLB_M: |
404 | info.si_code = BUS_ADRALN; | 405 | info.si_code = BUS_ADRALN; |
405 | sig = SIGBUS; | 406 | sig = SIGBUS; |
406 | verbose_printk(KERN_NOTICE EXC_0x26(KERN_NOTICE)); | 407 | verbose_printk(KERN_NOTICE EXC_0x26(KERN_NOTICE)); |
407 | break; | 408 | break; |
408 | /* 0x27 - Data CPLB Multiple Hits - Linux Trap Zero, handled here */ | 409 | /* 0x27 - Data CPLB Multiple Hits - Linux Trap Zero, handled here */ |
409 | case VEC_CPLB_MHIT: | 410 | case VEC_CPLB_MHIT: |
410 | info.si_code = ILL_CPLB_MULHIT; | 411 | info.si_code = ILL_CPLB_MULHIT; |
411 | sig = SIGSEGV; | 412 | sig = SIGSEGV; |
412 | #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO | 413 | #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO |
413 | if (cpu_pda[cpu].dcplb_fault_addr < FIXED_CODE_START) | 414 | if (cpu_pda[cpu].dcplb_fault_addr < FIXED_CODE_START) |
414 | verbose_printk(KERN_NOTICE "NULL pointer access\n"); | 415 | verbose_printk(KERN_NOTICE "NULL pointer access\n"); |
415 | else | 416 | else |
416 | #endif | 417 | #endif |
417 | verbose_printk(KERN_NOTICE EXC_0x27(KERN_NOTICE)); | 418 | verbose_printk(KERN_NOTICE EXC_0x27(KERN_NOTICE)); |
418 | CHK_DEBUGGER_TRAP_MAYBE(); | 419 | CHK_DEBUGGER_TRAP_MAYBE(); |
419 | break; | 420 | break; |
420 | /* 0x28 - Emulation Watchpoint, handled here */ | 421 | /* 0x28 - Emulation Watchpoint, handled here */ |
421 | case VEC_WATCH: | 422 | case VEC_WATCH: |
422 | info.si_code = TRAP_WATCHPT; | 423 | info.si_code = TRAP_WATCHPT; |
423 | sig = SIGTRAP; | 424 | sig = SIGTRAP; |
424 | pr_debug(EXC_0x28(KERN_DEBUG)); | 425 | pr_debug(EXC_0x28(KERN_DEBUG)); |
425 | CHK_DEBUGGER_TRAP_MAYBE(); | 426 | CHK_DEBUGGER_TRAP_MAYBE(); |
426 | /* Check if this is a watchpoint in kernel space */ | 427 | /* Check if this is a watchpoint in kernel space */ |
427 | if (fp->ipend & 0xffc0) | 428 | if (fp->ipend & 0xffc0) |
428 | return; | 429 | return; |
429 | else | 430 | else |
430 | break; | 431 | break; |
431 | #ifdef CONFIG_BF535 | 432 | #ifdef CONFIG_BF535 |
432 | /* 0x29 - Instruction fetch access error (535 only) */ | 433 | /* 0x29 - Instruction fetch access error (535 only) */ |
433 | case VEC_ISTRU_VL: /* ADSP-BF535 only (MH) */ | 434 | case VEC_ISTRU_VL: /* ADSP-BF535 only (MH) */ |
434 | info.si_code = BUS_OPFETCH; | 435 | info.si_code = BUS_OPFETCH; |
435 | sig = SIGBUS; | 436 | sig = SIGBUS; |
436 | verbose_printk(KERN_NOTICE "BF535: VEC_ISTRU_VL\n"); | 437 | verbose_printk(KERN_NOTICE "BF535: VEC_ISTRU_VL\n"); |
437 | CHK_DEBUGGER_TRAP_MAYBE(); | 438 | CHK_DEBUGGER_TRAP_MAYBE(); |
438 | break; | 439 | break; |
439 | #else | 440 | #else |
440 | /* 0x29 - Reserved, Caught by default */ | 441 | /* 0x29 - Reserved, Caught by default */ |
441 | #endif | 442 | #endif |
442 | /* 0x2A - Instruction fetch misaligned, handled here */ | 443 | /* 0x2A - Instruction fetch misaligned, handled here */ |
443 | case VEC_MISALI_I: | 444 | case VEC_MISALI_I: |
444 | info.si_code = BUS_ADRALN; | 445 | info.si_code = BUS_ADRALN; |
445 | sig = SIGBUS; | 446 | sig = SIGBUS; |
446 | verbose_printk(KERN_NOTICE EXC_0x2A(KERN_NOTICE)); | 447 | verbose_printk(KERN_NOTICE EXC_0x2A(KERN_NOTICE)); |
447 | CHK_DEBUGGER_TRAP_MAYBE(); | 448 | CHK_DEBUGGER_TRAP_MAYBE(); |
448 | break; | 449 | break; |
449 | /* 0x2B - Instruction CPLB protection violation, handled here */ | 450 | /* 0x2B - Instruction CPLB protection violation, handled here */ |
450 | case VEC_CPLB_I_VL: | 451 | case VEC_CPLB_I_VL: |
451 | info.si_code = ILL_CPLB_VI; | 452 | info.si_code = ILL_CPLB_VI; |
452 | sig = SIGBUS; | 453 | sig = SIGBUS; |
453 | verbose_printk(KERN_NOTICE EXC_0x2B(KERN_NOTICE)); | 454 | verbose_printk(KERN_NOTICE EXC_0x2B(KERN_NOTICE)); |
454 | CHK_DEBUGGER_TRAP_MAYBE(); | 455 | CHK_DEBUGGER_TRAP_MAYBE(); |
455 | break; | 456 | break; |
456 | /* 0x2C - Instruction CPLB miss, handled in _cplb_hdr */ | 457 | /* 0x2C - Instruction CPLB miss, handled in _cplb_hdr */ |
457 | case VEC_CPLB_I_M: | 458 | case VEC_CPLB_I_M: |
458 | info.si_code = ILL_CPLB_MISS; | 459 | info.si_code = ILL_CPLB_MISS; |
459 | sig = SIGBUS; | 460 | sig = SIGBUS; |
460 | verbose_printk(KERN_NOTICE EXC_0x2C(KERN_NOTICE)); | 461 | verbose_printk(KERN_NOTICE EXC_0x2C(KERN_NOTICE)); |
461 | break; | 462 | break; |
462 | /* 0x2D - Instruction CPLB Multiple Hits, handled here */ | 463 | /* 0x2D - Instruction CPLB Multiple Hits, handled here */ |
463 | case VEC_CPLB_I_MHIT: | 464 | case VEC_CPLB_I_MHIT: |
464 | info.si_code = ILL_CPLB_MULHIT; | 465 | info.si_code = ILL_CPLB_MULHIT; |
465 | sig = SIGSEGV; | 466 | sig = SIGSEGV; |
466 | #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO | 467 | #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO |
467 | if (cpu_pda[cpu].icplb_fault_addr < FIXED_CODE_START) | 468 | if (cpu_pda[cpu].icplb_fault_addr < FIXED_CODE_START) |
468 | verbose_printk(KERN_NOTICE "Jump to NULL address\n"); | 469 | verbose_printk(KERN_NOTICE "Jump to NULL address\n"); |
469 | else | 470 | else |
470 | #endif | 471 | #endif |
471 | verbose_printk(KERN_NOTICE EXC_0x2D(KERN_NOTICE)); | 472 | verbose_printk(KERN_NOTICE EXC_0x2D(KERN_NOTICE)); |
472 | CHK_DEBUGGER_TRAP_MAYBE(); | 473 | CHK_DEBUGGER_TRAP_MAYBE(); |
473 | break; | 474 | break; |
474 | /* 0x2E - Illegal use of Supervisor Resource, handled here */ | 475 | /* 0x2E - Illegal use of Supervisor Resource, handled here */ |
475 | case VEC_ILL_RES: | 476 | case VEC_ILL_RES: |
476 | info.si_code = ILL_PRVOPC; | 477 | info.si_code = ILL_PRVOPC; |
477 | sig = SIGILL; | 478 | sig = SIGILL; |
478 | verbose_printk(KERN_NOTICE EXC_0x2E(KERN_NOTICE)); | 479 | verbose_printk(KERN_NOTICE EXC_0x2E(KERN_NOTICE)); |
479 | CHK_DEBUGGER_TRAP_MAYBE(); | 480 | CHK_DEBUGGER_TRAP_MAYBE(); |
480 | break; | 481 | break; |
481 | /* 0x2F - Reserved, Caught by default */ | 482 | /* 0x2F - Reserved, Caught by default */ |
482 | /* 0x30 - Reserved, Caught by default */ | 483 | /* 0x30 - Reserved, Caught by default */ |
483 | /* 0x31 - Reserved, Caught by default */ | 484 | /* 0x31 - Reserved, Caught by default */ |
484 | /* 0x32 - Reserved, Caught by default */ | 485 | /* 0x32 - Reserved, Caught by default */ |
485 | /* 0x33 - Reserved, Caught by default */ | 486 | /* 0x33 - Reserved, Caught by default */ |
486 | /* 0x34 - Reserved, Caught by default */ | 487 | /* 0x34 - Reserved, Caught by default */ |
487 | /* 0x35 - Reserved, Caught by default */ | 488 | /* 0x35 - Reserved, Caught by default */ |
488 | /* 0x36 - Reserved, Caught by default */ | 489 | /* 0x36 - Reserved, Caught by default */ |
489 | /* 0x37 - Reserved, Caught by default */ | 490 | /* 0x37 - Reserved, Caught by default */ |
490 | /* 0x38 - Reserved, Caught by default */ | 491 | /* 0x38 - Reserved, Caught by default */ |
491 | /* 0x39 - Reserved, Caught by default */ | 492 | /* 0x39 - Reserved, Caught by default */ |
492 | /* 0x3A - Reserved, Caught by default */ | 493 | /* 0x3A - Reserved, Caught by default */ |
493 | /* 0x3B - Reserved, Caught by default */ | 494 | /* 0x3B - Reserved, Caught by default */ |
494 | /* 0x3C - Reserved, Caught by default */ | 495 | /* 0x3C - Reserved, Caught by default */ |
495 | /* 0x3D - Reserved, Caught by default */ | 496 | /* 0x3D - Reserved, Caught by default */ |
496 | /* 0x3E - Reserved, Caught by default */ | 497 | /* 0x3E - Reserved, Caught by default */ |
497 | /* 0x3F - Reserved, Caught by default */ | 498 | /* 0x3F - Reserved, Caught by default */ |
498 | case VEC_HWERR: | 499 | case VEC_HWERR: |
499 | info.si_code = BUS_ADRALN; | 500 | info.si_code = BUS_ADRALN; |
500 | sig = SIGBUS; | 501 | sig = SIGBUS; |
501 | switch (fp->seqstat & SEQSTAT_HWERRCAUSE) { | 502 | switch (fp->seqstat & SEQSTAT_HWERRCAUSE) { |
502 | /* System MMR Error */ | 503 | /* System MMR Error */ |
503 | case (SEQSTAT_HWERRCAUSE_SYSTEM_MMR): | 504 | case (SEQSTAT_HWERRCAUSE_SYSTEM_MMR): |
504 | info.si_code = BUS_ADRALN; | 505 | info.si_code = BUS_ADRALN; |
505 | sig = SIGBUS; | 506 | sig = SIGBUS; |
506 | verbose_printk(KERN_NOTICE HWC_x2(KERN_NOTICE)); | 507 | verbose_printk(KERN_NOTICE HWC_x2(KERN_NOTICE)); |
507 | break; | 508 | break; |
508 | /* External Memory Addressing Error */ | 509 | /* External Memory Addressing Error */ |
509 | case (SEQSTAT_HWERRCAUSE_EXTERN_ADDR): | 510 | case (SEQSTAT_HWERRCAUSE_EXTERN_ADDR): |
510 | info.si_code = BUS_ADRERR; | 511 | info.si_code = BUS_ADRERR; |
511 | sig = SIGBUS; | 512 | sig = SIGBUS; |
512 | verbose_printk(KERN_NOTICE HWC_x3(KERN_NOTICE)); | 513 | verbose_printk(KERN_NOTICE HWC_x3(KERN_NOTICE)); |
513 | break; | 514 | break; |
514 | /* Performance Monitor Overflow */ | 515 | /* Performance Monitor Overflow */ |
515 | case (SEQSTAT_HWERRCAUSE_PERF_FLOW): | 516 | case (SEQSTAT_HWERRCAUSE_PERF_FLOW): |
516 | verbose_printk(KERN_NOTICE HWC_x12(KERN_NOTICE)); | 517 | verbose_printk(KERN_NOTICE HWC_x12(KERN_NOTICE)); |
517 | break; | 518 | break; |
518 | /* RAISE 5 instruction */ | 519 | /* RAISE 5 instruction */ |
519 | case (SEQSTAT_HWERRCAUSE_RAISE_5): | 520 | case (SEQSTAT_HWERRCAUSE_RAISE_5): |
520 | printk(KERN_NOTICE HWC_x18(KERN_NOTICE)); | 521 | printk(KERN_NOTICE HWC_x18(KERN_NOTICE)); |
521 | break; | 522 | break; |
522 | default: /* Reserved */ | 523 | default: /* Reserved */ |
523 | printk(KERN_NOTICE HWC_default(KERN_NOTICE)); | 524 | printk(KERN_NOTICE HWC_default(KERN_NOTICE)); |
524 | break; | 525 | break; |
525 | } | 526 | } |
526 | CHK_DEBUGGER_TRAP_MAYBE(); | 527 | CHK_DEBUGGER_TRAP_MAYBE(); |
527 | break; | 528 | break; |
528 | /* | 529 | /* |
529 | * We should be handling all known exception types above, | 530 | * We should be handling all known exception types above, |
530 | * if we get here we hit a reserved one, so panic | 531 | * if we get here we hit a reserved one, so panic |
531 | */ | 532 | */ |
532 | default: | 533 | default: |
533 | oops_in_progress = 1; | 534 | oops_in_progress = 1; |
534 | info.si_code = ILL_ILLPARAOP; | 535 | info.si_code = ILL_ILLPARAOP; |
535 | sig = SIGILL; | 536 | sig = SIGILL; |
536 | verbose_printk(KERN_EMERG "Caught Unhandled Exception, code = %08lx\n", | 537 | verbose_printk(KERN_EMERG "Caught Unhandled Exception, code = %08lx\n", |
537 | (fp->seqstat & SEQSTAT_EXCAUSE)); | 538 | (fp->seqstat & SEQSTAT_EXCAUSE)); |
538 | CHK_DEBUGGER_TRAP_MAYBE(); | 539 | CHK_DEBUGGER_TRAP_MAYBE(); |
539 | break; | 540 | break; |
540 | } | 541 | } |
541 | 542 | ||
542 | BUG_ON(sig == 0); | 543 | BUG_ON(sig == 0); |
543 | 544 | ||
544 | if (sig != SIGTRAP) { | 545 | if (sig != SIGTRAP) { |
545 | dump_bfin_process(fp); | 546 | dump_bfin_process(fp); |
546 | dump_bfin_mem(fp); | 547 | dump_bfin_mem(fp); |
547 | show_regs(fp); | 548 | show_regs(fp); |
548 | 549 | ||
549 | /* Print out the trace buffer if it makes sense */ | 550 | /* Print out the trace buffer if it makes sense */ |
550 | #ifndef CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE | 551 | #ifndef CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE |
551 | if (trapnr == VEC_CPLB_I_M || trapnr == VEC_CPLB_M) | 552 | if (trapnr == VEC_CPLB_I_M || trapnr == VEC_CPLB_M) |
552 | verbose_printk(KERN_NOTICE "No trace since you do not have " | 553 | verbose_printk(KERN_NOTICE "No trace since you do not have " |
553 | "CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled\n" | 554 | "CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled\n" |
554 | KERN_NOTICE "\n"); | 555 | KERN_NOTICE "\n"); |
555 | else | 556 | else |
556 | #endif | 557 | #endif |
557 | dump_bfin_trace_buffer(); | 558 | dump_bfin_trace_buffer(); |
558 | 559 | ||
559 | if (oops_in_progress) { | 560 | if (oops_in_progress) { |
560 | /* Dump the current kernel stack */ | 561 | /* Dump the current kernel stack */ |
561 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "Kernel Stack\n"); | 562 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "Kernel Stack\n"); |
562 | show_stack(current, NULL); | 563 | show_stack(current, NULL); |
563 | print_modules(); | 564 | print_modules(); |
564 | #ifndef CONFIG_ACCESS_CHECK | 565 | #ifndef CONFIG_ACCESS_CHECK |
565 | verbose_printk(KERN_EMERG "Please turn on " | 566 | verbose_printk(KERN_EMERG "Please turn on " |
566 | "CONFIG_ACCESS_CHECK\n"); | 567 | "CONFIG_ACCESS_CHECK\n"); |
567 | #endif | 568 | #endif |
568 | panic("Kernel exception"); | 569 | panic("Kernel exception"); |
569 | } else { | 570 | } else { |
570 | #ifdef CONFIG_DEBUG_VERBOSE | 571 | #ifdef CONFIG_DEBUG_VERBOSE |
571 | unsigned long *stack; | 572 | unsigned long *stack; |
572 | /* Dump the user space stack */ | 573 | /* Dump the user space stack */ |
573 | stack = (unsigned long *)rdusp(); | 574 | stack = (unsigned long *)rdusp(); |
574 | verbose_printk(KERN_NOTICE "Userspace Stack\n"); | 575 | verbose_printk(KERN_NOTICE "Userspace Stack\n"); |
575 | show_stack(NULL, stack); | 576 | show_stack(NULL, stack); |
576 | #endif | 577 | #endif |
577 | } | 578 | } |
578 | } | 579 | } |
579 | 580 | ||
580 | #ifdef CONFIG_IPIPE | 581 | #ifdef CONFIG_IPIPE |
581 | if (!ipipe_trap_notify(fp->seqstat & 0x3f, fp)) | 582 | if (!ipipe_trap_notify(fp->seqstat & 0x3f, fp)) |
582 | #endif | 583 | #endif |
583 | { | 584 | { |
584 | info.si_signo = sig; | 585 | info.si_signo = sig; |
585 | info.si_errno = 0; | 586 | info.si_errno = 0; |
586 | info.si_addr = (void __user *)fp->pc; | 587 | info.si_addr = (void __user *)fp->pc; |
587 | force_sig_info(sig, &info, current); | 588 | force_sig_info(sig, &info, current); |
588 | } | 589 | } |
589 | 590 | ||
590 | trace_buffer_restore(j); | 591 | trace_buffer_restore(j); |
591 | return; | 592 | return; |
592 | } | 593 | } |
593 | 594 | ||
594 | /* Typical exception handling routines */ | 595 | /* Typical exception handling routines */ |
595 | 596 | ||
596 | #define EXPAND_LEN ((1 << CONFIG_DEBUG_BFIN_HWTRACE_EXPAND_LEN) * 256 - 1) | 597 | #define EXPAND_LEN ((1 << CONFIG_DEBUG_BFIN_HWTRACE_EXPAND_LEN) * 256 - 1) |
597 | 598 | ||
598 | /* | 599 | /* |
599 | * Similar to get_user, do some address checking, then dereference | 600 | * Similar to get_user, do some address checking, then dereference |
600 | * Return true on sucess, false on bad address | 601 | * Return true on sucess, false on bad address |
601 | */ | 602 | */ |
602 | static bool get_instruction(unsigned short *val, unsigned short *address) | 603 | static bool get_instruction(unsigned short *val, unsigned short *address) |
603 | { | 604 | { |
604 | 605 | ||
605 | unsigned long addr; | 606 | unsigned long addr; |
606 | 607 | ||
607 | addr = (unsigned long)address; | 608 | addr = (unsigned long)address; |
608 | 609 | ||
609 | /* Check for odd addresses */ | 610 | /* Check for odd addresses */ |
610 | if (addr & 0x1) | 611 | if (addr & 0x1) |
611 | return false; | 612 | return false; |
612 | 613 | ||
613 | /* Check that things do not wrap around */ | 614 | /* Check that things do not wrap around */ |
614 | if (addr > (addr + 2)) | 615 | if (addr > (addr + 2)) |
615 | return false; | 616 | return false; |
616 | 617 | ||
617 | /* | 618 | /* |
618 | * Since we are in exception context, we need to do a little address checking | 619 | * Since we are in exception context, we need to do a little address checking |
619 | * We need to make sure we are only accessing valid memory, and | 620 | * We need to make sure we are only accessing valid memory, and |
620 | * we don't read something in the async space that can hang forever | 621 | * we don't read something in the async space that can hang forever |
621 | */ | 622 | */ |
622 | if ((addr >= FIXED_CODE_START && (addr + 2) <= physical_mem_end) || | 623 | if ((addr >= FIXED_CODE_START && (addr + 2) <= physical_mem_end) || |
623 | #if L2_LENGTH != 0 | 624 | #if L2_LENGTH != 0 |
624 | (addr >= L2_START && (addr + 2) <= (L2_START + L2_LENGTH)) || | 625 | (addr >= L2_START && (addr + 2) <= (L2_START + L2_LENGTH)) || |
625 | #endif | 626 | #endif |
626 | (addr >= BOOT_ROM_START && (addr + 2) <= (BOOT_ROM_START + BOOT_ROM_LENGTH)) || | 627 | (addr >= BOOT_ROM_START && (addr + 2) <= (BOOT_ROM_START + BOOT_ROM_LENGTH)) || |
627 | #if L1_DATA_A_LENGTH != 0 | 628 | #if L1_DATA_A_LENGTH != 0 |
628 | (addr >= L1_DATA_A_START && (addr + 2) <= (L1_DATA_A_START + L1_DATA_A_LENGTH)) || | 629 | (addr >= L1_DATA_A_START && (addr + 2) <= (L1_DATA_A_START + L1_DATA_A_LENGTH)) || |
629 | #endif | 630 | #endif |
630 | #if L1_DATA_B_LENGTH != 0 | 631 | #if L1_DATA_B_LENGTH != 0 |
631 | (addr >= L1_DATA_B_START && (addr + 2) <= (L1_DATA_B_START + L1_DATA_B_LENGTH)) || | 632 | (addr >= L1_DATA_B_START && (addr + 2) <= (L1_DATA_B_START + L1_DATA_B_LENGTH)) || |
632 | #endif | 633 | #endif |
633 | (addr >= L1_SCRATCH_START && (addr + 2) <= (L1_SCRATCH_START + L1_SCRATCH_LENGTH)) || | 634 | (addr >= L1_SCRATCH_START && (addr + 2) <= (L1_SCRATCH_START + L1_SCRATCH_LENGTH)) || |
634 | (!(bfin_read_EBIU_AMBCTL0() & B0RDYEN) && | 635 | (!(bfin_read_EBIU_AMBCTL0() & B0RDYEN) && |
635 | addr >= ASYNC_BANK0_BASE && (addr + 2) <= (ASYNC_BANK0_BASE + ASYNC_BANK0_SIZE)) || | 636 | addr >= ASYNC_BANK0_BASE && (addr + 2) <= (ASYNC_BANK0_BASE + ASYNC_BANK0_SIZE)) || |
636 | (!(bfin_read_EBIU_AMBCTL0() & B1RDYEN) && | 637 | (!(bfin_read_EBIU_AMBCTL0() & B1RDYEN) && |
637 | addr >= ASYNC_BANK1_BASE && (addr + 2) <= (ASYNC_BANK1_BASE + ASYNC_BANK1_SIZE)) || | 638 | addr >= ASYNC_BANK1_BASE && (addr + 2) <= (ASYNC_BANK1_BASE + ASYNC_BANK1_SIZE)) || |
638 | (!(bfin_read_EBIU_AMBCTL1() & B2RDYEN) && | 639 | (!(bfin_read_EBIU_AMBCTL1() & B2RDYEN) && |
639 | addr >= ASYNC_BANK2_BASE && (addr + 2) <= (ASYNC_BANK2_BASE + ASYNC_BANK1_SIZE)) || | 640 | addr >= ASYNC_BANK2_BASE && (addr + 2) <= (ASYNC_BANK2_BASE + ASYNC_BANK1_SIZE)) || |
640 | (!(bfin_read_EBIU_AMBCTL1() & B3RDYEN) && | 641 | (!(bfin_read_EBIU_AMBCTL1() & B3RDYEN) && |
641 | addr >= ASYNC_BANK3_BASE && (addr + 2) <= (ASYNC_BANK3_BASE + ASYNC_BANK1_SIZE))) { | 642 | addr >= ASYNC_BANK3_BASE && (addr + 2) <= (ASYNC_BANK3_BASE + ASYNC_BANK1_SIZE))) { |
642 | *val = *address; | 643 | *val = *address; |
643 | return true; | 644 | return true; |
644 | } | 645 | } |
645 | 646 | ||
646 | #if L1_CODE_LENGTH != 0 | 647 | #if L1_CODE_LENGTH != 0 |
647 | if (addr >= L1_CODE_START && (addr + 2) <= (L1_CODE_START + L1_CODE_LENGTH)) { | 648 | if (addr >= L1_CODE_START && (addr + 2) <= (L1_CODE_START + L1_CODE_LENGTH)) { |
648 | isram_memcpy(val, address, 2); | 649 | isram_memcpy(val, address, 2); |
649 | return true; | 650 | return true; |
650 | } | 651 | } |
651 | #endif | 652 | #endif |
652 | 653 | ||
653 | 654 | ||
654 | return false; | 655 | return false; |
655 | } | 656 | } |
656 | 657 | ||
657 | /* | 658 | /* |
658 | * decode the instruction if we are printing out the trace, as it | 659 | * decode the instruction if we are printing out the trace, as it |
659 | * makes things easier to follow, without running it through objdump | 660 | * makes things easier to follow, without running it through objdump |
660 | * These are the normal instructions which cause change of flow, which | 661 | * These are the normal instructions which cause change of flow, which |
661 | * would be at the source of the trace buffer | 662 | * would be at the source of the trace buffer |
662 | */ | 663 | */ |
663 | #if defined(CONFIG_DEBUG_VERBOSE) && defined(CONFIG_DEBUG_BFIN_HWTRACE_ON) | 664 | #if defined(CONFIG_DEBUG_VERBOSE) && defined(CONFIG_DEBUG_BFIN_HWTRACE_ON) |
664 | static void decode_instruction(unsigned short *address) | 665 | static void decode_instruction(unsigned short *address) |
665 | { | 666 | { |
666 | unsigned short opcode; | 667 | unsigned short opcode; |
667 | 668 | ||
668 | if (get_instruction(&opcode, address)) { | 669 | if (get_instruction(&opcode, address)) { |
669 | if (opcode == 0x0010) | 670 | if (opcode == 0x0010) |
670 | verbose_printk("RTS"); | 671 | verbose_printk("RTS"); |
671 | else if (opcode == 0x0011) | 672 | else if (opcode == 0x0011) |
672 | verbose_printk("RTI"); | 673 | verbose_printk("RTI"); |
673 | else if (opcode == 0x0012) | 674 | else if (opcode == 0x0012) |
674 | verbose_printk("RTX"); | 675 | verbose_printk("RTX"); |
675 | else if (opcode >= 0x0050 && opcode <= 0x0057) | 676 | else if (opcode >= 0x0050 && opcode <= 0x0057) |
676 | verbose_printk("JUMP (P%i)", opcode & 7); | 677 | verbose_printk("JUMP (P%i)", opcode & 7); |
677 | else if (opcode >= 0x0060 && opcode <= 0x0067) | 678 | else if (opcode >= 0x0060 && opcode <= 0x0067) |
678 | verbose_printk("CALL (P%i)", opcode & 7); | 679 | verbose_printk("CALL (P%i)", opcode & 7); |
679 | else if (opcode >= 0x0070 && opcode <= 0x0077) | 680 | else if (opcode >= 0x0070 && opcode <= 0x0077) |
680 | verbose_printk("CALL (PC+P%i)", opcode & 7); | 681 | verbose_printk("CALL (PC+P%i)", opcode & 7); |
681 | else if (opcode >= 0x0080 && opcode <= 0x0087) | 682 | else if (opcode >= 0x0080 && opcode <= 0x0087) |
682 | verbose_printk("JUMP (PC+P%i)", opcode & 7); | 683 | verbose_printk("JUMP (PC+P%i)", opcode & 7); |
683 | else if ((opcode >= 0x1000 && opcode <= 0x13FF) || (opcode >= 0x1800 && opcode <= 0x1BFF)) | 684 | else if ((opcode >= 0x1000 && opcode <= 0x13FF) || (opcode >= 0x1800 && opcode <= 0x1BFF)) |
684 | verbose_printk("IF !CC JUMP"); | 685 | verbose_printk("IF !CC JUMP"); |
685 | else if ((opcode >= 0x1400 && opcode <= 0x17ff) || (opcode >= 0x1c00 && opcode <= 0x1fff)) | 686 | else if ((opcode >= 0x1400 && opcode <= 0x17ff) || (opcode >= 0x1c00 && opcode <= 0x1fff)) |
686 | verbose_printk("IF CC JUMP"); | 687 | verbose_printk("IF CC JUMP"); |
687 | else if (opcode >= 0x2000 && opcode <= 0x2fff) | 688 | else if (opcode >= 0x2000 && opcode <= 0x2fff) |
688 | verbose_printk("JUMP.S"); | 689 | verbose_printk("JUMP.S"); |
689 | else if (opcode >= 0xe080 && opcode <= 0xe0ff) | 690 | else if (opcode >= 0xe080 && opcode <= 0xe0ff) |
690 | verbose_printk("LSETUP"); | 691 | verbose_printk("LSETUP"); |
691 | else if (opcode >= 0xe200 && opcode <= 0xe2ff) | 692 | else if (opcode >= 0xe200 && opcode <= 0xe2ff) |
692 | verbose_printk("JUMP.L"); | 693 | verbose_printk("JUMP.L"); |
693 | else if (opcode >= 0xe300 && opcode <= 0xe3ff) | 694 | else if (opcode >= 0xe300 && opcode <= 0xe3ff) |
694 | verbose_printk("CALL pcrel"); | 695 | verbose_printk("CALL pcrel"); |
695 | else | 696 | else |
696 | verbose_printk("0x%04x", opcode); | 697 | verbose_printk("0x%04x", opcode); |
697 | } | 698 | } |
698 | 699 | ||
699 | } | 700 | } |
700 | #endif | 701 | #endif |
701 | 702 | ||
702 | void dump_bfin_trace_buffer(void) | 703 | void dump_bfin_trace_buffer(void) |
703 | { | 704 | { |
704 | #ifdef CONFIG_DEBUG_VERBOSE | 705 | #ifdef CONFIG_DEBUG_VERBOSE |
705 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_ON | 706 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_ON |
706 | int tflags, i = 0; | 707 | int tflags, i = 0; |
707 | char buf[150]; | 708 | char buf[150]; |
708 | unsigned short *addr; | 709 | unsigned short *addr; |
709 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_EXPAND | 710 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_EXPAND |
710 | int j, index; | 711 | int j, index; |
711 | #endif | 712 | #endif |
712 | 713 | ||
713 | trace_buffer_save(tflags); | 714 | trace_buffer_save(tflags); |
714 | 715 | ||
715 | printk(KERN_NOTICE "Hardware Trace:\n"); | 716 | printk(KERN_NOTICE "Hardware Trace:\n"); |
716 | 717 | ||
717 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_EXPAND | 718 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_EXPAND |
718 | printk(KERN_NOTICE "WARNING: Expanded trace turned on - can not trace exceptions\n"); | 719 | printk(KERN_NOTICE "WARNING: Expanded trace turned on - can not trace exceptions\n"); |
719 | #endif | 720 | #endif |
720 | 721 | ||
721 | if (likely(bfin_read_TBUFSTAT() & TBUFCNT)) { | 722 | if (likely(bfin_read_TBUFSTAT() & TBUFCNT)) { |
722 | for (; bfin_read_TBUFSTAT() & TBUFCNT; i++) { | 723 | for (; bfin_read_TBUFSTAT() & TBUFCNT; i++) { |
723 | decode_address(buf, (unsigned long)bfin_read_TBUF()); | 724 | decode_address(buf, (unsigned long)bfin_read_TBUF()); |
724 | printk(KERN_NOTICE "%4i Target : %s\n", i, buf); | 725 | printk(KERN_NOTICE "%4i Target : %s\n", i, buf); |
725 | addr = (unsigned short *)bfin_read_TBUF(); | 726 | addr = (unsigned short *)bfin_read_TBUF(); |
726 | decode_address(buf, (unsigned long)addr); | 727 | decode_address(buf, (unsigned long)addr); |
727 | printk(KERN_NOTICE " Source : %s ", buf); | 728 | printk(KERN_NOTICE " Source : %s ", buf); |
728 | decode_instruction(addr); | 729 | decode_instruction(addr); |
729 | printk("\n"); | 730 | printk("\n"); |
730 | } | 731 | } |
731 | } | 732 | } |
732 | 733 | ||
733 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_EXPAND | 734 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_EXPAND |
734 | if (trace_buff_offset) | 735 | if (trace_buff_offset) |
735 | index = trace_buff_offset / 4; | 736 | index = trace_buff_offset / 4; |
736 | else | 737 | else |
737 | index = EXPAND_LEN; | 738 | index = EXPAND_LEN; |
738 | 739 | ||
739 | j = (1 << CONFIG_DEBUG_BFIN_HWTRACE_EXPAND_LEN) * 128; | 740 | j = (1 << CONFIG_DEBUG_BFIN_HWTRACE_EXPAND_LEN) * 128; |
740 | while (j) { | 741 | while (j) { |
741 | decode_address(buf, software_trace_buff[index]); | 742 | decode_address(buf, software_trace_buff[index]); |
742 | printk(KERN_NOTICE "%4i Target : %s\n", i, buf); | 743 | printk(KERN_NOTICE "%4i Target : %s\n", i, buf); |
743 | index -= 1; | 744 | index -= 1; |
744 | if (index < 0 ) | 745 | if (index < 0 ) |
745 | index = EXPAND_LEN; | 746 | index = EXPAND_LEN; |
746 | decode_address(buf, software_trace_buff[index]); | 747 | decode_address(buf, software_trace_buff[index]); |
747 | printk(KERN_NOTICE " Source : %s ", buf); | 748 | printk(KERN_NOTICE " Source : %s ", buf); |
748 | decode_instruction((unsigned short *)software_trace_buff[index]); | 749 | decode_instruction((unsigned short *)software_trace_buff[index]); |
749 | printk("\n"); | 750 | printk("\n"); |
750 | index -= 1; | 751 | index -= 1; |
751 | if (index < 0) | 752 | if (index < 0) |
752 | index = EXPAND_LEN; | 753 | index = EXPAND_LEN; |
753 | j--; | 754 | j--; |
754 | i++; | 755 | i++; |
755 | } | 756 | } |
756 | #endif | 757 | #endif |
757 | 758 | ||
758 | trace_buffer_restore(tflags); | 759 | trace_buffer_restore(tflags); |
759 | #endif | 760 | #endif |
760 | #endif | 761 | #endif |
761 | } | 762 | } |
762 | EXPORT_SYMBOL(dump_bfin_trace_buffer); | 763 | EXPORT_SYMBOL(dump_bfin_trace_buffer); |
763 | 764 | ||
764 | /* | 765 | /* |
765 | * Checks to see if the address pointed to is either a | 766 | * Checks to see if the address pointed to is either a |
766 | * 16-bit CALL instruction, or a 32-bit CALL instruction | 767 | * 16-bit CALL instruction, or a 32-bit CALL instruction |
767 | */ | 768 | */ |
768 | static bool is_bfin_call(unsigned short *addr) | 769 | static bool is_bfin_call(unsigned short *addr) |
769 | { | 770 | { |
770 | unsigned short opcode = 0, *ins_addr; | 771 | unsigned short opcode = 0, *ins_addr; |
771 | ins_addr = (unsigned short *)addr; | 772 | ins_addr = (unsigned short *)addr; |
772 | 773 | ||
773 | if (!get_instruction(&opcode, ins_addr)) | 774 | if (!get_instruction(&opcode, ins_addr)) |
774 | return false; | 775 | return false; |
775 | 776 | ||
776 | if ((opcode >= 0x0060 && opcode <= 0x0067) || | 777 | if ((opcode >= 0x0060 && opcode <= 0x0067) || |
777 | (opcode >= 0x0070 && opcode <= 0x0077)) | 778 | (opcode >= 0x0070 && opcode <= 0x0077)) |
778 | return true; | 779 | return true; |
779 | 780 | ||
780 | ins_addr--; | 781 | ins_addr--; |
781 | if (!get_instruction(&opcode, ins_addr)) | 782 | if (!get_instruction(&opcode, ins_addr)) |
782 | return false; | 783 | return false; |
783 | 784 | ||
784 | if (opcode >= 0xE300 && opcode <= 0xE3FF) | 785 | if (opcode >= 0xE300 && opcode <= 0xE3FF) |
785 | return true; | 786 | return true; |
786 | 787 | ||
787 | return false; | 788 | return false; |
788 | 789 | ||
789 | } | 790 | } |
790 | 791 | ||
791 | void show_stack(struct task_struct *task, unsigned long *stack) | 792 | void show_stack(struct task_struct *task, unsigned long *stack) |
792 | { | 793 | { |
793 | #ifdef CONFIG_PRINTK | 794 | #ifdef CONFIG_PRINTK |
794 | unsigned int *addr, *endstack, *fp = 0, *frame; | 795 | unsigned int *addr, *endstack, *fp = 0, *frame; |
795 | unsigned short *ins_addr; | 796 | unsigned short *ins_addr; |
796 | char buf[150]; | 797 | char buf[150]; |
797 | unsigned int i, j, ret_addr, frame_no = 0; | 798 | unsigned int i, j, ret_addr, frame_no = 0; |
798 | 799 | ||
799 | /* | 800 | /* |
800 | * If we have been passed a specific stack, use that one otherwise | 801 | * If we have been passed a specific stack, use that one otherwise |
801 | * if we have been passed a task structure, use that, otherwise | 802 | * if we have been passed a task structure, use that, otherwise |
802 | * use the stack of where the variable "stack" exists | 803 | * use the stack of where the variable "stack" exists |
803 | */ | 804 | */ |
804 | 805 | ||
805 | if (stack == NULL) { | 806 | if (stack == NULL) { |
806 | if (task) { | 807 | if (task) { |
807 | /* We know this is a kernel stack, so this is the start/end */ | 808 | /* We know this is a kernel stack, so this is the start/end */ |
808 | stack = (unsigned long *)task->thread.ksp; | 809 | stack = (unsigned long *)task->thread.ksp; |
809 | endstack = (unsigned int *)(((unsigned int)(stack) & ~(THREAD_SIZE - 1)) + THREAD_SIZE); | 810 | endstack = (unsigned int *)(((unsigned int)(stack) & ~(THREAD_SIZE - 1)) + THREAD_SIZE); |
810 | } else { | 811 | } else { |
811 | /* print out the existing stack info */ | 812 | /* print out the existing stack info */ |
812 | stack = (unsigned long *)&stack; | 813 | stack = (unsigned long *)&stack; |
813 | endstack = (unsigned int *)PAGE_ALIGN((unsigned int)stack); | 814 | endstack = (unsigned int *)PAGE_ALIGN((unsigned int)stack); |
814 | } | 815 | } |
815 | } else | 816 | } else |
816 | endstack = (unsigned int *)PAGE_ALIGN((unsigned int)stack); | 817 | endstack = (unsigned int *)PAGE_ALIGN((unsigned int)stack); |
817 | 818 | ||
818 | printk(KERN_NOTICE "Stack info:\n"); | 819 | printk(KERN_NOTICE "Stack info:\n"); |
819 | decode_address(buf, (unsigned int)stack); | 820 | decode_address(buf, (unsigned int)stack); |
820 | printk(KERN_NOTICE " SP: [0x%p] %s\n", stack, buf); | 821 | printk(KERN_NOTICE " SP: [0x%p] %s\n", stack, buf); |
821 | 822 | ||
822 | addr = (unsigned int *)((unsigned int)stack & ~0x3F); | 823 | addr = (unsigned int *)((unsigned int)stack & ~0x3F); |
823 | 824 | ||
824 | /* First thing is to look for a frame pointer */ | 825 | /* First thing is to look for a frame pointer */ |
825 | for (addr = (unsigned int *)((unsigned int)stack & ~0xF), i = 0; | 826 | for (addr = (unsigned int *)((unsigned int)stack & ~0xF), i = 0; |
826 | addr < endstack; addr++, i++) { | 827 | addr < endstack; addr++, i++) { |
827 | if (*addr & 0x1) | 828 | if (*addr & 0x1) |
828 | continue; | 829 | continue; |
829 | ins_addr = (unsigned short *)*addr; | 830 | ins_addr = (unsigned short *)*addr; |
830 | ins_addr--; | 831 | ins_addr--; |
831 | if (is_bfin_call(ins_addr)) | 832 | if (is_bfin_call(ins_addr)) |
832 | fp = addr - 1; | 833 | fp = addr - 1; |
833 | 834 | ||
834 | if (fp) { | 835 | if (fp) { |
835 | /* Let's check to see if it is a frame pointer */ | 836 | /* Let's check to see if it is a frame pointer */ |
836 | while (fp >= (addr - 1) && fp < endstack && fp) | 837 | while (fp >= (addr - 1) && fp < endstack && fp) |
837 | fp = (unsigned int *)*fp; | 838 | fp = (unsigned int *)*fp; |
838 | if (fp == 0 || fp == endstack) { | 839 | if (fp == 0 || fp == endstack) { |
839 | fp = addr - 1; | 840 | fp = addr - 1; |
840 | break; | 841 | break; |
841 | } | 842 | } |
842 | fp = 0; | 843 | fp = 0; |
843 | } | 844 | } |
844 | } | 845 | } |
845 | if (fp) { | 846 | if (fp) { |
846 | frame = fp; | 847 | frame = fp; |
847 | printk(KERN_NOTICE " FP: (0x%p)\n", fp); | 848 | printk(KERN_NOTICE " FP: (0x%p)\n", fp); |
848 | } else | 849 | } else |
849 | frame = 0; | 850 | frame = 0; |
850 | 851 | ||
851 | /* | 852 | /* |
852 | * Now that we think we know where things are, we | 853 | * Now that we think we know where things are, we |
853 | * walk the stack again, this time printing things out | 854 | * walk the stack again, this time printing things out |
854 | * incase there is no frame pointer, we still look for | 855 | * incase there is no frame pointer, we still look for |
855 | * valid return addresses | 856 | * valid return addresses |
856 | */ | 857 | */ |
857 | 858 | ||
858 | /* First time print out data, next time, print out symbols */ | 859 | /* First time print out data, next time, print out symbols */ |
859 | for (j = 0; j <= 1; j++) { | 860 | for (j = 0; j <= 1; j++) { |
860 | if (j) | 861 | if (j) |
861 | printk(KERN_NOTICE "Return addresses in stack:\n"); | 862 | printk(KERN_NOTICE "Return addresses in stack:\n"); |
862 | else | 863 | else |
863 | printk(KERN_NOTICE " Memory from 0x%08lx to %p", ((long unsigned int)stack & ~0xF), endstack); | 864 | printk(KERN_NOTICE " Memory from 0x%08lx to %p", ((long unsigned int)stack & ~0xF), endstack); |
864 | 865 | ||
865 | fp = frame; | 866 | fp = frame; |
866 | frame_no = 0; | 867 | frame_no = 0; |
867 | 868 | ||
868 | for (addr = (unsigned int *)((unsigned int)stack & ~0xF), i = 0; | 869 | for (addr = (unsigned int *)((unsigned int)stack & ~0xF), i = 0; |
869 | addr <= endstack; addr++, i++) { | 870 | addr <= endstack; addr++, i++) { |
870 | 871 | ||
871 | ret_addr = 0; | 872 | ret_addr = 0; |
872 | if (!j && i % 8 == 0) | 873 | if (!j && i % 8 == 0) |
873 | printk("\n" KERN_NOTICE "%p:",addr); | 874 | printk("\n" KERN_NOTICE "%p:",addr); |
874 | 875 | ||
875 | /* if it is an odd address, or zero, just skip it */ | 876 | /* if it is an odd address, or zero, just skip it */ |
876 | if (*addr & 0x1 || !*addr) | 877 | if (*addr & 0x1 || !*addr) |
877 | goto print; | 878 | goto print; |
878 | 879 | ||
879 | ins_addr = (unsigned short *)*addr; | 880 | ins_addr = (unsigned short *)*addr; |
880 | 881 | ||
881 | /* Go back one instruction, and see if it is a CALL */ | 882 | /* Go back one instruction, and see if it is a CALL */ |
882 | ins_addr--; | 883 | ins_addr--; |
883 | ret_addr = is_bfin_call(ins_addr); | 884 | ret_addr = is_bfin_call(ins_addr); |
884 | print: | 885 | print: |
885 | if (!j && stack == (unsigned long *)addr) | 886 | if (!j && stack == (unsigned long *)addr) |
886 | printk("[%08x]", *addr); | 887 | printk("[%08x]", *addr); |
887 | else if (ret_addr) | 888 | else if (ret_addr) |
888 | if (j) { | 889 | if (j) { |
889 | decode_address(buf, (unsigned int)*addr); | 890 | decode_address(buf, (unsigned int)*addr); |
890 | if (frame == addr) { | 891 | if (frame == addr) { |
891 | printk(KERN_NOTICE " frame %2i : %s\n", frame_no, buf); | 892 | printk(KERN_NOTICE " frame %2i : %s\n", frame_no, buf); |
892 | continue; | 893 | continue; |
893 | } | 894 | } |
894 | printk(KERN_NOTICE " address : %s\n", buf); | 895 | printk(KERN_NOTICE " address : %s\n", buf); |
895 | } else | 896 | } else |
896 | printk("<%08x>", *addr); | 897 | printk("<%08x>", *addr); |
897 | else if (fp == addr) { | 898 | else if (fp == addr) { |
898 | if (j) | 899 | if (j) |
899 | frame = addr+1; | 900 | frame = addr+1; |
900 | else | 901 | else |
901 | printk("(%08x)", *addr); | 902 | printk("(%08x)", *addr); |
902 | 903 | ||
903 | fp = (unsigned int *)*addr; | 904 | fp = (unsigned int *)*addr; |
904 | frame_no++; | 905 | frame_no++; |
905 | 906 | ||
906 | } else if (!j) | 907 | } else if (!j) |
907 | printk(" %08x ", *addr); | 908 | printk(" %08x ", *addr); |
908 | } | 909 | } |
909 | if (!j) | 910 | if (!j) |
910 | printk("\n"); | 911 | printk("\n"); |
911 | } | 912 | } |
912 | #endif | 913 | #endif |
913 | } | 914 | } |
914 | 915 | ||
915 | void dump_stack(void) | 916 | void dump_stack(void) |
916 | { | 917 | { |
917 | unsigned long stack; | 918 | unsigned long stack; |
918 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_ON | 919 | #ifdef CONFIG_DEBUG_BFIN_HWTRACE_ON |
919 | int tflags; | 920 | int tflags; |
920 | #endif | 921 | #endif |
921 | trace_buffer_save(tflags); | 922 | trace_buffer_save(tflags); |
922 | dump_bfin_trace_buffer(); | 923 | dump_bfin_trace_buffer(); |
923 | show_stack(current, &stack); | 924 | show_stack(current, &stack); |
924 | trace_buffer_restore(tflags); | 925 | trace_buffer_restore(tflags); |
925 | } | 926 | } |
926 | EXPORT_SYMBOL(dump_stack); | 927 | EXPORT_SYMBOL(dump_stack); |
927 | 928 | ||
928 | void dump_bfin_process(struct pt_regs *fp) | 929 | void dump_bfin_process(struct pt_regs *fp) |
929 | { | 930 | { |
930 | #ifdef CONFIG_DEBUG_VERBOSE | 931 | #ifdef CONFIG_DEBUG_VERBOSE |
931 | /* We should be able to look at fp->ipend, but we don't push it on the | 932 | /* We should be able to look at fp->ipend, but we don't push it on the |
932 | * stack all the time, so do this until we fix that */ | 933 | * stack all the time, so do this until we fix that */ |
933 | unsigned int context = bfin_read_IPEND(); | 934 | unsigned int context = bfin_read_IPEND(); |
934 | 935 | ||
935 | if (oops_in_progress) | 936 | if (oops_in_progress) |
936 | verbose_printk(KERN_EMERG "Kernel OOPS in progress\n"); | 937 | verbose_printk(KERN_EMERG "Kernel OOPS in progress\n"); |
937 | 938 | ||
938 | if (context & 0x0020 && (fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR) | 939 | if (context & 0x0020 && (fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR) |
939 | verbose_printk(KERN_NOTICE "HW Error context\n"); | 940 | verbose_printk(KERN_NOTICE "HW Error context\n"); |
940 | else if (context & 0x0020) | 941 | else if (context & 0x0020) |
941 | verbose_printk(KERN_NOTICE "Deferred Exception context\n"); | 942 | verbose_printk(KERN_NOTICE "Deferred Exception context\n"); |
942 | else if (context & 0x3FC0) | 943 | else if (context & 0x3FC0) |
943 | verbose_printk(KERN_NOTICE "Interrupt context\n"); | 944 | verbose_printk(KERN_NOTICE "Interrupt context\n"); |
944 | else if (context & 0x4000) | 945 | else if (context & 0x4000) |
945 | verbose_printk(KERN_NOTICE "Deferred Interrupt context\n"); | 946 | verbose_printk(KERN_NOTICE "Deferred Interrupt context\n"); |
946 | else if (context & 0x8000) | 947 | else if (context & 0x8000) |
947 | verbose_printk(KERN_NOTICE "Kernel process context\n"); | 948 | verbose_printk(KERN_NOTICE "Kernel process context\n"); |
948 | 949 | ||
949 | /* Because we are crashing, and pointers could be bad, we check things | 950 | /* Because we are crashing, and pointers could be bad, we check things |
950 | * pretty closely before we use them | 951 | * pretty closely before we use them |
951 | */ | 952 | */ |
952 | if ((unsigned long)current >= FIXED_CODE_START && | 953 | if ((unsigned long)current >= FIXED_CODE_START && |
953 | !((unsigned long)current & 0x3) && current->pid) { | 954 | !((unsigned long)current & 0x3) && current->pid) { |
954 | verbose_printk(KERN_NOTICE "CURRENT PROCESS:\n"); | 955 | verbose_printk(KERN_NOTICE "CURRENT PROCESS:\n"); |
955 | if (current->comm >= (char *)FIXED_CODE_START) | 956 | if (current->comm >= (char *)FIXED_CODE_START) |
956 | verbose_printk(KERN_NOTICE "COMM=%s PID=%d\n", | 957 | verbose_printk(KERN_NOTICE "COMM=%s PID=%d\n", |
957 | current->comm, current->pid); | 958 | current->comm, current->pid); |
958 | else | 959 | else |
959 | verbose_printk(KERN_NOTICE "COMM= invalid\n"); | 960 | verbose_printk(KERN_NOTICE "COMM= invalid\n"); |
960 | 961 | ||
961 | printk(KERN_NOTICE "CPU = %d\n", current_thread_info()->cpu); | 962 | printk(KERN_NOTICE "CPU = %d\n", current_thread_info()->cpu); |
962 | if (!((unsigned long)current->mm & 0x3) && (unsigned long)current->mm >= FIXED_CODE_START) | 963 | if (!((unsigned long)current->mm & 0x3) && (unsigned long)current->mm >= FIXED_CODE_START) |
963 | verbose_printk(KERN_NOTICE "TEXT = 0x%p-0x%p DATA = 0x%p-0x%p\n" | 964 | verbose_printk(KERN_NOTICE "TEXT = 0x%p-0x%p DATA = 0x%p-0x%p\n" |
964 | KERN_NOTICE " BSS = 0x%p-0x%p USER-STACK = 0x%p\n" | 965 | KERN_NOTICE " BSS = 0x%p-0x%p USER-STACK = 0x%p\n" |
965 | KERN_NOTICE "\n", | 966 | KERN_NOTICE "\n", |
966 | (void *)current->mm->start_code, | 967 | (void *)current->mm->start_code, |
967 | (void *)current->mm->end_code, | 968 | (void *)current->mm->end_code, |
968 | (void *)current->mm->start_data, | 969 | (void *)current->mm->start_data, |
969 | (void *)current->mm->end_data, | 970 | (void *)current->mm->end_data, |
970 | (void *)current->mm->end_data, | 971 | (void *)current->mm->end_data, |
971 | (void *)current->mm->brk, | 972 | (void *)current->mm->brk, |
972 | (void *)current->mm->start_stack); | 973 | (void *)current->mm->start_stack); |
973 | else | 974 | else |
974 | verbose_printk(KERN_NOTICE "invalid mm\n"); | 975 | verbose_printk(KERN_NOTICE "invalid mm\n"); |
975 | } else | 976 | } else |
976 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE | 977 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE |
977 | "No Valid process in current context\n"); | 978 | "No Valid process in current context\n"); |
978 | #endif | 979 | #endif |
979 | } | 980 | } |
980 | 981 | ||
981 | void dump_bfin_mem(struct pt_regs *fp) | 982 | void dump_bfin_mem(struct pt_regs *fp) |
982 | { | 983 | { |
983 | #ifdef CONFIG_DEBUG_VERBOSE | 984 | #ifdef CONFIG_DEBUG_VERBOSE |
984 | unsigned short *addr, *erraddr, val = 0, err = 0; | 985 | unsigned short *addr, *erraddr, val = 0, err = 0; |
985 | char sti = 0, buf[6]; | 986 | char sti = 0, buf[6]; |
986 | 987 | ||
987 | erraddr = (void *)fp->pc; | 988 | erraddr = (void *)fp->pc; |
988 | 989 | ||
989 | verbose_printk(KERN_NOTICE "return address: [0x%p]; contents of:", erraddr); | 990 | verbose_printk(KERN_NOTICE "return address: [0x%p]; contents of:", erraddr); |
990 | 991 | ||
991 | for (addr = (unsigned short *)((unsigned long)erraddr & ~0xF) - 0x10; | 992 | for (addr = (unsigned short *)((unsigned long)erraddr & ~0xF) - 0x10; |
992 | addr < (unsigned short *)((unsigned long)erraddr & ~0xF) + 0x10; | 993 | addr < (unsigned short *)((unsigned long)erraddr & ~0xF) + 0x10; |
993 | addr++) { | 994 | addr++) { |
994 | if (!((unsigned long)addr & 0xF)) | 995 | if (!((unsigned long)addr & 0xF)) |
995 | verbose_printk("\n" KERN_NOTICE "0x%p: ", addr); | 996 | verbose_printk("\n" KERN_NOTICE "0x%p: ", addr); |
996 | 997 | ||
997 | if (!get_instruction(&val, addr)) { | 998 | if (!get_instruction(&val, addr)) { |
998 | val = 0; | 999 | val = 0; |
999 | sprintf(buf, "????"); | 1000 | sprintf(buf, "????"); |
1000 | } else | 1001 | } else |
1001 | sprintf(buf, "%04x", val); | 1002 | sprintf(buf, "%04x", val); |
1002 | 1003 | ||
1003 | if (addr == erraddr) { | 1004 | if (addr == erraddr) { |
1004 | verbose_printk("[%s]", buf); | 1005 | verbose_printk("[%s]", buf); |
1005 | err = val; | 1006 | err = val; |
1006 | } else | 1007 | } else |
1007 | verbose_printk(" %s ", buf); | 1008 | verbose_printk(" %s ", buf); |
1008 | 1009 | ||
1009 | /* Do any previous instructions turn on interrupts? */ | 1010 | /* Do any previous instructions turn on interrupts? */ |
1010 | if (addr <= erraddr && /* in the past */ | 1011 | if (addr <= erraddr && /* in the past */ |
1011 | ((val >= 0x0040 && val <= 0x0047) || /* STI instruction */ | 1012 | ((val >= 0x0040 && val <= 0x0047) || /* STI instruction */ |
1012 | val == 0x017b)) /* [SP++] = RETI */ | 1013 | val == 0x017b)) /* [SP++] = RETI */ |
1013 | sti = 1; | 1014 | sti = 1; |
1014 | } | 1015 | } |
1015 | 1016 | ||
1016 | verbose_printk("\n"); | 1017 | verbose_printk("\n"); |
1017 | 1018 | ||
1018 | /* Hardware error interrupts can be deferred */ | 1019 | /* Hardware error interrupts can be deferred */ |
1019 | if (unlikely(sti && (fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR && | 1020 | if (unlikely(sti && (fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR && |
1020 | oops_in_progress)){ | 1021 | oops_in_progress)){ |
1021 | verbose_printk(KERN_NOTICE "Looks like this was a deferred error - sorry\n"); | 1022 | verbose_printk(KERN_NOTICE "Looks like this was a deferred error - sorry\n"); |
1022 | #ifndef CONFIG_DEBUG_HWERR | 1023 | #ifndef CONFIG_DEBUG_HWERR |
1023 | verbose_printk(KERN_NOTICE "The remaining message may be meaningless\n" | 1024 | verbose_printk(KERN_NOTICE "The remaining message may be meaningless\n" |
1024 | KERN_NOTICE "You should enable CONFIG_DEBUG_HWERR to get a" | 1025 | KERN_NOTICE "You should enable CONFIG_DEBUG_HWERR to get a" |
1025 | " better idea where it came from\n"); | 1026 | " better idea where it came from\n"); |
1026 | #else | 1027 | #else |
1027 | /* If we are handling only one peripheral interrupt | 1028 | /* If we are handling only one peripheral interrupt |
1028 | * and current mm and pid are valid, and the last error | 1029 | * and current mm and pid are valid, and the last error |
1029 | * was in that user space process's text area | 1030 | * was in that user space process's text area |
1030 | * print it out - because that is where the problem exists | 1031 | * print it out - because that is where the problem exists |
1031 | */ | 1032 | */ |
1032 | if ((!(((fp)->ipend & ~0x30) & (((fp)->ipend & ~0x30) - 1))) && | 1033 | if ((!(((fp)->ipend & ~0x30) & (((fp)->ipend & ~0x30) - 1))) && |
1033 | (current->pid && current->mm)) { | 1034 | (current->pid && current->mm)) { |
1034 | /* And the last RETI points to the current userspace context */ | 1035 | /* And the last RETI points to the current userspace context */ |
1035 | if ((fp + 1)->pc >= current->mm->start_code && | 1036 | if ((fp + 1)->pc >= current->mm->start_code && |
1036 | (fp + 1)->pc <= current->mm->end_code) { | 1037 | (fp + 1)->pc <= current->mm->end_code) { |
1037 | verbose_printk(KERN_NOTICE "It might be better to look around here : \n"); | 1038 | verbose_printk(KERN_NOTICE "It might be better to look around here : \n"); |
1038 | verbose_printk(KERN_NOTICE "-------------------------------------------\n"); | 1039 | verbose_printk(KERN_NOTICE "-------------------------------------------\n"); |
1039 | show_regs(fp + 1); | 1040 | show_regs(fp + 1); |
1040 | verbose_printk(KERN_NOTICE "-------------------------------------------\n"); | 1041 | verbose_printk(KERN_NOTICE "-------------------------------------------\n"); |
1041 | } | 1042 | } |
1042 | } | 1043 | } |
1043 | #endif | 1044 | #endif |
1044 | } | 1045 | } |
1045 | #endif | 1046 | #endif |
1046 | } | 1047 | } |
1047 | 1048 | ||
1048 | void show_regs(struct pt_regs *fp) | 1049 | void show_regs(struct pt_regs *fp) |
1049 | { | 1050 | { |
1050 | #ifdef CONFIG_DEBUG_VERBOSE | 1051 | #ifdef CONFIG_DEBUG_VERBOSE |
1051 | char buf [150]; | 1052 | char buf [150]; |
1052 | struct irqaction *action; | 1053 | struct irqaction *action; |
1053 | unsigned int i; | 1054 | unsigned int i; |
1054 | unsigned long flags; | 1055 | unsigned long flags; |
1055 | unsigned int cpu = smp_processor_id(); | 1056 | unsigned int cpu = smp_processor_id(); |
1056 | 1057 | ||
1057 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "SEQUENCER STATUS:\t\t%s\n", print_tainted()); | 1058 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "SEQUENCER STATUS:\t\t%s\n", print_tainted()); |
1058 | verbose_printk(KERN_NOTICE " SEQSTAT: %08lx IPEND: %04lx SYSCFG: %04lx\n", | 1059 | verbose_printk(KERN_NOTICE " SEQSTAT: %08lx IPEND: %04lx SYSCFG: %04lx\n", |
1059 | (long)fp->seqstat, fp->ipend, fp->syscfg); | 1060 | (long)fp->seqstat, fp->ipend, fp->syscfg); |
1060 | if ((fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR) { | 1061 | if ((fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR) { |
1061 | verbose_printk(KERN_NOTICE " HWERRCAUSE: 0x%lx\n", | 1062 | verbose_printk(KERN_NOTICE " HWERRCAUSE: 0x%lx\n", |
1062 | (fp->seqstat & SEQSTAT_HWERRCAUSE) >> 14); | 1063 | (fp->seqstat & SEQSTAT_HWERRCAUSE) >> 14); |
1063 | #ifdef EBIU_ERRMST | 1064 | #ifdef EBIU_ERRMST |
1064 | /* If the error was from the EBIU, print it out */ | 1065 | /* If the error was from the EBIU, print it out */ |
1065 | if (bfin_read_EBIU_ERRMST() & CORE_ERROR) { | 1066 | if (bfin_read_EBIU_ERRMST() & CORE_ERROR) { |
1066 | verbose_printk(KERN_NOTICE " EBIU Error Reason : 0x%04x\n", | 1067 | verbose_printk(KERN_NOTICE " EBIU Error Reason : 0x%04x\n", |
1067 | bfin_read_EBIU_ERRMST()); | 1068 | bfin_read_EBIU_ERRMST()); |
1068 | verbose_printk(KERN_NOTICE " EBIU Error Address : 0x%08x\n", | 1069 | verbose_printk(KERN_NOTICE " EBIU Error Address : 0x%08x\n", |
1069 | bfin_read_EBIU_ERRADD()); | 1070 | bfin_read_EBIU_ERRADD()); |
1070 | } | 1071 | } |
1071 | #endif | 1072 | #endif |
1072 | } | 1073 | } |
1073 | verbose_printk(KERN_NOTICE " EXCAUSE : 0x%lx\n", | 1074 | verbose_printk(KERN_NOTICE " EXCAUSE : 0x%lx\n", |
1074 | fp->seqstat & SEQSTAT_EXCAUSE); | 1075 | fp->seqstat & SEQSTAT_EXCAUSE); |
1075 | for (i = 6; i <= 15 ; i++) { | 1076 | for (i = 6; i <= 15 ; i++) { |
1076 | if (fp->ipend & (1 << i)) { | 1077 | if (fp->ipend & (1 << i)) { |
1077 | decode_address(buf, bfin_read32(EVT0 + 4*i)); | 1078 | decode_address(buf, bfin_read32(EVT0 + 4*i)); |
1078 | verbose_printk(KERN_NOTICE " physical IVG%i asserted : %s\n", i, buf); | 1079 | verbose_printk(KERN_NOTICE " physical IVG%i asserted : %s\n", i, buf); |
1079 | } | 1080 | } |
1080 | } | 1081 | } |
1081 | 1082 | ||
1082 | /* if no interrupts are going off, don't print this out */ | 1083 | /* if no interrupts are going off, don't print this out */ |
1083 | if (fp->ipend & ~0x3F) { | 1084 | if (fp->ipend & ~0x3F) { |
1084 | for (i = 0; i < (NR_IRQS - 1); i++) { | 1085 | for (i = 0; i < (NR_IRQS - 1); i++) { |
1085 | spin_lock_irqsave(&irq_desc[i].lock, flags); | 1086 | spin_lock_irqsave(&irq_desc[i].lock, flags); |
1086 | action = irq_desc[i].action; | 1087 | action = irq_desc[i].action; |
1087 | if (!action) | 1088 | if (!action) |
1088 | goto unlock; | 1089 | goto unlock; |
1089 | 1090 | ||
1090 | decode_address(buf, (unsigned int)action->handler); | 1091 | decode_address(buf, (unsigned int)action->handler); |
1091 | verbose_printk(KERN_NOTICE " logical irq %3d mapped : %s", i, buf); | 1092 | verbose_printk(KERN_NOTICE " logical irq %3d mapped : %s", i, buf); |
1092 | for (action = action->next; action; action = action->next) { | 1093 | for (action = action->next; action; action = action->next) { |
1093 | decode_address(buf, (unsigned int)action->handler); | 1094 | decode_address(buf, (unsigned int)action->handler); |
1094 | verbose_printk(", %s", buf); | 1095 | verbose_printk(", %s", buf); |
1095 | } | 1096 | } |
1096 | verbose_printk("\n"); | 1097 | verbose_printk("\n"); |
1097 | unlock: | 1098 | unlock: |
1098 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | 1099 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); |
1099 | } | 1100 | } |
1100 | } | 1101 | } |
1101 | 1102 | ||
1102 | decode_address(buf, fp->rete); | 1103 | decode_address(buf, fp->rete); |
1103 | verbose_printk(KERN_NOTICE " RETE: %s\n", buf); | 1104 | verbose_printk(KERN_NOTICE " RETE: %s\n", buf); |
1104 | decode_address(buf, fp->retn); | 1105 | decode_address(buf, fp->retn); |
1105 | verbose_printk(KERN_NOTICE " RETN: %s\n", buf); | 1106 | verbose_printk(KERN_NOTICE " RETN: %s\n", buf); |
1106 | decode_address(buf, fp->retx); | 1107 | decode_address(buf, fp->retx); |
1107 | verbose_printk(KERN_NOTICE " RETX: %s\n", buf); | 1108 | verbose_printk(KERN_NOTICE " RETX: %s\n", buf); |
1108 | decode_address(buf, fp->rets); | 1109 | decode_address(buf, fp->rets); |
1109 | verbose_printk(KERN_NOTICE " RETS: %s\n", buf); | 1110 | verbose_printk(KERN_NOTICE " RETS: %s\n", buf); |
1110 | decode_address(buf, fp->pc); | 1111 | decode_address(buf, fp->pc); |
1111 | verbose_printk(KERN_NOTICE " PC : %s\n", buf); | 1112 | verbose_printk(KERN_NOTICE " PC : %s\n", buf); |
1112 | 1113 | ||
1113 | if (((long)fp->seqstat & SEQSTAT_EXCAUSE) && | 1114 | if (((long)fp->seqstat & SEQSTAT_EXCAUSE) && |
1114 | (((long)fp->seqstat & SEQSTAT_EXCAUSE) != VEC_HWERR)) { | 1115 | (((long)fp->seqstat & SEQSTAT_EXCAUSE) != VEC_HWERR)) { |
1115 | decode_address(buf, cpu_pda[cpu].dcplb_fault_addr); | 1116 | decode_address(buf, cpu_pda[cpu].dcplb_fault_addr); |
1116 | verbose_printk(KERN_NOTICE "DCPLB_FAULT_ADDR: %s\n", buf); | 1117 | verbose_printk(KERN_NOTICE "DCPLB_FAULT_ADDR: %s\n", buf); |
1117 | decode_address(buf, cpu_pda[cpu].icplb_fault_addr); | 1118 | decode_address(buf, cpu_pda[cpu].icplb_fault_addr); |
1118 | verbose_printk(KERN_NOTICE "ICPLB_FAULT_ADDR: %s\n", buf); | 1119 | verbose_printk(KERN_NOTICE "ICPLB_FAULT_ADDR: %s\n", buf); |
1119 | } | 1120 | } |
1120 | 1121 | ||
1121 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "PROCESSOR STATE:\n"); | 1122 | verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "PROCESSOR STATE:\n"); |
1122 | verbose_printk(KERN_NOTICE " R0 : %08lx R1 : %08lx R2 : %08lx R3 : %08lx\n", | 1123 | verbose_printk(KERN_NOTICE " R0 : %08lx R1 : %08lx R2 : %08lx R3 : %08lx\n", |
1123 | fp->r0, fp->r1, fp->r2, fp->r3); | 1124 | fp->r0, fp->r1, fp->r2, fp->r3); |
1124 | verbose_printk(KERN_NOTICE " R4 : %08lx R5 : %08lx R6 : %08lx R7 : %08lx\n", | 1125 | verbose_printk(KERN_NOTICE " R4 : %08lx R5 : %08lx R6 : %08lx R7 : %08lx\n", |
1125 | fp->r4, fp->r5, fp->r6, fp->r7); | 1126 | fp->r4, fp->r5, fp->r6, fp->r7); |
1126 | verbose_printk(KERN_NOTICE " P0 : %08lx P1 : %08lx P2 : %08lx P3 : %08lx\n", | 1127 | verbose_printk(KERN_NOTICE " P0 : %08lx P1 : %08lx P2 : %08lx P3 : %08lx\n", |
1127 | fp->p0, fp->p1, fp->p2, fp->p3); | 1128 | fp->p0, fp->p1, fp->p2, fp->p3); |
1128 | verbose_printk(KERN_NOTICE " P4 : %08lx P5 : %08lx FP : %08lx SP : %08lx\n", | 1129 | verbose_printk(KERN_NOTICE " P4 : %08lx P5 : %08lx FP : %08lx SP : %08lx\n", |
1129 | fp->p4, fp->p5, fp->fp, (long)fp); | 1130 | fp->p4, fp->p5, fp->fp, (long)fp); |
1130 | verbose_printk(KERN_NOTICE " LB0: %08lx LT0: %08lx LC0: %08lx\n", | 1131 | verbose_printk(KERN_NOTICE " LB0: %08lx LT0: %08lx LC0: %08lx\n", |
1131 | fp->lb0, fp->lt0, fp->lc0); | 1132 | fp->lb0, fp->lt0, fp->lc0); |
1132 | verbose_printk(KERN_NOTICE " LB1: %08lx LT1: %08lx LC1: %08lx\n", | 1133 | verbose_printk(KERN_NOTICE " LB1: %08lx LT1: %08lx LC1: %08lx\n", |
1133 | fp->lb1, fp->lt1, fp->lc1); | 1134 | fp->lb1, fp->lt1, fp->lc1); |
1134 | verbose_printk(KERN_NOTICE " B0 : %08lx L0 : %08lx M0 : %08lx I0 : %08lx\n", | 1135 | verbose_printk(KERN_NOTICE " B0 : %08lx L0 : %08lx M0 : %08lx I0 : %08lx\n", |
1135 | fp->b0, fp->l0, fp->m0, fp->i0); | 1136 | fp->b0, fp->l0, fp->m0, fp->i0); |
1136 | verbose_printk(KERN_NOTICE " B1 : %08lx L1 : %08lx M1 : %08lx I1 : %08lx\n", | 1137 | verbose_printk(KERN_NOTICE " B1 : %08lx L1 : %08lx M1 : %08lx I1 : %08lx\n", |
1137 | fp->b1, fp->l1, fp->m1, fp->i1); | 1138 | fp->b1, fp->l1, fp->m1, fp->i1); |
1138 | verbose_printk(KERN_NOTICE " B2 : %08lx L2 : %08lx M2 : %08lx I2 : %08lx\n", | 1139 | verbose_printk(KERN_NOTICE " B2 : %08lx L2 : %08lx M2 : %08lx I2 : %08lx\n", |
1139 | fp->b2, fp->l2, fp->m2, fp->i2); | 1140 | fp->b2, fp->l2, fp->m2, fp->i2); |
1140 | verbose_printk(KERN_NOTICE " B3 : %08lx L3 : %08lx M3 : %08lx I3 : %08lx\n", | 1141 | verbose_printk(KERN_NOTICE " B3 : %08lx L3 : %08lx M3 : %08lx I3 : %08lx\n", |
1141 | fp->b3, fp->l3, fp->m3, fp->i3); | 1142 | fp->b3, fp->l3, fp->m3, fp->i3); |
1142 | verbose_printk(KERN_NOTICE "A0.w: %08lx A0.x: %08lx A1.w: %08lx A1.x: %08lx\n", | 1143 | verbose_printk(KERN_NOTICE "A0.w: %08lx A0.x: %08lx A1.w: %08lx A1.x: %08lx\n", |
1143 | fp->a0w, fp->a0x, fp->a1w, fp->a1x); | 1144 | fp->a0w, fp->a0x, fp->a1w, fp->a1x); |
1144 | 1145 | ||
1145 | verbose_printk(KERN_NOTICE "USP : %08lx ASTAT: %08lx\n", | 1146 | verbose_printk(KERN_NOTICE "USP : %08lx ASTAT: %08lx\n", |
1146 | rdusp(), fp->astat); | 1147 | rdusp(), fp->astat); |
1147 | 1148 | ||
1148 | verbose_printk(KERN_NOTICE "\n"); | 1149 | verbose_printk(KERN_NOTICE "\n"); |
1149 | #endif | 1150 | #endif |
1150 | } | 1151 | } |
1151 | 1152 | ||
1152 | #ifdef CONFIG_SYS_BFIN_SPINLOCK_L1 | 1153 | #ifdef CONFIG_SYS_BFIN_SPINLOCK_L1 |
1153 | asmlinkage int sys_bfin_spinlock(int *spinlock)__attribute__((l1_text)); | 1154 | asmlinkage int sys_bfin_spinlock(int *spinlock)__attribute__((l1_text)); |
1154 | #endif | 1155 | #endif |
1155 | 1156 | ||
1156 | static DEFINE_SPINLOCK(bfin_spinlock_lock); | 1157 | static DEFINE_SPINLOCK(bfin_spinlock_lock); |
1157 | 1158 | ||
1158 | asmlinkage int sys_bfin_spinlock(int *p) | 1159 | asmlinkage int sys_bfin_spinlock(int *p) |
1159 | { | 1160 | { |
1160 | int ret, tmp = 0; | 1161 | int ret, tmp = 0; |
1161 | 1162 | ||
1162 | spin_lock(&bfin_spinlock_lock); /* This would also hold kernel preemption. */ | 1163 | spin_lock(&bfin_spinlock_lock); /* This would also hold kernel preemption. */ |
1163 | ret = get_user(tmp, p); | 1164 | ret = get_user(tmp, p); |
1164 | if (likely(ret == 0)) { | 1165 | if (likely(ret == 0)) { |
1165 | if (unlikely(tmp)) | 1166 | if (unlikely(tmp)) |
1166 | ret = 1; | 1167 | ret = 1; |
1167 | else | 1168 | else |
1168 | put_user(1, p); | 1169 | put_user(1, p); |
1169 | } | 1170 | } |
1170 | spin_unlock(&bfin_spinlock_lock); | 1171 | spin_unlock(&bfin_spinlock_lock); |
1171 | return ret; | 1172 | return ret; |
1172 | } | 1173 | } |
1173 | 1174 | ||
1174 | int bfin_request_exception(unsigned int exception, void (*handler)(void)) | 1175 | int bfin_request_exception(unsigned int exception, void (*handler)(void)) |
1175 | { | 1176 | { |
1176 | void (*curr_handler)(void); | 1177 | void (*curr_handler)(void); |
1177 | 1178 | ||
1178 | if (exception > 0x3F) | 1179 | if (exception > 0x3F) |
1179 | return -EINVAL; | 1180 | return -EINVAL; |
1180 | 1181 | ||
1181 | curr_handler = ex_table[exception]; | 1182 | curr_handler = ex_table[exception]; |
1182 | 1183 | ||
1183 | if (curr_handler != ex_replaceable) | 1184 | if (curr_handler != ex_replaceable) |
1184 | return -EBUSY; | 1185 | return -EBUSY; |
1185 | 1186 | ||
1186 | ex_table[exception] = handler; | 1187 | ex_table[exception] = handler; |
1187 | 1188 | ||
1188 | return 0; | 1189 | return 0; |
1189 | } | 1190 | } |
1190 | EXPORT_SYMBOL(bfin_request_exception); | 1191 | EXPORT_SYMBOL(bfin_request_exception); |
1191 | 1192 | ||
1192 | int bfin_free_exception(unsigned int exception, void (*handler)(void)) | 1193 | int bfin_free_exception(unsigned int exception, void (*handler)(void)) |
1193 | { | 1194 | { |
1194 | void (*curr_handler)(void); | 1195 | void (*curr_handler)(void); |
1195 | 1196 | ||
1196 | if (exception > 0x3F) | 1197 | if (exception > 0x3F) |
1197 | return -EINVAL; | 1198 | return -EINVAL; |
1198 | 1199 | ||
1199 | curr_handler = ex_table[exception]; | 1200 | curr_handler = ex_table[exception]; |
1200 | 1201 | ||
1201 | if (curr_handler != handler) | 1202 | if (curr_handler != handler) |
1202 | return -EBUSY; | 1203 | return -EBUSY; |
1203 | 1204 | ||
1204 | ex_table[exception] = ex_replaceable; | 1205 | ex_table[exception] = ex_replaceable; |
1205 | 1206 | ||
1206 | return 0; | 1207 | return 0; |
1207 | } | 1208 | } |
1208 | EXPORT_SYMBOL(bfin_free_exception); | 1209 | EXPORT_SYMBOL(bfin_free_exception); |
1209 | 1210 | ||
1210 | void panic_cplb_error(int cplb_panic, struct pt_regs *fp) | 1211 | void panic_cplb_error(int cplb_panic, struct pt_regs *fp) |
1211 | { | 1212 | { |
1212 | switch (cplb_panic) { | 1213 | switch (cplb_panic) { |
1213 | case CPLB_NO_UNLOCKED: | 1214 | case CPLB_NO_UNLOCKED: |
1214 | printk(KERN_EMERG "All CPLBs are locked\n"); | 1215 | printk(KERN_EMERG "All CPLBs are locked\n"); |
1215 | break; | 1216 | break; |
1216 | case CPLB_PROT_VIOL: | 1217 | case CPLB_PROT_VIOL: |
1217 | return; | 1218 | return; |
1218 | case CPLB_NO_ADDR_MATCH: | 1219 | case CPLB_NO_ADDR_MATCH: |
1219 | return; | 1220 | return; |
1220 | case CPLB_UNKNOWN_ERR: | 1221 | case CPLB_UNKNOWN_ERR: |
1221 | printk(KERN_EMERG "Unknown CPLB Exception\n"); | 1222 | printk(KERN_EMERG "Unknown CPLB Exception\n"); |
1222 | break; | 1223 | break; |
1223 | } | 1224 | } |
1224 | 1225 | ||
1225 | oops_in_progress = 1; | 1226 | oops_in_progress = 1; |
1226 | 1227 | ||
1227 | dump_bfin_process(fp); | 1228 | dump_bfin_process(fp); |
1228 | dump_bfin_mem(fp); | 1229 | dump_bfin_mem(fp); |
1229 | show_regs(fp); | 1230 | show_regs(fp); |
1230 | dump_stack(); | 1231 | dump_stack(); |
arch/frv/kernel/ptrace.c
1 | /* ptrace.c: FRV specific parts of process tracing | 1 | /* ptrace.c: FRV specific parts of process tracing |
2 | * | 2 | * |
3 | * Copyright (C) 2003-5 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2003-5 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * - Derived from arch/m68k/kernel/ptrace.c | 5 | * - Derived from arch/m68k/kernel/ptrace.c |
6 | * | 6 | * |
7 | * This program is free software; you can redistribute it and/or | 7 | * This program is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU General Public License | 8 | * modify it under the terms of the GNU General Public License |
9 | * as published by the Free Software Foundation; either version | 9 | * as published by the Free Software Foundation; either version |
10 | * 2 of the License, or (at your option) any later version. | 10 | * 2 of the License, or (at your option) any later version. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
14 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/smp.h> | 16 | #include <linux/smp.h> |
17 | #include <linux/errno.h> | 17 | #include <linux/errno.h> |
18 | #include <linux/ptrace.h> | 18 | #include <linux/ptrace.h> |
19 | #include <linux/user.h> | 19 | #include <linux/user.h> |
20 | #include <linux/security.h> | 20 | #include <linux/security.h> |
21 | #include <linux/signal.h> | 21 | #include <linux/signal.h> |
22 | 22 | ||
23 | #include <asm/uaccess.h> | 23 | #include <asm/uaccess.h> |
24 | #include <asm/page.h> | 24 | #include <asm/page.h> |
25 | #include <asm/pgtable.h> | 25 | #include <asm/pgtable.h> |
26 | #include <asm/system.h> | 26 | #include <asm/system.h> |
27 | #include <asm/processor.h> | 27 | #include <asm/processor.h> |
28 | #include <asm/unistd.h> | 28 | #include <asm/unistd.h> |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * does not yet catch signals sent when the child dies. | 31 | * does not yet catch signals sent when the child dies. |
32 | * in exit.c or in signal.c. | 32 | * in exit.c or in signal.c. |
33 | */ | 33 | */ |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * Get contents of register REGNO in task TASK. | 36 | * Get contents of register REGNO in task TASK. |
37 | */ | 37 | */ |
38 | static inline long get_reg(struct task_struct *task, int regno) | 38 | static inline long get_reg(struct task_struct *task, int regno) |
39 | { | 39 | { |
40 | struct user_context *user = task->thread.user; | 40 | struct user_context *user = task->thread.user; |
41 | 41 | ||
42 | if (regno < 0 || regno >= PT__END) | 42 | if (regno < 0 || regno >= PT__END) |
43 | return 0; | 43 | return 0; |
44 | 44 | ||
45 | return ((unsigned long *) user)[regno]; | 45 | return ((unsigned long *) user)[regno]; |
46 | } | 46 | } |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Write contents of register REGNO in task TASK. | 49 | * Write contents of register REGNO in task TASK. |
50 | */ | 50 | */ |
51 | static inline int put_reg(struct task_struct *task, int regno, | 51 | static inline int put_reg(struct task_struct *task, int regno, |
52 | unsigned long data) | 52 | unsigned long data) |
53 | { | 53 | { |
54 | struct user_context *user = task->thread.user; | 54 | struct user_context *user = task->thread.user; |
55 | 55 | ||
56 | if (regno < 0 || regno >= PT__END) | 56 | if (regno < 0 || regno >= PT__END) |
57 | return -EIO; | 57 | return -EIO; |
58 | 58 | ||
59 | switch (regno) { | 59 | switch (regno) { |
60 | case PT_GR(0): | 60 | case PT_GR(0): |
61 | return 0; | 61 | return 0; |
62 | case PT_PSR: | 62 | case PT_PSR: |
63 | case PT__STATUS: | 63 | case PT__STATUS: |
64 | return -EIO; | 64 | return -EIO; |
65 | default: | 65 | default: |
66 | ((unsigned long *) user)[regno] = data; | 66 | ((unsigned long *) user)[regno] = data; |
67 | return 0; | 67 | return 0; |
68 | } | 68 | } |
69 | } | 69 | } |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * check that an address falls within the bounds of the target process's memory mappings | 72 | * check that an address falls within the bounds of the target process's memory |
73 | * mappings | ||
73 | */ | 74 | */ |
74 | static inline int is_user_addr_valid(struct task_struct *child, | 75 | static inline int is_user_addr_valid(struct task_struct *child, |
75 | unsigned long start, unsigned long len) | 76 | unsigned long start, unsigned long len) |
76 | { | 77 | { |
77 | #ifdef CONFIG_MMU | 78 | #ifdef CONFIG_MMU |
78 | if (start >= PAGE_OFFSET || len > PAGE_OFFSET - start) | 79 | if (start >= PAGE_OFFSET || len > PAGE_OFFSET - start) |
79 | return -EIO; | 80 | return -EIO; |
80 | return 0; | 81 | return 0; |
81 | #else | 82 | #else |
82 | struct vm_list_struct *vml; | 83 | struct vm_area_struct *vma; |
83 | 84 | ||
84 | for (vml = child->mm->context.vmlist; vml; vml = vml->next) | 85 | vma = find_vma(child->mm, start); |
85 | if (start >= vml->vma->vm_start && start + len <= vml->vma->vm_end) | 86 | if (vma && start >= vma->vm_start && start + len <= vma->vm_end) |
86 | return 0; | 87 | return 0; |
87 | 88 | ||
88 | return -EIO; | 89 | return -EIO; |
89 | #endif | 90 | #endif |
90 | } | 91 | } |
91 | 92 | ||
92 | /* | 93 | /* |
93 | * Called by kernel/ptrace.c when detaching.. | 94 | * Called by kernel/ptrace.c when detaching.. |
94 | * | 95 | * |
95 | * Control h/w single stepping | 96 | * Control h/w single stepping |
96 | */ | 97 | */ |
97 | void ptrace_disable(struct task_struct *child) | 98 | void ptrace_disable(struct task_struct *child) |
98 | { | 99 | { |
99 | child->thread.frame0->__status &= ~REG__STATUS_STEP; | 100 | child->thread.frame0->__status &= ~REG__STATUS_STEP; |
100 | } | 101 | } |
101 | 102 | ||
102 | void ptrace_enable(struct task_struct *child) | 103 | void ptrace_enable(struct task_struct *child) |
103 | { | 104 | { |
104 | child->thread.frame0->__status |= REG__STATUS_STEP; | 105 | child->thread.frame0->__status |= REG__STATUS_STEP; |
105 | } | 106 | } |
106 | 107 | ||
107 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) | 108 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) |
108 | { | 109 | { |
109 | unsigned long tmp; | 110 | unsigned long tmp; |
110 | int ret; | 111 | int ret; |
111 | 112 | ||
112 | switch (request) { | 113 | switch (request) { |
113 | /* when I and D space are separate, these will need to be fixed. */ | 114 | /* when I and D space are separate, these will need to be fixed. */ |
114 | case PTRACE_PEEKTEXT: /* read word at location addr. */ | 115 | case PTRACE_PEEKTEXT: /* read word at location addr. */ |
115 | case PTRACE_PEEKDATA: | 116 | case PTRACE_PEEKDATA: |
116 | ret = -EIO; | 117 | ret = -EIO; |
117 | if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0) | 118 | if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0) |
118 | break; | 119 | break; |
119 | ret = generic_ptrace_peekdata(child, addr, data); | 120 | ret = generic_ptrace_peekdata(child, addr, data); |
120 | break; | 121 | break; |
121 | 122 | ||
122 | /* read the word at location addr in the USER area. */ | 123 | /* read the word at location addr in the USER area. */ |
123 | case PTRACE_PEEKUSR: { | 124 | case PTRACE_PEEKUSR: { |
124 | tmp = 0; | 125 | tmp = 0; |
125 | ret = -EIO; | 126 | ret = -EIO; |
126 | if ((addr & 3) || addr < 0) | 127 | if ((addr & 3) || addr < 0) |
127 | break; | 128 | break; |
128 | 129 | ||
129 | ret = 0; | 130 | ret = 0; |
130 | switch (addr >> 2) { | 131 | switch (addr >> 2) { |
131 | case 0 ... PT__END - 1: | 132 | case 0 ... PT__END - 1: |
132 | tmp = get_reg(child, addr >> 2); | 133 | tmp = get_reg(child, addr >> 2); |
133 | break; | 134 | break; |
134 | 135 | ||
135 | case PT__END + 0: | 136 | case PT__END + 0: |
136 | tmp = child->mm->end_code - child->mm->start_code; | 137 | tmp = child->mm->end_code - child->mm->start_code; |
137 | break; | 138 | break; |
138 | 139 | ||
139 | case PT__END + 1: | 140 | case PT__END + 1: |
140 | tmp = child->mm->end_data - child->mm->start_data; | 141 | tmp = child->mm->end_data - child->mm->start_data; |
141 | break; | 142 | break; |
142 | 143 | ||
143 | case PT__END + 2: | 144 | case PT__END + 2: |
144 | tmp = child->mm->start_stack - child->mm->start_brk; | 145 | tmp = child->mm->start_stack - child->mm->start_brk; |
145 | break; | 146 | break; |
146 | 147 | ||
147 | case PT__END + 3: | 148 | case PT__END + 3: |
148 | tmp = child->mm->start_code; | 149 | tmp = child->mm->start_code; |
149 | break; | 150 | break; |
150 | 151 | ||
151 | case PT__END + 4: | 152 | case PT__END + 4: |
152 | tmp = child->mm->start_stack; | 153 | tmp = child->mm->start_stack; |
153 | break; | 154 | break; |
154 | 155 | ||
155 | default: | 156 | default: |
156 | ret = -EIO; | 157 | ret = -EIO; |
157 | break; | 158 | break; |
158 | } | 159 | } |
159 | 160 | ||
160 | if (ret == 0) | 161 | if (ret == 0) |
161 | ret = put_user(tmp, (unsigned long *) data); | 162 | ret = put_user(tmp, (unsigned long *) data); |
162 | break; | 163 | break; |
163 | } | 164 | } |
164 | 165 | ||
165 | /* when I and D space are separate, this will have to be fixed. */ | 166 | /* when I and D space are separate, this will have to be fixed. */ |
166 | case PTRACE_POKETEXT: /* write the word at location addr. */ | 167 | case PTRACE_POKETEXT: /* write the word at location addr. */ |
167 | case PTRACE_POKEDATA: | 168 | case PTRACE_POKEDATA: |
168 | ret = -EIO; | 169 | ret = -EIO; |
169 | if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0) | 170 | if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0) |
170 | break; | 171 | break; |
171 | ret = generic_ptrace_pokedata(child, addr, data); | 172 | ret = generic_ptrace_pokedata(child, addr, data); |
172 | break; | 173 | break; |
173 | 174 | ||
174 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | 175 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ |
175 | ret = -EIO; | 176 | ret = -EIO; |
176 | if ((addr & 3) || addr < 0) | 177 | if ((addr & 3) || addr < 0) |
177 | break; | 178 | break; |
178 | 179 | ||
179 | ret = 0; | 180 | ret = 0; |
180 | switch (addr >> 2) { | 181 | switch (addr >> 2) { |
181 | case 0 ... PT__END-1: | 182 | case 0 ... PT__END-1: |
182 | ret = put_reg(child, addr >> 2, data); | 183 | ret = put_reg(child, addr >> 2, data); |
183 | break; | 184 | break; |
184 | 185 | ||
185 | default: | 186 | default: |
186 | ret = -EIO; | 187 | ret = -EIO; |
187 | break; | 188 | break; |
188 | } | 189 | } |
189 | break; | 190 | break; |
190 | 191 | ||
191 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ | 192 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ |
192 | case PTRACE_CONT: /* restart after signal. */ | 193 | case PTRACE_CONT: /* restart after signal. */ |
193 | ret = -EIO; | 194 | ret = -EIO; |
194 | if (!valid_signal(data)) | 195 | if (!valid_signal(data)) |
195 | break; | 196 | break; |
196 | if (request == PTRACE_SYSCALL) | 197 | if (request == PTRACE_SYSCALL) |
197 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 198 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
198 | else | 199 | else |
199 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 200 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
200 | child->exit_code = data; | 201 | child->exit_code = data; |
201 | ptrace_disable(child); | 202 | ptrace_disable(child); |
202 | wake_up_process(child); | 203 | wake_up_process(child); |
203 | ret = 0; | 204 | ret = 0; |
204 | break; | 205 | break; |
205 | 206 | ||
206 | /* make the child exit. Best I can do is send it a sigkill. | 207 | /* make the child exit. Best I can do is send it a sigkill. |
207 | * perhaps it should be put in the status that it wants to | 208 | * perhaps it should be put in the status that it wants to |
208 | * exit. | 209 | * exit. |
209 | */ | 210 | */ |
210 | case PTRACE_KILL: | 211 | case PTRACE_KILL: |
211 | ret = 0; | 212 | ret = 0; |
212 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ | 213 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ |
213 | break; | 214 | break; |
214 | child->exit_code = SIGKILL; | 215 | child->exit_code = SIGKILL; |
215 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | 216 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); |
216 | ptrace_disable(child); | 217 | ptrace_disable(child); |
217 | wake_up_process(child); | 218 | wake_up_process(child); |
218 | break; | 219 | break; |
219 | 220 | ||
220 | case PTRACE_SINGLESTEP: /* set the trap flag. */ | 221 | case PTRACE_SINGLESTEP: /* set the trap flag. */ |
221 | ret = -EIO; | 222 | ret = -EIO; |
222 | if (!valid_signal(data)) | 223 | if (!valid_signal(data)) |
223 | break; | 224 | break; |
224 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 225 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
225 | ptrace_enable(child); | 226 | ptrace_enable(child); |
226 | child->exit_code = data; | 227 | child->exit_code = data; |
227 | wake_up_process(child); | 228 | wake_up_process(child); |
228 | ret = 0; | 229 | ret = 0; |
229 | break; | 230 | break; |
230 | 231 | ||
231 | case PTRACE_DETACH: /* detach a process that was attached. */ | 232 | case PTRACE_DETACH: /* detach a process that was attached. */ |
232 | ret = ptrace_detach(child, data); | 233 | ret = ptrace_detach(child, data); |
233 | break; | 234 | break; |
234 | 235 | ||
235 | case PTRACE_GETREGS: { /* Get all integer regs from the child. */ | 236 | case PTRACE_GETREGS: { /* Get all integer regs from the child. */ |
236 | int i; | 237 | int i; |
237 | for (i = 0; i < PT__GPEND; i++) { | 238 | for (i = 0; i < PT__GPEND; i++) { |
238 | tmp = get_reg(child, i); | 239 | tmp = get_reg(child, i); |
239 | if (put_user(tmp, (unsigned long *) data)) { | 240 | if (put_user(tmp, (unsigned long *) data)) { |
240 | ret = -EFAULT; | 241 | ret = -EFAULT; |
241 | break; | 242 | break; |
242 | } | 243 | } |
243 | data += sizeof(long); | 244 | data += sizeof(long); |
244 | } | 245 | } |
245 | ret = 0; | 246 | ret = 0; |
246 | break; | 247 | break; |
247 | } | 248 | } |
248 | 249 | ||
249 | case PTRACE_SETREGS: { /* Set all integer regs in the child. */ | 250 | case PTRACE_SETREGS: { /* Set all integer regs in the child. */ |
250 | int i; | 251 | int i; |
251 | for (i = 0; i < PT__GPEND; i++) { | 252 | for (i = 0; i < PT__GPEND; i++) { |
252 | if (get_user(tmp, (unsigned long *) data)) { | 253 | if (get_user(tmp, (unsigned long *) data)) { |
253 | ret = -EFAULT; | 254 | ret = -EFAULT; |
254 | break; | 255 | break; |
255 | } | 256 | } |
256 | put_reg(child, i, tmp); | 257 | put_reg(child, i, tmp); |
257 | data += sizeof(long); | 258 | data += sizeof(long); |
258 | } | 259 | } |
259 | ret = 0; | 260 | ret = 0; |
260 | break; | 261 | break; |
261 | } | 262 | } |
262 | 263 | ||
263 | case PTRACE_GETFPREGS: { /* Get the child FP/Media state. */ | 264 | case PTRACE_GETFPREGS: { /* Get the child FP/Media state. */ |
264 | ret = 0; | 265 | ret = 0; |
265 | if (copy_to_user((void *) data, | 266 | if (copy_to_user((void *) data, |
266 | &child->thread.user->f, | 267 | &child->thread.user->f, |
267 | sizeof(child->thread.user->f))) | 268 | sizeof(child->thread.user->f))) |
268 | ret = -EFAULT; | 269 | ret = -EFAULT; |
269 | break; | 270 | break; |
270 | } | 271 | } |
271 | 272 | ||
272 | case PTRACE_SETFPREGS: { /* Set the child FP/Media state. */ | 273 | case PTRACE_SETFPREGS: { /* Set the child FP/Media state. */ |
273 | ret = 0; | 274 | ret = 0; |
274 | if (copy_from_user(&child->thread.user->f, | 275 | if (copy_from_user(&child->thread.user->f, |
275 | (void *) data, | 276 | (void *) data, |
276 | sizeof(child->thread.user->f))) | 277 | sizeof(child->thread.user->f))) |
277 | ret = -EFAULT; | 278 | ret = -EFAULT; |
278 | break; | 279 | break; |
279 | } | 280 | } |
280 | 281 | ||
281 | case PTRACE_GETFDPIC: | 282 | case PTRACE_GETFDPIC: |
282 | tmp = 0; | 283 | tmp = 0; |
283 | switch (addr) { | 284 | switch (addr) { |
284 | case PTRACE_GETFDPIC_EXEC: | 285 | case PTRACE_GETFDPIC_EXEC: |
285 | tmp = child->mm->context.exec_fdpic_loadmap; | 286 | tmp = child->mm->context.exec_fdpic_loadmap; |
286 | break; | 287 | break; |
287 | case PTRACE_GETFDPIC_INTERP: | 288 | case PTRACE_GETFDPIC_INTERP: |
288 | tmp = child->mm->context.interp_fdpic_loadmap; | 289 | tmp = child->mm->context.interp_fdpic_loadmap; |
289 | break; | 290 | break; |
290 | default: | 291 | default: |
291 | break; | 292 | break; |
292 | } | 293 | } |
293 | 294 | ||
294 | ret = 0; | 295 | ret = 0; |
295 | if (put_user(tmp, (unsigned long *) data)) { | 296 | if (put_user(tmp, (unsigned long *) data)) { |
296 | ret = -EFAULT; | 297 | ret = -EFAULT; |
297 | break; | 298 | break; |
298 | } | 299 | } |
299 | break; | 300 | break; |
300 | 301 | ||
301 | default: | 302 | default: |
302 | ret = -EIO; | 303 | ret = -EIO; |
303 | break; | 304 | break; |
304 | } | 305 | } |
305 | return ret; | 306 | return ret; |
306 | } | 307 | } |
307 | 308 | ||
308 | int __nongprelbss kstrace; | 309 | int __nongprelbss kstrace; |
309 | 310 | ||
310 | static const struct { | 311 | static const struct { |
311 | const char *name; | 312 | const char *name; |
312 | unsigned argmask; | 313 | unsigned argmask; |
313 | } __syscall_name_table[NR_syscalls] = { | 314 | } __syscall_name_table[NR_syscalls] = { |
314 | [0] = { "restart_syscall" }, | 315 | [0] = { "restart_syscall" }, |
315 | [1] = { "exit", 0x000001 }, | 316 | [1] = { "exit", 0x000001 }, |
316 | [2] = { "fork", 0xffffff }, | 317 | [2] = { "fork", 0xffffff }, |
317 | [3] = { "read", 0x000141 }, | 318 | [3] = { "read", 0x000141 }, |
318 | [4] = { "write", 0x000141 }, | 319 | [4] = { "write", 0x000141 }, |
319 | [5] = { "open", 0x000235 }, | 320 | [5] = { "open", 0x000235 }, |
320 | [6] = { "close", 0x000001 }, | 321 | [6] = { "close", 0x000001 }, |
321 | [7] = { "waitpid", 0x000141 }, | 322 | [7] = { "waitpid", 0x000141 }, |
322 | [8] = { "creat", 0x000025 }, | 323 | [8] = { "creat", 0x000025 }, |
323 | [9] = { "link", 0x000055 }, | 324 | [9] = { "link", 0x000055 }, |
324 | [10] = { "unlink", 0x000005 }, | 325 | [10] = { "unlink", 0x000005 }, |
325 | [11] = { "execve", 0x000445 }, | 326 | [11] = { "execve", 0x000445 }, |
326 | [12] = { "chdir", 0x000005 }, | 327 | [12] = { "chdir", 0x000005 }, |
327 | [13] = { "time", 0x000004 }, | 328 | [13] = { "time", 0x000004 }, |
328 | [14] = { "mknod", 0x000325 }, | 329 | [14] = { "mknod", 0x000325 }, |
329 | [15] = { "chmod", 0x000025 }, | 330 | [15] = { "chmod", 0x000025 }, |
330 | [16] = { "lchown", 0x000025 }, | 331 | [16] = { "lchown", 0x000025 }, |
331 | [17] = { "break" }, | 332 | [17] = { "break" }, |
332 | [18] = { "oldstat", 0x000045 }, | 333 | [18] = { "oldstat", 0x000045 }, |
333 | [19] = { "lseek", 0x000131 }, | 334 | [19] = { "lseek", 0x000131 }, |
334 | [20] = { "getpid", 0xffffff }, | 335 | [20] = { "getpid", 0xffffff }, |
335 | [21] = { "mount", 0x043555 }, | 336 | [21] = { "mount", 0x043555 }, |
336 | [22] = { "umount", 0x000005 }, | 337 | [22] = { "umount", 0x000005 }, |
337 | [23] = { "setuid", 0x000001 }, | 338 | [23] = { "setuid", 0x000001 }, |
338 | [24] = { "getuid", 0xffffff }, | 339 | [24] = { "getuid", 0xffffff }, |
339 | [25] = { "stime", 0x000004 }, | 340 | [25] = { "stime", 0x000004 }, |
340 | [26] = { "ptrace", 0x004413 }, | 341 | [26] = { "ptrace", 0x004413 }, |
341 | [27] = { "alarm", 0x000001 }, | 342 | [27] = { "alarm", 0x000001 }, |
342 | [28] = { "oldfstat", 0x000041 }, | 343 | [28] = { "oldfstat", 0x000041 }, |
343 | [29] = { "pause", 0xffffff }, | 344 | [29] = { "pause", 0xffffff }, |
344 | [30] = { "utime", 0x000045 }, | 345 | [30] = { "utime", 0x000045 }, |
345 | [31] = { "stty" }, | 346 | [31] = { "stty" }, |
346 | [32] = { "gtty" }, | 347 | [32] = { "gtty" }, |
347 | [33] = { "access", 0x000025 }, | 348 | [33] = { "access", 0x000025 }, |
348 | [34] = { "nice", 0x000001 }, | 349 | [34] = { "nice", 0x000001 }, |
349 | [35] = { "ftime" }, | 350 | [35] = { "ftime" }, |
350 | [36] = { "sync", 0xffffff }, | 351 | [36] = { "sync", 0xffffff }, |
351 | [37] = { "kill", 0x000011 }, | 352 | [37] = { "kill", 0x000011 }, |
352 | [38] = { "rename", 0x000055 }, | 353 | [38] = { "rename", 0x000055 }, |
353 | [39] = { "mkdir", 0x000025 }, | 354 | [39] = { "mkdir", 0x000025 }, |
354 | [40] = { "rmdir", 0x000005 }, | 355 | [40] = { "rmdir", 0x000005 }, |
355 | [41] = { "dup", 0x000001 }, | 356 | [41] = { "dup", 0x000001 }, |
356 | [42] = { "pipe", 0x000004 }, | 357 | [42] = { "pipe", 0x000004 }, |
357 | [43] = { "times", 0x000004 }, | 358 | [43] = { "times", 0x000004 }, |
358 | [44] = { "prof" }, | 359 | [44] = { "prof" }, |
359 | [45] = { "brk", 0x000004 }, | 360 | [45] = { "brk", 0x000004 }, |
360 | [46] = { "setgid", 0x000001 }, | 361 | [46] = { "setgid", 0x000001 }, |
361 | [47] = { "getgid", 0xffffff }, | 362 | [47] = { "getgid", 0xffffff }, |
362 | [48] = { "signal", 0x000041 }, | 363 | [48] = { "signal", 0x000041 }, |
363 | [49] = { "geteuid", 0xffffff }, | 364 | [49] = { "geteuid", 0xffffff }, |
364 | [50] = { "getegid", 0xffffff }, | 365 | [50] = { "getegid", 0xffffff }, |
365 | [51] = { "acct", 0x000005 }, | 366 | [51] = { "acct", 0x000005 }, |
366 | [52] = { "umount2", 0x000035 }, | 367 | [52] = { "umount2", 0x000035 }, |
367 | [53] = { "lock" }, | 368 | [53] = { "lock" }, |
368 | [54] = { "ioctl", 0x000331 }, | 369 | [54] = { "ioctl", 0x000331 }, |
369 | [55] = { "fcntl", 0x000331 }, | 370 | [55] = { "fcntl", 0x000331 }, |
370 | [56] = { "mpx" }, | 371 | [56] = { "mpx" }, |
371 | [57] = { "setpgid", 0x000011 }, | 372 | [57] = { "setpgid", 0x000011 }, |
372 | [58] = { "ulimit" }, | 373 | [58] = { "ulimit" }, |
373 | [60] = { "umask", 0x000002 }, | 374 | [60] = { "umask", 0x000002 }, |
374 | [61] = { "chroot", 0x000005 }, | 375 | [61] = { "chroot", 0x000005 }, |
375 | [62] = { "ustat", 0x000043 }, | 376 | [62] = { "ustat", 0x000043 }, |
376 | [63] = { "dup2", 0x000011 }, | 377 | [63] = { "dup2", 0x000011 }, |
377 | [64] = { "getppid", 0xffffff }, | 378 | [64] = { "getppid", 0xffffff }, |
378 | [65] = { "getpgrp", 0xffffff }, | 379 | [65] = { "getpgrp", 0xffffff }, |
379 | [66] = { "setsid", 0xffffff }, | 380 | [66] = { "setsid", 0xffffff }, |
380 | [67] = { "sigaction" }, | 381 | [67] = { "sigaction" }, |
381 | [68] = { "sgetmask" }, | 382 | [68] = { "sgetmask" }, |
382 | [69] = { "ssetmask" }, | 383 | [69] = { "ssetmask" }, |
383 | [70] = { "setreuid" }, | 384 | [70] = { "setreuid" }, |
384 | [71] = { "setregid" }, | 385 | [71] = { "setregid" }, |
385 | [72] = { "sigsuspend" }, | 386 | [72] = { "sigsuspend" }, |
386 | [73] = { "sigpending" }, | 387 | [73] = { "sigpending" }, |
387 | [74] = { "sethostname" }, | 388 | [74] = { "sethostname" }, |
388 | [75] = { "setrlimit" }, | 389 | [75] = { "setrlimit" }, |
389 | [76] = { "getrlimit" }, | 390 | [76] = { "getrlimit" }, |
390 | [77] = { "getrusage" }, | 391 | [77] = { "getrusage" }, |
391 | [78] = { "gettimeofday" }, | 392 | [78] = { "gettimeofday" }, |
392 | [79] = { "settimeofday" }, | 393 | [79] = { "settimeofday" }, |
393 | [80] = { "getgroups" }, | 394 | [80] = { "getgroups" }, |
394 | [81] = { "setgroups" }, | 395 | [81] = { "setgroups" }, |
395 | [82] = { "select" }, | 396 | [82] = { "select" }, |
396 | [83] = { "symlink" }, | 397 | [83] = { "symlink" }, |
397 | [84] = { "oldlstat" }, | 398 | [84] = { "oldlstat" }, |
398 | [85] = { "readlink" }, | 399 | [85] = { "readlink" }, |
399 | [86] = { "uselib" }, | 400 | [86] = { "uselib" }, |
400 | [87] = { "swapon" }, | 401 | [87] = { "swapon" }, |
401 | [88] = { "reboot" }, | 402 | [88] = { "reboot" }, |
402 | [89] = { "readdir" }, | 403 | [89] = { "readdir" }, |
403 | [91] = { "munmap", 0x000034 }, | 404 | [91] = { "munmap", 0x000034 }, |
404 | [92] = { "truncate" }, | 405 | [92] = { "truncate" }, |
405 | [93] = { "ftruncate" }, | 406 | [93] = { "ftruncate" }, |
406 | [94] = { "fchmod" }, | 407 | [94] = { "fchmod" }, |
407 | [95] = { "fchown" }, | 408 | [95] = { "fchown" }, |
408 | [96] = { "getpriority" }, | 409 | [96] = { "getpriority" }, |
409 | [97] = { "setpriority" }, | 410 | [97] = { "setpriority" }, |
410 | [99] = { "statfs" }, | 411 | [99] = { "statfs" }, |
411 | [100] = { "fstatfs" }, | 412 | [100] = { "fstatfs" }, |
412 | [102] = { "socketcall" }, | 413 | [102] = { "socketcall" }, |
413 | [103] = { "syslog" }, | 414 | [103] = { "syslog" }, |
414 | [104] = { "setitimer" }, | 415 | [104] = { "setitimer" }, |
415 | [105] = { "getitimer" }, | 416 | [105] = { "getitimer" }, |
416 | [106] = { "stat" }, | 417 | [106] = { "stat" }, |
417 | [107] = { "lstat" }, | 418 | [107] = { "lstat" }, |
418 | [108] = { "fstat" }, | 419 | [108] = { "fstat" }, |
419 | [111] = { "vhangup" }, | 420 | [111] = { "vhangup" }, |
420 | [114] = { "wait4" }, | 421 | [114] = { "wait4" }, |
421 | [115] = { "swapoff" }, | 422 | [115] = { "swapoff" }, |
422 | [116] = { "sysinfo" }, | 423 | [116] = { "sysinfo" }, |
423 | [117] = { "ipc" }, | 424 | [117] = { "ipc" }, |
424 | [118] = { "fsync" }, | 425 | [118] = { "fsync" }, |
425 | [119] = { "sigreturn" }, | 426 | [119] = { "sigreturn" }, |
426 | [120] = { "clone" }, | 427 | [120] = { "clone" }, |
427 | [121] = { "setdomainname" }, | 428 | [121] = { "setdomainname" }, |
428 | [122] = { "uname" }, | 429 | [122] = { "uname" }, |
429 | [123] = { "modify_ldt" }, | 430 | [123] = { "modify_ldt" }, |
430 | [123] = { "cacheflush" }, | 431 | [123] = { "cacheflush" }, |
431 | [124] = { "adjtimex" }, | 432 | [124] = { "adjtimex" }, |
432 | [125] = { "mprotect" }, | 433 | [125] = { "mprotect" }, |
433 | [126] = { "sigprocmask" }, | 434 | [126] = { "sigprocmask" }, |
434 | [127] = { "create_module" }, | 435 | [127] = { "create_module" }, |
435 | [128] = { "init_module" }, | 436 | [128] = { "init_module" }, |
436 | [129] = { "delete_module" }, | 437 | [129] = { "delete_module" }, |
437 | [130] = { "get_kernel_syms" }, | 438 | [130] = { "get_kernel_syms" }, |
438 | [131] = { "quotactl" }, | 439 | [131] = { "quotactl" }, |
439 | [132] = { "getpgid" }, | 440 | [132] = { "getpgid" }, |
440 | [133] = { "fchdir" }, | 441 | [133] = { "fchdir" }, |
441 | [134] = { "bdflush" }, | 442 | [134] = { "bdflush" }, |
442 | [135] = { "sysfs" }, | 443 | [135] = { "sysfs" }, |
443 | [136] = { "personality" }, | 444 | [136] = { "personality" }, |
444 | [137] = { "afs_syscall" }, | 445 | [137] = { "afs_syscall" }, |
445 | [138] = { "setfsuid" }, | 446 | [138] = { "setfsuid" }, |
446 | [139] = { "setfsgid" }, | 447 | [139] = { "setfsgid" }, |
447 | [140] = { "_llseek", 0x014331 }, | 448 | [140] = { "_llseek", 0x014331 }, |
448 | [141] = { "getdents" }, | 449 | [141] = { "getdents" }, |
449 | [142] = { "_newselect", 0x000141 }, | 450 | [142] = { "_newselect", 0x000141 }, |
450 | [143] = { "flock" }, | 451 | [143] = { "flock" }, |
451 | [144] = { "msync" }, | 452 | [144] = { "msync" }, |
452 | [145] = { "readv" }, | 453 | [145] = { "readv" }, |
453 | [146] = { "writev" }, | 454 | [146] = { "writev" }, |
454 | [147] = { "getsid", 0x000001 }, | 455 | [147] = { "getsid", 0x000001 }, |
455 | [148] = { "fdatasync", 0x000001 }, | 456 | [148] = { "fdatasync", 0x000001 }, |
456 | [149] = { "_sysctl", 0x000004 }, | 457 | [149] = { "_sysctl", 0x000004 }, |
457 | [150] = { "mlock" }, | 458 | [150] = { "mlock" }, |
458 | [151] = { "munlock" }, | 459 | [151] = { "munlock" }, |
459 | [152] = { "mlockall" }, | 460 | [152] = { "mlockall" }, |
460 | [153] = { "munlockall" }, | 461 | [153] = { "munlockall" }, |
461 | [154] = { "sched_setparam" }, | 462 | [154] = { "sched_setparam" }, |
462 | [155] = { "sched_getparam" }, | 463 | [155] = { "sched_getparam" }, |
463 | [156] = { "sched_setscheduler" }, | 464 | [156] = { "sched_setscheduler" }, |
464 | [157] = { "sched_getscheduler" }, | 465 | [157] = { "sched_getscheduler" }, |
465 | [158] = { "sched_yield" }, | 466 | [158] = { "sched_yield" }, |
466 | [159] = { "sched_get_priority_max" }, | 467 | [159] = { "sched_get_priority_max" }, |
467 | [160] = { "sched_get_priority_min" }, | 468 | [160] = { "sched_get_priority_min" }, |
468 | [161] = { "sched_rr_get_interval" }, | 469 | [161] = { "sched_rr_get_interval" }, |
469 | [162] = { "nanosleep", 0x000044 }, | 470 | [162] = { "nanosleep", 0x000044 }, |
470 | [163] = { "mremap" }, | 471 | [163] = { "mremap" }, |
471 | [164] = { "setresuid" }, | 472 | [164] = { "setresuid" }, |
472 | [165] = { "getresuid" }, | 473 | [165] = { "getresuid" }, |
473 | [166] = { "vm86" }, | 474 | [166] = { "vm86" }, |
474 | [167] = { "query_module" }, | 475 | [167] = { "query_module" }, |
475 | [168] = { "poll" }, | 476 | [168] = { "poll" }, |
476 | [169] = { "nfsservctl" }, | 477 | [169] = { "nfsservctl" }, |
477 | [170] = { "setresgid" }, | 478 | [170] = { "setresgid" }, |
478 | [171] = { "getresgid" }, | 479 | [171] = { "getresgid" }, |
479 | [172] = { "prctl", 0x333331 }, | 480 | [172] = { "prctl", 0x333331 }, |
480 | [173] = { "rt_sigreturn", 0xffffff }, | 481 | [173] = { "rt_sigreturn", 0xffffff }, |
481 | [174] = { "rt_sigaction", 0x001441 }, | 482 | [174] = { "rt_sigaction", 0x001441 }, |
482 | [175] = { "rt_sigprocmask", 0x001441 }, | 483 | [175] = { "rt_sigprocmask", 0x001441 }, |
483 | [176] = { "rt_sigpending", 0x000014 }, | 484 | [176] = { "rt_sigpending", 0x000014 }, |
484 | [177] = { "rt_sigtimedwait", 0x001444 }, | 485 | [177] = { "rt_sigtimedwait", 0x001444 }, |
485 | [178] = { "rt_sigqueueinfo", 0x000411 }, | 486 | [178] = { "rt_sigqueueinfo", 0x000411 }, |
486 | [179] = { "rt_sigsuspend", 0x000014 }, | 487 | [179] = { "rt_sigsuspend", 0x000014 }, |
487 | [180] = { "pread", 0x003341 }, | 488 | [180] = { "pread", 0x003341 }, |
488 | [181] = { "pwrite", 0x003341 }, | 489 | [181] = { "pwrite", 0x003341 }, |
489 | [182] = { "chown", 0x000115 }, | 490 | [182] = { "chown", 0x000115 }, |
490 | [183] = { "getcwd" }, | 491 | [183] = { "getcwd" }, |
491 | [184] = { "capget" }, | 492 | [184] = { "capget" }, |
492 | [185] = { "capset" }, | 493 | [185] = { "capset" }, |
493 | [186] = { "sigaltstack" }, | 494 | [186] = { "sigaltstack" }, |
494 | [187] = { "sendfile" }, | 495 | [187] = { "sendfile" }, |
495 | [188] = { "getpmsg" }, | 496 | [188] = { "getpmsg" }, |
496 | [189] = { "putpmsg" }, | 497 | [189] = { "putpmsg" }, |
497 | [190] = { "vfork", 0xffffff }, | 498 | [190] = { "vfork", 0xffffff }, |
498 | [191] = { "ugetrlimit" }, | 499 | [191] = { "ugetrlimit" }, |
499 | [192] = { "mmap2", 0x313314 }, | 500 | [192] = { "mmap2", 0x313314 }, |
500 | [193] = { "truncate64" }, | 501 | [193] = { "truncate64" }, |
501 | [194] = { "ftruncate64" }, | 502 | [194] = { "ftruncate64" }, |
502 | [195] = { "stat64", 0x000045 }, | 503 | [195] = { "stat64", 0x000045 }, |
503 | [196] = { "lstat64", 0x000045 }, | 504 | [196] = { "lstat64", 0x000045 }, |
504 | [197] = { "fstat64", 0x000041 }, | 505 | [197] = { "fstat64", 0x000041 }, |
505 | [198] = { "lchown32" }, | 506 | [198] = { "lchown32" }, |
506 | [199] = { "getuid32", 0xffffff }, | 507 | [199] = { "getuid32", 0xffffff }, |
507 | [200] = { "getgid32", 0xffffff }, | 508 | [200] = { "getgid32", 0xffffff }, |
508 | [201] = { "geteuid32", 0xffffff }, | 509 | [201] = { "geteuid32", 0xffffff }, |
509 | [202] = { "getegid32", 0xffffff }, | 510 | [202] = { "getegid32", 0xffffff }, |
510 | [203] = { "setreuid32" }, | 511 | [203] = { "setreuid32" }, |
511 | [204] = { "setregid32" }, | 512 | [204] = { "setregid32" }, |
512 | [205] = { "getgroups32" }, | 513 | [205] = { "getgroups32" }, |
513 | [206] = { "setgroups32" }, | 514 | [206] = { "setgroups32" }, |
514 | [207] = { "fchown32" }, | 515 | [207] = { "fchown32" }, |
515 | [208] = { "setresuid32" }, | 516 | [208] = { "setresuid32" }, |
516 | [209] = { "getresuid32" }, | 517 | [209] = { "getresuid32" }, |
517 | [210] = { "setresgid32" }, | 518 | [210] = { "setresgid32" }, |
518 | [211] = { "getresgid32" }, | 519 | [211] = { "getresgid32" }, |
519 | [212] = { "chown32" }, | 520 | [212] = { "chown32" }, |
520 | [213] = { "setuid32" }, | 521 | [213] = { "setuid32" }, |
521 | [214] = { "setgid32" }, | 522 | [214] = { "setgid32" }, |
522 | [215] = { "setfsuid32" }, | 523 | [215] = { "setfsuid32" }, |
523 | [216] = { "setfsgid32" }, | 524 | [216] = { "setfsgid32" }, |
524 | [217] = { "pivot_root" }, | 525 | [217] = { "pivot_root" }, |
525 | [218] = { "mincore" }, | 526 | [218] = { "mincore" }, |
526 | [219] = { "madvise" }, | 527 | [219] = { "madvise" }, |
527 | [220] = { "getdents64" }, | 528 | [220] = { "getdents64" }, |
528 | [221] = { "fcntl64" }, | 529 | [221] = { "fcntl64" }, |
529 | [223] = { "security" }, | 530 | [223] = { "security" }, |
530 | [224] = { "gettid" }, | 531 | [224] = { "gettid" }, |
531 | [225] = { "readahead" }, | 532 | [225] = { "readahead" }, |
532 | [226] = { "setxattr" }, | 533 | [226] = { "setxattr" }, |
533 | [227] = { "lsetxattr" }, | 534 | [227] = { "lsetxattr" }, |
534 | [228] = { "fsetxattr" }, | 535 | [228] = { "fsetxattr" }, |
535 | [229] = { "getxattr" }, | 536 | [229] = { "getxattr" }, |
536 | [230] = { "lgetxattr" }, | 537 | [230] = { "lgetxattr" }, |
537 | [231] = { "fgetxattr" }, | 538 | [231] = { "fgetxattr" }, |
538 | [232] = { "listxattr" }, | 539 | [232] = { "listxattr" }, |
539 | [233] = { "llistxattr" }, | 540 | [233] = { "llistxattr" }, |
540 | [234] = { "flistxattr" }, | 541 | [234] = { "flistxattr" }, |
541 | [235] = { "removexattr" }, | 542 | [235] = { "removexattr" }, |
542 | [236] = { "lremovexattr" }, | 543 | [236] = { "lremovexattr" }, |
543 | [237] = { "fremovexattr" }, | 544 | [237] = { "fremovexattr" }, |
544 | [238] = { "tkill" }, | 545 | [238] = { "tkill" }, |
545 | [239] = { "sendfile64" }, | 546 | [239] = { "sendfile64" }, |
546 | [240] = { "futex" }, | 547 | [240] = { "futex" }, |
547 | [241] = { "sched_setaffinity" }, | 548 | [241] = { "sched_setaffinity" }, |
548 | [242] = { "sched_getaffinity" }, | 549 | [242] = { "sched_getaffinity" }, |
549 | [243] = { "set_thread_area" }, | 550 | [243] = { "set_thread_area" }, |
550 | [244] = { "get_thread_area" }, | 551 | [244] = { "get_thread_area" }, |
551 | [245] = { "io_setup" }, | 552 | [245] = { "io_setup" }, |
552 | [246] = { "io_destroy" }, | 553 | [246] = { "io_destroy" }, |
553 | [247] = { "io_getevents" }, | 554 | [247] = { "io_getevents" }, |
554 | [248] = { "io_submit" }, | 555 | [248] = { "io_submit" }, |
555 | [249] = { "io_cancel" }, | 556 | [249] = { "io_cancel" }, |
556 | [250] = { "fadvise64" }, | 557 | [250] = { "fadvise64" }, |
557 | [252] = { "exit_group", 0x000001 }, | 558 | [252] = { "exit_group", 0x000001 }, |
558 | [253] = { "lookup_dcookie" }, | 559 | [253] = { "lookup_dcookie" }, |
559 | [254] = { "epoll_create" }, | 560 | [254] = { "epoll_create" }, |
560 | [255] = { "epoll_ctl" }, | 561 | [255] = { "epoll_ctl" }, |
561 | [256] = { "epoll_wait" }, | 562 | [256] = { "epoll_wait" }, |
562 | [257] = { "remap_file_pages" }, | 563 | [257] = { "remap_file_pages" }, |
563 | [258] = { "set_tid_address" }, | 564 | [258] = { "set_tid_address" }, |
564 | [259] = { "timer_create" }, | 565 | [259] = { "timer_create" }, |
565 | [260] = { "timer_settime" }, | 566 | [260] = { "timer_settime" }, |
566 | [261] = { "timer_gettime" }, | 567 | [261] = { "timer_gettime" }, |
567 | [262] = { "timer_getoverrun" }, | 568 | [262] = { "timer_getoverrun" }, |
568 | [263] = { "timer_delete" }, | 569 | [263] = { "timer_delete" }, |
569 | [264] = { "clock_settime" }, | 570 | [264] = { "clock_settime" }, |
570 | [265] = { "clock_gettime" }, | 571 | [265] = { "clock_gettime" }, |
571 | [266] = { "clock_getres" }, | 572 | [266] = { "clock_getres" }, |
572 | [267] = { "clock_nanosleep" }, | 573 | [267] = { "clock_nanosleep" }, |
573 | [268] = { "statfs64" }, | 574 | [268] = { "statfs64" }, |
574 | [269] = { "fstatfs64" }, | 575 | [269] = { "fstatfs64" }, |
575 | [270] = { "tgkill" }, | 576 | [270] = { "tgkill" }, |
576 | [271] = { "utimes" }, | 577 | [271] = { "utimes" }, |
577 | [272] = { "fadvise64_64" }, | 578 | [272] = { "fadvise64_64" }, |
578 | [273] = { "vserver" }, | 579 | [273] = { "vserver" }, |
579 | [274] = { "mbind" }, | 580 | [274] = { "mbind" }, |
580 | [275] = { "get_mempolicy" }, | 581 | [275] = { "get_mempolicy" }, |
581 | [276] = { "set_mempolicy" }, | 582 | [276] = { "set_mempolicy" }, |
582 | [277] = { "mq_open" }, | 583 | [277] = { "mq_open" }, |
583 | [278] = { "mq_unlink" }, | 584 | [278] = { "mq_unlink" }, |
584 | [279] = { "mq_timedsend" }, | 585 | [279] = { "mq_timedsend" }, |
585 | [280] = { "mq_timedreceive" }, | 586 | [280] = { "mq_timedreceive" }, |
586 | [281] = { "mq_notify" }, | 587 | [281] = { "mq_notify" }, |
587 | [282] = { "mq_getsetattr" }, | 588 | [282] = { "mq_getsetattr" }, |
588 | [283] = { "sys_kexec_load" }, | 589 | [283] = { "sys_kexec_load" }, |
589 | }; | 590 | }; |
590 | 591 | ||
591 | asmlinkage void do_syscall_trace(int leaving) | 592 | asmlinkage void do_syscall_trace(int leaving) |
592 | { | 593 | { |
593 | #if 0 | 594 | #if 0 |
594 | unsigned long *argp; | 595 | unsigned long *argp; |
595 | const char *name; | 596 | const char *name; |
596 | unsigned argmask; | 597 | unsigned argmask; |
597 | char buffer[16]; | 598 | char buffer[16]; |
598 | 599 | ||
599 | if (!kstrace) | 600 | if (!kstrace) |
600 | return; | 601 | return; |
601 | 602 | ||
602 | if (!current->mm) | 603 | if (!current->mm) |
603 | return; | 604 | return; |
604 | 605 | ||
605 | if (__frame->gr7 == __NR_close) | 606 | if (__frame->gr7 == __NR_close) |
606 | return; | 607 | return; |
607 | 608 | ||
608 | #if 0 | 609 | #if 0 |
609 | if (__frame->gr7 != __NR_mmap2 && | 610 | if (__frame->gr7 != __NR_mmap2 && |
610 | __frame->gr7 != __NR_vfork && | 611 | __frame->gr7 != __NR_vfork && |
611 | __frame->gr7 != __NR_execve && | 612 | __frame->gr7 != __NR_execve && |
612 | __frame->gr7 != __NR_exit) | 613 | __frame->gr7 != __NR_exit) |
613 | return; | 614 | return; |
614 | #endif | 615 | #endif |
615 | 616 | ||
616 | argmask = 0; | 617 | argmask = 0; |
617 | name = NULL; | 618 | name = NULL; |
618 | if (__frame->gr7 < NR_syscalls) { | 619 | if (__frame->gr7 < NR_syscalls) { |
619 | name = __syscall_name_table[__frame->gr7].name; | 620 | name = __syscall_name_table[__frame->gr7].name; |
620 | argmask = __syscall_name_table[__frame->gr7].argmask; | 621 | argmask = __syscall_name_table[__frame->gr7].argmask; |
621 | } | 622 | } |
622 | if (!name) { | 623 | if (!name) { |
623 | sprintf(buffer, "sys_%lx", __frame->gr7); | 624 | sprintf(buffer, "sys_%lx", __frame->gr7); |
624 | name = buffer; | 625 | name = buffer; |
625 | } | 626 | } |
626 | 627 | ||
627 | if (!leaving) { | 628 | if (!leaving) { |
628 | if (!argmask) { | 629 | if (!argmask) { |
629 | printk(KERN_CRIT "[%d] %s(%lx,%lx,%lx,%lx,%lx,%lx)\n", | 630 | printk(KERN_CRIT "[%d] %s(%lx,%lx,%lx,%lx,%lx,%lx)\n", |
630 | current->pid, | 631 | current->pid, |
631 | name, | 632 | name, |
632 | __frame->gr8, | 633 | __frame->gr8, |
633 | __frame->gr9, | 634 | __frame->gr9, |
634 | __frame->gr10, | 635 | __frame->gr10, |
635 | __frame->gr11, | 636 | __frame->gr11, |
636 | __frame->gr12, | 637 | __frame->gr12, |
637 | __frame->gr13); | 638 | __frame->gr13); |
638 | } | 639 | } |
639 | else if (argmask == 0xffffff) { | 640 | else if (argmask == 0xffffff) { |
640 | printk(KERN_CRIT "[%d] %s()\n", | 641 | printk(KERN_CRIT "[%d] %s()\n", |
641 | current->pid, | 642 | current->pid, |
642 | name); | 643 | name); |
643 | } | 644 | } |
644 | else { | 645 | else { |
645 | printk(KERN_CRIT "[%d] %s(", | 646 | printk(KERN_CRIT "[%d] %s(", |
646 | current->pid, | 647 | current->pid, |
647 | name); | 648 | name); |
648 | 649 | ||
649 | argp = &__frame->gr8; | 650 | argp = &__frame->gr8; |
650 | 651 | ||
651 | do { | 652 | do { |
652 | switch (argmask & 0xf) { | 653 | switch (argmask & 0xf) { |
653 | case 1: | 654 | case 1: |
654 | printk("%ld", (long) *argp); | 655 | printk("%ld", (long) *argp); |
655 | break; | 656 | break; |
656 | case 2: | 657 | case 2: |
657 | printk("%lo", *argp); | 658 | printk("%lo", *argp); |
658 | break; | 659 | break; |
659 | case 3: | 660 | case 3: |
660 | printk("%lx", *argp); | 661 | printk("%lx", *argp); |
661 | break; | 662 | break; |
662 | case 4: | 663 | case 4: |
663 | printk("%p", (void *) *argp); | 664 | printk("%p", (void *) *argp); |
664 | break; | 665 | break; |
665 | case 5: | 666 | case 5: |
666 | printk("\"%s\"", (char *) *argp); | 667 | printk("\"%s\"", (char *) *argp); |
667 | break; | 668 | break; |
668 | } | 669 | } |
669 | 670 | ||
670 | argp++; | 671 | argp++; |
671 | argmask >>= 4; | 672 | argmask >>= 4; |
672 | if (argmask) | 673 | if (argmask) |
673 | printk(","); | 674 | printk(","); |
674 | 675 | ||
675 | } while (argmask); | 676 | } while (argmask); |
676 | 677 | ||
677 | printk(")\n"); | 678 | printk(")\n"); |
678 | } | 679 | } |
679 | } | 680 | } |
680 | else { | 681 | else { |
681 | if ((int)__frame->gr8 > -4096 && (int)__frame->gr8 < 4096) | 682 | if ((int)__frame->gr8 > -4096 && (int)__frame->gr8 < 4096) |
682 | printk(KERN_CRIT "[%d] %s() = %ld\n", current->pid, name, __frame->gr8); | 683 | printk(KERN_CRIT "[%d] %s() = %ld\n", current->pid, name, __frame->gr8); |
683 | else | 684 | else |
684 | printk(KERN_CRIT "[%d] %s() = %lx\n", current->pid, name, __frame->gr8); | 685 | printk(KERN_CRIT "[%d] %s() = %lx\n", current->pid, name, __frame->gr8); |
685 | } | 686 | } |
686 | return; | 687 | return; |
687 | #endif | 688 | #endif |
688 | 689 | ||
689 | if (!test_thread_flag(TIF_SYSCALL_TRACE)) | 690 | if (!test_thread_flag(TIF_SYSCALL_TRACE)) |
690 | return; | 691 | return; |
691 | 692 | ||
692 | if (!(current->ptrace & PT_PTRACED)) | 693 | if (!(current->ptrace & PT_PTRACED)) |
693 | return; | 694 | return; |
694 | 695 | ||
695 | /* we need to indicate entry or exit to strace */ | 696 | /* we need to indicate entry or exit to strace */ |
696 | if (leaving) | 697 | if (leaving) |
697 | __frame->__status |= REG__STATUS_SYSC_EXIT; | 698 | __frame->__status |= REG__STATUS_SYSC_EXIT; |
698 | else | 699 | else |
699 | __frame->__status |= REG__STATUS_SYSC_ENTRY; | 700 | __frame->__status |= REG__STATUS_SYSC_ENTRY; |
700 | 701 | ||
701 | ptrace_notify(SIGTRAP); | 702 | ptrace_notify(SIGTRAP); |
702 | 703 | ||
703 | /* | 704 | /* |
704 | * this isn't the same as continuing with a signal, but it will do | 705 | * this isn't the same as continuing with a signal, but it will do |
705 | * for normal use. strace only continues with a signal if the | 706 | * for normal use. strace only continues with a signal if the |
706 | * stopping signal is not SIGTRAP. -brl | 707 | * stopping signal is not SIGTRAP. -brl |
707 | */ | 708 | */ |
708 | if (current->exit_code) { | 709 | if (current->exit_code) { |
709 | send_sig(current->exit_code, current, 1); | 710 | send_sig(current->exit_code, current, 1); |
710 | current->exit_code = 0; | 711 | current->exit_code = 0; |
711 | } | 712 | } |
712 | } | 713 | } |
713 | 714 |
arch/h8300/include/asm/mmu.h
1 | #ifndef __MMU_H | 1 | #ifndef __MMU_H |
2 | #define __MMU_H | 2 | #define __MMU_H |
3 | 3 | ||
4 | /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */ | 4 | /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */ |
5 | 5 | ||
6 | typedef struct { | 6 | typedef struct { |
7 | struct vm_list_struct *vmlist; | ||
8 | unsigned long end_brk; | 7 | unsigned long end_brk; |
9 | } mm_context_t; | 8 | } mm_context_t; |
10 | 9 | ||
11 | #endif | 10 | #endif |
12 | 11 |
arch/m68knommu/include/asm/mmu.h
1 | #ifndef __M68KNOMMU_MMU_H | 1 | #ifndef __M68KNOMMU_MMU_H |
2 | #define __M68KNOMMU_MMU_H | 2 | #define __M68KNOMMU_MMU_H |
3 | 3 | ||
4 | /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */ | 4 | /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */ |
5 | 5 | ||
6 | typedef struct { | 6 | typedef struct { |
7 | struct vm_list_struct *vmlist; | ||
8 | unsigned long end_brk; | 7 | unsigned long end_brk; |
9 | } mm_context_t; | 8 | } mm_context_t; |
10 | 9 | ||
11 | #endif /* __M68KNOMMU_MMU_H */ | 10 | #endif /* __M68KNOMMU_MMU_H */ |
12 | 11 |
arch/sh/include/asm/mmu.h
1 | #ifndef __MMU_H | 1 | #ifndef __MMU_H |
2 | #define __MMU_H | 2 | #define __MMU_H |
3 | 3 | ||
4 | /* Default "unsigned long" context */ | 4 | /* Default "unsigned long" context */ |
5 | typedef unsigned long mm_context_id_t[NR_CPUS]; | 5 | typedef unsigned long mm_context_id_t[NR_CPUS]; |
6 | 6 | ||
7 | typedef struct { | 7 | typedef struct { |
8 | #ifdef CONFIG_MMU | 8 | #ifdef CONFIG_MMU |
9 | mm_context_id_t id; | 9 | mm_context_id_t id; |
10 | void *vdso; | 10 | void *vdso; |
11 | #else | 11 | #else |
12 | struct vm_list_struct *vmlist; | ||
13 | unsigned long end_brk; | 12 | unsigned long end_brk; |
14 | #endif | 13 | #endif |
15 | #ifdef CONFIG_BINFMT_ELF_FDPIC | 14 | #ifdef CONFIG_BINFMT_ELF_FDPIC |
16 | unsigned long exec_fdpic_loadmap; | 15 | unsigned long exec_fdpic_loadmap; |
17 | unsigned long interp_fdpic_loadmap; | 16 | unsigned long interp_fdpic_loadmap; |
18 | #endif | 17 | #endif |
19 | } mm_context_t; | 18 | } mm_context_t; |
20 | 19 | ||
21 | /* | 20 | /* |
22 | * Privileged Space Mapping Buffer (PMB) definitions | 21 | * Privileged Space Mapping Buffer (PMB) definitions |
23 | */ | 22 | */ |
24 | #define PMB_PASCR 0xff000070 | 23 | #define PMB_PASCR 0xff000070 |
25 | #define PMB_IRMCR 0xff000078 | 24 | #define PMB_IRMCR 0xff000078 |
26 | 25 | ||
27 | #define PMB_ADDR 0xf6100000 | 26 | #define PMB_ADDR 0xf6100000 |
28 | #define PMB_DATA 0xf7100000 | 27 | #define PMB_DATA 0xf7100000 |
29 | #define PMB_ENTRY_MAX 16 | 28 | #define PMB_ENTRY_MAX 16 |
30 | #define PMB_E_MASK 0x0000000f | 29 | #define PMB_E_MASK 0x0000000f |
31 | #define PMB_E_SHIFT 8 | 30 | #define PMB_E_SHIFT 8 |
32 | 31 | ||
33 | #define PMB_SZ_16M 0x00000000 | 32 | #define PMB_SZ_16M 0x00000000 |
34 | #define PMB_SZ_64M 0x00000010 | 33 | #define PMB_SZ_64M 0x00000010 |
35 | #define PMB_SZ_128M 0x00000080 | 34 | #define PMB_SZ_128M 0x00000080 |
36 | #define PMB_SZ_512M 0x00000090 | 35 | #define PMB_SZ_512M 0x00000090 |
37 | #define PMB_SZ_MASK PMB_SZ_512M | 36 | #define PMB_SZ_MASK PMB_SZ_512M |
38 | #define PMB_C 0x00000008 | 37 | #define PMB_C 0x00000008 |
39 | #define PMB_WT 0x00000001 | 38 | #define PMB_WT 0x00000001 |
40 | #define PMB_UB 0x00000200 | 39 | #define PMB_UB 0x00000200 |
41 | #define PMB_V 0x00000100 | 40 | #define PMB_V 0x00000100 |
42 | 41 | ||
43 | #define PMB_NO_ENTRY (-1) | 42 | #define PMB_NO_ENTRY (-1) |
44 | 43 | ||
45 | struct pmb_entry; | 44 | struct pmb_entry; |
46 | 45 | ||
47 | struct pmb_entry { | 46 | struct pmb_entry { |
48 | unsigned long vpn; | 47 | unsigned long vpn; |
49 | unsigned long ppn; | 48 | unsigned long ppn; |
50 | unsigned long flags; | 49 | unsigned long flags; |
51 | 50 | ||
52 | /* | 51 | /* |
53 | * 0 .. NR_PMB_ENTRIES for specific entry selection, or | 52 | * 0 .. NR_PMB_ENTRIES for specific entry selection, or |
54 | * PMB_NO_ENTRY to search for a free one | 53 | * PMB_NO_ENTRY to search for a free one |
55 | */ | 54 | */ |
56 | int entry; | 55 | int entry; |
57 | 56 | ||
58 | struct pmb_entry *next; | 57 | struct pmb_entry *next; |
59 | /* Adjacent entry link for contiguous multi-entry mappings */ | 58 | /* Adjacent entry link for contiguous multi-entry mappings */ |
60 | struct pmb_entry *link; | 59 | struct pmb_entry *link; |
61 | }; | 60 | }; |
62 | 61 | ||
63 | /* arch/sh/mm/pmb.c */ | 62 | /* arch/sh/mm/pmb.c */ |
64 | int __set_pmb_entry(unsigned long vpn, unsigned long ppn, | 63 | int __set_pmb_entry(unsigned long vpn, unsigned long ppn, |
65 | unsigned long flags, int *entry); | 64 | unsigned long flags, int *entry); |
66 | int set_pmb_entry(struct pmb_entry *pmbe); | 65 | int set_pmb_entry(struct pmb_entry *pmbe); |
67 | void clear_pmb_entry(struct pmb_entry *pmbe); | 66 | void clear_pmb_entry(struct pmb_entry *pmbe); |
68 | struct pmb_entry *pmb_alloc(unsigned long vpn, unsigned long ppn, | 67 | struct pmb_entry *pmb_alloc(unsigned long vpn, unsigned long ppn, |
69 | unsigned long flags); | 68 | unsigned long flags); |
70 | void pmb_free(struct pmb_entry *pmbe); | 69 | void pmb_free(struct pmb_entry *pmbe); |
71 | long pmb_remap(unsigned long virt, unsigned long phys, | 70 | long pmb_remap(unsigned long virt, unsigned long phys, |
72 | unsigned long size, unsigned long flags); | 71 | unsigned long size, unsigned long flags); |
73 | void pmb_unmap(unsigned long addr); | 72 | void pmb_unmap(unsigned long addr); |
74 | 73 | ||
75 | #endif /* __MMU_H */ | 74 | #endif /* __MMU_H */ |
76 | 75 | ||
77 | 76 |
fs/binfmt_elf_fdpic.c
1 | /* binfmt_elf_fdpic.c: FDPIC ELF binary format | 1 | /* binfmt_elf_fdpic.c: FDPIC ELF binary format |
2 | * | 2 | * |
3 | * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * Derived from binfmt_elf.c | 5 | * Derived from binfmt_elf.c |
6 | * | 6 | * |
7 | * This program is free software; you can redistribute it and/or | 7 | * This program is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU General Public License | 8 | * modify it under the terms of the GNU General Public License |
9 | * as published by the Free Software Foundation; either version | 9 | * as published by the Free Software Foundation; either version |
10 | * 2 of the License, or (at your option) any later version. | 10 | * 2 of the License, or (at your option) any later version. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | 14 | ||
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/stat.h> | 16 | #include <linux/stat.h> |
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/mman.h> | 19 | #include <linux/mman.h> |
20 | #include <linux/errno.h> | 20 | #include <linux/errno.h> |
21 | #include <linux/signal.h> | 21 | #include <linux/signal.h> |
22 | #include <linux/binfmts.h> | 22 | #include <linux/binfmts.h> |
23 | #include <linux/string.h> | 23 | #include <linux/string.h> |
24 | #include <linux/file.h> | 24 | #include <linux/file.h> |
25 | #include <linux/fcntl.h> | 25 | #include <linux/fcntl.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/pagemap.h> | 27 | #include <linux/pagemap.h> |
28 | #include <linux/security.h> | 28 | #include <linux/security.h> |
29 | #include <linux/highmem.h> | 29 | #include <linux/highmem.h> |
30 | #include <linux/highuid.h> | 30 | #include <linux/highuid.h> |
31 | #include <linux/personality.h> | 31 | #include <linux/personality.h> |
32 | #include <linux/ptrace.h> | 32 | #include <linux/ptrace.h> |
33 | #include <linux/init.h> | 33 | #include <linux/init.h> |
34 | #include <linux/elf.h> | 34 | #include <linux/elf.h> |
35 | #include <linux/elf-fdpic.h> | 35 | #include <linux/elf-fdpic.h> |
36 | #include <linux/elfcore.h> | 36 | #include <linux/elfcore.h> |
37 | 37 | ||
38 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
39 | #include <asm/param.h> | 39 | #include <asm/param.h> |
40 | #include <asm/pgalloc.h> | 40 | #include <asm/pgalloc.h> |
41 | 41 | ||
42 | typedef char *elf_caddr_t; | 42 | typedef char *elf_caddr_t; |
43 | 43 | ||
44 | #if 0 | 44 | #if 0 |
45 | #define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) | 45 | #define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) |
46 | #else | 46 | #else |
47 | #define kdebug(fmt, ...) do {} while(0) | 47 | #define kdebug(fmt, ...) do {} while(0) |
48 | #endif | 48 | #endif |
49 | 49 | ||
50 | #if 0 | 50 | #if 0 |
51 | #define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) | 51 | #define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) |
52 | #else | 52 | #else |
53 | #define kdcore(fmt, ...) do {} while(0) | 53 | #define kdcore(fmt, ...) do {} while(0) |
54 | #endif | 54 | #endif |
55 | 55 | ||
56 | MODULE_LICENSE("GPL"); | 56 | MODULE_LICENSE("GPL"); |
57 | 57 | ||
58 | static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); | 58 | static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); |
59 | static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); | 59 | static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); |
60 | static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, | 60 | static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, |
61 | struct mm_struct *, const char *); | 61 | struct mm_struct *, const char *); |
62 | 62 | ||
63 | static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, | 63 | static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, |
64 | struct elf_fdpic_params *, | 64 | struct elf_fdpic_params *, |
65 | struct elf_fdpic_params *); | 65 | struct elf_fdpic_params *); |
66 | 66 | ||
67 | #ifndef CONFIG_MMU | 67 | #ifndef CONFIG_MMU |
68 | static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, | 68 | static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, |
69 | unsigned long *); | 69 | unsigned long *); |
70 | static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, | 70 | static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, |
71 | struct file *, | 71 | struct file *, |
72 | struct mm_struct *); | 72 | struct mm_struct *); |
73 | #endif | 73 | #endif |
74 | 74 | ||
75 | static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, | 75 | static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, |
76 | struct file *, struct mm_struct *); | 76 | struct file *, struct mm_struct *); |
77 | 77 | ||
78 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) | 78 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) |
79 | static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit); | 79 | static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit); |
80 | #endif | 80 | #endif |
81 | 81 | ||
82 | static struct linux_binfmt elf_fdpic_format = { | 82 | static struct linux_binfmt elf_fdpic_format = { |
83 | .module = THIS_MODULE, | 83 | .module = THIS_MODULE, |
84 | .load_binary = load_elf_fdpic_binary, | 84 | .load_binary = load_elf_fdpic_binary, |
85 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) | 85 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) |
86 | .core_dump = elf_fdpic_core_dump, | 86 | .core_dump = elf_fdpic_core_dump, |
87 | #endif | 87 | #endif |
88 | .min_coredump = ELF_EXEC_PAGESIZE, | 88 | .min_coredump = ELF_EXEC_PAGESIZE, |
89 | }; | 89 | }; |
90 | 90 | ||
91 | static int __init init_elf_fdpic_binfmt(void) | 91 | static int __init init_elf_fdpic_binfmt(void) |
92 | { | 92 | { |
93 | return register_binfmt(&elf_fdpic_format); | 93 | return register_binfmt(&elf_fdpic_format); |
94 | } | 94 | } |
95 | 95 | ||
96 | static void __exit exit_elf_fdpic_binfmt(void) | 96 | static void __exit exit_elf_fdpic_binfmt(void) |
97 | { | 97 | { |
98 | unregister_binfmt(&elf_fdpic_format); | 98 | unregister_binfmt(&elf_fdpic_format); |
99 | } | 99 | } |
100 | 100 | ||
101 | core_initcall(init_elf_fdpic_binfmt); | 101 | core_initcall(init_elf_fdpic_binfmt); |
102 | module_exit(exit_elf_fdpic_binfmt); | 102 | module_exit(exit_elf_fdpic_binfmt); |
103 | 103 | ||
104 | static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) | 104 | static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) |
105 | { | 105 | { |
106 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0) | 106 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0) |
107 | return 0; | 107 | return 0; |
108 | if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) | 108 | if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) |
109 | return 0; | 109 | return 0; |
110 | if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr)) | 110 | if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr)) |
111 | return 0; | 111 | return 0; |
112 | if (!file->f_op || !file->f_op->mmap) | 112 | if (!file->f_op || !file->f_op->mmap) |
113 | return 0; | 113 | return 0; |
114 | return 1; | 114 | return 1; |
115 | } | 115 | } |
116 | 116 | ||
117 | /*****************************************************************************/ | 117 | /*****************************************************************************/ |
118 | /* | 118 | /* |
119 | * read the program headers table into memory | 119 | * read the program headers table into memory |
120 | */ | 120 | */ |
121 | static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, | 121 | static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, |
122 | struct file *file) | 122 | struct file *file) |
123 | { | 123 | { |
124 | struct elf32_phdr *phdr; | 124 | struct elf32_phdr *phdr; |
125 | unsigned long size; | 125 | unsigned long size; |
126 | int retval, loop; | 126 | int retval, loop; |
127 | 127 | ||
128 | if (params->hdr.e_phentsize != sizeof(struct elf_phdr)) | 128 | if (params->hdr.e_phentsize != sizeof(struct elf_phdr)) |
129 | return -ENOMEM; | 129 | return -ENOMEM; |
130 | if (params->hdr.e_phnum > 65536U / sizeof(struct elf_phdr)) | 130 | if (params->hdr.e_phnum > 65536U / sizeof(struct elf_phdr)) |
131 | return -ENOMEM; | 131 | return -ENOMEM; |
132 | 132 | ||
133 | size = params->hdr.e_phnum * sizeof(struct elf_phdr); | 133 | size = params->hdr.e_phnum * sizeof(struct elf_phdr); |
134 | params->phdrs = kmalloc(size, GFP_KERNEL); | 134 | params->phdrs = kmalloc(size, GFP_KERNEL); |
135 | if (!params->phdrs) | 135 | if (!params->phdrs) |
136 | return -ENOMEM; | 136 | return -ENOMEM; |
137 | 137 | ||
138 | retval = kernel_read(file, params->hdr.e_phoff, | 138 | retval = kernel_read(file, params->hdr.e_phoff, |
139 | (char *) params->phdrs, size); | 139 | (char *) params->phdrs, size); |
140 | if (unlikely(retval != size)) | 140 | if (unlikely(retval != size)) |
141 | return retval < 0 ? retval : -ENOEXEC; | 141 | return retval < 0 ? retval : -ENOEXEC; |
142 | 142 | ||
143 | /* determine stack size for this binary */ | 143 | /* determine stack size for this binary */ |
144 | phdr = params->phdrs; | 144 | phdr = params->phdrs; |
145 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { | 145 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { |
146 | if (phdr->p_type != PT_GNU_STACK) | 146 | if (phdr->p_type != PT_GNU_STACK) |
147 | continue; | 147 | continue; |
148 | 148 | ||
149 | if (phdr->p_flags & PF_X) | 149 | if (phdr->p_flags & PF_X) |
150 | params->flags |= ELF_FDPIC_FLAG_EXEC_STACK; | 150 | params->flags |= ELF_FDPIC_FLAG_EXEC_STACK; |
151 | else | 151 | else |
152 | params->flags |= ELF_FDPIC_FLAG_NOEXEC_STACK; | 152 | params->flags |= ELF_FDPIC_FLAG_NOEXEC_STACK; |
153 | 153 | ||
154 | params->stack_size = phdr->p_memsz; | 154 | params->stack_size = phdr->p_memsz; |
155 | break; | 155 | break; |
156 | } | 156 | } |
157 | 157 | ||
158 | return 0; | 158 | return 0; |
159 | } | 159 | } |
160 | 160 | ||
161 | /*****************************************************************************/ | 161 | /*****************************************************************************/ |
162 | /* | 162 | /* |
163 | * load an fdpic binary into various bits of memory | 163 | * load an fdpic binary into various bits of memory |
164 | */ | 164 | */ |
165 | static int load_elf_fdpic_binary(struct linux_binprm *bprm, | 165 | static int load_elf_fdpic_binary(struct linux_binprm *bprm, |
166 | struct pt_regs *regs) | 166 | struct pt_regs *regs) |
167 | { | 167 | { |
168 | struct elf_fdpic_params exec_params, interp_params; | 168 | struct elf_fdpic_params exec_params, interp_params; |
169 | struct elf_phdr *phdr; | 169 | struct elf_phdr *phdr; |
170 | unsigned long stack_size, entryaddr; | 170 | unsigned long stack_size, entryaddr; |
171 | #ifndef CONFIG_MMU | ||
172 | unsigned long fullsize; | ||
173 | #endif | ||
174 | #ifdef ELF_FDPIC_PLAT_INIT | 171 | #ifdef ELF_FDPIC_PLAT_INIT |
175 | unsigned long dynaddr; | 172 | unsigned long dynaddr; |
176 | #endif | 173 | #endif |
177 | struct file *interpreter = NULL; /* to shut gcc up */ | 174 | struct file *interpreter = NULL; /* to shut gcc up */ |
178 | char *interpreter_name = NULL; | 175 | char *interpreter_name = NULL; |
179 | int executable_stack; | 176 | int executable_stack; |
180 | int retval, i; | 177 | int retval, i; |
181 | 178 | ||
182 | kdebug("____ LOAD %d ____", current->pid); | 179 | kdebug("____ LOAD %d ____", current->pid); |
183 | 180 | ||
184 | memset(&exec_params, 0, sizeof(exec_params)); | 181 | memset(&exec_params, 0, sizeof(exec_params)); |
185 | memset(&interp_params, 0, sizeof(interp_params)); | 182 | memset(&interp_params, 0, sizeof(interp_params)); |
186 | 183 | ||
187 | exec_params.hdr = *(struct elfhdr *) bprm->buf; | 184 | exec_params.hdr = *(struct elfhdr *) bprm->buf; |
188 | exec_params.flags = ELF_FDPIC_FLAG_PRESENT | ELF_FDPIC_FLAG_EXECUTABLE; | 185 | exec_params.flags = ELF_FDPIC_FLAG_PRESENT | ELF_FDPIC_FLAG_EXECUTABLE; |
189 | 186 | ||
190 | /* check that this is a binary we know how to deal with */ | 187 | /* check that this is a binary we know how to deal with */ |
191 | retval = -ENOEXEC; | 188 | retval = -ENOEXEC; |
192 | if (!is_elf_fdpic(&exec_params.hdr, bprm->file)) | 189 | if (!is_elf_fdpic(&exec_params.hdr, bprm->file)) |
193 | goto error; | 190 | goto error; |
194 | 191 | ||
195 | /* read the program header table */ | 192 | /* read the program header table */ |
196 | retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file); | 193 | retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file); |
197 | if (retval < 0) | 194 | if (retval < 0) |
198 | goto error; | 195 | goto error; |
199 | 196 | ||
200 | /* scan for a program header that specifies an interpreter */ | 197 | /* scan for a program header that specifies an interpreter */ |
201 | phdr = exec_params.phdrs; | 198 | phdr = exec_params.phdrs; |
202 | 199 | ||
203 | for (i = 0; i < exec_params.hdr.e_phnum; i++, phdr++) { | 200 | for (i = 0; i < exec_params.hdr.e_phnum; i++, phdr++) { |
204 | switch (phdr->p_type) { | 201 | switch (phdr->p_type) { |
205 | case PT_INTERP: | 202 | case PT_INTERP: |
206 | retval = -ENOMEM; | 203 | retval = -ENOMEM; |
207 | if (phdr->p_filesz > PATH_MAX) | 204 | if (phdr->p_filesz > PATH_MAX) |
208 | goto error; | 205 | goto error; |
209 | retval = -ENOENT; | 206 | retval = -ENOENT; |
210 | if (phdr->p_filesz < 2) | 207 | if (phdr->p_filesz < 2) |
211 | goto error; | 208 | goto error; |
212 | 209 | ||
213 | /* read the name of the interpreter into memory */ | 210 | /* read the name of the interpreter into memory */ |
214 | interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL); | 211 | interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL); |
215 | if (!interpreter_name) | 212 | if (!interpreter_name) |
216 | goto error; | 213 | goto error; |
217 | 214 | ||
218 | retval = kernel_read(bprm->file, | 215 | retval = kernel_read(bprm->file, |
219 | phdr->p_offset, | 216 | phdr->p_offset, |
220 | interpreter_name, | 217 | interpreter_name, |
221 | phdr->p_filesz); | 218 | phdr->p_filesz); |
222 | if (unlikely(retval != phdr->p_filesz)) { | 219 | if (unlikely(retval != phdr->p_filesz)) { |
223 | if (retval >= 0) | 220 | if (retval >= 0) |
224 | retval = -ENOEXEC; | 221 | retval = -ENOEXEC; |
225 | goto error; | 222 | goto error; |
226 | } | 223 | } |
227 | 224 | ||
228 | retval = -ENOENT; | 225 | retval = -ENOENT; |
229 | if (interpreter_name[phdr->p_filesz - 1] != '\0') | 226 | if (interpreter_name[phdr->p_filesz - 1] != '\0') |
230 | goto error; | 227 | goto error; |
231 | 228 | ||
232 | kdebug("Using ELF interpreter %s", interpreter_name); | 229 | kdebug("Using ELF interpreter %s", interpreter_name); |
233 | 230 | ||
234 | /* replace the program with the interpreter */ | 231 | /* replace the program with the interpreter */ |
235 | interpreter = open_exec(interpreter_name); | 232 | interpreter = open_exec(interpreter_name); |
236 | retval = PTR_ERR(interpreter); | 233 | retval = PTR_ERR(interpreter); |
237 | if (IS_ERR(interpreter)) { | 234 | if (IS_ERR(interpreter)) { |
238 | interpreter = NULL; | 235 | interpreter = NULL; |
239 | goto error; | 236 | goto error; |
240 | } | 237 | } |
241 | 238 | ||
242 | /* | 239 | /* |
243 | * If the binary is not readable then enforce | 240 | * If the binary is not readable then enforce |
244 | * mm->dumpable = 0 regardless of the interpreter's | 241 | * mm->dumpable = 0 regardless of the interpreter's |
245 | * permissions. | 242 | * permissions. |
246 | */ | 243 | */ |
247 | if (file_permission(interpreter, MAY_READ) < 0) | 244 | if (file_permission(interpreter, MAY_READ) < 0) |
248 | bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; | 245 | bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; |
249 | 246 | ||
250 | retval = kernel_read(interpreter, 0, bprm->buf, | 247 | retval = kernel_read(interpreter, 0, bprm->buf, |
251 | BINPRM_BUF_SIZE); | 248 | BINPRM_BUF_SIZE); |
252 | if (unlikely(retval != BINPRM_BUF_SIZE)) { | 249 | if (unlikely(retval != BINPRM_BUF_SIZE)) { |
253 | if (retval >= 0) | 250 | if (retval >= 0) |
254 | retval = -ENOEXEC; | 251 | retval = -ENOEXEC; |
255 | goto error; | 252 | goto error; |
256 | } | 253 | } |
257 | 254 | ||
258 | interp_params.hdr = *((struct elfhdr *) bprm->buf); | 255 | interp_params.hdr = *((struct elfhdr *) bprm->buf); |
259 | break; | 256 | break; |
260 | 257 | ||
261 | case PT_LOAD: | 258 | case PT_LOAD: |
262 | #ifdef CONFIG_MMU | 259 | #ifdef CONFIG_MMU |
263 | if (exec_params.load_addr == 0) | 260 | if (exec_params.load_addr == 0) |
264 | exec_params.load_addr = phdr->p_vaddr; | 261 | exec_params.load_addr = phdr->p_vaddr; |
265 | #endif | 262 | #endif |
266 | break; | 263 | break; |
267 | } | 264 | } |
268 | 265 | ||
269 | } | 266 | } |
270 | 267 | ||
271 | if (elf_check_const_displacement(&exec_params.hdr)) | 268 | if (elf_check_const_displacement(&exec_params.hdr)) |
272 | exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; | 269 | exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; |
273 | 270 | ||
274 | /* perform insanity checks on the interpreter */ | 271 | /* perform insanity checks on the interpreter */ |
275 | if (interpreter_name) { | 272 | if (interpreter_name) { |
276 | retval = -ELIBBAD; | 273 | retval = -ELIBBAD; |
277 | if (!is_elf_fdpic(&interp_params.hdr, interpreter)) | 274 | if (!is_elf_fdpic(&interp_params.hdr, interpreter)) |
278 | goto error; | 275 | goto error; |
279 | 276 | ||
280 | interp_params.flags = ELF_FDPIC_FLAG_PRESENT; | 277 | interp_params.flags = ELF_FDPIC_FLAG_PRESENT; |
281 | 278 | ||
282 | /* read the interpreter's program header table */ | 279 | /* read the interpreter's program header table */ |
283 | retval = elf_fdpic_fetch_phdrs(&interp_params, interpreter); | 280 | retval = elf_fdpic_fetch_phdrs(&interp_params, interpreter); |
284 | if (retval < 0) | 281 | if (retval < 0) |
285 | goto error; | 282 | goto error; |
286 | } | 283 | } |
287 | 284 | ||
288 | stack_size = exec_params.stack_size; | 285 | stack_size = exec_params.stack_size; |
289 | if (stack_size < interp_params.stack_size) | 286 | if (stack_size < interp_params.stack_size) |
290 | stack_size = interp_params.stack_size; | 287 | stack_size = interp_params.stack_size; |
291 | 288 | ||
292 | if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) | 289 | if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) |
293 | executable_stack = EXSTACK_ENABLE_X; | 290 | executable_stack = EXSTACK_ENABLE_X; |
294 | else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) | 291 | else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) |
295 | executable_stack = EXSTACK_DISABLE_X; | 292 | executable_stack = EXSTACK_DISABLE_X; |
296 | else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) | 293 | else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) |
297 | executable_stack = EXSTACK_ENABLE_X; | 294 | executable_stack = EXSTACK_ENABLE_X; |
298 | else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) | 295 | else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) |
299 | executable_stack = EXSTACK_DISABLE_X; | 296 | executable_stack = EXSTACK_DISABLE_X; |
300 | else | 297 | else |
301 | executable_stack = EXSTACK_DEFAULT; | 298 | executable_stack = EXSTACK_DEFAULT; |
302 | 299 | ||
303 | retval = -ENOEXEC; | 300 | retval = -ENOEXEC; |
304 | if (stack_size == 0) | 301 | if (stack_size == 0) |
305 | goto error; | 302 | goto error; |
306 | 303 | ||
307 | if (elf_check_const_displacement(&interp_params.hdr)) | 304 | if (elf_check_const_displacement(&interp_params.hdr)) |
308 | interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; | 305 | interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; |
309 | 306 | ||
310 | /* flush all traces of the currently running executable */ | 307 | /* flush all traces of the currently running executable */ |
311 | retval = flush_old_exec(bprm); | 308 | retval = flush_old_exec(bprm); |
312 | if (retval) | 309 | if (retval) |
313 | goto error; | 310 | goto error; |
314 | 311 | ||
315 | /* there's now no turning back... the old userspace image is dead, | 312 | /* there's now no turning back... the old userspace image is dead, |
316 | * defunct, deceased, etc. after this point we have to exit via | 313 | * defunct, deceased, etc. after this point we have to exit via |
317 | * error_kill */ | 314 | * error_kill */ |
318 | set_personality(PER_LINUX_FDPIC); | 315 | set_personality(PER_LINUX_FDPIC); |
319 | set_binfmt(&elf_fdpic_format); | 316 | set_binfmt(&elf_fdpic_format); |
320 | 317 | ||
321 | current->mm->start_code = 0; | 318 | current->mm->start_code = 0; |
322 | current->mm->end_code = 0; | 319 | current->mm->end_code = 0; |
323 | current->mm->start_stack = 0; | 320 | current->mm->start_stack = 0; |
324 | current->mm->start_data = 0; | 321 | current->mm->start_data = 0; |
325 | current->mm->end_data = 0; | 322 | current->mm->end_data = 0; |
326 | current->mm->context.exec_fdpic_loadmap = 0; | 323 | current->mm->context.exec_fdpic_loadmap = 0; |
327 | current->mm->context.interp_fdpic_loadmap = 0; | 324 | current->mm->context.interp_fdpic_loadmap = 0; |
328 | 325 | ||
329 | current->flags &= ~PF_FORKNOEXEC; | 326 | current->flags &= ~PF_FORKNOEXEC; |
330 | 327 | ||
331 | #ifdef CONFIG_MMU | 328 | #ifdef CONFIG_MMU |
332 | elf_fdpic_arch_lay_out_mm(&exec_params, | 329 | elf_fdpic_arch_lay_out_mm(&exec_params, |
333 | &interp_params, | 330 | &interp_params, |
334 | ¤t->mm->start_stack, | 331 | ¤t->mm->start_stack, |
335 | ¤t->mm->start_brk); | 332 | ¤t->mm->start_brk); |
336 | 333 | ||
337 | retval = setup_arg_pages(bprm, current->mm->start_stack, | 334 | retval = setup_arg_pages(bprm, current->mm->start_stack, |
338 | executable_stack); | 335 | executable_stack); |
339 | if (retval < 0) { | 336 | if (retval < 0) { |
340 | send_sig(SIGKILL, current, 0); | 337 | send_sig(SIGKILL, current, 0); |
341 | goto error_kill; | 338 | goto error_kill; |
342 | } | 339 | } |
343 | #endif | 340 | #endif |
344 | 341 | ||
345 | /* load the executable and interpreter into memory */ | 342 | /* load the executable and interpreter into memory */ |
346 | retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, | 343 | retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, |
347 | "executable"); | 344 | "executable"); |
348 | if (retval < 0) | 345 | if (retval < 0) |
349 | goto error_kill; | 346 | goto error_kill; |
350 | 347 | ||
351 | if (interpreter_name) { | 348 | if (interpreter_name) { |
352 | retval = elf_fdpic_map_file(&interp_params, interpreter, | 349 | retval = elf_fdpic_map_file(&interp_params, interpreter, |
353 | current->mm, "interpreter"); | 350 | current->mm, "interpreter"); |
354 | if (retval < 0) { | 351 | if (retval < 0) { |
355 | printk(KERN_ERR "Unable to load interpreter\n"); | 352 | printk(KERN_ERR "Unable to load interpreter\n"); |
356 | goto error_kill; | 353 | goto error_kill; |
357 | } | 354 | } |
358 | 355 | ||
359 | allow_write_access(interpreter); | 356 | allow_write_access(interpreter); |
360 | fput(interpreter); | 357 | fput(interpreter); |
361 | interpreter = NULL; | 358 | interpreter = NULL; |
362 | } | 359 | } |
363 | 360 | ||
364 | #ifdef CONFIG_MMU | 361 | #ifdef CONFIG_MMU |
365 | if (!current->mm->start_brk) | 362 | if (!current->mm->start_brk) |
366 | current->mm->start_brk = current->mm->end_data; | 363 | current->mm->start_brk = current->mm->end_data; |
367 | 364 | ||
368 | current->mm->brk = current->mm->start_brk = | 365 | current->mm->brk = current->mm->start_brk = |
369 | PAGE_ALIGN(current->mm->start_brk); | 366 | PAGE_ALIGN(current->mm->start_brk); |
370 | 367 | ||
371 | #else | 368 | #else |
372 | /* create a stack and brk area big enough for everyone | 369 | /* create a stack and brk area big enough for everyone |
373 | * - the brk heap starts at the bottom and works up | 370 | * - the brk heap starts at the bottom and works up |
374 | * - the stack starts at the top and works down | 371 | * - the stack starts at the top and works down |
375 | */ | 372 | */ |
376 | stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK; | 373 | stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK; |
377 | if (stack_size < PAGE_SIZE * 2) | 374 | if (stack_size < PAGE_SIZE * 2) |
378 | stack_size = PAGE_SIZE * 2; | 375 | stack_size = PAGE_SIZE * 2; |
379 | 376 | ||
380 | down_write(¤t->mm->mmap_sem); | 377 | down_write(¤t->mm->mmap_sem); |
381 | current->mm->start_brk = do_mmap(NULL, 0, stack_size, | 378 | current->mm->start_brk = do_mmap(NULL, 0, stack_size, |
382 | PROT_READ | PROT_WRITE | PROT_EXEC, | 379 | PROT_READ | PROT_WRITE | PROT_EXEC, |
383 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, | 380 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, |
384 | 0); | 381 | 0); |
385 | 382 | ||
386 | if (IS_ERR_VALUE(current->mm->start_brk)) { | 383 | if (IS_ERR_VALUE(current->mm->start_brk)) { |
387 | up_write(¤t->mm->mmap_sem); | 384 | up_write(¤t->mm->mmap_sem); |
388 | retval = current->mm->start_brk; | 385 | retval = current->mm->start_brk; |
389 | current->mm->start_brk = 0; | 386 | current->mm->start_brk = 0; |
390 | goto error_kill; | 387 | goto error_kill; |
391 | } | 388 | } |
392 | 389 | ||
393 | /* expand the stack mapping to use up the entire allocation granule */ | ||
394 | fullsize = kobjsize((char *) current->mm->start_brk); | ||
395 | if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, | ||
396 | fullsize, 0, 0))) | ||
397 | stack_size = fullsize; | ||
398 | up_write(¤t->mm->mmap_sem); | 390 | up_write(¤t->mm->mmap_sem); |
399 | 391 | ||
400 | current->mm->brk = current->mm->start_brk; | 392 | current->mm->brk = current->mm->start_brk; |
401 | current->mm->context.end_brk = current->mm->start_brk; | 393 | current->mm->context.end_brk = current->mm->start_brk; |
402 | current->mm->context.end_brk += | 394 | current->mm->context.end_brk += |
403 | (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; | 395 | (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; |
404 | current->mm->start_stack = current->mm->start_brk + stack_size; | 396 | current->mm->start_stack = current->mm->start_brk + stack_size; |
405 | #endif | 397 | #endif |
406 | 398 | ||
407 | install_exec_creds(bprm); | 399 | install_exec_creds(bprm); |
408 | current->flags &= ~PF_FORKNOEXEC; | 400 | current->flags &= ~PF_FORKNOEXEC; |
409 | if (create_elf_fdpic_tables(bprm, current->mm, | 401 | if (create_elf_fdpic_tables(bprm, current->mm, |
410 | &exec_params, &interp_params) < 0) | 402 | &exec_params, &interp_params) < 0) |
411 | goto error_kill; | 403 | goto error_kill; |
412 | 404 | ||
413 | kdebug("- start_code %lx", current->mm->start_code); | 405 | kdebug("- start_code %lx", current->mm->start_code); |
414 | kdebug("- end_code %lx", current->mm->end_code); | 406 | kdebug("- end_code %lx", current->mm->end_code); |
415 | kdebug("- start_data %lx", current->mm->start_data); | 407 | kdebug("- start_data %lx", current->mm->start_data); |
416 | kdebug("- end_data %lx", current->mm->end_data); | 408 | kdebug("- end_data %lx", current->mm->end_data); |
417 | kdebug("- start_brk %lx", current->mm->start_brk); | 409 | kdebug("- start_brk %lx", current->mm->start_brk); |
418 | kdebug("- brk %lx", current->mm->brk); | 410 | kdebug("- brk %lx", current->mm->brk); |
419 | kdebug("- start_stack %lx", current->mm->start_stack); | 411 | kdebug("- start_stack %lx", current->mm->start_stack); |
420 | 412 | ||
421 | #ifdef ELF_FDPIC_PLAT_INIT | 413 | #ifdef ELF_FDPIC_PLAT_INIT |
422 | /* | 414 | /* |
423 | * The ABI may specify that certain registers be set up in special | 415 | * The ABI may specify that certain registers be set up in special |
424 | * ways (on i386 %edx is the address of a DT_FINI function, for | 416 | * ways (on i386 %edx is the address of a DT_FINI function, for |
425 | * example. This macro performs whatever initialization to | 417 | * example. This macro performs whatever initialization to |
426 | * the regs structure is required. | 418 | * the regs structure is required. |
427 | */ | 419 | */ |
428 | dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr; | 420 | dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr; |
429 | ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr, | 421 | ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr, |
430 | dynaddr); | 422 | dynaddr); |
431 | #endif | 423 | #endif |
432 | 424 | ||
433 | /* everything is now ready... get the userspace context ready to roll */ | 425 | /* everything is now ready... get the userspace context ready to roll */ |
434 | entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; | 426 | entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; |
435 | start_thread(regs, entryaddr, current->mm->start_stack); | 427 | start_thread(regs, entryaddr, current->mm->start_stack); |
436 | 428 | ||
437 | retval = 0; | 429 | retval = 0; |
438 | 430 | ||
439 | error: | 431 | error: |
440 | if (interpreter) { | 432 | if (interpreter) { |
441 | allow_write_access(interpreter); | 433 | allow_write_access(interpreter); |
442 | fput(interpreter); | 434 | fput(interpreter); |
443 | } | 435 | } |
444 | kfree(interpreter_name); | 436 | kfree(interpreter_name); |
445 | kfree(exec_params.phdrs); | 437 | kfree(exec_params.phdrs); |
446 | kfree(exec_params.loadmap); | 438 | kfree(exec_params.loadmap); |
447 | kfree(interp_params.phdrs); | 439 | kfree(interp_params.phdrs); |
448 | kfree(interp_params.loadmap); | 440 | kfree(interp_params.loadmap); |
449 | return retval; | 441 | return retval; |
450 | 442 | ||
451 | /* unrecoverable error - kill the process */ | 443 | /* unrecoverable error - kill the process */ |
452 | error_kill: | 444 | error_kill: |
453 | send_sig(SIGSEGV, current, 0); | 445 | send_sig(SIGSEGV, current, 0); |
454 | goto error; | 446 | goto error; |
455 | 447 | ||
456 | } | 448 | } |
457 | 449 | ||
458 | /*****************************************************************************/ | 450 | /*****************************************************************************/ |
459 | 451 | ||
460 | #ifndef ELF_BASE_PLATFORM | 452 | #ifndef ELF_BASE_PLATFORM |
461 | /* | 453 | /* |
462 | * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. | 454 | * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. |
463 | * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value | 455 | * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value |
464 | * will be copied to the user stack in the same manner as AT_PLATFORM. | 456 | * will be copied to the user stack in the same manner as AT_PLATFORM. |
465 | */ | 457 | */ |
466 | #define ELF_BASE_PLATFORM NULL | 458 | #define ELF_BASE_PLATFORM NULL |
467 | #endif | 459 | #endif |
468 | 460 | ||
469 | /* | 461 | /* |
470 | * present useful information to the program by shovelling it onto the new | 462 | * present useful information to the program by shovelling it onto the new |
471 | * process's stack | 463 | * process's stack |
472 | */ | 464 | */ |
473 | static int create_elf_fdpic_tables(struct linux_binprm *bprm, | 465 | static int create_elf_fdpic_tables(struct linux_binprm *bprm, |
474 | struct mm_struct *mm, | 466 | struct mm_struct *mm, |
475 | struct elf_fdpic_params *exec_params, | 467 | struct elf_fdpic_params *exec_params, |
476 | struct elf_fdpic_params *interp_params) | 468 | struct elf_fdpic_params *interp_params) |
477 | { | 469 | { |
478 | const struct cred *cred = current_cred(); | 470 | const struct cred *cred = current_cred(); |
479 | unsigned long sp, csp, nitems; | 471 | unsigned long sp, csp, nitems; |
480 | elf_caddr_t __user *argv, *envp; | 472 | elf_caddr_t __user *argv, *envp; |
481 | size_t platform_len = 0, len; | 473 | size_t platform_len = 0, len; |
482 | char *k_platform, *k_base_platform; | 474 | char *k_platform, *k_base_platform; |
483 | char __user *u_platform, *u_base_platform, *p; | 475 | char __user *u_platform, *u_base_platform, *p; |
484 | long hwcap; | 476 | long hwcap; |
485 | int loop; | 477 | int loop; |
486 | int nr; /* reset for each csp adjustment */ | 478 | int nr; /* reset for each csp adjustment */ |
487 | 479 | ||
488 | #ifdef CONFIG_MMU | 480 | #ifdef CONFIG_MMU |
489 | /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions | 481 | /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions |
490 | * by the processes running on the same package. One thing we can do is | 482 | * by the processes running on the same package. One thing we can do is |
491 | * to shuffle the initial stack for them, so we give the architecture | 483 | * to shuffle the initial stack for them, so we give the architecture |
492 | * an opportunity to do so here. | 484 | * an opportunity to do so here. |
493 | */ | 485 | */ |
494 | sp = arch_align_stack(bprm->p); | 486 | sp = arch_align_stack(bprm->p); |
495 | #else | 487 | #else |
496 | sp = mm->start_stack; | 488 | sp = mm->start_stack; |
497 | 489 | ||
498 | /* stack the program arguments and environment */ | 490 | /* stack the program arguments and environment */ |
499 | if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0) | 491 | if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0) |
500 | return -EFAULT; | 492 | return -EFAULT; |
501 | #endif | 493 | #endif |
502 | 494 | ||
503 | hwcap = ELF_HWCAP; | 495 | hwcap = ELF_HWCAP; |
504 | 496 | ||
505 | /* | 497 | /* |
506 | * If this architecture has a platform capability string, copy it | 498 | * If this architecture has a platform capability string, copy it |
507 | * to userspace. In some cases (Sparc), this info is impossible | 499 | * to userspace. In some cases (Sparc), this info is impossible |
508 | * for userspace to get any other way, in others (i386) it is | 500 | * for userspace to get any other way, in others (i386) it is |
509 | * merely difficult. | 501 | * merely difficult. |
510 | */ | 502 | */ |
511 | k_platform = ELF_PLATFORM; | 503 | k_platform = ELF_PLATFORM; |
512 | u_platform = NULL; | 504 | u_platform = NULL; |
513 | 505 | ||
514 | if (k_platform) { | 506 | if (k_platform) { |
515 | platform_len = strlen(k_platform) + 1; | 507 | platform_len = strlen(k_platform) + 1; |
516 | sp -= platform_len; | 508 | sp -= platform_len; |
517 | u_platform = (char __user *) sp; | 509 | u_platform = (char __user *) sp; |
518 | if (__copy_to_user(u_platform, k_platform, platform_len) != 0) | 510 | if (__copy_to_user(u_platform, k_platform, platform_len) != 0) |
519 | return -EFAULT; | 511 | return -EFAULT; |
520 | } | 512 | } |
521 | 513 | ||
522 | /* | 514 | /* |
523 | * If this architecture has a "base" platform capability | 515 | * If this architecture has a "base" platform capability |
524 | * string, copy it to userspace. | 516 | * string, copy it to userspace. |
525 | */ | 517 | */ |
526 | k_base_platform = ELF_BASE_PLATFORM; | 518 | k_base_platform = ELF_BASE_PLATFORM; |
527 | u_base_platform = NULL; | 519 | u_base_platform = NULL; |
528 | 520 | ||
529 | if (k_base_platform) { | 521 | if (k_base_platform) { |
530 | platform_len = strlen(k_base_platform) + 1; | 522 | platform_len = strlen(k_base_platform) + 1; |
531 | sp -= platform_len; | 523 | sp -= platform_len; |
532 | u_base_platform = (char __user *) sp; | 524 | u_base_platform = (char __user *) sp; |
533 | if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0) | 525 | if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0) |
534 | return -EFAULT; | 526 | return -EFAULT; |
535 | } | 527 | } |
536 | 528 | ||
537 | sp &= ~7UL; | 529 | sp &= ~7UL; |
538 | 530 | ||
539 | /* stack the load map(s) */ | 531 | /* stack the load map(s) */ |
540 | len = sizeof(struct elf32_fdpic_loadmap); | 532 | len = sizeof(struct elf32_fdpic_loadmap); |
541 | len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs; | 533 | len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs; |
542 | sp = (sp - len) & ~7UL; | 534 | sp = (sp - len) & ~7UL; |
543 | exec_params->map_addr = sp; | 535 | exec_params->map_addr = sp; |
544 | 536 | ||
545 | if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0) | 537 | if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0) |
546 | return -EFAULT; | 538 | return -EFAULT; |
547 | 539 | ||
548 | current->mm->context.exec_fdpic_loadmap = (unsigned long) sp; | 540 | current->mm->context.exec_fdpic_loadmap = (unsigned long) sp; |
549 | 541 | ||
550 | if (interp_params->loadmap) { | 542 | if (interp_params->loadmap) { |
551 | len = sizeof(struct elf32_fdpic_loadmap); | 543 | len = sizeof(struct elf32_fdpic_loadmap); |
552 | len += sizeof(struct elf32_fdpic_loadseg) * | 544 | len += sizeof(struct elf32_fdpic_loadseg) * |
553 | interp_params->loadmap->nsegs; | 545 | interp_params->loadmap->nsegs; |
554 | sp = (sp - len) & ~7UL; | 546 | sp = (sp - len) & ~7UL; |
555 | interp_params->map_addr = sp; | 547 | interp_params->map_addr = sp; |
556 | 548 | ||
557 | if (copy_to_user((void __user *) sp, interp_params->loadmap, | 549 | if (copy_to_user((void __user *) sp, interp_params->loadmap, |
558 | len) != 0) | 550 | len) != 0) |
559 | return -EFAULT; | 551 | return -EFAULT; |
560 | 552 | ||
561 | current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; | 553 | current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; |
562 | } | 554 | } |
563 | 555 | ||
564 | /* force 16 byte _final_ alignment here for generality */ | 556 | /* force 16 byte _final_ alignment here for generality */ |
565 | #define DLINFO_ITEMS 15 | 557 | #define DLINFO_ITEMS 15 |
566 | 558 | ||
567 | nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + | 559 | nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + |
568 | (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; | 560 | (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; |
569 | 561 | ||
570 | if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) | 562 | if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) |
571 | nitems++; | 563 | nitems++; |
572 | 564 | ||
573 | csp = sp; | 565 | csp = sp; |
574 | sp -= nitems * 2 * sizeof(unsigned long); | 566 | sp -= nitems * 2 * sizeof(unsigned long); |
575 | sp -= (bprm->envc + 1) * sizeof(char *); /* envv[] */ | 567 | sp -= (bprm->envc + 1) * sizeof(char *); /* envv[] */ |
576 | sp -= (bprm->argc + 1) * sizeof(char *); /* argv[] */ | 568 | sp -= (bprm->argc + 1) * sizeof(char *); /* argv[] */ |
577 | sp -= 1 * sizeof(unsigned long); /* argc */ | 569 | sp -= 1 * sizeof(unsigned long); /* argc */ |
578 | 570 | ||
579 | csp -= sp & 15UL; | 571 | csp -= sp & 15UL; |
580 | sp -= sp & 15UL; | 572 | sp -= sp & 15UL; |
581 | 573 | ||
582 | /* put the ELF interpreter info on the stack */ | 574 | /* put the ELF interpreter info on the stack */ |
583 | #define NEW_AUX_ENT(id, val) \ | 575 | #define NEW_AUX_ENT(id, val) \ |
584 | do { \ | 576 | do { \ |
585 | struct { unsigned long _id, _val; } __user *ent; \ | 577 | struct { unsigned long _id, _val; } __user *ent; \ |
586 | \ | 578 | \ |
587 | ent = (void __user *) csp; \ | 579 | ent = (void __user *) csp; \ |
588 | __put_user((id), &ent[nr]._id); \ | 580 | __put_user((id), &ent[nr]._id); \ |
589 | __put_user((val), &ent[nr]._val); \ | 581 | __put_user((val), &ent[nr]._val); \ |
590 | nr++; \ | 582 | nr++; \ |
591 | } while (0) | 583 | } while (0) |
592 | 584 | ||
593 | nr = 0; | 585 | nr = 0; |
594 | csp -= 2 * sizeof(unsigned long); | 586 | csp -= 2 * sizeof(unsigned long); |
595 | NEW_AUX_ENT(AT_NULL, 0); | 587 | NEW_AUX_ENT(AT_NULL, 0); |
596 | if (k_platform) { | 588 | if (k_platform) { |
597 | nr = 0; | 589 | nr = 0; |
598 | csp -= 2 * sizeof(unsigned long); | 590 | csp -= 2 * sizeof(unsigned long); |
599 | NEW_AUX_ENT(AT_PLATFORM, | 591 | NEW_AUX_ENT(AT_PLATFORM, |
600 | (elf_addr_t) (unsigned long) u_platform); | 592 | (elf_addr_t) (unsigned long) u_platform); |
601 | } | 593 | } |
602 | 594 | ||
603 | if (k_base_platform) { | 595 | if (k_base_platform) { |
604 | nr = 0; | 596 | nr = 0; |
605 | csp -= 2 * sizeof(unsigned long); | 597 | csp -= 2 * sizeof(unsigned long); |
606 | NEW_AUX_ENT(AT_BASE_PLATFORM, | 598 | NEW_AUX_ENT(AT_BASE_PLATFORM, |
607 | (elf_addr_t) (unsigned long) u_base_platform); | 599 | (elf_addr_t) (unsigned long) u_base_platform); |
608 | } | 600 | } |
609 | 601 | ||
610 | if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { | 602 | if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { |
611 | nr = 0; | 603 | nr = 0; |
612 | csp -= 2 * sizeof(unsigned long); | 604 | csp -= 2 * sizeof(unsigned long); |
613 | NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); | 605 | NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); |
614 | } | 606 | } |
615 | 607 | ||
616 | nr = 0; | 608 | nr = 0; |
617 | csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); | 609 | csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); |
618 | NEW_AUX_ENT(AT_HWCAP, hwcap); | 610 | NEW_AUX_ENT(AT_HWCAP, hwcap); |
619 | NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); | 611 | NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); |
620 | NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); | 612 | NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); |
621 | NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); | 613 | NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); |
622 | NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); | 614 | NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); |
623 | NEW_AUX_ENT(AT_PHNUM, exec_params->hdr.e_phnum); | 615 | NEW_AUX_ENT(AT_PHNUM, exec_params->hdr.e_phnum); |
624 | NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); | 616 | NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); |
625 | NEW_AUX_ENT(AT_FLAGS, 0); | 617 | NEW_AUX_ENT(AT_FLAGS, 0); |
626 | NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); | 618 | NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); |
627 | NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid); | 619 | NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid); |
628 | NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid); | 620 | NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid); |
629 | NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid); | 621 | NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid); |
630 | NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid); | 622 | NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid); |
631 | NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); | 623 | NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); |
632 | NEW_AUX_ENT(AT_EXECFN, bprm->exec); | 624 | NEW_AUX_ENT(AT_EXECFN, bprm->exec); |
633 | 625 | ||
634 | #ifdef ARCH_DLINFO | 626 | #ifdef ARCH_DLINFO |
635 | nr = 0; | 627 | nr = 0; |
636 | csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long); | 628 | csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long); |
637 | 629 | ||
638 | /* ARCH_DLINFO must come last so platform specific code can enforce | 630 | /* ARCH_DLINFO must come last so platform specific code can enforce |
639 | * special alignment requirements on the AUXV if necessary (eg. PPC). | 631 | * special alignment requirements on the AUXV if necessary (eg. PPC). |
640 | */ | 632 | */ |
641 | ARCH_DLINFO; | 633 | ARCH_DLINFO; |
642 | #endif | 634 | #endif |
643 | #undef NEW_AUX_ENT | 635 | #undef NEW_AUX_ENT |
644 | 636 | ||
645 | /* allocate room for argv[] and envv[] */ | 637 | /* allocate room for argv[] and envv[] */ |
646 | csp -= (bprm->envc + 1) * sizeof(elf_caddr_t); | 638 | csp -= (bprm->envc + 1) * sizeof(elf_caddr_t); |
647 | envp = (elf_caddr_t __user *) csp; | 639 | envp = (elf_caddr_t __user *) csp; |
648 | csp -= (bprm->argc + 1) * sizeof(elf_caddr_t); | 640 | csp -= (bprm->argc + 1) * sizeof(elf_caddr_t); |
649 | argv = (elf_caddr_t __user *) csp; | 641 | argv = (elf_caddr_t __user *) csp; |
650 | 642 | ||
651 | /* stack argc */ | 643 | /* stack argc */ |
652 | csp -= sizeof(unsigned long); | 644 | csp -= sizeof(unsigned long); |
653 | __put_user(bprm->argc, (unsigned long __user *) csp); | 645 | __put_user(bprm->argc, (unsigned long __user *) csp); |
654 | 646 | ||
655 | BUG_ON(csp != sp); | 647 | BUG_ON(csp != sp); |
656 | 648 | ||
657 | /* fill in the argv[] array */ | 649 | /* fill in the argv[] array */ |
658 | #ifdef CONFIG_MMU | 650 | #ifdef CONFIG_MMU |
659 | current->mm->arg_start = bprm->p; | 651 | current->mm->arg_start = bprm->p; |
660 | #else | 652 | #else |
661 | current->mm->arg_start = current->mm->start_stack - | 653 | current->mm->arg_start = current->mm->start_stack - |
662 | (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); | 654 | (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); |
663 | #endif | 655 | #endif |
664 | 656 | ||
665 | p = (char __user *) current->mm->arg_start; | 657 | p = (char __user *) current->mm->arg_start; |
666 | for (loop = bprm->argc; loop > 0; loop--) { | 658 | for (loop = bprm->argc; loop > 0; loop--) { |
667 | __put_user((elf_caddr_t) p, argv++); | 659 | __put_user((elf_caddr_t) p, argv++); |
668 | len = strnlen_user(p, MAX_ARG_STRLEN); | 660 | len = strnlen_user(p, MAX_ARG_STRLEN); |
669 | if (!len || len > MAX_ARG_STRLEN) | 661 | if (!len || len > MAX_ARG_STRLEN) |
670 | return -EINVAL; | 662 | return -EINVAL; |
671 | p += len; | 663 | p += len; |
672 | } | 664 | } |
673 | __put_user(NULL, argv); | 665 | __put_user(NULL, argv); |
674 | current->mm->arg_end = (unsigned long) p; | 666 | current->mm->arg_end = (unsigned long) p; |
675 | 667 | ||
676 | /* fill in the envv[] array */ | 668 | /* fill in the envv[] array */ |
677 | current->mm->env_start = (unsigned long) p; | 669 | current->mm->env_start = (unsigned long) p; |
678 | for (loop = bprm->envc; loop > 0; loop--) { | 670 | for (loop = bprm->envc; loop > 0; loop--) { |
679 | __put_user((elf_caddr_t)(unsigned long) p, envp++); | 671 | __put_user((elf_caddr_t)(unsigned long) p, envp++); |
680 | len = strnlen_user(p, MAX_ARG_STRLEN); | 672 | len = strnlen_user(p, MAX_ARG_STRLEN); |
681 | if (!len || len > MAX_ARG_STRLEN) | 673 | if (!len || len > MAX_ARG_STRLEN) |
682 | return -EINVAL; | 674 | return -EINVAL; |
683 | p += len; | 675 | p += len; |
684 | } | 676 | } |
685 | __put_user(NULL, envp); | 677 | __put_user(NULL, envp); |
686 | current->mm->env_end = (unsigned long) p; | 678 | current->mm->env_end = (unsigned long) p; |
687 | 679 | ||
688 | mm->start_stack = (unsigned long) sp; | 680 | mm->start_stack = (unsigned long) sp; |
689 | return 0; | 681 | return 0; |
690 | } | 682 | } |
691 | 683 | ||
692 | /*****************************************************************************/ | 684 | /*****************************************************************************/ |
693 | /* | 685 | /* |
694 | * transfer the program arguments and environment from the holding pages onto | 686 | * transfer the program arguments and environment from the holding pages onto |
695 | * the stack | 687 | * the stack |
696 | */ | 688 | */ |
697 | #ifndef CONFIG_MMU | 689 | #ifndef CONFIG_MMU |
698 | static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, | 690 | static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, |
699 | unsigned long *_sp) | 691 | unsigned long *_sp) |
700 | { | 692 | { |
701 | unsigned long index, stop, sp; | 693 | unsigned long index, stop, sp; |
702 | char *src; | 694 | char *src; |
703 | int ret = 0; | 695 | int ret = 0; |
704 | 696 | ||
705 | stop = bprm->p >> PAGE_SHIFT; | 697 | stop = bprm->p >> PAGE_SHIFT; |
706 | sp = *_sp; | 698 | sp = *_sp; |
707 | 699 | ||
708 | for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { | 700 | for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { |
709 | src = kmap(bprm->page[index]); | 701 | src = kmap(bprm->page[index]); |
710 | sp -= PAGE_SIZE; | 702 | sp -= PAGE_SIZE; |
711 | if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0) | 703 | if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0) |
712 | ret = -EFAULT; | 704 | ret = -EFAULT; |
713 | kunmap(bprm->page[index]); | 705 | kunmap(bprm->page[index]); |
714 | if (ret < 0) | 706 | if (ret < 0) |
715 | goto out; | 707 | goto out; |
716 | } | 708 | } |
717 | 709 | ||
718 | *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; | 710 | *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; |
719 | 711 | ||
720 | out: | 712 | out: |
721 | return ret; | 713 | return ret; |
722 | } | 714 | } |
723 | #endif | 715 | #endif |
724 | 716 | ||
725 | /*****************************************************************************/ | 717 | /*****************************************************************************/ |
726 | /* | 718 | /* |
727 | * load the appropriate binary image (executable or interpreter) into memory | 719 | * load the appropriate binary image (executable or interpreter) into memory |
728 | * - we assume no MMU is available | 720 | * - we assume no MMU is available |
729 | * - if no other PIC bits are set in params->hdr->e_flags | 721 | * - if no other PIC bits are set in params->hdr->e_flags |
730 | * - we assume that the LOADable segments in the binary are independently relocatable | 722 | * - we assume that the LOADable segments in the binary are independently relocatable |
731 | * - we assume R/O executable segments are shareable | 723 | * - we assume R/O executable segments are shareable |
732 | * - else | 724 | * - else |
733 | * - we assume the loadable parts of the image to require fixed displacement | 725 | * - we assume the loadable parts of the image to require fixed displacement |
734 | * - the image is not shareable | 726 | * - the image is not shareable |
735 | */ | 727 | */ |
736 | static int elf_fdpic_map_file(struct elf_fdpic_params *params, | 728 | static int elf_fdpic_map_file(struct elf_fdpic_params *params, |
737 | struct file *file, | 729 | struct file *file, |
738 | struct mm_struct *mm, | 730 | struct mm_struct *mm, |
739 | const char *what) | 731 | const char *what) |
740 | { | 732 | { |
741 | struct elf32_fdpic_loadmap *loadmap; | 733 | struct elf32_fdpic_loadmap *loadmap; |
742 | #ifdef CONFIG_MMU | 734 | #ifdef CONFIG_MMU |
743 | struct elf32_fdpic_loadseg *mseg; | 735 | struct elf32_fdpic_loadseg *mseg; |
744 | #endif | 736 | #endif |
745 | struct elf32_fdpic_loadseg *seg; | 737 | struct elf32_fdpic_loadseg *seg; |
746 | struct elf32_phdr *phdr; | 738 | struct elf32_phdr *phdr; |
747 | unsigned long load_addr, stop; | 739 | unsigned long load_addr, stop; |
748 | unsigned nloads, tmp; | 740 | unsigned nloads, tmp; |
749 | size_t size; | 741 | size_t size; |
750 | int loop, ret; | 742 | int loop, ret; |
751 | 743 | ||
752 | /* allocate a load map table */ | 744 | /* allocate a load map table */ |
753 | nloads = 0; | 745 | nloads = 0; |
754 | for (loop = 0; loop < params->hdr.e_phnum; loop++) | 746 | for (loop = 0; loop < params->hdr.e_phnum; loop++) |
755 | if (params->phdrs[loop].p_type == PT_LOAD) | 747 | if (params->phdrs[loop].p_type == PT_LOAD) |
756 | nloads++; | 748 | nloads++; |
757 | 749 | ||
758 | if (nloads == 0) | 750 | if (nloads == 0) |
759 | return -ELIBBAD; | 751 | return -ELIBBAD; |
760 | 752 | ||
761 | size = sizeof(*loadmap) + nloads * sizeof(*seg); | 753 | size = sizeof(*loadmap) + nloads * sizeof(*seg); |
762 | loadmap = kzalloc(size, GFP_KERNEL); | 754 | loadmap = kzalloc(size, GFP_KERNEL); |
763 | if (!loadmap) | 755 | if (!loadmap) |
764 | return -ENOMEM; | 756 | return -ENOMEM; |
765 | 757 | ||
766 | params->loadmap = loadmap; | 758 | params->loadmap = loadmap; |
767 | 759 | ||
768 | loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; | 760 | loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; |
769 | loadmap->nsegs = nloads; | 761 | loadmap->nsegs = nloads; |
770 | 762 | ||
771 | load_addr = params->load_addr; | 763 | load_addr = params->load_addr; |
772 | seg = loadmap->segs; | 764 | seg = loadmap->segs; |
773 | 765 | ||
774 | /* map the requested LOADs into the memory space */ | 766 | /* map the requested LOADs into the memory space */ |
775 | switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { | 767 | switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { |
776 | case ELF_FDPIC_FLAG_CONSTDISP: | 768 | case ELF_FDPIC_FLAG_CONSTDISP: |
777 | case ELF_FDPIC_FLAG_CONTIGUOUS: | 769 | case ELF_FDPIC_FLAG_CONTIGUOUS: |
778 | #ifndef CONFIG_MMU | 770 | #ifndef CONFIG_MMU |
779 | ret = elf_fdpic_map_file_constdisp_on_uclinux(params, file, mm); | 771 | ret = elf_fdpic_map_file_constdisp_on_uclinux(params, file, mm); |
780 | if (ret < 0) | 772 | if (ret < 0) |
781 | return ret; | 773 | return ret; |
782 | break; | 774 | break; |
783 | #endif | 775 | #endif |
784 | default: | 776 | default: |
785 | ret = elf_fdpic_map_file_by_direct_mmap(params, file, mm); | 777 | ret = elf_fdpic_map_file_by_direct_mmap(params, file, mm); |
786 | if (ret < 0) | 778 | if (ret < 0) |
787 | return ret; | 779 | return ret; |
788 | break; | 780 | break; |
789 | } | 781 | } |
790 | 782 | ||
791 | /* map the entry point */ | 783 | /* map the entry point */ |
792 | if (params->hdr.e_entry) { | 784 | if (params->hdr.e_entry) { |
793 | seg = loadmap->segs; | 785 | seg = loadmap->segs; |
794 | for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { | 786 | for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { |
795 | if (params->hdr.e_entry >= seg->p_vaddr && | 787 | if (params->hdr.e_entry >= seg->p_vaddr && |
796 | params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) { | 788 | params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) { |
797 | params->entry_addr = | 789 | params->entry_addr = |
798 | (params->hdr.e_entry - seg->p_vaddr) + | 790 | (params->hdr.e_entry - seg->p_vaddr) + |
799 | seg->addr; | 791 | seg->addr; |
800 | break; | 792 | break; |
801 | } | 793 | } |
802 | } | 794 | } |
803 | } | 795 | } |
804 | 796 | ||
805 | /* determine where the program header table has wound up if mapped */ | 797 | /* determine where the program header table has wound up if mapped */ |
806 | stop = params->hdr.e_phoff; | 798 | stop = params->hdr.e_phoff; |
807 | stop += params->hdr.e_phnum * sizeof (struct elf_phdr); | 799 | stop += params->hdr.e_phnum * sizeof (struct elf_phdr); |
808 | phdr = params->phdrs; | 800 | phdr = params->phdrs; |
809 | 801 | ||
810 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { | 802 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { |
811 | if (phdr->p_type != PT_LOAD) | 803 | if (phdr->p_type != PT_LOAD) |
812 | continue; | 804 | continue; |
813 | 805 | ||
814 | if (phdr->p_offset > params->hdr.e_phoff || | 806 | if (phdr->p_offset > params->hdr.e_phoff || |
815 | phdr->p_offset + phdr->p_filesz < stop) | 807 | phdr->p_offset + phdr->p_filesz < stop) |
816 | continue; | 808 | continue; |
817 | 809 | ||
818 | seg = loadmap->segs; | 810 | seg = loadmap->segs; |
819 | for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { | 811 | for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { |
820 | if (phdr->p_vaddr >= seg->p_vaddr && | 812 | if (phdr->p_vaddr >= seg->p_vaddr && |
821 | phdr->p_vaddr + phdr->p_filesz <= | 813 | phdr->p_vaddr + phdr->p_filesz <= |
822 | seg->p_vaddr + seg->p_memsz) { | 814 | seg->p_vaddr + seg->p_memsz) { |
823 | params->ph_addr = | 815 | params->ph_addr = |
824 | (phdr->p_vaddr - seg->p_vaddr) + | 816 | (phdr->p_vaddr - seg->p_vaddr) + |
825 | seg->addr + | 817 | seg->addr + |
826 | params->hdr.e_phoff - phdr->p_offset; | 818 | params->hdr.e_phoff - phdr->p_offset; |
827 | break; | 819 | break; |
828 | } | 820 | } |
829 | } | 821 | } |
830 | break; | 822 | break; |
831 | } | 823 | } |
832 | 824 | ||
833 | /* determine where the dynamic section has wound up if there is one */ | 825 | /* determine where the dynamic section has wound up if there is one */ |
834 | phdr = params->phdrs; | 826 | phdr = params->phdrs; |
835 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { | 827 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { |
836 | if (phdr->p_type != PT_DYNAMIC) | 828 | if (phdr->p_type != PT_DYNAMIC) |
837 | continue; | 829 | continue; |
838 | 830 | ||
839 | seg = loadmap->segs; | 831 | seg = loadmap->segs; |
840 | for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { | 832 | for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { |
841 | if (phdr->p_vaddr >= seg->p_vaddr && | 833 | if (phdr->p_vaddr >= seg->p_vaddr && |
842 | phdr->p_vaddr + phdr->p_memsz <= | 834 | phdr->p_vaddr + phdr->p_memsz <= |
843 | seg->p_vaddr + seg->p_memsz) { | 835 | seg->p_vaddr + seg->p_memsz) { |
844 | params->dynamic_addr = | 836 | params->dynamic_addr = |
845 | (phdr->p_vaddr - seg->p_vaddr) + | 837 | (phdr->p_vaddr - seg->p_vaddr) + |
846 | seg->addr; | 838 | seg->addr; |
847 | 839 | ||
848 | /* check the dynamic section contains at least | 840 | /* check the dynamic section contains at least |
849 | * one item, and that the last item is a NULL | 841 | * one item, and that the last item is a NULL |
850 | * entry */ | 842 | * entry */ |
851 | if (phdr->p_memsz == 0 || | 843 | if (phdr->p_memsz == 0 || |
852 | phdr->p_memsz % sizeof(Elf32_Dyn) != 0) | 844 | phdr->p_memsz % sizeof(Elf32_Dyn) != 0) |
853 | goto dynamic_error; | 845 | goto dynamic_error; |
854 | 846 | ||
855 | tmp = phdr->p_memsz / sizeof(Elf32_Dyn); | 847 | tmp = phdr->p_memsz / sizeof(Elf32_Dyn); |
856 | if (((Elf32_Dyn *) | 848 | if (((Elf32_Dyn *) |
857 | params->dynamic_addr)[tmp - 1].d_tag != 0) | 849 | params->dynamic_addr)[tmp - 1].d_tag != 0) |
858 | goto dynamic_error; | 850 | goto dynamic_error; |
859 | break; | 851 | break; |
860 | } | 852 | } |
861 | } | 853 | } |
862 | break; | 854 | break; |
863 | } | 855 | } |
864 | 856 | ||
865 | /* now elide adjacent segments in the load map on MMU linux | 857 | /* now elide adjacent segments in the load map on MMU linux |
866 | * - on uClinux the holes between may actually be filled with system | 858 | * - on uClinux the holes between may actually be filled with system |
867 | * stuff or stuff from other processes | 859 | * stuff or stuff from other processes |
868 | */ | 860 | */ |
869 | #ifdef CONFIG_MMU | 861 | #ifdef CONFIG_MMU |
870 | nloads = loadmap->nsegs; | 862 | nloads = loadmap->nsegs; |
871 | mseg = loadmap->segs; | 863 | mseg = loadmap->segs; |
872 | seg = mseg + 1; | 864 | seg = mseg + 1; |
873 | for (loop = 1; loop < nloads; loop++) { | 865 | for (loop = 1; loop < nloads; loop++) { |
874 | /* see if we have a candidate for merging */ | 866 | /* see if we have a candidate for merging */ |
875 | if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { | 867 | if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { |
876 | load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); | 868 | load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); |
877 | if (load_addr == (seg->addr & PAGE_MASK)) { | 869 | if (load_addr == (seg->addr & PAGE_MASK)) { |
878 | mseg->p_memsz += | 870 | mseg->p_memsz += |
879 | load_addr - | 871 | load_addr - |
880 | (mseg->addr + mseg->p_memsz); | 872 | (mseg->addr + mseg->p_memsz); |
881 | mseg->p_memsz += seg->addr & ~PAGE_MASK; | 873 | mseg->p_memsz += seg->addr & ~PAGE_MASK; |
882 | mseg->p_memsz += seg->p_memsz; | 874 | mseg->p_memsz += seg->p_memsz; |
883 | loadmap->nsegs--; | 875 | loadmap->nsegs--; |
884 | continue; | 876 | continue; |
885 | } | 877 | } |
886 | } | 878 | } |
887 | 879 | ||
888 | mseg++; | 880 | mseg++; |
889 | if (mseg != seg) | 881 | if (mseg != seg) |
890 | *mseg = *seg; | 882 | *mseg = *seg; |
891 | } | 883 | } |
892 | #endif | 884 | #endif |
893 | 885 | ||
894 | kdebug("Mapped Object [%s]:", what); | 886 | kdebug("Mapped Object [%s]:", what); |
895 | kdebug("- elfhdr : %lx", params->elfhdr_addr); | 887 | kdebug("- elfhdr : %lx", params->elfhdr_addr); |
896 | kdebug("- entry : %lx", params->entry_addr); | 888 | kdebug("- entry : %lx", params->entry_addr); |
897 | kdebug("- PHDR[] : %lx", params->ph_addr); | 889 | kdebug("- PHDR[] : %lx", params->ph_addr); |
898 | kdebug("- DYNAMIC[]: %lx", params->dynamic_addr); | 890 | kdebug("- DYNAMIC[]: %lx", params->dynamic_addr); |
899 | seg = loadmap->segs; | 891 | seg = loadmap->segs; |
900 | for (loop = 0; loop < loadmap->nsegs; loop++, seg++) | 892 | for (loop = 0; loop < loadmap->nsegs; loop++, seg++) |
901 | kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]", | 893 | kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]", |
902 | loop, | 894 | loop, |
903 | seg->addr, seg->addr + seg->p_memsz - 1, | 895 | seg->addr, seg->addr + seg->p_memsz - 1, |
904 | seg->p_vaddr, seg->p_memsz); | 896 | seg->p_vaddr, seg->p_memsz); |
905 | 897 | ||
906 | return 0; | 898 | return 0; |
907 | 899 | ||
908 | dynamic_error: | 900 | dynamic_error: |
909 | printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", | 901 | printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", |
910 | what, file->f_path.dentry->d_inode->i_ino); | 902 | what, file->f_path.dentry->d_inode->i_ino); |
911 | return -ELIBBAD; | 903 | return -ELIBBAD; |
912 | } | 904 | } |
913 | 905 | ||
914 | /*****************************************************************************/ | 906 | /*****************************************************************************/ |
915 | /* | 907 | /* |
916 | * map a file with constant displacement under uClinux | 908 | * map a file with constant displacement under uClinux |
917 | */ | 909 | */ |
918 | #ifndef CONFIG_MMU | 910 | #ifndef CONFIG_MMU |
919 | static int elf_fdpic_map_file_constdisp_on_uclinux( | 911 | static int elf_fdpic_map_file_constdisp_on_uclinux( |
920 | struct elf_fdpic_params *params, | 912 | struct elf_fdpic_params *params, |
921 | struct file *file, | 913 | struct file *file, |
922 | struct mm_struct *mm) | 914 | struct mm_struct *mm) |
923 | { | 915 | { |
924 | struct elf32_fdpic_loadseg *seg; | 916 | struct elf32_fdpic_loadseg *seg; |
925 | struct elf32_phdr *phdr; | 917 | struct elf32_phdr *phdr; |
926 | unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; | 918 | unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; |
927 | loff_t fpos; | 919 | loff_t fpos; |
928 | int loop, ret; | 920 | int loop, ret; |
929 | 921 | ||
930 | load_addr = params->load_addr; | 922 | load_addr = params->load_addr; |
931 | seg = params->loadmap->segs; | 923 | seg = params->loadmap->segs; |
932 | 924 | ||
933 | /* determine the bounds of the contiguous overall allocation we must | 925 | /* determine the bounds of the contiguous overall allocation we must |
934 | * make */ | 926 | * make */ |
935 | phdr = params->phdrs; | 927 | phdr = params->phdrs; |
936 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { | 928 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { |
937 | if (params->phdrs[loop].p_type != PT_LOAD) | 929 | if (params->phdrs[loop].p_type != PT_LOAD) |
938 | continue; | 930 | continue; |
939 | 931 | ||
940 | if (base > phdr->p_vaddr) | 932 | if (base > phdr->p_vaddr) |
941 | base = phdr->p_vaddr; | 933 | base = phdr->p_vaddr; |
942 | if (top < phdr->p_vaddr + phdr->p_memsz) | 934 | if (top < phdr->p_vaddr + phdr->p_memsz) |
943 | top = phdr->p_vaddr + phdr->p_memsz; | 935 | top = phdr->p_vaddr + phdr->p_memsz; |
944 | } | 936 | } |
945 | 937 | ||
946 | /* allocate one big anon block for everything */ | 938 | /* allocate one big anon block for everything */ |
947 | mflags = MAP_PRIVATE; | 939 | mflags = MAP_PRIVATE; |
948 | if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) | 940 | if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) |
949 | mflags |= MAP_EXECUTABLE; | 941 | mflags |= MAP_EXECUTABLE; |
950 | 942 | ||
951 | down_write(&mm->mmap_sem); | 943 | down_write(&mm->mmap_sem); |
952 | maddr = do_mmap(NULL, load_addr, top - base, | 944 | maddr = do_mmap(NULL, load_addr, top - base, |
953 | PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); | 945 | PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); |
954 | up_write(&mm->mmap_sem); | 946 | up_write(&mm->mmap_sem); |
955 | if (IS_ERR_VALUE(maddr)) | 947 | if (IS_ERR_VALUE(maddr)) |
956 | return (int) maddr; | 948 | return (int) maddr; |
957 | 949 | ||
958 | if (load_addr != 0) | 950 | if (load_addr != 0) |
959 | load_addr += PAGE_ALIGN(top - base); | 951 | load_addr += PAGE_ALIGN(top - base); |
960 | 952 | ||
961 | /* and then load the file segments into it */ | 953 | /* and then load the file segments into it */ |
962 | phdr = params->phdrs; | 954 | phdr = params->phdrs; |
963 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { | 955 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { |
964 | if (params->phdrs[loop].p_type != PT_LOAD) | 956 | if (params->phdrs[loop].p_type != PT_LOAD) |
965 | continue; | 957 | continue; |
966 | 958 | ||
967 | fpos = phdr->p_offset; | 959 | fpos = phdr->p_offset; |
968 | 960 | ||
969 | seg->addr = maddr + (phdr->p_vaddr - base); | 961 | seg->addr = maddr + (phdr->p_vaddr - base); |
970 | seg->p_vaddr = phdr->p_vaddr; | 962 | seg->p_vaddr = phdr->p_vaddr; |
971 | seg->p_memsz = phdr->p_memsz; | 963 | seg->p_memsz = phdr->p_memsz; |
972 | 964 | ||
973 | ret = file->f_op->read(file, (void *) seg->addr, | 965 | ret = file->f_op->read(file, (void *) seg->addr, |
974 | phdr->p_filesz, &fpos); | 966 | phdr->p_filesz, &fpos); |
975 | if (ret < 0) | 967 | if (ret < 0) |
976 | return ret; | 968 | return ret; |
977 | 969 | ||
978 | /* map the ELF header address if in this segment */ | 970 | /* map the ELF header address if in this segment */ |
979 | if (phdr->p_offset == 0) | 971 | if (phdr->p_offset == 0) |
980 | params->elfhdr_addr = seg->addr; | 972 | params->elfhdr_addr = seg->addr; |
981 | 973 | ||
982 | /* clear any space allocated but not loaded */ | 974 | /* clear any space allocated but not loaded */ |
983 | if (phdr->p_filesz < phdr->p_memsz) | 975 | if (phdr->p_filesz < phdr->p_memsz) |
984 | clear_user((void *) (seg->addr + phdr->p_filesz), | 976 | clear_user((void *) (seg->addr + phdr->p_filesz), |
985 | phdr->p_memsz - phdr->p_filesz); | 977 | phdr->p_memsz - phdr->p_filesz); |
986 | 978 | ||
987 | if (mm) { | 979 | if (mm) { |
988 | if (phdr->p_flags & PF_X) { | 980 | if (phdr->p_flags & PF_X) { |
989 | if (!mm->start_code) { | 981 | if (!mm->start_code) { |
990 | mm->start_code = seg->addr; | 982 | mm->start_code = seg->addr; |
991 | mm->end_code = seg->addr + | 983 | mm->end_code = seg->addr + |
992 | phdr->p_memsz; | 984 | phdr->p_memsz; |
993 | } | 985 | } |
994 | } else if (!mm->start_data) { | 986 | } else if (!mm->start_data) { |
995 | mm->start_data = seg->addr; | 987 | mm->start_data = seg->addr; |
996 | #ifndef CONFIG_MMU | 988 | #ifndef CONFIG_MMU |
997 | mm->end_data = seg->addr + phdr->p_memsz; | 989 | mm->end_data = seg->addr + phdr->p_memsz; |
998 | #endif | 990 | #endif |
999 | } | 991 | } |
1000 | 992 | ||
1001 | #ifdef CONFIG_MMU | 993 | #ifdef CONFIG_MMU |
1002 | if (seg->addr + phdr->p_memsz > mm->end_data) | 994 | if (seg->addr + phdr->p_memsz > mm->end_data) |
1003 | mm->end_data = seg->addr + phdr->p_memsz; | 995 | mm->end_data = seg->addr + phdr->p_memsz; |
1004 | #endif | 996 | #endif |
1005 | } | 997 | } |
1006 | 998 | ||
1007 | seg++; | 999 | seg++; |
1008 | } | 1000 | } |
1009 | 1001 | ||
1010 | return 0; | 1002 | return 0; |
1011 | } | 1003 | } |
1012 | #endif | 1004 | #endif |
1013 | 1005 | ||
1014 | /*****************************************************************************/ | 1006 | /*****************************************************************************/ |
1015 | /* | 1007 | /* |
1016 | * map a binary by direct mmap() of the individual PT_LOAD segments | 1008 | * map a binary by direct mmap() of the individual PT_LOAD segments |
1017 | */ | 1009 | */ |
1018 | static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, | 1010 | static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, |
1019 | struct file *file, | 1011 | struct file *file, |
1020 | struct mm_struct *mm) | 1012 | struct mm_struct *mm) |
1021 | { | 1013 | { |
1022 | struct elf32_fdpic_loadseg *seg; | 1014 | struct elf32_fdpic_loadseg *seg; |
1023 | struct elf32_phdr *phdr; | 1015 | struct elf32_phdr *phdr; |
1024 | unsigned long load_addr, delta_vaddr; | 1016 | unsigned long load_addr, delta_vaddr; |
1025 | int loop, dvset; | 1017 | int loop, dvset; |
1026 | 1018 | ||
1027 | load_addr = params->load_addr; | 1019 | load_addr = params->load_addr; |
1028 | delta_vaddr = 0; | 1020 | delta_vaddr = 0; |
1029 | dvset = 0; | 1021 | dvset = 0; |
1030 | 1022 | ||
1031 | seg = params->loadmap->segs; | 1023 | seg = params->loadmap->segs; |
1032 | 1024 | ||
1033 | /* deal with each load segment separately */ | 1025 | /* deal with each load segment separately */ |
1034 | phdr = params->phdrs; | 1026 | phdr = params->phdrs; |
1035 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { | 1027 | for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { |
1036 | unsigned long maddr, disp, excess, excess1; | 1028 | unsigned long maddr, disp, excess, excess1; |
1037 | int prot = 0, flags; | 1029 | int prot = 0, flags; |
1038 | 1030 | ||
1039 | if (phdr->p_type != PT_LOAD) | 1031 | if (phdr->p_type != PT_LOAD) |
1040 | continue; | 1032 | continue; |
1041 | 1033 | ||
1042 | kdebug("[LOAD] va=%lx of=%lx fs=%lx ms=%lx", | 1034 | kdebug("[LOAD] va=%lx of=%lx fs=%lx ms=%lx", |
1043 | (unsigned long) phdr->p_vaddr, | 1035 | (unsigned long) phdr->p_vaddr, |
1044 | (unsigned long) phdr->p_offset, | 1036 | (unsigned long) phdr->p_offset, |
1045 | (unsigned long) phdr->p_filesz, | 1037 | (unsigned long) phdr->p_filesz, |
1046 | (unsigned long) phdr->p_memsz); | 1038 | (unsigned long) phdr->p_memsz); |
1047 | 1039 | ||
1048 | /* determine the mapping parameters */ | 1040 | /* determine the mapping parameters */ |
1049 | if (phdr->p_flags & PF_R) prot |= PROT_READ; | 1041 | if (phdr->p_flags & PF_R) prot |= PROT_READ; |
1050 | if (phdr->p_flags & PF_W) prot |= PROT_WRITE; | 1042 | if (phdr->p_flags & PF_W) prot |= PROT_WRITE; |
1051 | if (phdr->p_flags & PF_X) prot |= PROT_EXEC; | 1043 | if (phdr->p_flags & PF_X) prot |= PROT_EXEC; |
1052 | 1044 | ||
1053 | flags = MAP_PRIVATE | MAP_DENYWRITE; | 1045 | flags = MAP_PRIVATE | MAP_DENYWRITE; |
1054 | if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) | 1046 | if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) |
1055 | flags |= MAP_EXECUTABLE; | 1047 | flags |= MAP_EXECUTABLE; |
1056 | 1048 | ||
1057 | maddr = 0; | 1049 | maddr = 0; |
1058 | 1050 | ||
1059 | switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { | 1051 | switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { |
1060 | case ELF_FDPIC_FLAG_INDEPENDENT: | 1052 | case ELF_FDPIC_FLAG_INDEPENDENT: |
1061 | /* PT_LOADs are independently locatable */ | 1053 | /* PT_LOADs are independently locatable */ |
1062 | break; | 1054 | break; |
1063 | 1055 | ||
1064 | case ELF_FDPIC_FLAG_HONOURVADDR: | 1056 | case ELF_FDPIC_FLAG_HONOURVADDR: |
1065 | /* the specified virtual address must be honoured */ | 1057 | /* the specified virtual address must be honoured */ |
1066 | maddr = phdr->p_vaddr; | 1058 | maddr = phdr->p_vaddr; |
1067 | flags |= MAP_FIXED; | 1059 | flags |= MAP_FIXED; |
1068 | break; | 1060 | break; |
1069 | 1061 | ||
1070 | case ELF_FDPIC_FLAG_CONSTDISP: | 1062 | case ELF_FDPIC_FLAG_CONSTDISP: |
1071 | /* constant displacement | 1063 | /* constant displacement |
1072 | * - can be mapped anywhere, but must be mapped as a | 1064 | * - can be mapped anywhere, but must be mapped as a |
1073 | * unit | 1065 | * unit |
1074 | */ | 1066 | */ |
1075 | if (!dvset) { | 1067 | if (!dvset) { |
1076 | maddr = load_addr; | 1068 | maddr = load_addr; |
1077 | delta_vaddr = phdr->p_vaddr; | 1069 | delta_vaddr = phdr->p_vaddr; |
1078 | dvset = 1; | 1070 | dvset = 1; |
1079 | } else { | 1071 | } else { |
1080 | maddr = load_addr + phdr->p_vaddr - delta_vaddr; | 1072 | maddr = load_addr + phdr->p_vaddr - delta_vaddr; |
1081 | flags |= MAP_FIXED; | 1073 | flags |= MAP_FIXED; |
1082 | } | 1074 | } |
1083 | break; | 1075 | break; |
1084 | 1076 | ||
1085 | case ELF_FDPIC_FLAG_CONTIGUOUS: | 1077 | case ELF_FDPIC_FLAG_CONTIGUOUS: |
1086 | /* contiguity handled later */ | 1078 | /* contiguity handled later */ |
1087 | break; | 1079 | break; |
1088 | 1080 | ||
1089 | default: | 1081 | default: |
1090 | BUG(); | 1082 | BUG(); |
1091 | } | 1083 | } |
1092 | 1084 | ||
1093 | maddr &= PAGE_MASK; | 1085 | maddr &= PAGE_MASK; |
1094 | 1086 | ||
1095 | /* create the mapping */ | 1087 | /* create the mapping */ |
1096 | disp = phdr->p_vaddr & ~PAGE_MASK; | 1088 | disp = phdr->p_vaddr & ~PAGE_MASK; |
1097 | down_write(&mm->mmap_sem); | 1089 | down_write(&mm->mmap_sem); |
1098 | maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, | 1090 | maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, |
1099 | phdr->p_offset - disp); | 1091 | phdr->p_offset - disp); |
1100 | up_write(&mm->mmap_sem); | 1092 | up_write(&mm->mmap_sem); |
1101 | 1093 | ||
1102 | kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx", | 1094 | kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx", |
1103 | loop, phdr->p_memsz + disp, prot, flags, | 1095 | loop, phdr->p_memsz + disp, prot, flags, |
1104 | phdr->p_offset - disp, maddr); | 1096 | phdr->p_offset - disp, maddr); |
1105 | 1097 | ||
1106 | if (IS_ERR_VALUE(maddr)) | 1098 | if (IS_ERR_VALUE(maddr)) |
1107 | return (int) maddr; | 1099 | return (int) maddr; |
1108 | 1100 | ||
1109 | if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == | 1101 | if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == |
1110 | ELF_FDPIC_FLAG_CONTIGUOUS) | 1102 | ELF_FDPIC_FLAG_CONTIGUOUS) |
1111 | load_addr += PAGE_ALIGN(phdr->p_memsz + disp); | 1103 | load_addr += PAGE_ALIGN(phdr->p_memsz + disp); |
1112 | 1104 | ||
1113 | seg->addr = maddr + disp; | 1105 | seg->addr = maddr + disp; |
1114 | seg->p_vaddr = phdr->p_vaddr; | 1106 | seg->p_vaddr = phdr->p_vaddr; |
1115 | seg->p_memsz = phdr->p_memsz; | 1107 | seg->p_memsz = phdr->p_memsz; |
1116 | 1108 | ||
1117 | /* map the ELF header address if in this segment */ | 1109 | /* map the ELF header address if in this segment */ |
1118 | if (phdr->p_offset == 0) | 1110 | if (phdr->p_offset == 0) |
1119 | params->elfhdr_addr = seg->addr; | 1111 | params->elfhdr_addr = seg->addr; |
1120 | 1112 | ||
1121 | /* clear the bit between beginning of mapping and beginning of | 1113 | /* clear the bit between beginning of mapping and beginning of |
1122 | * PT_LOAD */ | 1114 | * PT_LOAD */ |
1123 | if (prot & PROT_WRITE && disp > 0) { | 1115 | if (prot & PROT_WRITE && disp > 0) { |
1124 | kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); | 1116 | kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); |
1125 | clear_user((void __user *) maddr, disp); | 1117 | clear_user((void __user *) maddr, disp); |
1126 | maddr += disp; | 1118 | maddr += disp; |
1127 | } | 1119 | } |
1128 | 1120 | ||
1129 | /* clear any space allocated but not loaded | 1121 | /* clear any space allocated but not loaded |
1130 | * - on uClinux we can just clear the lot | 1122 | * - on uClinux we can just clear the lot |
1131 | * - on MMU linux we'll get a SIGBUS beyond the last page | 1123 | * - on MMU linux we'll get a SIGBUS beyond the last page |
1132 | * extant in the file | 1124 | * extant in the file |
1133 | */ | 1125 | */ |
1134 | excess = phdr->p_memsz - phdr->p_filesz; | 1126 | excess = phdr->p_memsz - phdr->p_filesz; |
1135 | excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); | 1127 | excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); |
1136 | 1128 | ||
1137 | #ifdef CONFIG_MMU | 1129 | #ifdef CONFIG_MMU |
1138 | if (excess > excess1) { | 1130 | if (excess > excess1) { |
1139 | unsigned long xaddr = maddr + phdr->p_filesz + excess1; | 1131 | unsigned long xaddr = maddr + phdr->p_filesz + excess1; |
1140 | unsigned long xmaddr; | 1132 | unsigned long xmaddr; |
1141 | 1133 | ||
1142 | flags |= MAP_FIXED | MAP_ANONYMOUS; | 1134 | flags |= MAP_FIXED | MAP_ANONYMOUS; |
1143 | down_write(&mm->mmap_sem); | 1135 | down_write(&mm->mmap_sem); |
1144 | xmaddr = do_mmap(NULL, xaddr, excess - excess1, | 1136 | xmaddr = do_mmap(NULL, xaddr, excess - excess1, |
1145 | prot, flags, 0); | 1137 | prot, flags, 0); |
1146 | up_write(&mm->mmap_sem); | 1138 | up_write(&mm->mmap_sem); |
1147 | 1139 | ||
1148 | kdebug("mmap[%d] <anon>" | 1140 | kdebug("mmap[%d] <anon>" |
1149 | " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", | 1141 | " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", |
1150 | loop, xaddr, excess - excess1, prot, flags, | 1142 | loop, xaddr, excess - excess1, prot, flags, |
1151 | xmaddr); | 1143 | xmaddr); |
1152 | 1144 | ||
1153 | if (xmaddr != xaddr) | 1145 | if (xmaddr != xaddr) |
1154 | return -ENOMEM; | 1146 | return -ENOMEM; |
1155 | } | 1147 | } |
1156 | 1148 | ||
1157 | if (prot & PROT_WRITE && excess1 > 0) { | 1149 | if (prot & PROT_WRITE && excess1 > 0) { |
1158 | kdebug("clear[%d] ad=%lx sz=%lx", | 1150 | kdebug("clear[%d] ad=%lx sz=%lx", |
1159 | loop, maddr + phdr->p_filesz, excess1); | 1151 | loop, maddr + phdr->p_filesz, excess1); |
1160 | clear_user((void __user *) maddr + phdr->p_filesz, | 1152 | clear_user((void __user *) maddr + phdr->p_filesz, |
1161 | excess1); | 1153 | excess1); |
1162 | } | 1154 | } |
1163 | 1155 | ||
1164 | #else | 1156 | #else |
1165 | if (excess > 0) { | 1157 | if (excess > 0) { |
1166 | kdebug("clear[%d] ad=%lx sz=%lx", | 1158 | kdebug("clear[%d] ad=%lx sz=%lx", |
1167 | loop, maddr + phdr->p_filesz, excess); | 1159 | loop, maddr + phdr->p_filesz, excess); |
1168 | clear_user((void *) maddr + phdr->p_filesz, excess); | 1160 | clear_user((void *) maddr + phdr->p_filesz, excess); |
1169 | } | 1161 | } |
1170 | #endif | 1162 | #endif |
1171 | 1163 | ||
1172 | if (mm) { | 1164 | if (mm) { |
1173 | if (phdr->p_flags & PF_X) { | 1165 | if (phdr->p_flags & PF_X) { |
1174 | if (!mm->start_code) { | 1166 | if (!mm->start_code) { |
1175 | mm->start_code = maddr; | 1167 | mm->start_code = maddr; |
1176 | mm->end_code = maddr + phdr->p_memsz; | 1168 | mm->end_code = maddr + phdr->p_memsz; |
1177 | } | 1169 | } |
1178 | } else if (!mm->start_data) { | 1170 | } else if (!mm->start_data) { |
1179 | mm->start_data = maddr; | 1171 | mm->start_data = maddr; |
1180 | mm->end_data = maddr + phdr->p_memsz; | 1172 | mm->end_data = maddr + phdr->p_memsz; |
1181 | } | 1173 | } |
1182 | } | 1174 | } |
1183 | 1175 | ||
1184 | seg++; | 1176 | seg++; |
1185 | } | 1177 | } |
1186 | 1178 | ||
1187 | return 0; | 1179 | return 0; |
1188 | } | 1180 | } |
1189 | 1181 | ||
1190 | /*****************************************************************************/ | 1182 | /*****************************************************************************/ |
1191 | /* | 1183 | /* |
1192 | * ELF-FDPIC core dumper | 1184 | * ELF-FDPIC core dumper |
1193 | * | 1185 | * |
1194 | * Modelled on fs/exec.c:aout_core_dump() | 1186 | * Modelled on fs/exec.c:aout_core_dump() |
1195 | * Jeremy Fitzhardinge <jeremy@sw.oz.au> | 1187 | * Jeremy Fitzhardinge <jeremy@sw.oz.au> |
1196 | * | 1188 | * |
1197 | * Modelled on fs/binfmt_elf.c core dumper | 1189 | * Modelled on fs/binfmt_elf.c core dumper |
1198 | */ | 1190 | */ |
1199 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) | 1191 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) |
1200 | 1192 | ||
1201 | /* | 1193 | /* |
1202 | * These are the only things you should do on a core-file: use only these | 1194 | * These are the only things you should do on a core-file: use only these |
1203 | * functions to write out all the necessary info. | 1195 | * functions to write out all the necessary info. |
1204 | */ | 1196 | */ |
1205 | static int dump_write(struct file *file, const void *addr, int nr) | 1197 | static int dump_write(struct file *file, const void *addr, int nr) |
1206 | { | 1198 | { |
1207 | return file->f_op->write(file, addr, nr, &file->f_pos) == nr; | 1199 | return file->f_op->write(file, addr, nr, &file->f_pos) == nr; |
1208 | } | 1200 | } |
1209 | 1201 | ||
1210 | static int dump_seek(struct file *file, loff_t off) | 1202 | static int dump_seek(struct file *file, loff_t off) |
1211 | { | 1203 | { |
1212 | if (file->f_op->llseek) { | 1204 | if (file->f_op->llseek) { |
1213 | if (file->f_op->llseek(file, off, SEEK_SET) != off) | 1205 | if (file->f_op->llseek(file, off, SEEK_SET) != off) |
1214 | return 0; | 1206 | return 0; |
1215 | } else { | 1207 | } else { |
1216 | file->f_pos = off; | 1208 | file->f_pos = off; |
1217 | } | 1209 | } |
1218 | return 1; | 1210 | return 1; |
1219 | } | 1211 | } |
1220 | 1212 | ||
1221 | /* | 1213 | /* |
1222 | * Decide whether a segment is worth dumping; default is yes to be | 1214 | * Decide whether a segment is worth dumping; default is yes to be |
1223 | * sure (missing info is worse than too much; etc). | 1215 | * sure (missing info is worse than too much; etc). |
1224 | * Personally I'd include everything, and use the coredump limit... | 1216 | * Personally I'd include everything, and use the coredump limit... |
1225 | * | 1217 | * |
1226 | * I think we should skip something. But I am not sure how. H.J. | 1218 | * I think we should skip something. But I am not sure how. H.J. |
1227 | */ | 1219 | */ |
1228 | static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) | 1220 | static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) |
1229 | { | 1221 | { |
1230 | int dump_ok; | 1222 | int dump_ok; |
1231 | 1223 | ||
1232 | /* Do not dump I/O mapped devices or special mappings */ | 1224 | /* Do not dump I/O mapped devices or special mappings */ |
1233 | if (vma->vm_flags & (VM_IO | VM_RESERVED)) { | 1225 | if (vma->vm_flags & (VM_IO | VM_RESERVED)) { |
1234 | kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); | 1226 | kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); |
1235 | return 0; | 1227 | return 0; |
1236 | } | 1228 | } |
1237 | 1229 | ||
1238 | /* If we may not read the contents, don't allow us to dump | 1230 | /* If we may not read the contents, don't allow us to dump |
1239 | * them either. "dump_write()" can't handle it anyway. | 1231 | * them either. "dump_write()" can't handle it anyway. |
1240 | */ | 1232 | */ |
1241 | if (!(vma->vm_flags & VM_READ)) { | 1233 | if (!(vma->vm_flags & VM_READ)) { |
1242 | kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); | 1234 | kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); |
1243 | return 0; | 1235 | return 0; |
1244 | } | 1236 | } |
1245 | 1237 | ||
1246 | /* By default, dump shared memory if mapped from an anonymous file. */ | 1238 | /* By default, dump shared memory if mapped from an anonymous file. */ |
1247 | if (vma->vm_flags & VM_SHARED) { | 1239 | if (vma->vm_flags & VM_SHARED) { |
1248 | if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) { | 1240 | if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) { |
1249 | dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); | 1241 | dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); |
1250 | kdcore("%08lx: %08lx: %s (share)", vma->vm_start, | 1242 | kdcore("%08lx: %08lx: %s (share)", vma->vm_start, |
1251 | vma->vm_flags, dump_ok ? "yes" : "no"); | 1243 | vma->vm_flags, dump_ok ? "yes" : "no"); |
1252 | return dump_ok; | 1244 | return dump_ok; |
1253 | } | 1245 | } |
1254 | 1246 | ||
1255 | dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); | 1247 | dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); |
1256 | kdcore("%08lx: %08lx: %s (share)", vma->vm_start, | 1248 | kdcore("%08lx: %08lx: %s (share)", vma->vm_start, |
1257 | vma->vm_flags, dump_ok ? "yes" : "no"); | 1249 | vma->vm_flags, dump_ok ? "yes" : "no"); |
1258 | return dump_ok; | 1250 | return dump_ok; |
1259 | } | 1251 | } |
1260 | 1252 | ||
1261 | #ifdef CONFIG_MMU | 1253 | #ifdef CONFIG_MMU |
1262 | /* By default, if it hasn't been written to, don't write it out */ | 1254 | /* By default, if it hasn't been written to, don't write it out */ |
1263 | if (!vma->anon_vma) { | 1255 | if (!vma->anon_vma) { |
1264 | dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); | 1256 | dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); |
1265 | kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, | 1257 | kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, |
1266 | vma->vm_flags, dump_ok ? "yes" : "no"); | 1258 | vma->vm_flags, dump_ok ? "yes" : "no"); |
1267 | return dump_ok; | 1259 | return dump_ok; |
1268 | } | 1260 | } |
1269 | #endif | 1261 | #endif |
1270 | 1262 | ||
1271 | dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); | 1263 | dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); |
1272 | kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, | 1264 | kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, |
1273 | dump_ok ? "yes" : "no"); | 1265 | dump_ok ? "yes" : "no"); |
1274 | return dump_ok; | 1266 | return dump_ok; |
1275 | } | 1267 | } |
1276 | 1268 | ||
1277 | /* An ELF note in memory */ | 1269 | /* An ELF note in memory */ |
1278 | struct memelfnote | 1270 | struct memelfnote |
1279 | { | 1271 | { |
1280 | const char *name; | 1272 | const char *name; |
1281 | int type; | 1273 | int type; |
1282 | unsigned int datasz; | 1274 | unsigned int datasz; |
1283 | void *data; | 1275 | void *data; |
1284 | }; | 1276 | }; |
1285 | 1277 | ||
1286 | static int notesize(struct memelfnote *en) | 1278 | static int notesize(struct memelfnote *en) |
1287 | { | 1279 | { |
1288 | int sz; | 1280 | int sz; |
1289 | 1281 | ||
1290 | sz = sizeof(struct elf_note); | 1282 | sz = sizeof(struct elf_note); |
1291 | sz += roundup(strlen(en->name) + 1, 4); | 1283 | sz += roundup(strlen(en->name) + 1, 4); |
1292 | sz += roundup(en->datasz, 4); | 1284 | sz += roundup(en->datasz, 4); |
1293 | 1285 | ||
1294 | return sz; | 1286 | return sz; |
1295 | } | 1287 | } |
1296 | 1288 | ||
1297 | /* #define DEBUG */ | 1289 | /* #define DEBUG */ |
1298 | 1290 | ||
1299 | #define DUMP_WRITE(addr, nr) \ | 1291 | #define DUMP_WRITE(addr, nr) \ |
1300 | do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) | 1292 | do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) |
1301 | #define DUMP_SEEK(off) \ | 1293 | #define DUMP_SEEK(off) \ |
1302 | do { if (!dump_seek(file, (off))) return 0; } while(0) | 1294 | do { if (!dump_seek(file, (off))) return 0; } while(0) |
1303 | 1295 | ||
1304 | static int writenote(struct memelfnote *men, struct file *file) | 1296 | static int writenote(struct memelfnote *men, struct file *file) |
1305 | { | 1297 | { |
1306 | struct elf_note en; | 1298 | struct elf_note en; |
1307 | 1299 | ||
1308 | en.n_namesz = strlen(men->name) + 1; | 1300 | en.n_namesz = strlen(men->name) + 1; |
1309 | en.n_descsz = men->datasz; | 1301 | en.n_descsz = men->datasz; |
1310 | en.n_type = men->type; | 1302 | en.n_type = men->type; |
1311 | 1303 | ||
1312 | DUMP_WRITE(&en, sizeof(en)); | 1304 | DUMP_WRITE(&en, sizeof(en)); |
1313 | DUMP_WRITE(men->name, en.n_namesz); | 1305 | DUMP_WRITE(men->name, en.n_namesz); |
1314 | /* XXX - cast from long long to long to avoid need for libgcc.a */ | 1306 | /* XXX - cast from long long to long to avoid need for libgcc.a */ |
1315 | DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ | 1307 | DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ |
1316 | DUMP_WRITE(men->data, men->datasz); | 1308 | DUMP_WRITE(men->data, men->datasz); |
1317 | DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ | 1309 | DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ |
1318 | 1310 | ||
1319 | return 1; | 1311 | return 1; |
1320 | } | 1312 | } |
1321 | #undef DUMP_WRITE | 1313 | #undef DUMP_WRITE |
1322 | #undef DUMP_SEEK | 1314 | #undef DUMP_SEEK |
1323 | 1315 | ||
1324 | #define DUMP_WRITE(addr, nr) \ | 1316 | #define DUMP_WRITE(addr, nr) \ |
1325 | if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ | 1317 | if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ |
1326 | goto end_coredump; | 1318 | goto end_coredump; |
1327 | #define DUMP_SEEK(off) \ | 1319 | #define DUMP_SEEK(off) \ |
1328 | if (!dump_seek(file, (off))) \ | 1320 | if (!dump_seek(file, (off))) \ |
1329 | goto end_coredump; | 1321 | goto end_coredump; |
1330 | 1322 | ||
1331 | static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) | 1323 | static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) |
1332 | { | 1324 | { |
1333 | memcpy(elf->e_ident, ELFMAG, SELFMAG); | 1325 | memcpy(elf->e_ident, ELFMAG, SELFMAG); |
1334 | elf->e_ident[EI_CLASS] = ELF_CLASS; | 1326 | elf->e_ident[EI_CLASS] = ELF_CLASS; |
1335 | elf->e_ident[EI_DATA] = ELF_DATA; | 1327 | elf->e_ident[EI_DATA] = ELF_DATA; |
1336 | elf->e_ident[EI_VERSION] = EV_CURRENT; | 1328 | elf->e_ident[EI_VERSION] = EV_CURRENT; |
1337 | elf->e_ident[EI_OSABI] = ELF_OSABI; | 1329 | elf->e_ident[EI_OSABI] = ELF_OSABI; |
1338 | memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); | 1330 | memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); |
1339 | 1331 | ||
1340 | elf->e_type = ET_CORE; | 1332 | elf->e_type = ET_CORE; |
1341 | elf->e_machine = ELF_ARCH; | 1333 | elf->e_machine = ELF_ARCH; |
1342 | elf->e_version = EV_CURRENT; | 1334 | elf->e_version = EV_CURRENT; |
1343 | elf->e_entry = 0; | 1335 | elf->e_entry = 0; |
1344 | elf->e_phoff = sizeof(struct elfhdr); | 1336 | elf->e_phoff = sizeof(struct elfhdr); |
1345 | elf->e_shoff = 0; | 1337 | elf->e_shoff = 0; |
1346 | elf->e_flags = ELF_FDPIC_CORE_EFLAGS; | 1338 | elf->e_flags = ELF_FDPIC_CORE_EFLAGS; |
1347 | elf->e_ehsize = sizeof(struct elfhdr); | 1339 | elf->e_ehsize = sizeof(struct elfhdr); |
1348 | elf->e_phentsize = sizeof(struct elf_phdr); | 1340 | elf->e_phentsize = sizeof(struct elf_phdr); |
1349 | elf->e_phnum = segs; | 1341 | elf->e_phnum = segs; |
1350 | elf->e_shentsize = 0; | 1342 | elf->e_shentsize = 0; |
1351 | elf->e_shnum = 0; | 1343 | elf->e_shnum = 0; |
1352 | elf->e_shstrndx = 0; | 1344 | elf->e_shstrndx = 0; |
1353 | return; | 1345 | return; |
1354 | } | 1346 | } |
1355 | 1347 | ||
1356 | static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) | 1348 | static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) |
1357 | { | 1349 | { |
1358 | phdr->p_type = PT_NOTE; | 1350 | phdr->p_type = PT_NOTE; |
1359 | phdr->p_offset = offset; | 1351 | phdr->p_offset = offset; |
1360 | phdr->p_vaddr = 0; | 1352 | phdr->p_vaddr = 0; |
1361 | phdr->p_paddr = 0; | 1353 | phdr->p_paddr = 0; |
1362 | phdr->p_filesz = sz; | 1354 | phdr->p_filesz = sz; |
1363 | phdr->p_memsz = 0; | 1355 | phdr->p_memsz = 0; |
1364 | phdr->p_flags = 0; | 1356 | phdr->p_flags = 0; |
1365 | phdr->p_align = 0; | 1357 | phdr->p_align = 0; |
1366 | return; | 1358 | return; |
1367 | } | 1359 | } |
1368 | 1360 | ||
1369 | static inline void fill_note(struct memelfnote *note, const char *name, int type, | 1361 | static inline void fill_note(struct memelfnote *note, const char *name, int type, |
1370 | unsigned int sz, void *data) | 1362 | unsigned int sz, void *data) |
1371 | { | 1363 | { |
1372 | note->name = name; | 1364 | note->name = name; |
1373 | note->type = type; | 1365 | note->type = type; |
1374 | note->datasz = sz; | 1366 | note->datasz = sz; |
1375 | note->data = data; | 1367 | note->data = data; |
1376 | return; | 1368 | return; |
1377 | } | 1369 | } |
1378 | 1370 | ||
1379 | /* | 1371 | /* |
1380 | * fill up all the fields in prstatus from the given task struct, except | 1372 | * fill up all the fields in prstatus from the given task struct, except |
1381 | * registers which need to be filled up seperately. | 1373 | * registers which need to be filled up seperately. |
1382 | */ | 1374 | */ |
1383 | static void fill_prstatus(struct elf_prstatus *prstatus, | 1375 | static void fill_prstatus(struct elf_prstatus *prstatus, |
1384 | struct task_struct *p, long signr) | 1376 | struct task_struct *p, long signr) |
1385 | { | 1377 | { |
1386 | prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; | 1378 | prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; |
1387 | prstatus->pr_sigpend = p->pending.signal.sig[0]; | 1379 | prstatus->pr_sigpend = p->pending.signal.sig[0]; |
1388 | prstatus->pr_sighold = p->blocked.sig[0]; | 1380 | prstatus->pr_sighold = p->blocked.sig[0]; |
1389 | prstatus->pr_pid = task_pid_vnr(p); | 1381 | prstatus->pr_pid = task_pid_vnr(p); |
1390 | prstatus->pr_ppid = task_pid_vnr(p->parent); | 1382 | prstatus->pr_ppid = task_pid_vnr(p->parent); |
1391 | prstatus->pr_pgrp = task_pgrp_vnr(p); | 1383 | prstatus->pr_pgrp = task_pgrp_vnr(p); |
1392 | prstatus->pr_sid = task_session_vnr(p); | 1384 | prstatus->pr_sid = task_session_vnr(p); |
1393 | if (thread_group_leader(p)) { | 1385 | if (thread_group_leader(p)) { |
1394 | struct task_cputime cputime; | 1386 | struct task_cputime cputime; |
1395 | 1387 | ||
1396 | /* | 1388 | /* |
1397 | * This is the record for the group leader. It shows the | 1389 | * This is the record for the group leader. It shows the |
1398 | * group-wide total, not its individual thread total. | 1390 | * group-wide total, not its individual thread total. |
1399 | */ | 1391 | */ |
1400 | thread_group_cputime(p, &cputime); | 1392 | thread_group_cputime(p, &cputime); |
1401 | cputime_to_timeval(cputime.utime, &prstatus->pr_utime); | 1393 | cputime_to_timeval(cputime.utime, &prstatus->pr_utime); |
1402 | cputime_to_timeval(cputime.stime, &prstatus->pr_stime); | 1394 | cputime_to_timeval(cputime.stime, &prstatus->pr_stime); |
1403 | } else { | 1395 | } else { |
1404 | cputime_to_timeval(p->utime, &prstatus->pr_utime); | 1396 | cputime_to_timeval(p->utime, &prstatus->pr_utime); |
1405 | cputime_to_timeval(p->stime, &prstatus->pr_stime); | 1397 | cputime_to_timeval(p->stime, &prstatus->pr_stime); |
1406 | } | 1398 | } |
1407 | cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); | 1399 | cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); |
1408 | cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); | 1400 | cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); |
1409 | 1401 | ||
1410 | prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; | 1402 | prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; |
1411 | prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; | 1403 | prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; |
1412 | } | 1404 | } |
1413 | 1405 | ||
1414 | static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, | 1406 | static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, |
1415 | struct mm_struct *mm) | 1407 | struct mm_struct *mm) |
1416 | { | 1408 | { |
1417 | const struct cred *cred; | 1409 | const struct cred *cred; |
1418 | unsigned int i, len; | 1410 | unsigned int i, len; |
1419 | 1411 | ||
1420 | /* first copy the parameters from user space */ | 1412 | /* first copy the parameters from user space */ |
1421 | memset(psinfo, 0, sizeof(struct elf_prpsinfo)); | 1413 | memset(psinfo, 0, sizeof(struct elf_prpsinfo)); |
1422 | 1414 | ||
1423 | len = mm->arg_end - mm->arg_start; | 1415 | len = mm->arg_end - mm->arg_start; |
1424 | if (len >= ELF_PRARGSZ) | 1416 | if (len >= ELF_PRARGSZ) |
1425 | len = ELF_PRARGSZ - 1; | 1417 | len = ELF_PRARGSZ - 1; |
1426 | if (copy_from_user(&psinfo->pr_psargs, | 1418 | if (copy_from_user(&psinfo->pr_psargs, |
1427 | (const char __user *) mm->arg_start, len)) | 1419 | (const char __user *) mm->arg_start, len)) |
1428 | return -EFAULT; | 1420 | return -EFAULT; |
1429 | for (i = 0; i < len; i++) | 1421 | for (i = 0; i < len; i++) |
1430 | if (psinfo->pr_psargs[i] == 0) | 1422 | if (psinfo->pr_psargs[i] == 0) |
1431 | psinfo->pr_psargs[i] = ' '; | 1423 | psinfo->pr_psargs[i] = ' '; |
1432 | psinfo->pr_psargs[len] = 0; | 1424 | psinfo->pr_psargs[len] = 0; |
1433 | 1425 | ||
1434 | psinfo->pr_pid = task_pid_vnr(p); | 1426 | psinfo->pr_pid = task_pid_vnr(p); |
1435 | psinfo->pr_ppid = task_pid_vnr(p->parent); | 1427 | psinfo->pr_ppid = task_pid_vnr(p->parent); |
1436 | psinfo->pr_pgrp = task_pgrp_vnr(p); | 1428 | psinfo->pr_pgrp = task_pgrp_vnr(p); |
1437 | psinfo->pr_sid = task_session_vnr(p); | 1429 | psinfo->pr_sid = task_session_vnr(p); |
1438 | 1430 | ||
1439 | i = p->state ? ffz(~p->state) + 1 : 0; | 1431 | i = p->state ? ffz(~p->state) + 1 : 0; |
1440 | psinfo->pr_state = i; | 1432 | psinfo->pr_state = i; |
1441 | psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; | 1433 | psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; |
1442 | psinfo->pr_zomb = psinfo->pr_sname == 'Z'; | 1434 | psinfo->pr_zomb = psinfo->pr_sname == 'Z'; |
1443 | psinfo->pr_nice = task_nice(p); | 1435 | psinfo->pr_nice = task_nice(p); |
1444 | psinfo->pr_flag = p->flags; | 1436 | psinfo->pr_flag = p->flags; |
1445 | rcu_read_lock(); | 1437 | rcu_read_lock(); |
1446 | cred = __task_cred(p); | 1438 | cred = __task_cred(p); |
1447 | SET_UID(psinfo->pr_uid, cred->uid); | 1439 | SET_UID(psinfo->pr_uid, cred->uid); |
1448 | SET_GID(psinfo->pr_gid, cred->gid); | 1440 | SET_GID(psinfo->pr_gid, cred->gid); |
1449 | rcu_read_unlock(); | 1441 | rcu_read_unlock(); |
1450 | strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); | 1442 | strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); |
1451 | 1443 | ||
1452 | return 0; | 1444 | return 0; |
1453 | } | 1445 | } |
1454 | 1446 | ||
1455 | /* Here is the structure in which status of each thread is captured. */ | 1447 | /* Here is the structure in which status of each thread is captured. */ |
1456 | struct elf_thread_status | 1448 | struct elf_thread_status |
1457 | { | 1449 | { |
1458 | struct list_head list; | 1450 | struct list_head list; |
1459 | struct elf_prstatus prstatus; /* NT_PRSTATUS */ | 1451 | struct elf_prstatus prstatus; /* NT_PRSTATUS */ |
1460 | elf_fpregset_t fpu; /* NT_PRFPREG */ | 1452 | elf_fpregset_t fpu; /* NT_PRFPREG */ |
1461 | struct task_struct *thread; | 1453 | struct task_struct *thread; |
1462 | #ifdef ELF_CORE_COPY_XFPREGS | 1454 | #ifdef ELF_CORE_COPY_XFPREGS |
1463 | elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */ | 1455 | elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */ |
1464 | #endif | 1456 | #endif |
1465 | struct memelfnote notes[3]; | 1457 | struct memelfnote notes[3]; |
1466 | int num_notes; | 1458 | int num_notes; |
1467 | }; | 1459 | }; |
1468 | 1460 | ||
1469 | /* | 1461 | /* |
1470 | * In order to add the specific thread information for the elf file format, | 1462 | * In order to add the specific thread information for the elf file format, |
1471 | * we need to keep a linked list of every thread's pr_status and then create | 1463 | * we need to keep a linked list of every thread's pr_status and then create |
1472 | * a single section for them in the final core file. | 1464 | * a single section for them in the final core file. |
1473 | */ | 1465 | */ |
1474 | static int elf_dump_thread_status(long signr, struct elf_thread_status *t) | 1466 | static int elf_dump_thread_status(long signr, struct elf_thread_status *t) |
1475 | { | 1467 | { |
1476 | struct task_struct *p = t->thread; | 1468 | struct task_struct *p = t->thread; |
1477 | int sz = 0; | 1469 | int sz = 0; |
1478 | 1470 | ||
1479 | t->num_notes = 0; | 1471 | t->num_notes = 0; |
1480 | 1472 | ||
1481 | fill_prstatus(&t->prstatus, p, signr); | 1473 | fill_prstatus(&t->prstatus, p, signr); |
1482 | elf_core_copy_task_regs(p, &t->prstatus.pr_reg); | 1474 | elf_core_copy_task_regs(p, &t->prstatus.pr_reg); |
1483 | 1475 | ||
1484 | fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), | 1476 | fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), |
1485 | &t->prstatus); | 1477 | &t->prstatus); |
1486 | t->num_notes++; | 1478 | t->num_notes++; |
1487 | sz += notesize(&t->notes[0]); | 1479 | sz += notesize(&t->notes[0]); |
1488 | 1480 | ||
1489 | t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu); | 1481 | t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu); |
1490 | if (t->prstatus.pr_fpvalid) { | 1482 | if (t->prstatus.pr_fpvalid) { |
1491 | fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), | 1483 | fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), |
1492 | &t->fpu); | 1484 | &t->fpu); |
1493 | t->num_notes++; | 1485 | t->num_notes++; |
1494 | sz += notesize(&t->notes[1]); | 1486 | sz += notesize(&t->notes[1]); |
1495 | } | 1487 | } |
1496 | 1488 | ||
1497 | #ifdef ELF_CORE_COPY_XFPREGS | 1489 | #ifdef ELF_CORE_COPY_XFPREGS |
1498 | if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { | 1490 | if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { |
1499 | fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE, | 1491 | fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE, |
1500 | sizeof(t->xfpu), &t->xfpu); | 1492 | sizeof(t->xfpu), &t->xfpu); |
1501 | t->num_notes++; | 1493 | t->num_notes++; |
1502 | sz += notesize(&t->notes[2]); | 1494 | sz += notesize(&t->notes[2]); |
1503 | } | 1495 | } |
1504 | #endif | 1496 | #endif |
1505 | return sz; | 1497 | return sz; |
1506 | } | 1498 | } |
1507 | 1499 | ||
1508 | /* | 1500 | /* |
1509 | * dump the segments for an MMU process | 1501 | * dump the segments for an MMU process |
1510 | */ | 1502 | */ |
1511 | #ifdef CONFIG_MMU | 1503 | #ifdef CONFIG_MMU |
1512 | static int elf_fdpic_dump_segments(struct file *file, size_t *size, | 1504 | static int elf_fdpic_dump_segments(struct file *file, size_t *size, |
1513 | unsigned long *limit, unsigned long mm_flags) | 1505 | unsigned long *limit, unsigned long mm_flags) |
1514 | { | 1506 | { |
1515 | struct vm_area_struct *vma; | 1507 | struct vm_area_struct *vma; |
1516 | 1508 | ||
1517 | for (vma = current->mm->mmap; vma; vma = vma->vm_next) { | 1509 | for (vma = current->mm->mmap; vma; vma = vma->vm_next) { |
1518 | unsigned long addr; | 1510 | unsigned long addr; |
1519 | 1511 | ||
1520 | if (!maydump(vma, mm_flags)) | 1512 | if (!maydump(vma, mm_flags)) |
1521 | continue; | 1513 | continue; |
1522 | 1514 | ||
1523 | for (addr = vma->vm_start; | 1515 | for (addr = vma->vm_start; |
1524 | addr < vma->vm_end; | 1516 | addr < vma->vm_end; |
1525 | addr += PAGE_SIZE | 1517 | addr += PAGE_SIZE |
1526 | ) { | 1518 | ) { |
1527 | struct vm_area_struct *vma; | 1519 | struct vm_area_struct *vma; |
1528 | struct page *page; | 1520 | struct page *page; |
1529 | 1521 | ||
1530 | if (get_user_pages(current, current->mm, addr, 1, 0, 1, | 1522 | if (get_user_pages(current, current->mm, addr, 1, 0, 1, |
1531 | &page, &vma) <= 0) { | 1523 | &page, &vma) <= 0) { |
1532 | DUMP_SEEK(file->f_pos + PAGE_SIZE); | 1524 | DUMP_SEEK(file->f_pos + PAGE_SIZE); |
1533 | } | 1525 | } |
1534 | else if (page == ZERO_PAGE(0)) { | 1526 | else if (page == ZERO_PAGE(0)) { |
1535 | page_cache_release(page); | 1527 | page_cache_release(page); |
1536 | DUMP_SEEK(file->f_pos + PAGE_SIZE); | 1528 | DUMP_SEEK(file->f_pos + PAGE_SIZE); |
1537 | } | 1529 | } |
1538 | else { | 1530 | else { |
1539 | void *kaddr; | 1531 | void *kaddr; |
1540 | 1532 | ||
1541 | flush_cache_page(vma, addr, page_to_pfn(page)); | 1533 | flush_cache_page(vma, addr, page_to_pfn(page)); |
1542 | kaddr = kmap(page); | 1534 | kaddr = kmap(page); |
1543 | if ((*size += PAGE_SIZE) > *limit || | 1535 | if ((*size += PAGE_SIZE) > *limit || |
1544 | !dump_write(file, kaddr, PAGE_SIZE) | 1536 | !dump_write(file, kaddr, PAGE_SIZE) |
1545 | ) { | 1537 | ) { |
1546 | kunmap(page); | 1538 | kunmap(page); |
1547 | page_cache_release(page); | 1539 | page_cache_release(page); |
1548 | return -EIO; | 1540 | return -EIO; |
1549 | } | 1541 | } |
1550 | kunmap(page); | 1542 | kunmap(page); |
1551 | page_cache_release(page); | 1543 | page_cache_release(page); |
1552 | } | 1544 | } |
1553 | } | 1545 | } |
1554 | } | 1546 | } |
1555 | 1547 | ||
1556 | return 0; | 1548 | return 0; |
1557 | 1549 | ||
1558 | end_coredump: | 1550 | end_coredump: |
1559 | return -EFBIG; | 1551 | return -EFBIG; |
1560 | } | 1552 | } |
1561 | #endif | 1553 | #endif |
1562 | 1554 | ||
1563 | /* | 1555 | /* |
1564 | * dump the segments for a NOMMU process | 1556 | * dump the segments for a NOMMU process |
1565 | */ | 1557 | */ |
1566 | #ifndef CONFIG_MMU | 1558 | #ifndef CONFIG_MMU |
1567 | static int elf_fdpic_dump_segments(struct file *file, size_t *size, | 1559 | static int elf_fdpic_dump_segments(struct file *file, size_t *size, |
1568 | unsigned long *limit, unsigned long mm_flags) | 1560 | unsigned long *limit, unsigned long mm_flags) |
1569 | { | 1561 | { |
1570 | struct vm_list_struct *vml; | 1562 | struct vm_area_struct *vma; |
1571 | 1563 | ||
1572 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) { | 1564 | for (vma = current->mm->mmap; vma; vma = vma->vm_next) { |
1573 | struct vm_area_struct *vma = vml->vma; | ||
1574 | |||
1575 | if (!maydump(vma, mm_flags)) | 1565 | if (!maydump(vma, mm_flags)) |
1576 | continue; | 1566 | continue; |
1577 | 1567 | ||
1578 | if ((*size += PAGE_SIZE) > *limit) | 1568 | if ((*size += PAGE_SIZE) > *limit) |
1579 | return -EFBIG; | 1569 | return -EFBIG; |
1580 | 1570 | ||
1581 | if (!dump_write(file, (void *) vma->vm_start, | 1571 | if (!dump_write(file, (void *) vma->vm_start, |
1582 | vma->vm_end - vma->vm_start)) | 1572 | vma->vm_end - vma->vm_start)) |
1583 | return -EIO; | 1573 | return -EIO; |
1584 | } | 1574 | } |
1585 | 1575 | ||
1586 | return 0; | 1576 | return 0; |
1587 | } | 1577 | } |
1588 | #endif | 1578 | #endif |
1589 | 1579 | ||
1590 | /* | 1580 | /* |
1591 | * Actual dumper | 1581 | * Actual dumper |
1592 | * | 1582 | * |
1593 | * This is a two-pass process; first we find the offsets of the bits, | 1583 | * This is a two-pass process; first we find the offsets of the bits, |
1594 | * and then they are actually written out. If we run out of core limit | 1584 | * and then they are actually written out. If we run out of core limit |
1595 | * we just truncate. | 1585 | * we just truncate. |
1596 | */ | 1586 | */ |
1597 | static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, | 1587 | static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, |
1598 | struct file *file, unsigned long limit) | 1588 | struct file *file, unsigned long limit) |
1599 | { | 1589 | { |
1600 | #define NUM_NOTES 6 | 1590 | #define NUM_NOTES 6 |
1601 | int has_dumped = 0; | 1591 | int has_dumped = 0; |
1602 | mm_segment_t fs; | 1592 | mm_segment_t fs; |
1603 | int segs; | 1593 | int segs; |
1604 | size_t size = 0; | 1594 | size_t size = 0; |
1605 | int i; | 1595 | int i; |
1606 | struct vm_area_struct *vma; | 1596 | struct vm_area_struct *vma; |
1607 | struct elfhdr *elf = NULL; | 1597 | struct elfhdr *elf = NULL; |
1608 | loff_t offset = 0, dataoff; | 1598 | loff_t offset = 0, dataoff; |
1609 | int numnote; | 1599 | int numnote; |
1610 | struct memelfnote *notes = NULL; | 1600 | struct memelfnote *notes = NULL; |
1611 | struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ | 1601 | struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ |
1612 | struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ | 1602 | struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ |
1613 | LIST_HEAD(thread_list); | 1603 | LIST_HEAD(thread_list); |
1614 | struct list_head *t; | 1604 | struct list_head *t; |
1615 | elf_fpregset_t *fpu = NULL; | 1605 | elf_fpregset_t *fpu = NULL; |
1616 | #ifdef ELF_CORE_COPY_XFPREGS | 1606 | #ifdef ELF_CORE_COPY_XFPREGS |
1617 | elf_fpxregset_t *xfpu = NULL; | 1607 | elf_fpxregset_t *xfpu = NULL; |
1618 | #endif | 1608 | #endif |
1619 | int thread_status_size = 0; | 1609 | int thread_status_size = 0; |
1620 | #ifndef CONFIG_MMU | ||
1621 | struct vm_list_struct *vml; | ||
1622 | #endif | ||
1623 | elf_addr_t *auxv; | 1610 | elf_addr_t *auxv; |
1624 | unsigned long mm_flags; | 1611 | unsigned long mm_flags; |
1625 | 1612 | ||
1626 | /* | 1613 | /* |
1627 | * We no longer stop all VM operations. | 1614 | * We no longer stop all VM operations. |
1628 | * | 1615 | * |
1629 | * This is because those proceses that could possibly change map_count | 1616 | * This is because those proceses that could possibly change map_count |
1630 | * or the mmap / vma pages are now blocked in do_exit on current | 1617 | * or the mmap / vma pages are now blocked in do_exit on current |
1631 | * finishing this core dump. | 1618 | * finishing this core dump. |
1632 | * | 1619 | * |
1633 | * Only ptrace can touch these memory addresses, but it doesn't change | 1620 | * Only ptrace can touch these memory addresses, but it doesn't change |
1634 | * the map_count or the pages allocated. So no possibility of crashing | 1621 | * the map_count or the pages allocated. So no possibility of crashing |
1635 | * exists while dumping the mm->vm_next areas to the core file. | 1622 | * exists while dumping the mm->vm_next areas to the core file. |
1636 | */ | 1623 | */ |
1637 | 1624 | ||
1638 | /* alloc memory for large data structures: too large to be on stack */ | 1625 | /* alloc memory for large data structures: too large to be on stack */ |
1639 | elf = kmalloc(sizeof(*elf), GFP_KERNEL); | 1626 | elf = kmalloc(sizeof(*elf), GFP_KERNEL); |
1640 | if (!elf) | 1627 | if (!elf) |
1641 | goto cleanup; | 1628 | goto cleanup; |
1642 | prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); | 1629 | prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); |
1643 | if (!prstatus) | 1630 | if (!prstatus) |
1644 | goto cleanup; | 1631 | goto cleanup; |
1645 | psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); | 1632 | psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); |
1646 | if (!psinfo) | 1633 | if (!psinfo) |
1647 | goto cleanup; | 1634 | goto cleanup; |
1648 | notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); | 1635 | notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); |
1649 | if (!notes) | 1636 | if (!notes) |
1650 | goto cleanup; | 1637 | goto cleanup; |
1651 | fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); | 1638 | fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); |
1652 | if (!fpu) | 1639 | if (!fpu) |
1653 | goto cleanup; | 1640 | goto cleanup; |
1654 | #ifdef ELF_CORE_COPY_XFPREGS | 1641 | #ifdef ELF_CORE_COPY_XFPREGS |
1655 | xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); | 1642 | xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); |
1656 | if (!xfpu) | 1643 | if (!xfpu) |
1657 | goto cleanup; | 1644 | goto cleanup; |
1658 | #endif | 1645 | #endif |
1659 | 1646 | ||
1660 | if (signr) { | 1647 | if (signr) { |
1661 | struct core_thread *ct; | 1648 | struct core_thread *ct; |
1662 | struct elf_thread_status *tmp; | 1649 | struct elf_thread_status *tmp; |
1663 | 1650 | ||
1664 | for (ct = current->mm->core_state->dumper.next; | 1651 | for (ct = current->mm->core_state->dumper.next; |
1665 | ct; ct = ct->next) { | 1652 | ct; ct = ct->next) { |
1666 | tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); | 1653 | tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); |
1667 | if (!tmp) | 1654 | if (!tmp) |
1668 | goto cleanup; | 1655 | goto cleanup; |
1669 | 1656 | ||
1670 | tmp->thread = ct->task; | 1657 | tmp->thread = ct->task; |
1671 | list_add(&tmp->list, &thread_list); | 1658 | list_add(&tmp->list, &thread_list); |
1672 | } | 1659 | } |
1673 | 1660 | ||
1674 | list_for_each(t, &thread_list) { | 1661 | list_for_each(t, &thread_list) { |
1675 | struct elf_thread_status *tmp; | 1662 | struct elf_thread_status *tmp; |
1676 | int sz; | 1663 | int sz; |
1677 | 1664 | ||
1678 | tmp = list_entry(t, struct elf_thread_status, list); | 1665 | tmp = list_entry(t, struct elf_thread_status, list); |
1679 | sz = elf_dump_thread_status(signr, tmp); | 1666 | sz = elf_dump_thread_status(signr, tmp); |
1680 | thread_status_size += sz; | 1667 | thread_status_size += sz; |
1681 | } | 1668 | } |
1682 | } | 1669 | } |
1683 | 1670 | ||
1684 | /* now collect the dump for the current */ | 1671 | /* now collect the dump for the current */ |
1685 | fill_prstatus(prstatus, current, signr); | 1672 | fill_prstatus(prstatus, current, signr); |
1686 | elf_core_copy_regs(&prstatus->pr_reg, regs); | 1673 | elf_core_copy_regs(&prstatus->pr_reg, regs); |
1687 | 1674 | ||
1688 | #ifdef CONFIG_MMU | ||
1689 | segs = current->mm->map_count; | 1675 | segs = current->mm->map_count; |
1690 | #else | ||
1691 | segs = 0; | ||
1692 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) | ||
1693 | segs++; | ||
1694 | #endif | ||
1695 | #ifdef ELF_CORE_EXTRA_PHDRS | 1676 | #ifdef ELF_CORE_EXTRA_PHDRS |
1696 | segs += ELF_CORE_EXTRA_PHDRS; | 1677 | segs += ELF_CORE_EXTRA_PHDRS; |
1697 | #endif | 1678 | #endif |
1698 | 1679 | ||
1699 | /* Set up header */ | 1680 | /* Set up header */ |
1700 | fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ | 1681 | fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ |
1701 | 1682 | ||
1702 | has_dumped = 1; | 1683 | has_dumped = 1; |
1703 | current->flags |= PF_DUMPCORE; | 1684 | current->flags |= PF_DUMPCORE; |
1704 | 1685 | ||
1705 | /* | 1686 | /* |
1706 | * Set up the notes in similar form to SVR4 core dumps made | 1687 | * Set up the notes in similar form to SVR4 core dumps made |
1707 | * with info from their /proc. | 1688 | * with info from their /proc. |
1708 | */ | 1689 | */ |
1709 | 1690 | ||
1710 | fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); | 1691 | fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); |
1711 | fill_psinfo(psinfo, current->group_leader, current->mm); | 1692 | fill_psinfo(psinfo, current->group_leader, current->mm); |
1712 | fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); | 1693 | fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); |
1713 | 1694 | ||
1714 | numnote = 2; | 1695 | numnote = 2; |
1715 | 1696 | ||
1716 | auxv = (elf_addr_t *) current->mm->saved_auxv; | 1697 | auxv = (elf_addr_t *) current->mm->saved_auxv; |
1717 | 1698 | ||
1718 | i = 0; | 1699 | i = 0; |
1719 | do | 1700 | do |
1720 | i += 2; | 1701 | i += 2; |
1721 | while (auxv[i - 2] != AT_NULL); | 1702 | while (auxv[i - 2] != AT_NULL); |
1722 | fill_note(¬es[numnote++], "CORE", NT_AUXV, | 1703 | fill_note(¬es[numnote++], "CORE", NT_AUXV, |
1723 | i * sizeof(elf_addr_t), auxv); | 1704 | i * sizeof(elf_addr_t), auxv); |
1724 | 1705 | ||
1725 | /* Try to dump the FPU. */ | 1706 | /* Try to dump the FPU. */ |
1726 | if ((prstatus->pr_fpvalid = | 1707 | if ((prstatus->pr_fpvalid = |
1727 | elf_core_copy_task_fpregs(current, regs, fpu))) | 1708 | elf_core_copy_task_fpregs(current, regs, fpu))) |
1728 | fill_note(notes + numnote++, | 1709 | fill_note(notes + numnote++, |
1729 | "CORE", NT_PRFPREG, sizeof(*fpu), fpu); | 1710 | "CORE", NT_PRFPREG, sizeof(*fpu), fpu); |
1730 | #ifdef ELF_CORE_COPY_XFPREGS | 1711 | #ifdef ELF_CORE_COPY_XFPREGS |
1731 | if (elf_core_copy_task_xfpregs(current, xfpu)) | 1712 | if (elf_core_copy_task_xfpregs(current, xfpu)) |
1732 | fill_note(notes + numnote++, | 1713 | fill_note(notes + numnote++, |
1733 | "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu); | 1714 | "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu); |
1734 | #endif | 1715 | #endif |
1735 | 1716 | ||
1736 | fs = get_fs(); | 1717 | fs = get_fs(); |
1737 | set_fs(KERNEL_DS); | 1718 | set_fs(KERNEL_DS); |
1738 | 1719 | ||
1739 | DUMP_WRITE(elf, sizeof(*elf)); | 1720 | DUMP_WRITE(elf, sizeof(*elf)); |
1740 | offset += sizeof(*elf); /* Elf header */ | 1721 | offset += sizeof(*elf); /* Elf header */ |
1741 | offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ | 1722 | offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ |
1742 | 1723 | ||
1743 | /* Write notes phdr entry */ | 1724 | /* Write notes phdr entry */ |
1744 | { | 1725 | { |
1745 | struct elf_phdr phdr; | 1726 | struct elf_phdr phdr; |
1746 | int sz = 0; | 1727 | int sz = 0; |
1747 | 1728 | ||
1748 | for (i = 0; i < numnote; i++) | 1729 | for (i = 0; i < numnote; i++) |
1749 | sz += notesize(notes + i); | 1730 | sz += notesize(notes + i); |
1750 | 1731 | ||
1751 | sz += thread_status_size; | 1732 | sz += thread_status_size; |
1752 | 1733 | ||
1753 | fill_elf_note_phdr(&phdr, sz, offset); | 1734 | fill_elf_note_phdr(&phdr, sz, offset); |
1754 | offset += sz; | 1735 | offset += sz; |
1755 | DUMP_WRITE(&phdr, sizeof(phdr)); | 1736 | DUMP_WRITE(&phdr, sizeof(phdr)); |
1756 | } | 1737 | } |
1757 | 1738 | ||
1758 | /* Page-align dumped data */ | 1739 | /* Page-align dumped data */ |
1759 | dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); | 1740 | dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); |
1760 | 1741 | ||
1761 | /* | 1742 | /* |
1762 | * We must use the same mm->flags while dumping core to avoid | 1743 | * We must use the same mm->flags while dumping core to avoid |
1763 | * inconsistency between the program headers and bodies, otherwise an | 1744 | * inconsistency between the program headers and bodies, otherwise an |
1764 | * unusable core file can be generated. | 1745 | * unusable core file can be generated. |
1765 | */ | 1746 | */ |
1766 | mm_flags = current->mm->flags; | 1747 | mm_flags = current->mm->flags; |
1767 | 1748 | ||
1768 | /* write program headers for segments dump */ | 1749 | /* write program headers for segments dump */ |
1769 | for ( | 1750 | for (vma = current->mm->mmap; vma; vma = vma->vm_next) { |
1770 | #ifdef CONFIG_MMU | ||
1771 | vma = current->mm->mmap; vma; vma = vma->vm_next | ||
1772 | #else | ||
1773 | vml = current->mm->context.vmlist; vml; vml = vml->next | ||
1774 | #endif | ||
1775 | ) { | ||
1776 | struct elf_phdr phdr; | 1751 | struct elf_phdr phdr; |
1777 | size_t sz; | 1752 | size_t sz; |
1778 | |||
1779 | #ifndef CONFIG_MMU | ||
1780 | vma = vml->vma; | ||
1781 | #endif | ||
1782 | 1753 | ||
1783 | sz = vma->vm_end - vma->vm_start; | 1754 | sz = vma->vm_end - vma->vm_start; |
1784 | 1755 | ||
1785 | phdr.p_type = PT_LOAD; | 1756 | phdr.p_type = PT_LOAD; |
1786 | phdr.p_offset = offset; | 1757 | phdr.p_offset = offset; |
1787 | phdr.p_vaddr = vma->vm_start; | 1758 | phdr.p_vaddr = vma->vm_start; |
1788 | phdr.p_paddr = 0; | 1759 | phdr.p_paddr = 0; |
1789 | phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0; | 1760 | phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0; |
1790 | phdr.p_memsz = sz; | 1761 | phdr.p_memsz = sz; |
1791 | offset += phdr.p_filesz; | 1762 | offset += phdr.p_filesz; |
1792 | phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; | 1763 | phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; |
1793 | if (vma->vm_flags & VM_WRITE) | 1764 | if (vma->vm_flags & VM_WRITE) |
1794 | phdr.p_flags |= PF_W; | 1765 | phdr.p_flags |= PF_W; |
1795 | if (vma->vm_flags & VM_EXEC) | 1766 | if (vma->vm_flags & VM_EXEC) |
1796 | phdr.p_flags |= PF_X; | 1767 | phdr.p_flags |= PF_X; |
1797 | phdr.p_align = ELF_EXEC_PAGESIZE; | 1768 | phdr.p_align = ELF_EXEC_PAGESIZE; |
1798 | 1769 | ||
1799 | DUMP_WRITE(&phdr, sizeof(phdr)); | 1770 | DUMP_WRITE(&phdr, sizeof(phdr)); |
1800 | } | 1771 | } |
1801 | 1772 | ||
1802 | #ifdef ELF_CORE_WRITE_EXTRA_PHDRS | 1773 | #ifdef ELF_CORE_WRITE_EXTRA_PHDRS |
1803 | ELF_CORE_WRITE_EXTRA_PHDRS; | 1774 | ELF_CORE_WRITE_EXTRA_PHDRS; |
1804 | #endif | 1775 | #endif |
1805 | 1776 | ||
1806 | /* write out the notes section */ | 1777 | /* write out the notes section */ |
1807 | for (i = 0; i < numnote; i++) | 1778 | for (i = 0; i < numnote; i++) |
1808 | if (!writenote(notes + i, file)) | 1779 | if (!writenote(notes + i, file)) |
1809 | goto end_coredump; | 1780 | goto end_coredump; |
1810 | 1781 | ||
1811 | /* write out the thread status notes section */ | 1782 | /* write out the thread status notes section */ |
1812 | list_for_each(t, &thread_list) { | 1783 | list_for_each(t, &thread_list) { |
1813 | struct elf_thread_status *tmp = | 1784 | struct elf_thread_status *tmp = |
1814 | list_entry(t, struct elf_thread_status, list); | 1785 | list_entry(t, struct elf_thread_status, list); |
1815 | 1786 | ||
1816 | for (i = 0; i < tmp->num_notes; i++) | 1787 | for (i = 0; i < tmp->num_notes; i++) |
1817 | if (!writenote(&tmp->notes[i], file)) | 1788 | if (!writenote(&tmp->notes[i], file)) |
1818 | goto end_coredump; | 1789 | goto end_coredump; |
1819 | } | 1790 | } |
1820 | 1791 | ||
1821 | DUMP_SEEK(dataoff); | 1792 | DUMP_SEEK(dataoff); |
1822 | 1793 | ||
1823 | if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) | 1794 | if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) |
1824 | goto end_coredump; | 1795 | goto end_coredump; |
1825 | 1796 | ||
1826 | #ifdef ELF_CORE_WRITE_EXTRA_DATA | 1797 | #ifdef ELF_CORE_WRITE_EXTRA_DATA |
1827 | ELF_CORE_WRITE_EXTRA_DATA; | 1798 | ELF_CORE_WRITE_EXTRA_DATA; |
1828 | #endif | 1799 | #endif |
1829 | 1800 | ||
1830 | if (file->f_pos != offset) { | 1801 | if (file->f_pos != offset) { |
1831 | /* Sanity check */ | 1802 | /* Sanity check */ |
1832 | printk(KERN_WARNING | 1803 | printk(KERN_WARNING |
1833 | "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", | 1804 | "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", |
1834 | file->f_pos, offset); | 1805 | file->f_pos, offset); |
1835 | } | 1806 | } |
1836 | 1807 | ||
1837 | end_coredump: | 1808 | end_coredump: |
1838 | set_fs(fs); | 1809 | set_fs(fs); |
1839 | 1810 | ||
1840 | cleanup: | 1811 | cleanup: |
1841 | while (!list_empty(&thread_list)) { | 1812 | while (!list_empty(&thread_list)) { |
1842 | struct list_head *tmp = thread_list.next; | 1813 | struct list_head *tmp = thread_list.next; |
1843 | list_del(tmp); | 1814 | list_del(tmp); |
1844 | kfree(list_entry(tmp, struct elf_thread_status, list)); | 1815 | kfree(list_entry(tmp, struct elf_thread_status, list)); |
1845 | } | 1816 | } |
1846 | 1817 | ||
1847 | kfree(elf); | 1818 | kfree(elf); |
1848 | kfree(prstatus); | 1819 | kfree(prstatus); |
1849 | kfree(psinfo); | 1820 | kfree(psinfo); |
1850 | kfree(notes); | 1821 | kfree(notes); |
1851 | kfree(fpu); | 1822 | kfree(fpu); |
1852 | #ifdef ELF_CORE_COPY_XFPREGS | 1823 | #ifdef ELF_CORE_COPY_XFPREGS |
1853 | kfree(xfpu); | 1824 | kfree(xfpu); |
1854 | #endif | 1825 | #endif |
1855 | return has_dumped; | 1826 | return has_dumped; |
1856 | #undef NUM_NOTES | 1827 | #undef NUM_NOTES |
1857 | } | 1828 | } |
1858 | 1829 | ||
1859 | #endif /* USE_ELF_CORE_DUMP */ | 1830 | #endif /* USE_ELF_CORE_DUMP */ |
1860 | 1831 |
fs/binfmt_flat.c
1 | /****************************************************************************/ | 1 | /****************************************************************************/ |
2 | /* | 2 | /* |
3 | * linux/fs/binfmt_flat.c | 3 | * linux/fs/binfmt_flat.c |
4 | * | 4 | * |
5 | * Copyright (C) 2000-2003 David McCullough <davidm@snapgear.com> | 5 | * Copyright (C) 2000-2003 David McCullough <davidm@snapgear.com> |
6 | * Copyright (C) 2002 Greg Ungerer <gerg@snapgear.com> | 6 | * Copyright (C) 2002 Greg Ungerer <gerg@snapgear.com> |
7 | * Copyright (C) 2002 SnapGear, by Paul Dale <pauli@snapgear.com> | 7 | * Copyright (C) 2002 SnapGear, by Paul Dale <pauli@snapgear.com> |
8 | * Copyright (C) 2000, 2001 Lineo, by David McCullough <davidm@lineo.com> | 8 | * Copyright (C) 2000, 2001 Lineo, by David McCullough <davidm@lineo.com> |
9 | * based heavily on: | 9 | * based heavily on: |
10 | * | 10 | * |
11 | * linux/fs/binfmt_aout.c: | 11 | * linux/fs/binfmt_aout.c: |
12 | * Copyright (C) 1991, 1992, 1996 Linus Torvalds | 12 | * Copyright (C) 1991, 1992, 1996 Linus Torvalds |
13 | * linux/fs/binfmt_flat.c for 2.0 kernel | 13 | * linux/fs/binfmt_flat.c for 2.0 kernel |
14 | * Copyright (C) 1998 Kenneth Albanowski <kjahds@kjahds.com> | 14 | * Copyright (C) 1998 Kenneth Albanowski <kjahds@kjahds.com> |
15 | * JAN/99 -- coded full program relocation (gerg@snapgear.com) | 15 | * JAN/99 -- coded full program relocation (gerg@snapgear.com) |
16 | */ | 16 | */ |
17 | 17 | ||
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/mman.h> | 22 | #include <linux/mman.h> |
23 | #include <linux/errno.h> | 23 | #include <linux/errno.h> |
24 | #include <linux/signal.h> | 24 | #include <linux/signal.h> |
25 | #include <linux/string.h> | 25 | #include <linux/string.h> |
26 | #include <linux/fs.h> | 26 | #include <linux/fs.h> |
27 | #include <linux/file.h> | 27 | #include <linux/file.h> |
28 | #include <linux/stat.h> | 28 | #include <linux/stat.h> |
29 | #include <linux/fcntl.h> | 29 | #include <linux/fcntl.h> |
30 | #include <linux/ptrace.h> | 30 | #include <linux/ptrace.h> |
31 | #include <linux/user.h> | 31 | #include <linux/user.h> |
32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/binfmts.h> | 33 | #include <linux/binfmts.h> |
34 | #include <linux/personality.h> | 34 | #include <linux/personality.h> |
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/flat.h> | 36 | #include <linux/flat.h> |
37 | #include <linux/syscalls.h> | 37 | #include <linux/syscalls.h> |
38 | 38 | ||
39 | #include <asm/byteorder.h> | 39 | #include <asm/byteorder.h> |
40 | #include <asm/system.h> | 40 | #include <asm/system.h> |
41 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
42 | #include <asm/unaligned.h> | 42 | #include <asm/unaligned.h> |
43 | #include <asm/cacheflush.h> | 43 | #include <asm/cacheflush.h> |
44 | 44 | ||
45 | /****************************************************************************/ | 45 | /****************************************************************************/ |
46 | 46 | ||
47 | #if 0 | 47 | #if 0 |
48 | #define DEBUG 1 | 48 | #define DEBUG 1 |
49 | #endif | 49 | #endif |
50 | 50 | ||
51 | #ifdef DEBUG | 51 | #ifdef DEBUG |
52 | #define DBG_FLT(a...) printk(a) | 52 | #define DBG_FLT(a...) printk(a) |
53 | #else | 53 | #else |
54 | #define DBG_FLT(a...) | 54 | #define DBG_FLT(a...) |
55 | #endif | 55 | #endif |
56 | 56 | ||
57 | #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ | 57 | #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ |
58 | #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ | 58 | #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ |
59 | 59 | ||
60 | struct lib_info { | 60 | struct lib_info { |
61 | struct { | 61 | struct { |
62 | unsigned long start_code; /* Start of text segment */ | 62 | unsigned long start_code; /* Start of text segment */ |
63 | unsigned long start_data; /* Start of data segment */ | 63 | unsigned long start_data; /* Start of data segment */ |
64 | unsigned long start_brk; /* End of data segment */ | 64 | unsigned long start_brk; /* End of data segment */ |
65 | unsigned long text_len; /* Length of text segment */ | 65 | unsigned long text_len; /* Length of text segment */ |
66 | unsigned long entry; /* Start address for this module */ | 66 | unsigned long entry; /* Start address for this module */ |
67 | unsigned long build_date; /* When this one was compiled */ | 67 | unsigned long build_date; /* When this one was compiled */ |
68 | short loaded; /* Has this library been loaded? */ | 68 | short loaded; /* Has this library been loaded? */ |
69 | } lib_list[MAX_SHARED_LIBS]; | 69 | } lib_list[MAX_SHARED_LIBS]; |
70 | }; | 70 | }; |
71 | 71 | ||
72 | #ifdef CONFIG_BINFMT_SHARED_FLAT | 72 | #ifdef CONFIG_BINFMT_SHARED_FLAT |
73 | static int load_flat_shared_library(int id, struct lib_info *p); | 73 | static int load_flat_shared_library(int id, struct lib_info *p); |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); | 76 | static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); |
77 | static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); | 77 | static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); |
78 | 78 | ||
79 | static struct linux_binfmt flat_format = { | 79 | static struct linux_binfmt flat_format = { |
80 | .module = THIS_MODULE, | 80 | .module = THIS_MODULE, |
81 | .load_binary = load_flat_binary, | 81 | .load_binary = load_flat_binary, |
82 | .core_dump = flat_core_dump, | 82 | .core_dump = flat_core_dump, |
83 | .min_coredump = PAGE_SIZE | 83 | .min_coredump = PAGE_SIZE |
84 | }; | 84 | }; |
85 | 85 | ||
86 | /****************************************************************************/ | 86 | /****************************************************************************/ |
87 | /* | 87 | /* |
88 | * Routine writes a core dump image in the current directory. | 88 | * Routine writes a core dump image in the current directory. |
89 | * Currently only a stub-function. | 89 | * Currently only a stub-function. |
90 | */ | 90 | */ |
91 | 91 | ||
92 | static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) | 92 | static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) |
93 | { | 93 | { |
94 | printk("Process %s:%d received signr %d and should have core dumped\n", | 94 | printk("Process %s:%d received signr %d and should have core dumped\n", |
95 | current->comm, current->pid, (int) signr); | 95 | current->comm, current->pid, (int) signr); |
96 | return(1); | 96 | return(1); |
97 | } | 97 | } |
98 | 98 | ||
99 | /****************************************************************************/ | 99 | /****************************************************************************/ |
100 | /* | 100 | /* |
101 | * create_flat_tables() parses the env- and arg-strings in new user | 101 | * create_flat_tables() parses the env- and arg-strings in new user |
102 | * memory and creates the pointer tables from them, and puts their | 102 | * memory and creates the pointer tables from them, and puts their |
103 | * addresses on the "stack", returning the new stack pointer value. | 103 | * addresses on the "stack", returning the new stack pointer value. |
104 | */ | 104 | */ |
105 | 105 | ||
106 | static unsigned long create_flat_tables( | 106 | static unsigned long create_flat_tables( |
107 | unsigned long pp, | 107 | unsigned long pp, |
108 | struct linux_binprm * bprm) | 108 | struct linux_binprm * bprm) |
109 | { | 109 | { |
110 | unsigned long *argv,*envp; | 110 | unsigned long *argv,*envp; |
111 | unsigned long * sp; | 111 | unsigned long * sp; |
112 | char * p = (char*)pp; | 112 | char * p = (char*)pp; |
113 | int argc = bprm->argc; | 113 | int argc = bprm->argc; |
114 | int envc = bprm->envc; | 114 | int envc = bprm->envc; |
115 | char uninitialized_var(dummy); | 115 | char uninitialized_var(dummy); |
116 | 116 | ||
117 | sp = (unsigned long *) ((-(unsigned long)sizeof(char *))&(unsigned long) p); | 117 | sp = (unsigned long *) ((-(unsigned long)sizeof(char *))&(unsigned long) p); |
118 | 118 | ||
119 | sp -= envc+1; | 119 | sp -= envc+1; |
120 | envp = sp; | 120 | envp = sp; |
121 | sp -= argc+1; | 121 | sp -= argc+1; |
122 | argv = sp; | 122 | argv = sp; |
123 | 123 | ||
124 | flat_stack_align(sp); | 124 | flat_stack_align(sp); |
125 | if (flat_argvp_envp_on_stack()) { | 125 | if (flat_argvp_envp_on_stack()) { |
126 | --sp; put_user((unsigned long) envp, sp); | 126 | --sp; put_user((unsigned long) envp, sp); |
127 | --sp; put_user((unsigned long) argv, sp); | 127 | --sp; put_user((unsigned long) argv, sp); |
128 | } | 128 | } |
129 | 129 | ||
130 | put_user(argc,--sp); | 130 | put_user(argc,--sp); |
131 | current->mm->arg_start = (unsigned long) p; | 131 | current->mm->arg_start = (unsigned long) p; |
132 | while (argc-->0) { | 132 | while (argc-->0) { |
133 | put_user((unsigned long) p, argv++); | 133 | put_user((unsigned long) p, argv++); |
134 | do { | 134 | do { |
135 | get_user(dummy, p); p++; | 135 | get_user(dummy, p); p++; |
136 | } while (dummy); | 136 | } while (dummy); |
137 | } | 137 | } |
138 | put_user((unsigned long) NULL, argv); | 138 | put_user((unsigned long) NULL, argv); |
139 | current->mm->arg_end = current->mm->env_start = (unsigned long) p; | 139 | current->mm->arg_end = current->mm->env_start = (unsigned long) p; |
140 | while (envc-->0) { | 140 | while (envc-->0) { |
141 | put_user((unsigned long)p, envp); envp++; | 141 | put_user((unsigned long)p, envp); envp++; |
142 | do { | 142 | do { |
143 | get_user(dummy, p); p++; | 143 | get_user(dummy, p); p++; |
144 | } while (dummy); | 144 | } while (dummy); |
145 | } | 145 | } |
146 | put_user((unsigned long) NULL, envp); | 146 | put_user((unsigned long) NULL, envp); |
147 | current->mm->env_end = (unsigned long) p; | 147 | current->mm->env_end = (unsigned long) p; |
148 | return (unsigned long)sp; | 148 | return (unsigned long)sp; |
149 | } | 149 | } |
150 | 150 | ||
151 | /****************************************************************************/ | 151 | /****************************************************************************/ |
152 | 152 | ||
153 | #ifdef CONFIG_BINFMT_ZFLAT | 153 | #ifdef CONFIG_BINFMT_ZFLAT |
154 | 154 | ||
155 | #include <linux/zlib.h> | 155 | #include <linux/zlib.h> |
156 | 156 | ||
157 | #define LBUFSIZE 4000 | 157 | #define LBUFSIZE 4000 |
158 | 158 | ||
159 | /* gzip flag byte */ | 159 | /* gzip flag byte */ |
160 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ | 160 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ |
161 | #define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ | 161 | #define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ |
162 | #define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ | 162 | #define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ |
163 | #define ORIG_NAME 0x08 /* bit 3 set: original file name present */ | 163 | #define ORIG_NAME 0x08 /* bit 3 set: original file name present */ |
164 | #define COMMENT 0x10 /* bit 4 set: file comment present */ | 164 | #define COMMENT 0x10 /* bit 4 set: file comment present */ |
165 | #define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ | 165 | #define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ |
166 | #define RESERVED 0xC0 /* bit 6,7: reserved */ | 166 | #define RESERVED 0xC0 /* bit 6,7: reserved */ |
167 | 167 | ||
168 | static int decompress_exec( | 168 | static int decompress_exec( |
169 | struct linux_binprm *bprm, | 169 | struct linux_binprm *bprm, |
170 | unsigned long offset, | 170 | unsigned long offset, |
171 | char *dst, | 171 | char *dst, |
172 | long len, | 172 | long len, |
173 | int fd) | 173 | int fd) |
174 | { | 174 | { |
175 | unsigned char *buf; | 175 | unsigned char *buf; |
176 | z_stream strm; | 176 | z_stream strm; |
177 | loff_t fpos; | 177 | loff_t fpos; |
178 | int ret, retval; | 178 | int ret, retval; |
179 | 179 | ||
180 | DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len); | 180 | DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len); |
181 | 181 | ||
182 | memset(&strm, 0, sizeof(strm)); | 182 | memset(&strm, 0, sizeof(strm)); |
183 | strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); | 183 | strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); |
184 | if (strm.workspace == NULL) { | 184 | if (strm.workspace == NULL) { |
185 | DBG_FLT("binfmt_flat: no memory for decompress workspace\n"); | 185 | DBG_FLT("binfmt_flat: no memory for decompress workspace\n"); |
186 | return -ENOMEM; | 186 | return -ENOMEM; |
187 | } | 187 | } |
188 | buf = kmalloc(LBUFSIZE, GFP_KERNEL); | 188 | buf = kmalloc(LBUFSIZE, GFP_KERNEL); |
189 | if (buf == NULL) { | 189 | if (buf == NULL) { |
190 | DBG_FLT("binfmt_flat: no memory for read buffer\n"); | 190 | DBG_FLT("binfmt_flat: no memory for read buffer\n"); |
191 | retval = -ENOMEM; | 191 | retval = -ENOMEM; |
192 | goto out_free; | 192 | goto out_free; |
193 | } | 193 | } |
194 | 194 | ||
195 | /* Read in first chunk of data and parse gzip header. */ | 195 | /* Read in first chunk of data and parse gzip header. */ |
196 | fpos = offset; | 196 | fpos = offset; |
197 | ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); | 197 | ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); |
198 | 198 | ||
199 | strm.next_in = buf; | 199 | strm.next_in = buf; |
200 | strm.avail_in = ret; | 200 | strm.avail_in = ret; |
201 | strm.total_in = 0; | 201 | strm.total_in = 0; |
202 | 202 | ||
203 | retval = -ENOEXEC; | 203 | retval = -ENOEXEC; |
204 | 204 | ||
205 | /* Check minimum size -- gzip header */ | 205 | /* Check minimum size -- gzip header */ |
206 | if (ret < 10) { | 206 | if (ret < 10) { |
207 | DBG_FLT("binfmt_flat: file too small?\n"); | 207 | DBG_FLT("binfmt_flat: file too small?\n"); |
208 | goto out_free_buf; | 208 | goto out_free_buf; |
209 | } | 209 | } |
210 | 210 | ||
211 | /* Check gzip magic number */ | 211 | /* Check gzip magic number */ |
212 | if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) { | 212 | if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) { |
213 | DBG_FLT("binfmt_flat: unknown compression magic?\n"); | 213 | DBG_FLT("binfmt_flat: unknown compression magic?\n"); |
214 | goto out_free_buf; | 214 | goto out_free_buf; |
215 | } | 215 | } |
216 | 216 | ||
217 | /* Check gzip method */ | 217 | /* Check gzip method */ |
218 | if (buf[2] != 8) { | 218 | if (buf[2] != 8) { |
219 | DBG_FLT("binfmt_flat: unknown compression method?\n"); | 219 | DBG_FLT("binfmt_flat: unknown compression method?\n"); |
220 | goto out_free_buf; | 220 | goto out_free_buf; |
221 | } | 221 | } |
222 | /* Check gzip flags */ | 222 | /* Check gzip flags */ |
223 | if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) || | 223 | if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) || |
224 | (buf[3] & RESERVED)) { | 224 | (buf[3] & RESERVED)) { |
225 | DBG_FLT("binfmt_flat: unknown flags?\n"); | 225 | DBG_FLT("binfmt_flat: unknown flags?\n"); |
226 | goto out_free_buf; | 226 | goto out_free_buf; |
227 | } | 227 | } |
228 | 228 | ||
229 | ret = 10; | 229 | ret = 10; |
230 | if (buf[3] & EXTRA_FIELD) { | 230 | if (buf[3] & EXTRA_FIELD) { |
231 | ret += 2 + buf[10] + (buf[11] << 8); | 231 | ret += 2 + buf[10] + (buf[11] << 8); |
232 | if (unlikely(LBUFSIZE <= ret)) { | 232 | if (unlikely(LBUFSIZE <= ret)) { |
233 | DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); | 233 | DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); |
234 | goto out_free_buf; | 234 | goto out_free_buf; |
235 | } | 235 | } |
236 | } | 236 | } |
237 | if (buf[3] & ORIG_NAME) { | 237 | if (buf[3] & ORIG_NAME) { |
238 | while (ret < LBUFSIZE && buf[ret++] != 0) | 238 | while (ret < LBUFSIZE && buf[ret++] != 0) |
239 | ; | 239 | ; |
240 | if (unlikely(LBUFSIZE == ret)) { | 240 | if (unlikely(LBUFSIZE == ret)) { |
241 | DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); | 241 | DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); |
242 | goto out_free_buf; | 242 | goto out_free_buf; |
243 | } | 243 | } |
244 | } | 244 | } |
245 | if (buf[3] & COMMENT) { | 245 | if (buf[3] & COMMENT) { |
246 | while (ret < LBUFSIZE && buf[ret++] != 0) | 246 | while (ret < LBUFSIZE && buf[ret++] != 0) |
247 | ; | 247 | ; |
248 | if (unlikely(LBUFSIZE == ret)) { | 248 | if (unlikely(LBUFSIZE == ret)) { |
249 | DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); | 249 | DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); |
250 | goto out_free_buf; | 250 | goto out_free_buf; |
251 | } | 251 | } |
252 | } | 252 | } |
253 | 253 | ||
254 | strm.next_in += ret; | 254 | strm.next_in += ret; |
255 | strm.avail_in -= ret; | 255 | strm.avail_in -= ret; |
256 | 256 | ||
257 | strm.next_out = dst; | 257 | strm.next_out = dst; |
258 | strm.avail_out = len; | 258 | strm.avail_out = len; |
259 | strm.total_out = 0; | 259 | strm.total_out = 0; |
260 | 260 | ||
261 | if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) { | 261 | if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) { |
262 | DBG_FLT("binfmt_flat: zlib init failed?\n"); | 262 | DBG_FLT("binfmt_flat: zlib init failed?\n"); |
263 | goto out_free_buf; | 263 | goto out_free_buf; |
264 | } | 264 | } |
265 | 265 | ||
266 | while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { | 266 | while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { |
267 | ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); | 267 | ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); |
268 | if (ret <= 0) | 268 | if (ret <= 0) |
269 | break; | 269 | break; |
270 | if (ret >= (unsigned long) -4096) | 270 | if (ret >= (unsigned long) -4096) |
271 | break; | 271 | break; |
272 | len -= ret; | 272 | len -= ret; |
273 | 273 | ||
274 | strm.next_in = buf; | 274 | strm.next_in = buf; |
275 | strm.avail_in = ret; | 275 | strm.avail_in = ret; |
276 | strm.total_in = 0; | 276 | strm.total_in = 0; |
277 | } | 277 | } |
278 | 278 | ||
279 | if (ret < 0) { | 279 | if (ret < 0) { |
280 | DBG_FLT("binfmt_flat: decompression failed (%d), %s\n", | 280 | DBG_FLT("binfmt_flat: decompression failed (%d), %s\n", |
281 | ret, strm.msg); | 281 | ret, strm.msg); |
282 | goto out_zlib; | 282 | goto out_zlib; |
283 | } | 283 | } |
284 | 284 | ||
285 | retval = 0; | 285 | retval = 0; |
286 | out_zlib: | 286 | out_zlib: |
287 | zlib_inflateEnd(&strm); | 287 | zlib_inflateEnd(&strm); |
288 | out_free_buf: | 288 | out_free_buf: |
289 | kfree(buf); | 289 | kfree(buf); |
290 | out_free: | 290 | out_free: |
291 | kfree(strm.workspace); | 291 | kfree(strm.workspace); |
292 | return retval; | 292 | return retval; |
293 | } | 293 | } |
294 | 294 | ||
295 | #endif /* CONFIG_BINFMT_ZFLAT */ | 295 | #endif /* CONFIG_BINFMT_ZFLAT */ |
296 | 296 | ||
297 | /****************************************************************************/ | 297 | /****************************************************************************/ |
298 | 298 | ||
299 | static unsigned long | 299 | static unsigned long |
300 | calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) | 300 | calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) |
301 | { | 301 | { |
302 | unsigned long addr; | 302 | unsigned long addr; |
303 | int id; | 303 | int id; |
304 | unsigned long start_brk; | 304 | unsigned long start_brk; |
305 | unsigned long start_data; | 305 | unsigned long start_data; |
306 | unsigned long text_len; | 306 | unsigned long text_len; |
307 | unsigned long start_code; | 307 | unsigned long start_code; |
308 | 308 | ||
309 | #ifdef CONFIG_BINFMT_SHARED_FLAT | 309 | #ifdef CONFIG_BINFMT_SHARED_FLAT |
310 | if (r == 0) | 310 | if (r == 0) |
311 | id = curid; /* Relocs of 0 are always self referring */ | 311 | id = curid; /* Relocs of 0 are always self referring */ |
312 | else { | 312 | else { |
313 | id = (r >> 24) & 0xff; /* Find ID for this reloc */ | 313 | id = (r >> 24) & 0xff; /* Find ID for this reloc */ |
314 | r &= 0x00ffffff; /* Trim ID off here */ | 314 | r &= 0x00ffffff; /* Trim ID off here */ |
315 | } | 315 | } |
316 | if (id >= MAX_SHARED_LIBS) { | 316 | if (id >= MAX_SHARED_LIBS) { |
317 | printk("BINFMT_FLAT: reference 0x%x to shared library %d", | 317 | printk("BINFMT_FLAT: reference 0x%x to shared library %d", |
318 | (unsigned) r, id); | 318 | (unsigned) r, id); |
319 | goto failed; | 319 | goto failed; |
320 | } | 320 | } |
321 | if (curid != id) { | 321 | if (curid != id) { |
322 | if (internalp) { | 322 | if (internalp) { |
323 | printk("BINFMT_FLAT: reloc address 0x%x not in same module " | 323 | printk("BINFMT_FLAT: reloc address 0x%x not in same module " |
324 | "(%d != %d)", (unsigned) r, curid, id); | 324 | "(%d != %d)", (unsigned) r, curid, id); |
325 | goto failed; | 325 | goto failed; |
326 | } else if ( ! p->lib_list[id].loaded && | 326 | } else if ( ! p->lib_list[id].loaded && |
327 | load_flat_shared_library(id, p) > (unsigned long) -4096) { | 327 | load_flat_shared_library(id, p) > (unsigned long) -4096) { |
328 | printk("BINFMT_FLAT: failed to load library %d", id); | 328 | printk("BINFMT_FLAT: failed to load library %d", id); |
329 | goto failed; | 329 | goto failed; |
330 | } | 330 | } |
331 | /* Check versioning information (i.e. time stamps) */ | 331 | /* Check versioning information (i.e. time stamps) */ |
332 | if (p->lib_list[id].build_date && p->lib_list[curid].build_date && | 332 | if (p->lib_list[id].build_date && p->lib_list[curid].build_date && |
333 | p->lib_list[curid].build_date < p->lib_list[id].build_date) { | 333 | p->lib_list[curid].build_date < p->lib_list[id].build_date) { |
334 | printk("BINFMT_FLAT: library %d is younger than %d", id, curid); | 334 | printk("BINFMT_FLAT: library %d is younger than %d", id, curid); |
335 | goto failed; | 335 | goto failed; |
336 | } | 336 | } |
337 | } | 337 | } |
338 | #else | 338 | #else |
339 | id = 0; | 339 | id = 0; |
340 | #endif | 340 | #endif |
341 | 341 | ||
342 | start_brk = p->lib_list[id].start_brk; | 342 | start_brk = p->lib_list[id].start_brk; |
343 | start_data = p->lib_list[id].start_data; | 343 | start_data = p->lib_list[id].start_data; |
344 | start_code = p->lib_list[id].start_code; | 344 | start_code = p->lib_list[id].start_code; |
345 | text_len = p->lib_list[id].text_len; | 345 | text_len = p->lib_list[id].text_len; |
346 | 346 | ||
347 | if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { | 347 | if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { |
348 | printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", | 348 | printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", |
349 | (int) r,(int)(start_brk-start_code),(int)text_len); | 349 | (int) r,(int)(start_brk-start_code),(int)text_len); |
350 | goto failed; | 350 | goto failed; |
351 | } | 351 | } |
352 | 352 | ||
353 | if (r < text_len) /* In text segment */ | 353 | if (r < text_len) /* In text segment */ |
354 | addr = r + start_code; | 354 | addr = r + start_code; |
355 | else /* In data segment */ | 355 | else /* In data segment */ |
356 | addr = r - text_len + start_data; | 356 | addr = r - text_len + start_data; |
357 | 357 | ||
358 | /* Range checked already above so doing the range tests is redundant...*/ | 358 | /* Range checked already above so doing the range tests is redundant...*/ |
359 | return(addr); | 359 | return(addr); |
360 | 360 | ||
361 | failed: | 361 | failed: |
362 | printk(", killing %s!\n", current->comm); | 362 | printk(", killing %s!\n", current->comm); |
363 | send_sig(SIGSEGV, current, 0); | 363 | send_sig(SIGSEGV, current, 0); |
364 | 364 | ||
365 | return RELOC_FAILED; | 365 | return RELOC_FAILED; |
366 | } | 366 | } |
367 | 367 | ||
368 | /****************************************************************************/ | 368 | /****************************************************************************/ |
369 | 369 | ||
370 | void old_reloc(unsigned long rl) | 370 | void old_reloc(unsigned long rl) |
371 | { | 371 | { |
372 | #ifdef DEBUG | 372 | #ifdef DEBUG |
373 | char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; | 373 | char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; |
374 | #endif | 374 | #endif |
375 | flat_v2_reloc_t r; | 375 | flat_v2_reloc_t r; |
376 | unsigned long *ptr; | 376 | unsigned long *ptr; |
377 | 377 | ||
378 | r.value = rl; | 378 | r.value = rl; |
379 | #if defined(CONFIG_COLDFIRE) | 379 | #if defined(CONFIG_COLDFIRE) |
380 | ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset); | 380 | ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset); |
381 | #else | 381 | #else |
382 | ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset); | 382 | ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset); |
383 | #endif | 383 | #endif |
384 | 384 | ||
385 | #ifdef DEBUG | 385 | #ifdef DEBUG |
386 | printk("Relocation of variable at DATASEG+%x " | 386 | printk("Relocation of variable at DATASEG+%x " |
387 | "(address %p, currently %x) into segment %s\n", | 387 | "(address %p, currently %x) into segment %s\n", |
388 | r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]); | 388 | r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]); |
389 | #endif | 389 | #endif |
390 | 390 | ||
391 | switch (r.reloc.type) { | 391 | switch (r.reloc.type) { |
392 | case OLD_FLAT_RELOC_TYPE_TEXT: | 392 | case OLD_FLAT_RELOC_TYPE_TEXT: |
393 | *ptr += current->mm->start_code; | 393 | *ptr += current->mm->start_code; |
394 | break; | 394 | break; |
395 | case OLD_FLAT_RELOC_TYPE_DATA: | 395 | case OLD_FLAT_RELOC_TYPE_DATA: |
396 | *ptr += current->mm->start_data; | 396 | *ptr += current->mm->start_data; |
397 | break; | 397 | break; |
398 | case OLD_FLAT_RELOC_TYPE_BSS: | 398 | case OLD_FLAT_RELOC_TYPE_BSS: |
399 | *ptr += current->mm->end_data; | 399 | *ptr += current->mm->end_data; |
400 | break; | 400 | break; |
401 | default: | 401 | default: |
402 | printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type); | 402 | printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type); |
403 | break; | 403 | break; |
404 | } | 404 | } |
405 | 405 | ||
406 | #ifdef DEBUG | 406 | #ifdef DEBUG |
407 | printk("Relocation became %x\n", (int)*ptr); | 407 | printk("Relocation became %x\n", (int)*ptr); |
408 | #endif | 408 | #endif |
409 | } | 409 | } |
410 | 410 | ||
411 | /****************************************************************************/ | 411 | /****************************************************************************/ |
412 | 412 | ||
413 | static int load_flat_file(struct linux_binprm * bprm, | 413 | static int load_flat_file(struct linux_binprm * bprm, |
414 | struct lib_info *libinfo, int id, unsigned long *extra_stack) | 414 | struct lib_info *libinfo, int id, unsigned long *extra_stack) |
415 | { | 415 | { |
416 | struct flat_hdr * hdr; | 416 | struct flat_hdr * hdr; |
417 | unsigned long textpos = 0, datapos = 0, result; | 417 | unsigned long textpos = 0, datapos = 0, result; |
418 | unsigned long realdatastart = 0; | 418 | unsigned long realdatastart = 0; |
419 | unsigned long text_len, data_len, bss_len, stack_len, flags; | 419 | unsigned long text_len, data_len, bss_len, stack_len, flags; |
420 | unsigned long len, reallen, memp = 0; | 420 | unsigned long len, memp = 0; |
421 | unsigned long extra, rlim; | 421 | unsigned long memp_size, extra, rlim; |
422 | unsigned long *reloc = 0, *rp; | 422 | unsigned long *reloc = 0, *rp; |
423 | struct inode *inode; | 423 | struct inode *inode; |
424 | int i, rev, relocs = 0; | 424 | int i, rev, relocs = 0; |
425 | loff_t fpos; | 425 | loff_t fpos; |
426 | unsigned long start_code, end_code; | 426 | unsigned long start_code, end_code; |
427 | int ret; | 427 | int ret; |
428 | 428 | ||
429 | hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ | 429 | hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ |
430 | inode = bprm->file->f_path.dentry->d_inode; | 430 | inode = bprm->file->f_path.dentry->d_inode; |
431 | 431 | ||
432 | text_len = ntohl(hdr->data_start); | 432 | text_len = ntohl(hdr->data_start); |
433 | data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start); | 433 | data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start); |
434 | bss_len = ntohl(hdr->bss_end) - ntohl(hdr->data_end); | 434 | bss_len = ntohl(hdr->bss_end) - ntohl(hdr->data_end); |
435 | stack_len = ntohl(hdr->stack_size); | 435 | stack_len = ntohl(hdr->stack_size); |
436 | if (extra_stack) { | 436 | if (extra_stack) { |
437 | stack_len += *extra_stack; | 437 | stack_len += *extra_stack; |
438 | *extra_stack = stack_len; | 438 | *extra_stack = stack_len; |
439 | } | 439 | } |
440 | relocs = ntohl(hdr->reloc_count); | 440 | relocs = ntohl(hdr->reloc_count); |
441 | flags = ntohl(hdr->flags); | 441 | flags = ntohl(hdr->flags); |
442 | rev = ntohl(hdr->rev); | 442 | rev = ntohl(hdr->rev); |
443 | 443 | ||
444 | if (strncmp(hdr->magic, "bFLT", 4)) { | 444 | if (strncmp(hdr->magic, "bFLT", 4)) { |
445 | /* | 445 | /* |
446 | * Previously, here was a printk to tell people | 446 | * Previously, here was a printk to tell people |
447 | * "BINFMT_FLAT: bad header magic". | 447 | * "BINFMT_FLAT: bad header magic". |
448 | * But for the kernel which also use ELF FD-PIC format, this | 448 | * But for the kernel which also use ELF FD-PIC format, this |
449 | * error message is confusing. | 449 | * error message is confusing. |
450 | * because a lot of people do not manage to produce good | 450 | * because a lot of people do not manage to produce good |
451 | */ | 451 | */ |
452 | ret = -ENOEXEC; | 452 | ret = -ENOEXEC; |
453 | goto err; | 453 | goto err; |
454 | } | 454 | } |
455 | 455 | ||
456 | if (flags & FLAT_FLAG_KTRACE) | 456 | if (flags & FLAT_FLAG_KTRACE) |
457 | printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); | 457 | printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); |
458 | 458 | ||
459 | if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { | 459 | if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { |
460 | printk("BINFMT_FLAT: bad flat file version 0x%x (supported " | 460 | printk("BINFMT_FLAT: bad flat file version 0x%x (supported " |
461 | "0x%lx and 0x%lx)\n", | 461 | "0x%lx and 0x%lx)\n", |
462 | rev, FLAT_VERSION, OLD_FLAT_VERSION); | 462 | rev, FLAT_VERSION, OLD_FLAT_VERSION); |
463 | ret = -ENOEXEC; | 463 | ret = -ENOEXEC; |
464 | goto err; | 464 | goto err; |
465 | } | 465 | } |
466 | 466 | ||
467 | /* Don't allow old format executables to use shared libraries */ | 467 | /* Don't allow old format executables to use shared libraries */ |
468 | if (rev == OLD_FLAT_VERSION && id != 0) { | 468 | if (rev == OLD_FLAT_VERSION && id != 0) { |
469 | printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n", | 469 | printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n", |
470 | (int) FLAT_VERSION); | 470 | (int) FLAT_VERSION); |
471 | ret = -ENOEXEC; | 471 | ret = -ENOEXEC; |
472 | goto err; | 472 | goto err; |
473 | } | 473 | } |
474 | 474 | ||
475 | /* | 475 | /* |
476 | * fix up the flags for the older format, there were all kinds | 476 | * fix up the flags for the older format, there were all kinds |
477 | * of endian hacks, this only works for the simple cases | 477 | * of endian hacks, this only works for the simple cases |
478 | */ | 478 | */ |
479 | if (rev == OLD_FLAT_VERSION && flat_old_ram_flag(flags)) | 479 | if (rev == OLD_FLAT_VERSION && flat_old_ram_flag(flags)) |
480 | flags = FLAT_FLAG_RAM; | 480 | flags = FLAT_FLAG_RAM; |
481 | 481 | ||
482 | #ifndef CONFIG_BINFMT_ZFLAT | 482 | #ifndef CONFIG_BINFMT_ZFLAT |
483 | if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) { | 483 | if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) { |
484 | printk("Support for ZFLAT executables is not enabled.\n"); | 484 | printk("Support for ZFLAT executables is not enabled.\n"); |
485 | ret = -ENOEXEC; | 485 | ret = -ENOEXEC; |
486 | goto err; | 486 | goto err; |
487 | } | 487 | } |
488 | #endif | 488 | #endif |
489 | 489 | ||
490 | /* | 490 | /* |
491 | * Check initial limits. This avoids letting people circumvent | 491 | * Check initial limits. This avoids letting people circumvent |
492 | * size limits imposed on them by creating programs with large | 492 | * size limits imposed on them by creating programs with large |
493 | * arrays in the data or bss. | 493 | * arrays in the data or bss. |
494 | */ | 494 | */ |
495 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | 495 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; |
496 | if (rlim >= RLIM_INFINITY) | 496 | if (rlim >= RLIM_INFINITY) |
497 | rlim = ~0; | 497 | rlim = ~0; |
498 | if (data_len + bss_len > rlim) { | 498 | if (data_len + bss_len > rlim) { |
499 | ret = -ENOMEM; | 499 | ret = -ENOMEM; |
500 | goto err; | 500 | goto err; |
501 | } | 501 | } |
502 | 502 | ||
503 | /* Flush all traces of the currently running executable */ | 503 | /* Flush all traces of the currently running executable */ |
504 | if (id == 0) { | 504 | if (id == 0) { |
505 | result = flush_old_exec(bprm); | 505 | result = flush_old_exec(bprm); |
506 | if (result) { | 506 | if (result) { |
507 | ret = result; | 507 | ret = result; |
508 | goto err; | 508 | goto err; |
509 | } | 509 | } |
510 | 510 | ||
511 | /* OK, This is the point of no return */ | 511 | /* OK, This is the point of no return */ |
512 | set_personality(PER_LINUX_32BIT); | 512 | set_personality(PER_LINUX_32BIT); |
513 | } | 513 | } |
514 | 514 | ||
515 | /* | 515 | /* |
516 | * calculate the extra space we need to map in | 516 | * calculate the extra space we need to map in |
517 | */ | 517 | */ |
518 | extra = max_t(unsigned long, bss_len + stack_len, | 518 | extra = max_t(unsigned long, bss_len + stack_len, |
519 | relocs * sizeof(unsigned long)); | 519 | relocs * sizeof(unsigned long)); |
520 | 520 | ||
521 | /* | 521 | /* |
522 | * there are a couple of cases here, the separate code/data | 522 | * there are a couple of cases here, the separate code/data |
523 | * case, and then the fully copied to RAM case which lumps | 523 | * case, and then the fully copied to RAM case which lumps |
524 | * it all together. | 524 | * it all together. |
525 | */ | 525 | */ |
526 | if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) { | 526 | if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) { |
527 | /* | 527 | /* |
528 | * this should give us a ROM ptr, but if it doesn't we don't | 528 | * this should give us a ROM ptr, but if it doesn't we don't |
529 | * really care | 529 | * really care |
530 | */ | 530 | */ |
531 | DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); | 531 | DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); |
532 | 532 | ||
533 | down_write(¤t->mm->mmap_sem); | 533 | down_write(¤t->mm->mmap_sem); |
534 | textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, | 534 | textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, |
535 | MAP_PRIVATE|MAP_EXECUTABLE, 0); | 535 | MAP_PRIVATE|MAP_EXECUTABLE, 0); |
536 | up_write(¤t->mm->mmap_sem); | 536 | up_write(¤t->mm->mmap_sem); |
537 | if (!textpos || textpos >= (unsigned long) -4096) { | 537 | if (!textpos || textpos >= (unsigned long) -4096) { |
538 | if (!textpos) | 538 | if (!textpos) |
539 | textpos = (unsigned long) -ENOMEM; | 539 | textpos = (unsigned long) -ENOMEM; |
540 | printk("Unable to mmap process text, errno %d\n", (int)-textpos); | 540 | printk("Unable to mmap process text, errno %d\n", (int)-textpos); |
541 | ret = textpos; | 541 | ret = textpos; |
542 | goto err; | 542 | goto err; |
543 | } | 543 | } |
544 | 544 | ||
545 | len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); | 545 | len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); |
546 | len = PAGE_ALIGN(len); | ||
546 | down_write(¤t->mm->mmap_sem); | 547 | down_write(¤t->mm->mmap_sem); |
547 | realdatastart = do_mmap(0, 0, len, | 548 | realdatastart = do_mmap(0, 0, len, |
548 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); | 549 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); |
549 | /* Remap to use all availabe slack region space */ | ||
550 | if (realdatastart && (realdatastart < (unsigned long)-4096)) { | ||
551 | reallen = kobjsize((void *)realdatastart); | ||
552 | if (reallen > len) { | ||
553 | realdatastart = do_mremap(realdatastart, len, | ||
554 | reallen, MREMAP_FIXED, realdatastart); | ||
555 | } | ||
556 | } | ||
557 | up_write(¤t->mm->mmap_sem); | 550 | up_write(¤t->mm->mmap_sem); |
558 | 551 | ||
559 | if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { | 552 | if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { |
560 | if (!realdatastart) | 553 | if (!realdatastart) |
561 | realdatastart = (unsigned long) -ENOMEM; | 554 | realdatastart = (unsigned long) -ENOMEM; |
562 | printk("Unable to allocate RAM for process data, errno %d\n", | 555 | printk("Unable to allocate RAM for process data, errno %d\n", |
563 | (int)-realdatastart); | 556 | (int)-realdatastart); |
564 | do_munmap(current->mm, textpos, text_len); | 557 | do_munmap(current->mm, textpos, text_len); |
565 | ret = realdatastart; | 558 | ret = realdatastart; |
566 | goto err; | 559 | goto err; |
567 | } | 560 | } |
568 | datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); | 561 | datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); |
569 | 562 | ||
570 | DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", | 563 | DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", |
571 | (int)(data_len + bss_len + stack_len), (int)datapos); | 564 | (int)(data_len + bss_len + stack_len), (int)datapos); |
572 | 565 | ||
573 | fpos = ntohl(hdr->data_start); | 566 | fpos = ntohl(hdr->data_start); |
574 | #ifdef CONFIG_BINFMT_ZFLAT | 567 | #ifdef CONFIG_BINFMT_ZFLAT |
575 | if (flags & FLAT_FLAG_GZDATA) { | 568 | if (flags & FLAT_FLAG_GZDATA) { |
576 | result = decompress_exec(bprm, fpos, (char *) datapos, | 569 | result = decompress_exec(bprm, fpos, (char *) datapos, |
577 | data_len + (relocs * sizeof(unsigned long)), 0); | 570 | data_len + (relocs * sizeof(unsigned long)), 0); |
578 | } else | 571 | } else |
579 | #endif | 572 | #endif |
580 | { | 573 | { |
581 | result = bprm->file->f_op->read(bprm->file, (char *) datapos, | 574 | result = bprm->file->f_op->read(bprm->file, (char *) datapos, |
582 | data_len + (relocs * sizeof(unsigned long)), &fpos); | 575 | data_len + (relocs * sizeof(unsigned long)), &fpos); |
583 | } | 576 | } |
584 | if (result >= (unsigned long)-4096) { | 577 | if (result >= (unsigned long)-4096) { |
585 | printk("Unable to read data+bss, errno %d\n", (int)-result); | 578 | printk("Unable to read data+bss, errno %d\n", (int)-result); |
586 | do_munmap(current->mm, textpos, text_len); | 579 | do_munmap(current->mm, textpos, text_len); |
587 | do_munmap(current->mm, realdatastart, data_len + extra); | 580 | do_munmap(current->mm, realdatastart, data_len + extra); |
588 | ret = result; | 581 | ret = result; |
589 | goto err; | 582 | goto err; |
590 | } | 583 | } |
591 | 584 | ||
592 | reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); | 585 | reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); |
593 | memp = realdatastart; | 586 | memp = realdatastart; |
594 | 587 | memp_size = len; | |
595 | } else { | 588 | } else { |
596 | 589 | ||
597 | len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); | 590 | len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); |
591 | len = PAGE_ALIGN(len); | ||
598 | down_write(¤t->mm->mmap_sem); | 592 | down_write(¤t->mm->mmap_sem); |
599 | textpos = do_mmap(0, 0, len, | 593 | textpos = do_mmap(0, 0, len, |
600 | PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); | 594 | PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); |
601 | /* Remap to use all availabe slack region space */ | ||
602 | if (textpos && (textpos < (unsigned long) -4096)) { | ||
603 | reallen = kobjsize((void *)textpos); | ||
604 | if (reallen > len) { | ||
605 | textpos = do_mremap(textpos, len, reallen, | ||
606 | MREMAP_FIXED, textpos); | ||
607 | } | ||
608 | } | ||
609 | up_write(¤t->mm->mmap_sem); | 595 | up_write(¤t->mm->mmap_sem); |
610 | 596 | ||
611 | if (!textpos || textpos >= (unsigned long) -4096) { | 597 | if (!textpos || textpos >= (unsigned long) -4096) { |
612 | if (!textpos) | 598 | if (!textpos) |
613 | textpos = (unsigned long) -ENOMEM; | 599 | textpos = (unsigned long) -ENOMEM; |
614 | printk("Unable to allocate RAM for process text/data, errno %d\n", | 600 | printk("Unable to allocate RAM for process text/data, errno %d\n", |
615 | (int)-textpos); | 601 | (int)-textpos); |
616 | ret = textpos; | 602 | ret = textpos; |
617 | goto err; | 603 | goto err; |
618 | } | 604 | } |
619 | 605 | ||
620 | realdatastart = textpos + ntohl(hdr->data_start); | 606 | realdatastart = textpos + ntohl(hdr->data_start); |
621 | datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); | 607 | datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); |
622 | reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + | 608 | reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + |
623 | MAX_SHARED_LIBS * sizeof(unsigned long)); | 609 | MAX_SHARED_LIBS * sizeof(unsigned long)); |
624 | memp = textpos; | 610 | memp = textpos; |
625 | 611 | memp_size = len; | |
626 | #ifdef CONFIG_BINFMT_ZFLAT | 612 | #ifdef CONFIG_BINFMT_ZFLAT |
627 | /* | 613 | /* |
628 | * load it all in and treat it like a RAM load from now on | 614 | * load it all in and treat it like a RAM load from now on |
629 | */ | 615 | */ |
630 | if (flags & FLAT_FLAG_GZIP) { | 616 | if (flags & FLAT_FLAG_GZIP) { |
631 | result = decompress_exec(bprm, sizeof (struct flat_hdr), | 617 | result = decompress_exec(bprm, sizeof (struct flat_hdr), |
632 | (((char *) textpos) + sizeof (struct flat_hdr)), | 618 | (((char *) textpos) + sizeof (struct flat_hdr)), |
633 | (text_len + data_len + (relocs * sizeof(unsigned long)) | 619 | (text_len + data_len + (relocs * sizeof(unsigned long)) |
634 | - sizeof (struct flat_hdr)), | 620 | - sizeof (struct flat_hdr)), |
635 | 0); | 621 | 0); |
636 | memmove((void *) datapos, (void *) realdatastart, | 622 | memmove((void *) datapos, (void *) realdatastart, |
637 | data_len + (relocs * sizeof(unsigned long))); | 623 | data_len + (relocs * sizeof(unsigned long))); |
638 | } else if (flags & FLAT_FLAG_GZDATA) { | 624 | } else if (flags & FLAT_FLAG_GZDATA) { |
639 | fpos = 0; | 625 | fpos = 0; |
640 | result = bprm->file->f_op->read(bprm->file, | 626 | result = bprm->file->f_op->read(bprm->file, |
641 | (char *) textpos, text_len, &fpos); | 627 | (char *) textpos, text_len, &fpos); |
642 | if (result < (unsigned long) -4096) | 628 | if (result < (unsigned long) -4096) |
643 | result = decompress_exec(bprm, text_len, (char *) datapos, | 629 | result = decompress_exec(bprm, text_len, (char *) datapos, |
644 | data_len + (relocs * sizeof(unsigned long)), 0); | 630 | data_len + (relocs * sizeof(unsigned long)), 0); |
645 | } | 631 | } |
646 | else | 632 | else |
647 | #endif | 633 | #endif |
648 | { | 634 | { |
649 | fpos = 0; | 635 | fpos = 0; |
650 | result = bprm->file->f_op->read(bprm->file, | 636 | result = bprm->file->f_op->read(bprm->file, |
651 | (char *) textpos, text_len, &fpos); | 637 | (char *) textpos, text_len, &fpos); |
652 | if (result < (unsigned long) -4096) { | 638 | if (result < (unsigned long) -4096) { |
653 | fpos = ntohl(hdr->data_start); | 639 | fpos = ntohl(hdr->data_start); |
654 | result = bprm->file->f_op->read(bprm->file, (char *) datapos, | 640 | result = bprm->file->f_op->read(bprm->file, (char *) datapos, |
655 | data_len + (relocs * sizeof(unsigned long)), &fpos); | 641 | data_len + (relocs * sizeof(unsigned long)), &fpos); |
656 | } | 642 | } |
657 | } | 643 | } |
658 | if (result >= (unsigned long)-4096) { | 644 | if (result >= (unsigned long)-4096) { |
659 | printk("Unable to read code+data+bss, errno %d\n",(int)-result); | 645 | printk("Unable to read code+data+bss, errno %d\n",(int)-result); |
660 | do_munmap(current->mm, textpos, text_len + data_len + extra + | 646 | do_munmap(current->mm, textpos, text_len + data_len + extra + |
661 | MAX_SHARED_LIBS * sizeof(unsigned long)); | 647 | MAX_SHARED_LIBS * sizeof(unsigned long)); |
662 | ret = result; | 648 | ret = result; |
663 | goto err; | 649 | goto err; |
664 | } | 650 | } |
665 | } | 651 | } |
666 | 652 | ||
667 | if (flags & FLAT_FLAG_KTRACE) | 653 | if (flags & FLAT_FLAG_KTRACE) |
668 | printk("Mapping is %x, Entry point is %x, data_start is %x\n", | 654 | printk("Mapping is %x, Entry point is %x, data_start is %x\n", |
669 | (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); | 655 | (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); |
670 | 656 | ||
671 | /* The main program needs a little extra setup in the task structure */ | 657 | /* The main program needs a little extra setup in the task structure */ |
672 | start_code = textpos + sizeof (struct flat_hdr); | 658 | start_code = textpos + sizeof (struct flat_hdr); |
673 | end_code = textpos + text_len; | 659 | end_code = textpos + text_len; |
674 | if (id == 0) { | 660 | if (id == 0) { |
675 | current->mm->start_code = start_code; | 661 | current->mm->start_code = start_code; |
676 | current->mm->end_code = end_code; | 662 | current->mm->end_code = end_code; |
677 | current->mm->start_data = datapos; | 663 | current->mm->start_data = datapos; |
678 | current->mm->end_data = datapos + data_len; | 664 | current->mm->end_data = datapos + data_len; |
679 | /* | 665 | /* |
680 | * set up the brk stuff, uses any slack left in data/bss/stack | 666 | * set up the brk stuff, uses any slack left in data/bss/stack |
681 | * allocation. We put the brk after the bss (between the bss | 667 | * allocation. We put the brk after the bss (between the bss |
682 | * and stack) like other platforms. | 668 | * and stack) like other platforms. |
669 | * Userspace code relies on the stack pointer starting out at | ||
670 | * an address right at the end of a page. | ||
683 | */ | 671 | */ |
684 | current->mm->start_brk = datapos + data_len + bss_len; | 672 | current->mm->start_brk = datapos + data_len + bss_len; |
685 | current->mm->brk = (current->mm->start_brk + 3) & ~3; | 673 | current->mm->brk = (current->mm->start_brk + 3) & ~3; |
686 | current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; | 674 | current->mm->context.end_brk = memp + memp_size - stack_len; |
687 | } | 675 | } |
688 | 676 | ||
689 | if (flags & FLAT_FLAG_KTRACE) | 677 | if (flags & FLAT_FLAG_KTRACE) |
690 | printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n", | 678 | printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n", |
691 | id ? "Lib" : "Load", bprm->filename, | 679 | id ? "Lib" : "Load", bprm->filename, |
692 | (int) start_code, (int) end_code, | 680 | (int) start_code, (int) end_code, |
693 | (int) datapos, | 681 | (int) datapos, |
694 | (int) (datapos + data_len), | 682 | (int) (datapos + data_len), |
695 | (int) (datapos + data_len), | 683 | (int) (datapos + data_len), |
696 | (int) (((datapos + data_len + bss_len) + 3) & ~3)); | 684 | (int) (((datapos + data_len + bss_len) + 3) & ~3)); |
697 | 685 | ||
698 | text_len -= sizeof(struct flat_hdr); /* the real code len */ | 686 | text_len -= sizeof(struct flat_hdr); /* the real code len */ |
699 | 687 | ||
700 | /* Store the current module values into the global library structure */ | 688 | /* Store the current module values into the global library structure */ |
701 | libinfo->lib_list[id].start_code = start_code; | 689 | libinfo->lib_list[id].start_code = start_code; |
702 | libinfo->lib_list[id].start_data = datapos; | 690 | libinfo->lib_list[id].start_data = datapos; |
703 | libinfo->lib_list[id].start_brk = datapos + data_len + bss_len; | 691 | libinfo->lib_list[id].start_brk = datapos + data_len + bss_len; |
704 | libinfo->lib_list[id].text_len = text_len; | 692 | libinfo->lib_list[id].text_len = text_len; |
705 | libinfo->lib_list[id].loaded = 1; | 693 | libinfo->lib_list[id].loaded = 1; |
706 | libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; | 694 | libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; |
707 | libinfo->lib_list[id].build_date = ntohl(hdr->build_date); | 695 | libinfo->lib_list[id].build_date = ntohl(hdr->build_date); |
708 | 696 | ||
709 | /* | 697 | /* |
710 | * We just load the allocations into some temporary memory to | 698 | * We just load the allocations into some temporary memory to |
711 | * help simplify all this mumbo jumbo | 699 | * help simplify all this mumbo jumbo |
712 | * | 700 | * |
713 | * We've got two different sections of relocation entries. | 701 | * We've got two different sections of relocation entries. |
714 | * The first is the GOT which resides at the begining of the data segment | 702 | * The first is the GOT which resides at the begining of the data segment |
715 | * and is terminated with a -1. This one can be relocated in place. | 703 | * and is terminated with a -1. This one can be relocated in place. |
716 | * The second is the extra relocation entries tacked after the image's | 704 | * The second is the extra relocation entries tacked after the image's |
717 | * data segment. These require a little more processing as the entry is | 705 | * data segment. These require a little more processing as the entry is |
718 | * really an offset into the image which contains an offset into the | 706 | * really an offset into the image which contains an offset into the |
719 | * image. | 707 | * image. |
720 | */ | 708 | */ |
721 | if (flags & FLAT_FLAG_GOTPIC) { | 709 | if (flags & FLAT_FLAG_GOTPIC) { |
722 | for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) { | 710 | for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) { |
723 | unsigned long addr; | 711 | unsigned long addr; |
724 | if (*rp) { | 712 | if (*rp) { |
725 | addr = calc_reloc(*rp, libinfo, id, 0); | 713 | addr = calc_reloc(*rp, libinfo, id, 0); |
726 | if (addr == RELOC_FAILED) { | 714 | if (addr == RELOC_FAILED) { |
727 | ret = -ENOEXEC; | 715 | ret = -ENOEXEC; |
728 | goto err; | 716 | goto err; |
729 | } | 717 | } |
730 | *rp = addr; | 718 | *rp = addr; |
731 | } | 719 | } |
732 | } | 720 | } |
733 | } | 721 | } |
734 | 722 | ||
735 | /* | 723 | /* |
736 | * Now run through the relocation entries. | 724 | * Now run through the relocation entries. |
737 | * We've got to be careful here as C++ produces relocatable zero | 725 | * We've got to be careful here as C++ produces relocatable zero |
738 | * entries in the constructor and destructor tables which are then | 726 | * entries in the constructor and destructor tables which are then |
739 | * tested for being not zero (which will always occur unless we're | 727 | * tested for being not zero (which will always occur unless we're |
740 | * based from address zero). This causes an endless loop as __start | 728 | * based from address zero). This causes an endless loop as __start |
741 | * is at zero. The solution used is to not relocate zero addresses. | 729 | * is at zero. The solution used is to not relocate zero addresses. |
742 | * This has the negative side effect of not allowing a global data | 730 | * This has the negative side effect of not allowing a global data |
743 | * reference to be statically initialised to _stext (I've moved | 731 | * reference to be statically initialised to _stext (I've moved |
744 | * __start to address 4 so that is okay). | 732 | * __start to address 4 so that is okay). |
745 | */ | 733 | */ |
746 | if (rev > OLD_FLAT_VERSION) { | 734 | if (rev > OLD_FLAT_VERSION) { |
747 | unsigned long persistent = 0; | 735 | unsigned long persistent = 0; |
748 | for (i=0; i < relocs; i++) { | 736 | for (i=0; i < relocs; i++) { |
749 | unsigned long addr, relval; | 737 | unsigned long addr, relval; |
750 | 738 | ||
751 | /* Get the address of the pointer to be | 739 | /* Get the address of the pointer to be |
752 | relocated (of course, the address has to be | 740 | relocated (of course, the address has to be |
753 | relocated first). */ | 741 | relocated first). */ |
754 | relval = ntohl(reloc[i]); | 742 | relval = ntohl(reloc[i]); |
755 | if (flat_set_persistent (relval, &persistent)) | 743 | if (flat_set_persistent (relval, &persistent)) |
756 | continue; | 744 | continue; |
757 | addr = flat_get_relocate_addr(relval); | 745 | addr = flat_get_relocate_addr(relval); |
758 | rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); | 746 | rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); |
759 | if (rp == (unsigned long *)RELOC_FAILED) { | 747 | if (rp == (unsigned long *)RELOC_FAILED) { |
760 | ret = -ENOEXEC; | 748 | ret = -ENOEXEC; |
761 | goto err; | 749 | goto err; |
762 | } | 750 | } |
763 | 751 | ||
764 | /* Get the pointer's value. */ | 752 | /* Get the pointer's value. */ |
765 | addr = flat_get_addr_from_rp(rp, relval, flags, | 753 | addr = flat_get_addr_from_rp(rp, relval, flags, |
766 | &persistent); | 754 | &persistent); |
767 | if (addr != 0) { | 755 | if (addr != 0) { |
768 | /* | 756 | /* |
769 | * Do the relocation. PIC relocs in the data section are | 757 | * Do the relocation. PIC relocs in the data section are |
770 | * already in target order | 758 | * already in target order |
771 | */ | 759 | */ |
772 | if ((flags & FLAT_FLAG_GOTPIC) == 0) | 760 | if ((flags & FLAT_FLAG_GOTPIC) == 0) |
773 | addr = ntohl(addr); | 761 | addr = ntohl(addr); |
774 | addr = calc_reloc(addr, libinfo, id, 0); | 762 | addr = calc_reloc(addr, libinfo, id, 0); |
775 | if (addr == RELOC_FAILED) { | 763 | if (addr == RELOC_FAILED) { |
776 | ret = -ENOEXEC; | 764 | ret = -ENOEXEC; |
777 | goto err; | 765 | goto err; |
778 | } | 766 | } |
779 | 767 | ||
780 | /* Write back the relocated pointer. */ | 768 | /* Write back the relocated pointer. */ |
781 | flat_put_addr_at_rp(rp, addr, relval); | 769 | flat_put_addr_at_rp(rp, addr, relval); |
782 | } | 770 | } |
783 | } | 771 | } |
784 | } else { | 772 | } else { |
785 | for (i=0; i < relocs; i++) | 773 | for (i=0; i < relocs; i++) |
786 | old_reloc(ntohl(reloc[i])); | 774 | old_reloc(ntohl(reloc[i])); |
787 | } | 775 | } |
788 | 776 | ||
789 | flush_icache_range(start_code, end_code); | 777 | flush_icache_range(start_code, end_code); |
790 | 778 | ||
791 | /* zero the BSS, BRK and stack areas */ | 779 | /* zero the BSS, BRK and stack areas */ |
792 | memset((void*)(datapos + data_len), 0, bss_len + | 780 | memset((void*)(datapos + data_len), 0, bss_len + |
793 | (memp + kobjsize((void *) memp) - stack_len - /* end brk */ | 781 | (memp + memp_size - stack_len - /* end brk */ |
794 | libinfo->lib_list[id].start_brk) + /* start brk */ | 782 | libinfo->lib_list[id].start_brk) + /* start brk */ |
795 | stack_len); | 783 | stack_len); |
796 | 784 | ||
797 | return 0; | 785 | return 0; |
798 | err: | 786 | err: |
799 | return ret; | 787 | return ret; |
800 | } | 788 | } |
801 | 789 | ||
802 | 790 | ||
803 | /****************************************************************************/ | 791 | /****************************************************************************/ |
804 | #ifdef CONFIG_BINFMT_SHARED_FLAT | 792 | #ifdef CONFIG_BINFMT_SHARED_FLAT |
805 | 793 | ||
806 | /* | 794 | /* |
807 | * Load a shared library into memory. The library gets its own data | 795 | * Load a shared library into memory. The library gets its own data |
808 | * segment (including bss) but not argv/argc/environ. | 796 | * segment (including bss) but not argv/argc/environ. |
809 | */ | 797 | */ |
810 | 798 | ||
811 | static int load_flat_shared_library(int id, struct lib_info *libs) | 799 | static int load_flat_shared_library(int id, struct lib_info *libs) |
812 | { | 800 | { |
813 | struct linux_binprm bprm; | 801 | struct linux_binprm bprm; |
814 | int res; | 802 | int res; |
815 | char buf[16]; | 803 | char buf[16]; |
816 | 804 | ||
817 | /* Create the file name */ | 805 | /* Create the file name */ |
818 | sprintf(buf, "/lib/lib%d.so", id); | 806 | sprintf(buf, "/lib/lib%d.so", id); |
819 | 807 | ||
820 | /* Open the file up */ | 808 | /* Open the file up */ |
821 | bprm.filename = buf; | 809 | bprm.filename = buf; |
822 | bprm.file = open_exec(bprm.filename); | 810 | bprm.file = open_exec(bprm.filename); |
823 | res = PTR_ERR(bprm.file); | 811 | res = PTR_ERR(bprm.file); |
824 | if (IS_ERR(bprm.file)) | 812 | if (IS_ERR(bprm.file)) |
825 | return res; | 813 | return res; |
826 | 814 | ||
827 | res = prepare_binprm(&bprm); | 815 | res = prepare_binprm(&bprm); |
828 | 816 | ||
829 | if (res <= (unsigned long)-4096) | 817 | if (res <= (unsigned long)-4096) |
830 | res = load_flat_file(&bprm, libs, id, NULL); | 818 | res = load_flat_file(&bprm, libs, id, NULL); |
831 | if (bprm.file) { | 819 | if (bprm.file) { |
832 | allow_write_access(bprm.file); | 820 | allow_write_access(bprm.file); |
833 | fput(bprm.file); | 821 | fput(bprm.file); |
834 | bprm.file = NULL; | 822 | bprm.file = NULL; |
835 | } | 823 | } |
836 | return(res); | 824 | return(res); |
837 | } | 825 | } |
838 | 826 | ||
839 | #endif /* CONFIG_BINFMT_SHARED_FLAT */ | 827 | #endif /* CONFIG_BINFMT_SHARED_FLAT */ |
840 | /****************************************************************************/ | 828 | /****************************************************************************/ |
841 | 829 | ||
842 | /* | 830 | /* |
843 | * These are the functions used to load flat style executables and shared | 831 | * These are the functions used to load flat style executables and shared |
844 | * libraries. There is no binary dependent code anywhere else. | 832 | * libraries. There is no binary dependent code anywhere else. |
845 | */ | 833 | */ |
846 | 834 | ||
847 | static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) | 835 | static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) |
848 | { | 836 | { |
849 | struct lib_info libinfo; | 837 | struct lib_info libinfo; |
850 | unsigned long p = bprm->p; | 838 | unsigned long p = bprm->p; |
851 | unsigned long stack_len; | 839 | unsigned long stack_len; |
852 | unsigned long start_addr; | 840 | unsigned long start_addr; |
853 | unsigned long *sp; | 841 | unsigned long *sp; |
854 | int res; | 842 | int res; |
855 | int i, j; | 843 | int i, j; |
856 | 844 | ||
857 | memset(&libinfo, 0, sizeof(libinfo)); | 845 | memset(&libinfo, 0, sizeof(libinfo)); |
858 | /* | 846 | /* |
859 | * We have to add the size of our arguments to our stack size | 847 | * We have to add the size of our arguments to our stack size |
860 | * otherwise it's too easy for users to create stack overflows | 848 | * otherwise it's too easy for users to create stack overflows |
861 | * by passing in a huge argument list. And yes, we have to be | 849 | * by passing in a huge argument list. And yes, we have to be |
862 | * pedantic and include space for the argv/envp array as it may have | 850 | * pedantic and include space for the argv/envp array as it may have |
863 | * a lot of entries. | 851 | * a lot of entries. |
864 | */ | 852 | */ |
865 | #define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *)) | 853 | #define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *)) |
866 | stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ | 854 | stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ |
867 | stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ | 855 | stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ |
868 | stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ | 856 | stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ |
869 | 857 | ||
870 | 858 | ||
871 | res = load_flat_file(bprm, &libinfo, 0, &stack_len); | 859 | res = load_flat_file(bprm, &libinfo, 0, &stack_len); |
872 | if (res > (unsigned long)-4096) | 860 | if (res > (unsigned long)-4096) |
873 | return res; | 861 | return res; |
874 | 862 | ||
875 | /* Update data segment pointers for all libraries */ | 863 | /* Update data segment pointers for all libraries */ |
876 | for (i=0; i<MAX_SHARED_LIBS; i++) | 864 | for (i=0; i<MAX_SHARED_LIBS; i++) |
877 | if (libinfo.lib_list[i].loaded) | 865 | if (libinfo.lib_list[i].loaded) |
878 | for (j=0; j<MAX_SHARED_LIBS; j++) | 866 | for (j=0; j<MAX_SHARED_LIBS; j++) |
879 | (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] = | 867 | (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] = |
880 | (libinfo.lib_list[j].loaded)? | 868 | (libinfo.lib_list[j].loaded)? |
881 | libinfo.lib_list[j].start_data:UNLOADED_LIB; | 869 | libinfo.lib_list[j].start_data:UNLOADED_LIB; |
882 | 870 | ||
883 | install_exec_creds(bprm); | 871 | install_exec_creds(bprm); |
884 | current->flags &= ~PF_FORKNOEXEC; | 872 | current->flags &= ~PF_FORKNOEXEC; |
885 | 873 | ||
886 | set_binfmt(&flat_format); | 874 | set_binfmt(&flat_format); |
887 | 875 | ||
888 | p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; | 876 | p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; |
889 | DBG_FLT("p=%x\n", (int)p); | 877 | DBG_FLT("p=%x\n", (int)p); |
890 | 878 | ||
891 | /* copy the arg pages onto the stack, this could be more efficient :-) */ | 879 | /* copy the arg pages onto the stack, this could be more efficient :-) */ |
892 | for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--) | 880 | for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--) |
893 | * (char *) --p = | 881 | * (char *) --p = |
894 | ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE]; | 882 | ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE]; |
895 | 883 | ||
896 | sp = (unsigned long *) create_flat_tables(p, bprm); | 884 | sp = (unsigned long *) create_flat_tables(p, bprm); |
897 | 885 | ||
898 | /* Fake some return addresses to ensure the call chain will | 886 | /* Fake some return addresses to ensure the call chain will |
899 | * initialise library in order for us. We are required to call | 887 | * initialise library in order for us. We are required to call |
900 | * lib 1 first, then 2, ... and finally the main program (id 0). | 888 | * lib 1 first, then 2, ... and finally the main program (id 0). |
901 | */ | 889 | */ |
902 | start_addr = libinfo.lib_list[0].entry; | 890 | start_addr = libinfo.lib_list[0].entry; |
903 | 891 | ||
904 | #ifdef CONFIG_BINFMT_SHARED_FLAT | 892 | #ifdef CONFIG_BINFMT_SHARED_FLAT |
905 | for (i = MAX_SHARED_LIBS-1; i>0; i--) { | 893 | for (i = MAX_SHARED_LIBS-1; i>0; i--) { |
906 | if (libinfo.lib_list[i].loaded) { | 894 | if (libinfo.lib_list[i].loaded) { |
907 | /* Push previos first to call address */ | 895 | /* Push previos first to call address */ |
908 | --sp; put_user(start_addr, sp); | 896 | --sp; put_user(start_addr, sp); |
909 | start_addr = libinfo.lib_list[i].entry; | 897 | start_addr = libinfo.lib_list[i].entry; |
910 | } | 898 | } |
911 | } | 899 | } |
912 | #endif | 900 | #endif |
913 | 901 | ||
914 | /* Stash our initial stack pointer into the mm structure */ | 902 | /* Stash our initial stack pointer into the mm structure */ |
915 | current->mm->start_stack = (unsigned long )sp; | 903 | current->mm->start_stack = (unsigned long )sp; |
916 | 904 | ||
917 | #ifdef FLAT_PLAT_INIT | 905 | #ifdef FLAT_PLAT_INIT |
918 | FLAT_PLAT_INIT(regs); | 906 | FLAT_PLAT_INIT(regs); |
919 | #endif | 907 | #endif |
920 | DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", | 908 | DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", |
921 | (int)regs, (int)start_addr, (int)current->mm->start_stack); | 909 | (int)regs, (int)start_addr, (int)current->mm->start_stack); |
922 | 910 | ||
923 | start_thread(regs, start_addr, current->mm->start_stack); | 911 | start_thread(regs, start_addr, current->mm->start_stack); |
924 | 912 | ||
925 | return 0; | 913 | return 0; |
926 | } | 914 | } |
927 | 915 | ||
928 | /****************************************************************************/ | 916 | /****************************************************************************/ |
929 | 917 | ||
930 | static int __init init_flat_binfmt(void) | 918 | static int __init init_flat_binfmt(void) |
931 | { | 919 | { |
932 | return register_binfmt(&flat_format); | 920 | return register_binfmt(&flat_format); |
933 | } | 921 | } |
934 | 922 | ||
935 | /****************************************************************************/ | 923 | /****************************************************************************/ |
936 | 924 |
fs/proc/internal.h
1 | /* internal.h: internal procfs definitions | 1 | /* internal.h: internal procfs definitions |
2 | * | 2 | * |
3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 7 | * modify it under the terms of the GNU General Public License |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/proc_fs.h> | 12 | #include <linux/proc_fs.h> |
13 | 13 | ||
14 | extern struct proc_dir_entry proc_root; | 14 | extern struct proc_dir_entry proc_root; |
15 | #ifdef CONFIG_PROC_SYSCTL | 15 | #ifdef CONFIG_PROC_SYSCTL |
16 | extern int proc_sys_init(void); | 16 | extern int proc_sys_init(void); |
17 | #else | 17 | #else |
18 | static inline void proc_sys_init(void) { } | 18 | static inline void proc_sys_init(void) { } |
19 | #endif | 19 | #endif |
20 | #ifdef CONFIG_NET | 20 | #ifdef CONFIG_NET |
21 | extern int proc_net_init(void); | 21 | extern int proc_net_init(void); |
22 | #else | 22 | #else |
23 | static inline int proc_net_init(void) { return 0; } | 23 | static inline int proc_net_init(void) { return 0; } |
24 | #endif | 24 | #endif |
25 | 25 | ||
26 | struct vmalloc_info { | 26 | struct vmalloc_info { |
27 | unsigned long used; | 27 | unsigned long used; |
28 | unsigned long largest_chunk; | 28 | unsigned long largest_chunk; |
29 | }; | 29 | }; |
30 | 30 | ||
31 | extern struct mm_struct *mm_for_maps(struct task_struct *); | 31 | extern struct mm_struct *mm_for_maps(struct task_struct *); |
32 | 32 | ||
33 | #ifdef CONFIG_MMU | 33 | #ifdef CONFIG_MMU |
34 | #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) | 34 | #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) |
35 | extern void get_vmalloc_info(struct vmalloc_info *vmi); | 35 | extern void get_vmalloc_info(struct vmalloc_info *vmi); |
36 | #else | 36 | #else |
37 | 37 | ||
38 | #define VMALLOC_TOTAL 0UL | 38 | #define VMALLOC_TOTAL 0UL |
39 | #define get_vmalloc_info(vmi) \ | 39 | #define get_vmalloc_info(vmi) \ |
40 | do { \ | 40 | do { \ |
41 | (vmi)->used = 0; \ | 41 | (vmi)->used = 0; \ |
42 | (vmi)->largest_chunk = 0; \ | 42 | (vmi)->largest_chunk = 0; \ |
43 | } while(0) | 43 | } while(0) |
44 | |||
45 | extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); | ||
46 | #endif | 44 | #endif |
47 | 45 | ||
48 | extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, | 46 | extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, |
49 | struct pid *pid, struct task_struct *task); | 47 | struct pid *pid, struct task_struct *task); |
50 | extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, | 48 | extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, |
51 | struct pid *pid, struct task_struct *task); | 49 | struct pid *pid, struct task_struct *task); |
52 | extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, | 50 | extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, |
53 | struct pid *pid, struct task_struct *task); | 51 | struct pid *pid, struct task_struct *task); |
54 | extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, | 52 | extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, |
55 | struct pid *pid, struct task_struct *task); | 53 | struct pid *pid, struct task_struct *task); |
56 | extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); | 54 | extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); |
57 | 55 | ||
58 | extern const struct file_operations proc_maps_operations; | 56 | extern const struct file_operations proc_maps_operations; |
59 | extern const struct file_operations proc_numa_maps_operations; | 57 | extern const struct file_operations proc_numa_maps_operations; |
60 | extern const struct file_operations proc_smaps_operations; | 58 | extern const struct file_operations proc_smaps_operations; |
61 | extern const struct file_operations proc_clear_refs_operations; | 59 | extern const struct file_operations proc_clear_refs_operations; |
62 | extern const struct file_operations proc_pagemap_operations; | 60 | extern const struct file_operations proc_pagemap_operations; |
63 | extern const struct file_operations proc_net_operations; | 61 | extern const struct file_operations proc_net_operations; |
64 | extern const struct inode_operations proc_net_inode_operations; | 62 | extern const struct inode_operations proc_net_inode_operations; |
65 | 63 | ||
66 | void free_proc_entry(struct proc_dir_entry *de); | 64 | void free_proc_entry(struct proc_dir_entry *de); |
67 | 65 | ||
68 | void proc_init_inodecache(void); | 66 | void proc_init_inodecache(void); |
69 | 67 | ||
70 | static inline struct pid *proc_pid(struct inode *inode) | 68 | static inline struct pid *proc_pid(struct inode *inode) |
71 | { | 69 | { |
72 | return PROC_I(inode)->pid; | 70 | return PROC_I(inode)->pid; |
73 | } | 71 | } |
74 | 72 | ||
75 | static inline struct task_struct *get_proc_task(struct inode *inode) | 73 | static inline struct task_struct *get_proc_task(struct inode *inode) |
76 | { | 74 | { |
77 | return get_pid_task(proc_pid(inode), PIDTYPE_PID); | 75 | return get_pid_task(proc_pid(inode), PIDTYPE_PID); |
78 | } | 76 | } |
79 | 77 | ||
80 | static inline int proc_fd(struct inode *inode) | 78 | static inline int proc_fd(struct inode *inode) |
81 | { | 79 | { |
82 | return PROC_I(inode)->fd; | 80 | return PROC_I(inode)->fd; |
83 | } | 81 | } |
84 | 82 | ||
85 | struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, | 83 | struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, |
86 | struct dentry *dentry); | 84 | struct dentry *dentry); |
87 | int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, | 85 | int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, |
88 | filldir_t filldir); | 86 | filldir_t filldir); |
89 | 87 | ||
90 | struct pde_opener { | 88 | struct pde_opener { |
91 | struct inode *inode; | 89 | struct inode *inode; |
92 | struct file *file; | 90 | struct file *file; |
93 | int (*release)(struct inode *, struct file *); | 91 | int (*release)(struct inode *, struct file *); |
94 | struct list_head lh; | 92 | struct list_head lh; |
95 | }; | 93 | }; |
96 | 94 |
fs/proc/meminfo.c
1 | #include <linux/fs.h> | 1 | #include <linux/fs.h> |
2 | #include <linux/hugetlb.h> | 2 | #include <linux/hugetlb.h> |
3 | #include <linux/init.h> | 3 | #include <linux/init.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/mman.h> | 6 | #include <linux/mman.h> |
7 | #include <linux/mmzone.h> | 7 | #include <linux/mmzone.h> |
8 | #include <linux/proc_fs.h> | 8 | #include <linux/proc_fs.h> |
9 | #include <linux/quicklist.h> | 9 | #include <linux/quicklist.h> |
10 | #include <linux/seq_file.h> | 10 | #include <linux/seq_file.h> |
11 | #include <linux/swap.h> | 11 | #include <linux/swap.h> |
12 | #include <linux/vmstat.h> | 12 | #include <linux/vmstat.h> |
13 | #include <asm/atomic.h> | 13 | #include <asm/atomic.h> |
14 | #include <asm/page.h> | 14 | #include <asm/page.h> |
15 | #include <asm/pgtable.h> | 15 | #include <asm/pgtable.h> |
16 | #include "internal.h" | 16 | #include "internal.h" |
17 | 17 | ||
18 | void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) | 18 | void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) |
19 | { | 19 | { |
20 | } | 20 | } |
21 | 21 | ||
22 | static int meminfo_proc_show(struct seq_file *m, void *v) | 22 | static int meminfo_proc_show(struct seq_file *m, void *v) |
23 | { | 23 | { |
24 | struct sysinfo i; | 24 | struct sysinfo i; |
25 | unsigned long committed; | 25 | unsigned long committed; |
26 | unsigned long allowed; | 26 | unsigned long allowed; |
27 | struct vmalloc_info vmi; | 27 | struct vmalloc_info vmi; |
28 | long cached; | 28 | long cached; |
29 | unsigned long pages[NR_LRU_LISTS]; | 29 | unsigned long pages[NR_LRU_LISTS]; |
30 | int lru; | 30 | int lru; |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * display in kilobytes. | 33 | * display in kilobytes. |
34 | */ | 34 | */ |
35 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 35 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
36 | si_meminfo(&i); | 36 | si_meminfo(&i); |
37 | si_swapinfo(&i); | 37 | si_swapinfo(&i); |
38 | committed = atomic_long_read(&vm_committed_space); | 38 | committed = atomic_long_read(&vm_committed_space); |
39 | allowed = ((totalram_pages - hugetlb_total_pages()) | 39 | allowed = ((totalram_pages - hugetlb_total_pages()) |
40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; |
41 | 41 | ||
42 | cached = global_page_state(NR_FILE_PAGES) - | 42 | cached = global_page_state(NR_FILE_PAGES) - |
43 | total_swapcache_pages - i.bufferram; | 43 | total_swapcache_pages - i.bufferram; |
44 | if (cached < 0) | 44 | if (cached < 0) |
45 | cached = 0; | 45 | cached = 0; |
46 | 46 | ||
47 | get_vmalloc_info(&vmi); | 47 | get_vmalloc_info(&vmi); |
48 | 48 | ||
49 | for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) | 49 | for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) |
50 | pages[lru] = global_page_state(NR_LRU_BASE + lru); | 50 | pages[lru] = global_page_state(NR_LRU_BASE + lru); |
51 | 51 | ||
52 | /* | 52 | /* |
53 | * Tagged format, for easy grepping and expansion. | 53 | * Tagged format, for easy grepping and expansion. |
54 | */ | 54 | */ |
55 | seq_printf(m, | 55 | seq_printf(m, |
56 | "MemTotal: %8lu kB\n" | 56 | "MemTotal: %8lu kB\n" |
57 | "MemFree: %8lu kB\n" | 57 | "MemFree: %8lu kB\n" |
58 | "Buffers: %8lu kB\n" | 58 | "Buffers: %8lu kB\n" |
59 | "Cached: %8lu kB\n" | 59 | "Cached: %8lu kB\n" |
60 | "SwapCached: %8lu kB\n" | 60 | "SwapCached: %8lu kB\n" |
61 | "Active: %8lu kB\n" | 61 | "Active: %8lu kB\n" |
62 | "Inactive: %8lu kB\n" | 62 | "Inactive: %8lu kB\n" |
63 | "Active(anon): %8lu kB\n" | 63 | "Active(anon): %8lu kB\n" |
64 | "Inactive(anon): %8lu kB\n" | 64 | "Inactive(anon): %8lu kB\n" |
65 | "Active(file): %8lu kB\n" | 65 | "Active(file): %8lu kB\n" |
66 | "Inactive(file): %8lu kB\n" | 66 | "Inactive(file): %8lu kB\n" |
67 | #ifdef CONFIG_UNEVICTABLE_LRU | 67 | #ifdef CONFIG_UNEVICTABLE_LRU |
68 | "Unevictable: %8lu kB\n" | 68 | "Unevictable: %8lu kB\n" |
69 | "Mlocked: %8lu kB\n" | 69 | "Mlocked: %8lu kB\n" |
70 | #endif | 70 | #endif |
71 | #ifdef CONFIG_HIGHMEM | 71 | #ifdef CONFIG_HIGHMEM |
72 | "HighTotal: %8lu kB\n" | 72 | "HighTotal: %8lu kB\n" |
73 | "HighFree: %8lu kB\n" | 73 | "HighFree: %8lu kB\n" |
74 | "LowTotal: %8lu kB\n" | 74 | "LowTotal: %8lu kB\n" |
75 | "LowFree: %8lu kB\n" | 75 | "LowFree: %8lu kB\n" |
76 | #endif | 76 | #endif |
77 | #ifndef CONFIG_MMU | ||
78 | "MmapCopy: %8lu kB\n" | ||
79 | #endif | ||
77 | "SwapTotal: %8lu kB\n" | 80 | "SwapTotal: %8lu kB\n" |
78 | "SwapFree: %8lu kB\n" | 81 | "SwapFree: %8lu kB\n" |
79 | "Dirty: %8lu kB\n" | 82 | "Dirty: %8lu kB\n" |
80 | "Writeback: %8lu kB\n" | 83 | "Writeback: %8lu kB\n" |
81 | "AnonPages: %8lu kB\n" | 84 | "AnonPages: %8lu kB\n" |
82 | "Mapped: %8lu kB\n" | 85 | "Mapped: %8lu kB\n" |
83 | "Slab: %8lu kB\n" | 86 | "Slab: %8lu kB\n" |
84 | "SReclaimable: %8lu kB\n" | 87 | "SReclaimable: %8lu kB\n" |
85 | "SUnreclaim: %8lu kB\n" | 88 | "SUnreclaim: %8lu kB\n" |
86 | "PageTables: %8lu kB\n" | 89 | "PageTables: %8lu kB\n" |
87 | #ifdef CONFIG_QUICKLIST | 90 | #ifdef CONFIG_QUICKLIST |
88 | "Quicklists: %8lu kB\n" | 91 | "Quicklists: %8lu kB\n" |
89 | #endif | 92 | #endif |
90 | "NFS_Unstable: %8lu kB\n" | 93 | "NFS_Unstable: %8lu kB\n" |
91 | "Bounce: %8lu kB\n" | 94 | "Bounce: %8lu kB\n" |
92 | "WritebackTmp: %8lu kB\n" | 95 | "WritebackTmp: %8lu kB\n" |
93 | "CommitLimit: %8lu kB\n" | 96 | "CommitLimit: %8lu kB\n" |
94 | "Committed_AS: %8lu kB\n" | 97 | "Committed_AS: %8lu kB\n" |
95 | "VmallocTotal: %8lu kB\n" | 98 | "VmallocTotal: %8lu kB\n" |
96 | "VmallocUsed: %8lu kB\n" | 99 | "VmallocUsed: %8lu kB\n" |
97 | "VmallocChunk: %8lu kB\n", | 100 | "VmallocChunk: %8lu kB\n", |
98 | K(i.totalram), | 101 | K(i.totalram), |
99 | K(i.freeram), | 102 | K(i.freeram), |
100 | K(i.bufferram), | 103 | K(i.bufferram), |
101 | K(cached), | 104 | K(cached), |
102 | K(total_swapcache_pages), | 105 | K(total_swapcache_pages), |
103 | K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), | 106 | K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), |
104 | K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), | 107 | K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), |
105 | K(pages[LRU_ACTIVE_ANON]), | 108 | K(pages[LRU_ACTIVE_ANON]), |
106 | K(pages[LRU_INACTIVE_ANON]), | 109 | K(pages[LRU_INACTIVE_ANON]), |
107 | K(pages[LRU_ACTIVE_FILE]), | 110 | K(pages[LRU_ACTIVE_FILE]), |
108 | K(pages[LRU_INACTIVE_FILE]), | 111 | K(pages[LRU_INACTIVE_FILE]), |
109 | #ifdef CONFIG_UNEVICTABLE_LRU | 112 | #ifdef CONFIG_UNEVICTABLE_LRU |
110 | K(pages[LRU_UNEVICTABLE]), | 113 | K(pages[LRU_UNEVICTABLE]), |
111 | K(global_page_state(NR_MLOCK)), | 114 | K(global_page_state(NR_MLOCK)), |
112 | #endif | 115 | #endif |
113 | #ifdef CONFIG_HIGHMEM | 116 | #ifdef CONFIG_HIGHMEM |
114 | K(i.totalhigh), | 117 | K(i.totalhigh), |
115 | K(i.freehigh), | 118 | K(i.freehigh), |
116 | K(i.totalram-i.totalhigh), | 119 | K(i.totalram-i.totalhigh), |
117 | K(i.freeram-i.freehigh), | 120 | K(i.freeram-i.freehigh), |
121 | #endif | ||
122 | #ifndef CONFIG_MMU | ||
123 | K((unsigned long) atomic_read(&mmap_pages_allocated)), | ||
118 | #endif | 124 | #endif |
119 | K(i.totalswap), | 125 | K(i.totalswap), |
120 | K(i.freeswap), | 126 | K(i.freeswap), |
121 | K(global_page_state(NR_FILE_DIRTY)), | 127 | K(global_page_state(NR_FILE_DIRTY)), |
122 | K(global_page_state(NR_WRITEBACK)), | 128 | K(global_page_state(NR_WRITEBACK)), |
123 | K(global_page_state(NR_ANON_PAGES)), | 129 | K(global_page_state(NR_ANON_PAGES)), |
124 | K(global_page_state(NR_FILE_MAPPED)), | 130 | K(global_page_state(NR_FILE_MAPPED)), |
125 | K(global_page_state(NR_SLAB_RECLAIMABLE) + | 131 | K(global_page_state(NR_SLAB_RECLAIMABLE) + |
126 | global_page_state(NR_SLAB_UNRECLAIMABLE)), | 132 | global_page_state(NR_SLAB_UNRECLAIMABLE)), |
127 | K(global_page_state(NR_SLAB_RECLAIMABLE)), | 133 | K(global_page_state(NR_SLAB_RECLAIMABLE)), |
128 | K(global_page_state(NR_SLAB_UNRECLAIMABLE)), | 134 | K(global_page_state(NR_SLAB_UNRECLAIMABLE)), |
129 | K(global_page_state(NR_PAGETABLE)), | 135 | K(global_page_state(NR_PAGETABLE)), |
130 | #ifdef CONFIG_QUICKLIST | 136 | #ifdef CONFIG_QUICKLIST |
131 | K(quicklist_total_size()), | 137 | K(quicklist_total_size()), |
132 | #endif | 138 | #endif |
133 | K(global_page_state(NR_UNSTABLE_NFS)), | 139 | K(global_page_state(NR_UNSTABLE_NFS)), |
134 | K(global_page_state(NR_BOUNCE)), | 140 | K(global_page_state(NR_BOUNCE)), |
135 | K(global_page_state(NR_WRITEBACK_TEMP)), | 141 | K(global_page_state(NR_WRITEBACK_TEMP)), |
136 | K(allowed), | 142 | K(allowed), |
137 | K(committed), | 143 | K(committed), |
138 | (unsigned long)VMALLOC_TOTAL >> 10, | 144 | (unsigned long)VMALLOC_TOTAL >> 10, |
139 | vmi.used >> 10, | 145 | vmi.used >> 10, |
140 | vmi.largest_chunk >> 10 | 146 | vmi.largest_chunk >> 10 |
141 | ); | 147 | ); |
142 | 148 | ||
143 | hugetlb_report_meminfo(m); | 149 | hugetlb_report_meminfo(m); |
144 | 150 | ||
145 | arch_report_meminfo(m); | 151 | arch_report_meminfo(m); |
146 | 152 | ||
147 | return 0; | 153 | return 0; |
148 | #undef K | 154 | #undef K |
149 | } | 155 | } |
150 | 156 | ||
151 | static int meminfo_proc_open(struct inode *inode, struct file *file) | 157 | static int meminfo_proc_open(struct inode *inode, struct file *file) |
152 | { | 158 | { |
153 | return single_open(file, meminfo_proc_show, NULL); | 159 | return single_open(file, meminfo_proc_show, NULL); |
154 | } | 160 | } |
155 | 161 | ||
156 | static const struct file_operations meminfo_proc_fops = { | 162 | static const struct file_operations meminfo_proc_fops = { |
157 | .open = meminfo_proc_open, | 163 | .open = meminfo_proc_open, |
158 | .read = seq_read, | 164 | .read = seq_read, |
159 | .llseek = seq_lseek, | 165 | .llseek = seq_lseek, |
160 | .release = single_release, | 166 | .release = single_release, |
161 | }; | 167 | }; |
162 | 168 | ||
163 | static int __init proc_meminfo_init(void) | 169 | static int __init proc_meminfo_init(void) |
164 | { | 170 | { |
165 | proc_create("meminfo", 0, NULL, &meminfo_proc_fops); | 171 | proc_create("meminfo", 0, NULL, &meminfo_proc_fops); |
166 | return 0; | 172 | return 0; |
167 | } | 173 | } |
168 | module_init(proc_meminfo_init); | 174 | module_init(proc_meminfo_init); |
169 | 175 |
fs/proc/nommu.c
1 | /* nommu.c: mmu-less memory info files | 1 | /* nommu.c: mmu-less memory info files |
2 | * | 2 | * |
3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 7 | * modify it under the terms of the GNU General Public License |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/errno.h> | 14 | #include <linux/errno.h> |
15 | #include <linux/time.h> | 15 | #include <linux/time.h> |
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/string.h> | 17 | #include <linux/string.h> |
18 | #include <linux/mman.h> | 18 | #include <linux/mman.h> |
19 | #include <linux/proc_fs.h> | 19 | #include <linux/proc_fs.h> |
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/mmzone.h> | 21 | #include <linux/mmzone.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/smp.h> | 25 | #include <linux/smp.h> |
26 | #include <linux/seq_file.h> | 26 | #include <linux/seq_file.h> |
27 | #include <linux/hugetlb.h> | 27 | #include <linux/hugetlb.h> |
28 | #include <linux/vmalloc.h> | 28 | #include <linux/vmalloc.h> |
29 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
30 | #include <asm/pgtable.h> | 30 | #include <asm/pgtable.h> |
31 | #include <asm/tlb.h> | 31 | #include <asm/tlb.h> |
32 | #include <asm/div64.h> | 32 | #include <asm/div64.h> |
33 | #include "internal.h" | 33 | #include "internal.h" |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * display a single VMA to a sequenced file | 36 | * display a single region to a sequenced file |
37 | */ | 37 | */ |
38 | int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | 38 | static int nommu_region_show(struct seq_file *m, struct vm_region *region) |
39 | { | 39 | { |
40 | unsigned long ino = 0; | 40 | unsigned long ino = 0; |
41 | struct file *file; | 41 | struct file *file; |
42 | dev_t dev = 0; | 42 | dev_t dev = 0; |
43 | int flags, len; | 43 | int flags, len; |
44 | 44 | ||
45 | flags = vma->vm_flags; | 45 | flags = region->vm_flags; |
46 | file = vma->vm_file; | 46 | file = region->vm_file; |
47 | 47 | ||
48 | if (file) { | 48 | if (file) { |
49 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 49 | struct inode *inode = region->vm_file->f_path.dentry->d_inode; |
50 | dev = inode->i_sb->s_dev; | 50 | dev = inode->i_sb->s_dev; |
51 | ino = inode->i_ino; | 51 | ino = inode->i_ino; |
52 | } | 52 | } |
53 | 53 | ||
54 | seq_printf(m, | 54 | seq_printf(m, |
55 | "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", | 55 | "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", |
56 | vma->vm_start, | 56 | region->vm_start, |
57 | vma->vm_end, | 57 | region->vm_end, |
58 | flags & VM_READ ? 'r' : '-', | 58 | flags & VM_READ ? 'r' : '-', |
59 | flags & VM_WRITE ? 'w' : '-', | 59 | flags & VM_WRITE ? 'w' : '-', |
60 | flags & VM_EXEC ? 'x' : '-', | 60 | flags & VM_EXEC ? 'x' : '-', |
61 | flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', | 61 | flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', |
62 | ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, | 62 | ((loff_t)region->vm_pgoff) << PAGE_SHIFT, |
63 | MAJOR(dev), MINOR(dev), ino, &len); | 63 | MAJOR(dev), MINOR(dev), ino, &len); |
64 | 64 | ||
65 | if (file) { | 65 | if (file) { |
66 | len = 25 + sizeof(void *) * 6 - len; | 66 | len = 25 + sizeof(void *) * 6 - len; |
67 | if (len < 1) | 67 | if (len < 1) |
68 | len = 1; | 68 | len = 1; |
69 | seq_printf(m, "%*c", len, ' '); | 69 | seq_printf(m, "%*c", len, ' '); |
70 | seq_path(m, &file->f_path, ""); | 70 | seq_path(m, &file->f_path, ""); |
71 | } | 71 | } |
72 | 72 | ||
73 | seq_putc(m, '\n'); | 73 | seq_putc(m, '\n'); |
74 | return 0; | 74 | return 0; |
75 | } | 75 | } |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * display a list of all the VMAs the kernel knows about | 78 | * display a list of all the REGIONs the kernel knows about |
79 | * - nommu kernals have a single flat list | 79 | * - nommu kernals have a single flat list |
80 | */ | 80 | */ |
81 | static int nommu_vma_list_show(struct seq_file *m, void *v) | 81 | static int nommu_region_list_show(struct seq_file *m, void *_p) |
82 | { | 82 | { |
83 | struct vm_area_struct *vma; | 83 | struct rb_node *p = _p; |
84 | 84 | ||
85 | vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); | 85 | return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); |
86 | return nommu_vma_show(m, vma); | ||
87 | } | 86 | } |
88 | 87 | ||
89 | static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) | 88 | static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) |
90 | { | 89 | { |
91 | struct rb_node *_rb; | 90 | struct rb_node *p; |
92 | loff_t pos = *_pos; | 91 | loff_t pos = *_pos; |
93 | void *next = NULL; | ||
94 | 92 | ||
95 | down_read(&nommu_vma_sem); | 93 | down_read(&nommu_region_sem); |
96 | 94 | ||
97 | for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { | 95 | for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) |
98 | if (pos == 0) { | 96 | if (pos-- == 0) |
99 | next = _rb; | 97 | return p; |
100 | break; | 98 | return NULL; |
101 | } | ||
102 | pos--; | ||
103 | } | ||
104 | |||
105 | return next; | ||
106 | } | 99 | } |
107 | 100 | ||
108 | static void nommu_vma_list_stop(struct seq_file *m, void *v) | 101 | static void nommu_region_list_stop(struct seq_file *m, void *v) |
109 | { | 102 | { |
110 | up_read(&nommu_vma_sem); | 103 | up_read(&nommu_region_sem); |
111 | } | 104 | } |
112 | 105 | ||
113 | static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) | 106 | static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) |
114 | { | 107 | { |
115 | (*pos)++; | 108 | (*pos)++; |
116 | return rb_next((struct rb_node *) v); | 109 | return rb_next((struct rb_node *) v); |
117 | } | 110 | } |
118 | 111 | ||
119 | static const struct seq_operations proc_nommu_vma_list_seqop = { | 112 | static struct seq_operations proc_nommu_region_list_seqop = { |
120 | .start = nommu_vma_list_start, | 113 | .start = nommu_region_list_start, |
121 | .next = nommu_vma_list_next, | 114 | .next = nommu_region_list_next, |
122 | .stop = nommu_vma_list_stop, | 115 | .stop = nommu_region_list_stop, |
123 | .show = nommu_vma_list_show | 116 | .show = nommu_region_list_show |
124 | }; | 117 | }; |
125 | 118 | ||
126 | static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) | 119 | static int proc_nommu_region_list_open(struct inode *inode, struct file *file) |
127 | { | 120 | { |
128 | return seq_open(file, &proc_nommu_vma_list_seqop); | 121 | return seq_open(file, &proc_nommu_region_list_seqop); |
129 | } | 122 | } |
130 | 123 | ||
131 | static const struct file_operations proc_nommu_vma_list_operations = { | 124 | static const struct file_operations proc_nommu_region_list_operations = { |
132 | .open = proc_nommu_vma_list_open, | 125 | .open = proc_nommu_region_list_open, |
133 | .read = seq_read, | 126 | .read = seq_read, |
134 | .llseek = seq_lseek, | 127 | .llseek = seq_lseek, |
135 | .release = seq_release, | 128 | .release = seq_release, |
136 | }; | 129 | }; |
137 | 130 | ||
138 | static int __init proc_nommu_init(void) | 131 | static int __init proc_nommu_init(void) |
139 | { | 132 | { |
140 | proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); | 133 | proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); |
141 | return 0; | 134 | return 0; |
142 | } | 135 | } |
143 | 136 | ||
144 | module_init(proc_nommu_init); | 137 | module_init(proc_nommu_init); |
145 | 138 |
fs/proc/task_nommu.c
1 | 1 | ||
2 | #include <linux/mm.h> | 2 | #include <linux/mm.h> |
3 | #include <linux/file.h> | 3 | #include <linux/file.h> |
4 | #include <linux/fdtable.h> | 4 | #include <linux/fdtable.h> |
5 | #include <linux/mount.h> | 5 | #include <linux/mount.h> |
6 | #include <linux/ptrace.h> | 6 | #include <linux/ptrace.h> |
7 | #include <linux/seq_file.h> | 7 | #include <linux/seq_file.h> |
8 | #include "internal.h" | 8 | #include "internal.h" |
9 | 9 | ||
10 | /* | 10 | /* |
11 | * Logic: we've got two memory sums for each process, "shared", and | 11 | * Logic: we've got two memory sums for each process, "shared", and |
12 | * "non-shared". Shared memory may get counted more than once, for | 12 | * "non-shared". Shared memory may get counted more than once, for |
13 | * each process that owns it. Non-shared memory is counted | 13 | * each process that owns it. Non-shared memory is counted |
14 | * accurately. | 14 | * accurately. |
15 | */ | 15 | */ |
16 | void task_mem(struct seq_file *m, struct mm_struct *mm) | 16 | void task_mem(struct seq_file *m, struct mm_struct *mm) |
17 | { | 17 | { |
18 | struct vm_list_struct *vml; | 18 | struct vm_area_struct *vma; |
19 | unsigned long bytes = 0, sbytes = 0, slack = 0; | 19 | struct vm_region *region; |
20 | struct rb_node *p; | ||
21 | unsigned long bytes = 0, sbytes = 0, slack = 0, size; | ||
20 | 22 | ||
21 | down_read(&mm->mmap_sem); | 23 | down_read(&mm->mmap_sem); |
22 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | 24 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { |
23 | if (!vml->vma) | 25 | vma = rb_entry(p, struct vm_area_struct, vm_rb); |
24 | continue; | ||
25 | 26 | ||
26 | bytes += kobjsize(vml); | 27 | bytes += kobjsize(vma); |
28 | |||
29 | region = vma->vm_region; | ||
30 | if (region) { | ||
31 | size = kobjsize(region); | ||
32 | size += region->vm_end - region->vm_start; | ||
33 | } else { | ||
34 | size = vma->vm_end - vma->vm_start; | ||
35 | } | ||
36 | |||
27 | if (atomic_read(&mm->mm_count) > 1 || | 37 | if (atomic_read(&mm->mm_count) > 1 || |
28 | atomic_read(&vml->vma->vm_usage) > 1 | 38 | vma->vm_flags & VM_MAYSHARE) { |
29 | ) { | 39 | sbytes += size; |
30 | sbytes += kobjsize((void *) vml->vma->vm_start); | ||
31 | sbytes += kobjsize(vml->vma); | ||
32 | } else { | 40 | } else { |
33 | bytes += kobjsize((void *) vml->vma->vm_start); | 41 | bytes += size; |
34 | bytes += kobjsize(vml->vma); | 42 | if (region) |
35 | slack += kobjsize((void *) vml->vma->vm_start) - | 43 | slack = region->vm_end - vma->vm_end; |
36 | (vml->vma->vm_end - vml->vma->vm_start); | ||
37 | } | 44 | } |
38 | } | 45 | } |
39 | 46 | ||
40 | if (atomic_read(&mm->mm_count) > 1) | 47 | if (atomic_read(&mm->mm_count) > 1) |
41 | sbytes += kobjsize(mm); | 48 | sbytes += kobjsize(mm); |
42 | else | 49 | else |
43 | bytes += kobjsize(mm); | 50 | bytes += kobjsize(mm); |
44 | 51 | ||
45 | if (current->fs && atomic_read(¤t->fs->count) > 1) | 52 | if (current->fs && atomic_read(¤t->fs->count) > 1) |
46 | sbytes += kobjsize(current->fs); | 53 | sbytes += kobjsize(current->fs); |
47 | else | 54 | else |
48 | bytes += kobjsize(current->fs); | 55 | bytes += kobjsize(current->fs); |
49 | 56 | ||
50 | if (current->files && atomic_read(¤t->files->count) > 1) | 57 | if (current->files && atomic_read(¤t->files->count) > 1) |
51 | sbytes += kobjsize(current->files); | 58 | sbytes += kobjsize(current->files); |
52 | else | 59 | else |
53 | bytes += kobjsize(current->files); | 60 | bytes += kobjsize(current->files); |
54 | 61 | ||
55 | if (current->sighand && atomic_read(¤t->sighand->count) > 1) | 62 | if (current->sighand && atomic_read(¤t->sighand->count) > 1) |
56 | sbytes += kobjsize(current->sighand); | 63 | sbytes += kobjsize(current->sighand); |
57 | else | 64 | else |
58 | bytes += kobjsize(current->sighand); | 65 | bytes += kobjsize(current->sighand); |
59 | 66 | ||
60 | bytes += kobjsize(current); /* includes kernel stack */ | 67 | bytes += kobjsize(current); /* includes kernel stack */ |
61 | 68 | ||
62 | seq_printf(m, | 69 | seq_printf(m, |
63 | "Mem:\t%8lu bytes\n" | 70 | "Mem:\t%8lu bytes\n" |
64 | "Slack:\t%8lu bytes\n" | 71 | "Slack:\t%8lu bytes\n" |
65 | "Shared:\t%8lu bytes\n", | 72 | "Shared:\t%8lu bytes\n", |
66 | bytes, slack, sbytes); | 73 | bytes, slack, sbytes); |
67 | 74 | ||
68 | up_read(&mm->mmap_sem); | 75 | up_read(&mm->mmap_sem); |
69 | } | 76 | } |
70 | 77 | ||
71 | unsigned long task_vsize(struct mm_struct *mm) | 78 | unsigned long task_vsize(struct mm_struct *mm) |
72 | { | 79 | { |
73 | struct vm_list_struct *tbp; | 80 | struct vm_area_struct *vma; |
81 | struct rb_node *p; | ||
74 | unsigned long vsize = 0; | 82 | unsigned long vsize = 0; |
75 | 83 | ||
76 | down_read(&mm->mmap_sem); | 84 | down_read(&mm->mmap_sem); |
77 | for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { | 85 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { |
78 | if (tbp->vma) | 86 | vma = rb_entry(p, struct vm_area_struct, vm_rb); |
79 | vsize += kobjsize((void *) tbp->vma->vm_start); | 87 | vsize += vma->vm_end - vma->vm_start; |
80 | } | 88 | } |
81 | up_read(&mm->mmap_sem); | 89 | up_read(&mm->mmap_sem); |
82 | return vsize; | 90 | return vsize; |
83 | } | 91 | } |
84 | 92 | ||
85 | int task_statm(struct mm_struct *mm, int *shared, int *text, | 93 | int task_statm(struct mm_struct *mm, int *shared, int *text, |
86 | int *data, int *resident) | 94 | int *data, int *resident) |
87 | { | 95 | { |
88 | struct vm_list_struct *tbp; | 96 | struct vm_area_struct *vma; |
97 | struct vm_region *region; | ||
98 | struct rb_node *p; | ||
89 | int size = kobjsize(mm); | 99 | int size = kobjsize(mm); |
90 | 100 | ||
91 | down_read(&mm->mmap_sem); | 101 | down_read(&mm->mmap_sem); |
92 | for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { | 102 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { |
93 | size += kobjsize(tbp); | 103 | vma = rb_entry(p, struct vm_area_struct, vm_rb); |
94 | if (tbp->vma) { | 104 | size += kobjsize(vma); |
95 | size += kobjsize(tbp->vma); | 105 | region = vma->vm_region; |
96 | size += kobjsize((void *) tbp->vma->vm_start); | 106 | if (region) { |
107 | size += kobjsize(region); | ||
108 | size += region->vm_end - region->vm_start; | ||
97 | } | 109 | } |
98 | } | 110 | } |
99 | 111 | ||
100 | size += (*text = mm->end_code - mm->start_code); | 112 | size += (*text = mm->end_code - mm->start_code); |
101 | size += (*data = mm->start_stack - mm->start_data); | 113 | size += (*data = mm->start_stack - mm->start_data); |
102 | up_read(&mm->mmap_sem); | 114 | up_read(&mm->mmap_sem); |
103 | *resident = size; | 115 | *resident = size; |
104 | return size; | 116 | return size; |
105 | } | 117 | } |
106 | 118 | ||
107 | /* | 119 | /* |
120 | * display a single VMA to a sequenced file | ||
121 | */ | ||
122 | static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | ||
123 | { | ||
124 | unsigned long ino = 0; | ||
125 | struct file *file; | ||
126 | dev_t dev = 0; | ||
127 | int flags, len; | ||
128 | |||
129 | flags = vma->vm_flags; | ||
130 | file = vma->vm_file; | ||
131 | |||
132 | if (file) { | ||
133 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | ||
134 | dev = inode->i_sb->s_dev; | ||
135 | ino = inode->i_ino; | ||
136 | } | ||
137 | |||
138 | seq_printf(m, | ||
139 | "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", | ||
140 | vma->vm_start, | ||
141 | vma->vm_end, | ||
142 | flags & VM_READ ? 'r' : '-', | ||
143 | flags & VM_WRITE ? 'w' : '-', | ||
144 | flags & VM_EXEC ? 'x' : '-', | ||
145 | flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', | ||
146 | vma->vm_pgoff << PAGE_SHIFT, | ||
147 | MAJOR(dev), MINOR(dev), ino, &len); | ||
148 | |||
149 | if (file) { | ||
150 | len = 25 + sizeof(void *) * 6 - len; | ||
151 | if (len < 1) | ||
152 | len = 1; | ||
153 | seq_printf(m, "%*c", len, ' '); | ||
154 | seq_path(m, &file->f_path, ""); | ||
155 | } | ||
156 | |||
157 | seq_putc(m, '\n'); | ||
158 | return 0; | ||
159 | } | ||
160 | |||
161 | /* | ||
108 | * display mapping lines for a particular process's /proc/pid/maps | 162 | * display mapping lines for a particular process's /proc/pid/maps |
109 | */ | 163 | */ |
110 | static int show_map(struct seq_file *m, void *_vml) | 164 | static int show_map(struct seq_file *m, void *_p) |
111 | { | 165 | { |
112 | struct vm_list_struct *vml = _vml; | 166 | struct rb_node *p = _p; |
113 | 167 | ||
114 | return nommu_vma_show(m, vml->vma); | 168 | return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); |
115 | } | 169 | } |
116 | 170 | ||
117 | static void *m_start(struct seq_file *m, loff_t *pos) | 171 | static void *m_start(struct seq_file *m, loff_t *pos) |
118 | { | 172 | { |
119 | struct proc_maps_private *priv = m->private; | 173 | struct proc_maps_private *priv = m->private; |
120 | struct vm_list_struct *vml; | ||
121 | struct mm_struct *mm; | 174 | struct mm_struct *mm; |
175 | struct rb_node *p; | ||
122 | loff_t n = *pos; | 176 | loff_t n = *pos; |
123 | 177 | ||
124 | /* pin the task and mm whilst we play with them */ | 178 | /* pin the task and mm whilst we play with them */ |
125 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); | 179 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); |
126 | if (!priv->task) | 180 | if (!priv->task) |
127 | return NULL; | 181 | return NULL; |
128 | 182 | ||
129 | mm = mm_for_maps(priv->task); | 183 | mm = mm_for_maps(priv->task); |
130 | if (!mm) { | 184 | if (!mm) { |
131 | put_task_struct(priv->task); | 185 | put_task_struct(priv->task); |
132 | priv->task = NULL; | 186 | priv->task = NULL; |
133 | return NULL; | 187 | return NULL; |
134 | } | 188 | } |
135 | 189 | ||
136 | /* start from the Nth VMA */ | 190 | /* start from the Nth VMA */ |
137 | for (vml = mm->context.vmlist; vml; vml = vml->next) | 191 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) |
138 | if (n-- == 0) | 192 | if (n-- == 0) |
139 | return vml; | 193 | return p; |
140 | return NULL; | 194 | return NULL; |
141 | } | 195 | } |
142 | 196 | ||
143 | static void m_stop(struct seq_file *m, void *_vml) | 197 | static void m_stop(struct seq_file *m, void *_vml) |
144 | { | 198 | { |
145 | struct proc_maps_private *priv = m->private; | 199 | struct proc_maps_private *priv = m->private; |
146 | 200 | ||
147 | if (priv->task) { | 201 | if (priv->task) { |
148 | struct mm_struct *mm = priv->task->mm; | 202 | struct mm_struct *mm = priv->task->mm; |
149 | up_read(&mm->mmap_sem); | 203 | up_read(&mm->mmap_sem); |
150 | mmput(mm); | 204 | mmput(mm); |
151 | put_task_struct(priv->task); | 205 | put_task_struct(priv->task); |
152 | } | 206 | } |
153 | } | 207 | } |
154 | 208 | ||
155 | static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) | 209 | static void *m_next(struct seq_file *m, void *_p, loff_t *pos) |
156 | { | 210 | { |
157 | struct vm_list_struct *vml = _vml; | 211 | struct rb_node *p = _p; |
158 | 212 | ||
159 | (*pos)++; | 213 | (*pos)++; |
160 | return vml ? vml->next : NULL; | 214 | return p ? rb_next(p) : NULL; |
161 | } | 215 | } |
162 | 216 | ||
163 | static const struct seq_operations proc_pid_maps_ops = { | 217 | static const struct seq_operations proc_pid_maps_ops = { |
164 | .start = m_start, | 218 | .start = m_start, |
165 | .next = m_next, | 219 | .next = m_next, |
166 | .stop = m_stop, | 220 | .stop = m_stop, |
167 | .show = show_map | 221 | .show = show_map |
168 | }; | 222 | }; |
169 | 223 | ||
170 | static int maps_open(struct inode *inode, struct file *file) | 224 | static int maps_open(struct inode *inode, struct file *file) |
171 | { | 225 | { |
172 | struct proc_maps_private *priv; | 226 | struct proc_maps_private *priv; |
173 | int ret = -ENOMEM; | 227 | int ret = -ENOMEM; |
174 | 228 | ||
175 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 229 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
176 | if (priv) { | 230 | if (priv) { |
177 | priv->pid = proc_pid(inode); | 231 | priv->pid = proc_pid(inode); |
178 | ret = seq_open(file, &proc_pid_maps_ops); | 232 | ret = seq_open(file, &proc_pid_maps_ops); |
179 | if (!ret) { | 233 | if (!ret) { |
180 | struct seq_file *m = file->private_data; | 234 | struct seq_file *m = file->private_data; |
181 | m->private = priv; | 235 | m->private = priv; |
182 | } else { | 236 | } else { |
183 | kfree(priv); | 237 | kfree(priv); |
184 | } | 238 | } |
185 | } | 239 | } |
186 | return ret; | 240 | return ret; |
187 | } | 241 | } |
188 | 242 | ||
189 | const struct file_operations proc_maps_operations = { | 243 | const struct file_operations proc_maps_operations = { |
190 | .open = maps_open, | 244 | .open = maps_open, |
191 | .read = seq_read, | 245 | .read = seq_read, |
fs/ramfs/file-nommu.c
1 | /* file-nommu.c: no-MMU version of ramfs | 1 | /* file-nommu.c: no-MMU version of ramfs |
2 | * | 2 | * |
3 | * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 7 | * modify it under the terms of the GNU General Public License |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/highmem.h> | 16 | #include <linux/highmem.h> |
17 | #include <linux/init.h> | 17 | #include <linux/init.h> |
18 | #include <linux/string.h> | 18 | #include <linux/string.h> |
19 | #include <linux/backing-dev.h> | 19 | #include <linux/backing-dev.h> |
20 | #include <linux/ramfs.h> | 20 | #include <linux/ramfs.h> |
21 | #include <linux/quotaops.h> | 21 | #include <linux/quotaops.h> |
22 | #include <linux/pagevec.h> | 22 | #include <linux/pagevec.h> |
23 | #include <linux/mman.h> | 23 | #include <linux/mman.h> |
24 | 24 | ||
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include "internal.h" | 26 | #include "internal.h" |
27 | 27 | ||
28 | static int ramfs_nommu_setattr(struct dentry *, struct iattr *); | 28 | static int ramfs_nommu_setattr(struct dentry *, struct iattr *); |
29 | 29 | ||
30 | const struct address_space_operations ramfs_aops = { | 30 | const struct address_space_operations ramfs_aops = { |
31 | .readpage = simple_readpage, | 31 | .readpage = simple_readpage, |
32 | .write_begin = simple_write_begin, | 32 | .write_begin = simple_write_begin, |
33 | .write_end = simple_write_end, | 33 | .write_end = simple_write_end, |
34 | .set_page_dirty = __set_page_dirty_no_writeback, | 34 | .set_page_dirty = __set_page_dirty_no_writeback, |
35 | }; | 35 | }; |
36 | 36 | ||
37 | const struct file_operations ramfs_file_operations = { | 37 | const struct file_operations ramfs_file_operations = { |
38 | .mmap = ramfs_nommu_mmap, | 38 | .mmap = ramfs_nommu_mmap, |
39 | .get_unmapped_area = ramfs_nommu_get_unmapped_area, | 39 | .get_unmapped_area = ramfs_nommu_get_unmapped_area, |
40 | .read = do_sync_read, | 40 | .read = do_sync_read, |
41 | .aio_read = generic_file_aio_read, | 41 | .aio_read = generic_file_aio_read, |
42 | .write = do_sync_write, | 42 | .write = do_sync_write, |
43 | .aio_write = generic_file_aio_write, | 43 | .aio_write = generic_file_aio_write, |
44 | .fsync = simple_sync_file, | 44 | .fsync = simple_sync_file, |
45 | .splice_read = generic_file_splice_read, | 45 | .splice_read = generic_file_splice_read, |
46 | .splice_write = generic_file_splice_write, | 46 | .splice_write = generic_file_splice_write, |
47 | .llseek = generic_file_llseek, | 47 | .llseek = generic_file_llseek, |
48 | }; | 48 | }; |
49 | 49 | ||
50 | const struct inode_operations ramfs_file_inode_operations = { | 50 | const struct inode_operations ramfs_file_inode_operations = { |
51 | .setattr = ramfs_nommu_setattr, | 51 | .setattr = ramfs_nommu_setattr, |
52 | .getattr = simple_getattr, | 52 | .getattr = simple_getattr, |
53 | }; | 53 | }; |
54 | 54 | ||
55 | /*****************************************************************************/ | 55 | /*****************************************************************************/ |
56 | /* | 56 | /* |
57 | * add a contiguous set of pages into a ramfs inode when it's truncated from | 57 | * add a contiguous set of pages into a ramfs inode when it's truncated from |
58 | * size 0 on the assumption that it's going to be used for an mmap of shared | 58 | * size 0 on the assumption that it's going to be used for an mmap of shared |
59 | * memory | 59 | * memory |
60 | */ | 60 | */ |
61 | int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) | 61 | int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) |
62 | { | 62 | { |
63 | struct pagevec lru_pvec; | 63 | struct pagevec lru_pvec; |
64 | unsigned long npages, xpages, loop, limit; | 64 | unsigned long npages, xpages, loop, limit; |
65 | struct page *pages; | 65 | struct page *pages; |
66 | unsigned order; | 66 | unsigned order; |
67 | void *data; | 67 | void *data; |
68 | int ret; | 68 | int ret; |
69 | 69 | ||
70 | /* make various checks */ | 70 | /* make various checks */ |
71 | order = get_order(newsize); | 71 | order = get_order(newsize); |
72 | if (unlikely(order >= MAX_ORDER)) | 72 | if (unlikely(order >= MAX_ORDER)) |
73 | goto too_big; | 73 | goto too_big; |
74 | 74 | ||
75 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 75 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; |
76 | if (limit != RLIM_INFINITY && newsize > limit) | 76 | if (limit != RLIM_INFINITY && newsize > limit) |
77 | goto fsize_exceeded; | 77 | goto fsize_exceeded; |
78 | 78 | ||
79 | if (newsize > inode->i_sb->s_maxbytes) | 79 | if (newsize > inode->i_sb->s_maxbytes) |
80 | goto too_big; | 80 | goto too_big; |
81 | 81 | ||
82 | i_size_write(inode, newsize); | 82 | i_size_write(inode, newsize); |
83 | 83 | ||
84 | /* allocate enough contiguous pages to be able to satisfy the | 84 | /* allocate enough contiguous pages to be able to satisfy the |
85 | * request */ | 85 | * request */ |
86 | pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order); | 86 | pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order); |
87 | if (!pages) | 87 | if (!pages) |
88 | return -ENOMEM; | 88 | return -ENOMEM; |
89 | 89 | ||
90 | /* split the high-order page into an array of single pages */ | 90 | /* split the high-order page into an array of single pages */ |
91 | xpages = 1UL << order; | 91 | xpages = 1UL << order; |
92 | npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; | 92 | npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; |
93 | 93 | ||
94 | split_page(pages, order); | 94 | split_page(pages, order); |
95 | 95 | ||
96 | /* trim off any pages we don't actually require */ | 96 | /* trim off any pages we don't actually require */ |
97 | for (loop = npages; loop < xpages; loop++) | 97 | for (loop = npages; loop < xpages; loop++) |
98 | __free_page(pages + loop); | 98 | __free_page(pages + loop); |
99 | 99 | ||
100 | /* clear the memory we allocated */ | 100 | /* clear the memory we allocated */ |
101 | newsize = PAGE_SIZE * npages; | 101 | newsize = PAGE_SIZE * npages; |
102 | data = page_address(pages); | 102 | data = page_address(pages); |
103 | memset(data, 0, newsize); | 103 | memset(data, 0, newsize); |
104 | 104 | ||
105 | /* attach all the pages to the inode's address space */ | 105 | /* attach all the pages to the inode's address space */ |
106 | pagevec_init(&lru_pvec, 0); | 106 | pagevec_init(&lru_pvec, 0); |
107 | for (loop = 0; loop < npages; loop++) { | 107 | for (loop = 0; loop < npages; loop++) { |
108 | struct page *page = pages + loop; | 108 | struct page *page = pages + loop; |
109 | 109 | ||
110 | ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); | 110 | ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); |
111 | if (ret < 0) | 111 | if (ret < 0) |
112 | goto add_error; | 112 | goto add_error; |
113 | 113 | ||
114 | if (!pagevec_add(&lru_pvec, page)) | 114 | if (!pagevec_add(&lru_pvec, page)) |
115 | __pagevec_lru_add_file(&lru_pvec); | 115 | __pagevec_lru_add_file(&lru_pvec); |
116 | 116 | ||
117 | unlock_page(page); | 117 | unlock_page(page); |
118 | } | 118 | } |
119 | 119 | ||
120 | pagevec_lru_add_file(&lru_pvec); | 120 | pagevec_lru_add_file(&lru_pvec); |
121 | return 0; | 121 | return 0; |
122 | 122 | ||
123 | fsize_exceeded: | 123 | fsize_exceeded: |
124 | send_sig(SIGXFSZ, current, 0); | 124 | send_sig(SIGXFSZ, current, 0); |
125 | too_big: | 125 | too_big: |
126 | return -EFBIG; | 126 | return -EFBIG; |
127 | 127 | ||
128 | add_error: | 128 | add_error: |
129 | page_cache_release(pages + loop); | 129 | page_cache_release(pages + loop); |
130 | for (loop++; loop < npages; loop++) | 130 | for (loop++; loop < npages; loop++) |
131 | __free_page(pages + loop); | 131 | __free_page(pages + loop); |
132 | return ret; | 132 | return ret; |
133 | } | 133 | } |
134 | 134 | ||
135 | /*****************************************************************************/ | 135 | /*****************************************************************************/ |
136 | /* | 136 | /* |
137 | * check that file shrinkage doesn't leave any VMAs dangling in midair | 137 | * check that file shrinkage doesn't leave any VMAs dangling in midair |
138 | */ | 138 | */ |
139 | static int ramfs_nommu_check_mappings(struct inode *inode, | 139 | static int ramfs_nommu_check_mappings(struct inode *inode, |
140 | size_t newsize, size_t size) | 140 | size_t newsize, size_t size) |
141 | { | 141 | { |
142 | struct vm_area_struct *vma; | 142 | struct vm_area_struct *vma; |
143 | struct prio_tree_iter iter; | 143 | struct prio_tree_iter iter; |
144 | 144 | ||
145 | /* search for VMAs that fall within the dead zone */ | 145 | /* search for VMAs that fall within the dead zone */ |
146 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 146 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, |
147 | newsize >> PAGE_SHIFT, | 147 | newsize >> PAGE_SHIFT, |
148 | (size + PAGE_SIZE - 1) >> PAGE_SHIFT | 148 | (size + PAGE_SIZE - 1) >> PAGE_SHIFT |
149 | ) { | 149 | ) { |
150 | /* found one - only interested if it's shared out of the page | 150 | /* found one - only interested if it's shared out of the page |
151 | * cache */ | 151 | * cache */ |
152 | if (vma->vm_flags & VM_SHARED) | 152 | if (vma->vm_flags & VM_SHARED) |
153 | return -ETXTBSY; /* not quite true, but near enough */ | 153 | return -ETXTBSY; /* not quite true, but near enough */ |
154 | } | 154 | } |
155 | 155 | ||
156 | return 0; | 156 | return 0; |
157 | } | 157 | } |
158 | 158 | ||
159 | /*****************************************************************************/ | 159 | /*****************************************************************************/ |
160 | /* | 160 | /* |
161 | * | 161 | * |
162 | */ | 162 | */ |
163 | static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) | 163 | static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) |
164 | { | 164 | { |
165 | int ret; | 165 | int ret; |
166 | 166 | ||
167 | /* assume a truncate from zero size is going to be for the purposes of | 167 | /* assume a truncate from zero size is going to be for the purposes of |
168 | * shared mmap */ | 168 | * shared mmap */ |
169 | if (size == 0) { | 169 | if (size == 0) { |
170 | if (unlikely(newsize >> 32)) | 170 | if (unlikely(newsize >> 32)) |
171 | return -EFBIG; | 171 | return -EFBIG; |
172 | 172 | ||
173 | return ramfs_nommu_expand_for_mapping(inode, newsize); | 173 | return ramfs_nommu_expand_for_mapping(inode, newsize); |
174 | } | 174 | } |
175 | 175 | ||
176 | /* check that a decrease in size doesn't cut off any shared mappings */ | 176 | /* check that a decrease in size doesn't cut off any shared mappings */ |
177 | if (newsize < size) { | 177 | if (newsize < size) { |
178 | ret = ramfs_nommu_check_mappings(inode, newsize, size); | 178 | ret = ramfs_nommu_check_mappings(inode, newsize, size); |
179 | if (ret < 0) | 179 | if (ret < 0) |
180 | return ret; | 180 | return ret; |
181 | } | 181 | } |
182 | 182 | ||
183 | ret = vmtruncate(inode, newsize); | 183 | ret = vmtruncate(inode, newsize); |
184 | 184 | ||
185 | return ret; | 185 | return ret; |
186 | } | 186 | } |
187 | 187 | ||
188 | /*****************************************************************************/ | 188 | /*****************************************************************************/ |
189 | /* | 189 | /* |
190 | * handle a change of attributes | 190 | * handle a change of attributes |
191 | * - we're specifically interested in a change of size | 191 | * - we're specifically interested in a change of size |
192 | */ | 192 | */ |
193 | static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) | 193 | static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) |
194 | { | 194 | { |
195 | struct inode *inode = dentry->d_inode; | 195 | struct inode *inode = dentry->d_inode; |
196 | unsigned int old_ia_valid = ia->ia_valid; | 196 | unsigned int old_ia_valid = ia->ia_valid; |
197 | int ret = 0; | 197 | int ret = 0; |
198 | 198 | ||
199 | /* POSIX UID/GID verification for setting inode attributes */ | 199 | /* POSIX UID/GID verification for setting inode attributes */ |
200 | ret = inode_change_ok(inode, ia); | 200 | ret = inode_change_ok(inode, ia); |
201 | if (ret) | 201 | if (ret) |
202 | return ret; | 202 | return ret; |
203 | 203 | ||
204 | /* by providing our own setattr() method, we skip this quotaism */ | 204 | /* by providing our own setattr() method, we skip this quotaism */ |
205 | if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) || | 205 | if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) || |
206 | (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid)) | 206 | (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid)) |
207 | ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0; | 207 | ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0; |
208 | 208 | ||
209 | /* pick out size-changing events */ | 209 | /* pick out size-changing events */ |
210 | if (ia->ia_valid & ATTR_SIZE) { | 210 | if (ia->ia_valid & ATTR_SIZE) { |
211 | loff_t size = i_size_read(inode); | 211 | loff_t size = i_size_read(inode); |
212 | if (ia->ia_size != size) { | 212 | if (ia->ia_size != size) { |
213 | ret = ramfs_nommu_resize(inode, ia->ia_size, size); | 213 | ret = ramfs_nommu_resize(inode, ia->ia_size, size); |
214 | if (ret < 0 || ia->ia_valid == ATTR_SIZE) | 214 | if (ret < 0 || ia->ia_valid == ATTR_SIZE) |
215 | goto out; | 215 | goto out; |
216 | } else { | 216 | } else { |
217 | /* we skipped the truncate but must still update | 217 | /* we skipped the truncate but must still update |
218 | * timestamps | 218 | * timestamps |
219 | */ | 219 | */ |
220 | ia->ia_valid |= ATTR_MTIME|ATTR_CTIME; | 220 | ia->ia_valid |= ATTR_MTIME|ATTR_CTIME; |
221 | } | 221 | } |
222 | } | 222 | } |
223 | 223 | ||
224 | ret = inode_setattr(inode, ia); | 224 | ret = inode_setattr(inode, ia); |
225 | out: | 225 | out: |
226 | ia->ia_valid = old_ia_valid; | 226 | ia->ia_valid = old_ia_valid; |
227 | return ret; | 227 | return ret; |
228 | } | 228 | } |
229 | 229 | ||
230 | /*****************************************************************************/ | 230 | /*****************************************************************************/ |
231 | /* | 231 | /* |
232 | * try to determine where a shared mapping can be made | 232 | * try to determine where a shared mapping can be made |
233 | * - we require that: | 233 | * - we require that: |
234 | * - the pages to be mapped must exist | 234 | * - the pages to be mapped must exist |
235 | * - the pages be physically contiguous in sequence | 235 | * - the pages be physically contiguous in sequence |
236 | */ | 236 | */ |
237 | unsigned long ramfs_nommu_get_unmapped_area(struct file *file, | 237 | unsigned long ramfs_nommu_get_unmapped_area(struct file *file, |
238 | unsigned long addr, unsigned long len, | 238 | unsigned long addr, unsigned long len, |
239 | unsigned long pgoff, unsigned long flags) | 239 | unsigned long pgoff, unsigned long flags) |
240 | { | 240 | { |
241 | unsigned long maxpages, lpages, nr, loop, ret; | 241 | unsigned long maxpages, lpages, nr, loop, ret; |
242 | struct inode *inode = file->f_path.dentry->d_inode; | 242 | struct inode *inode = file->f_path.dentry->d_inode; |
243 | struct page **pages = NULL, **ptr, *page; | 243 | struct page **pages = NULL, **ptr, *page; |
244 | loff_t isize; | 244 | loff_t isize; |
245 | 245 | ||
246 | if (!(flags & MAP_SHARED)) | 246 | if (!(flags & MAP_SHARED)) |
247 | return addr; | 247 | return addr; |
248 | 248 | ||
249 | /* the mapping mustn't extend beyond the EOF */ | 249 | /* the mapping mustn't extend beyond the EOF */ |
250 | lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; | 250 | lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
251 | isize = i_size_read(inode); | 251 | isize = i_size_read(inode); |
252 | 252 | ||
253 | ret = -EINVAL; | 253 | ret = -EINVAL; |
254 | maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; | 254 | maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; |
255 | if (pgoff >= maxpages) | 255 | if (pgoff >= maxpages) |
256 | goto out; | 256 | goto out; |
257 | 257 | ||
258 | if (maxpages - pgoff < lpages) | 258 | if (maxpages - pgoff < lpages) |
259 | goto out; | 259 | goto out; |
260 | 260 | ||
261 | /* gang-find the pages */ | 261 | /* gang-find the pages */ |
262 | ret = -ENOMEM; | 262 | ret = -ENOMEM; |
263 | pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); | 263 | pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); |
264 | if (!pages) | 264 | if (!pages) |
265 | goto out; | 265 | goto out_free; |
266 | 266 | ||
267 | nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); | 267 | nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); |
268 | if (nr != lpages) | 268 | if (nr != lpages) |
269 | goto out; /* leave if some pages were missing */ | 269 | goto out_free_pages; /* leave if some pages were missing */ |
270 | 270 | ||
271 | /* check the pages for physical adjacency */ | 271 | /* check the pages for physical adjacency */ |
272 | ptr = pages; | 272 | ptr = pages; |
273 | page = *ptr++; | 273 | page = *ptr++; |
274 | page++; | 274 | page++; |
275 | for (loop = lpages; loop > 1; loop--) | 275 | for (loop = lpages; loop > 1; loop--) |
276 | if (*ptr++ != page++) | 276 | if (*ptr++ != page++) |
277 | goto out; | 277 | goto out_free_pages; |
278 | 278 | ||
279 | /* okay - all conditions fulfilled */ | 279 | /* okay - all conditions fulfilled */ |
280 | ret = (unsigned long) page_address(pages[0]); | 280 | ret = (unsigned long) page_address(pages[0]); |
281 | 281 | ||
282 | out: | 282 | out_free_pages: |
283 | if (pages) { | 283 | ptr = pages; |
284 | ptr = pages; | 284 | for (loop = nr; loop > 0; loop--) |
285 | for (loop = lpages; loop > 0; loop--) | 285 | put_page(*ptr++); |
286 | put_page(*ptr++); | 286 | out_free: |
287 | kfree(pages); | 287 | kfree(pages); |
288 | } | 288 | out: |
289 | |||
290 | return ret; | 289 | return ret; |
291 | } | 290 | } |
292 | 291 | ||
293 | /*****************************************************************************/ | 292 | /*****************************************************************************/ |
294 | /* | 293 | /* |
295 | * set up a mapping for shared memory segments | 294 | * set up a mapping for shared memory segments |
296 | */ | 295 | */ |
297 | int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) | 296 | int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) |
298 | { | 297 | { |
299 | if (!(vma->vm_flags & VM_SHARED)) | 298 | if (!(vma->vm_flags & VM_SHARED)) |
300 | return -ENOSYS; | 299 | return -ENOSYS; |
301 | 300 | ||
302 | file_accessed(file); | 301 | file_accessed(file); |
303 | vma->vm_ops = &generic_file_vm_ops; | 302 | vma->vm_ops = &generic_file_vm_ops; |
304 | return 0; | 303 | return 0; |
305 | } | 304 | } |
306 | 305 |
include/asm-frv/mmu.h
1 | /* mmu.h: memory management context for FR-V with or without MMU support | 1 | /* mmu.h: memory management context for FR-V with or without MMU support |
2 | * | 2 | * |
3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 7 | * modify it under the terms of the GNU General Public License |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | #ifndef _ASM_MMU_H | 11 | #ifndef _ASM_MMU_H |
12 | #define _ASM_MMU_H | 12 | #define _ASM_MMU_H |
13 | 13 | ||
14 | typedef struct { | 14 | typedef struct { |
15 | #ifdef CONFIG_MMU | 15 | #ifdef CONFIG_MMU |
16 | struct list_head id_link; /* link in list of context ID owners */ | 16 | struct list_head id_link; /* link in list of context ID owners */ |
17 | unsigned short id; /* MMU context ID */ | 17 | unsigned short id; /* MMU context ID */ |
18 | unsigned short id_busy; /* true if ID is in CXNR */ | 18 | unsigned short id_busy; /* true if ID is in CXNR */ |
19 | unsigned long itlb_cached_pge; /* [SCR0] PGE cached for insn TLB handler */ | 19 | unsigned long itlb_cached_pge; /* [SCR0] PGE cached for insn TLB handler */ |
20 | unsigned long itlb_ptd_mapping; /* [DAMR4] PTD mapping for itlb cached PGE */ | 20 | unsigned long itlb_ptd_mapping; /* [DAMR4] PTD mapping for itlb cached PGE */ |
21 | unsigned long dtlb_cached_pge; /* [SCR1] PGE cached for data TLB handler */ | 21 | unsigned long dtlb_cached_pge; /* [SCR1] PGE cached for data TLB handler */ |
22 | unsigned long dtlb_ptd_mapping; /* [DAMR5] PTD mapping for dtlb cached PGE */ | 22 | unsigned long dtlb_ptd_mapping; /* [DAMR5] PTD mapping for dtlb cached PGE */ |
23 | 23 | ||
24 | #else | 24 | #else |
25 | struct vm_list_struct *vmlist; | ||
26 | unsigned long end_brk; | 25 | unsigned long end_brk; |
27 | 26 | ||
28 | #endif | 27 | #endif |
29 | 28 | ||
30 | #ifdef CONFIG_BINFMT_ELF_FDPIC | 29 | #ifdef CONFIG_BINFMT_ELF_FDPIC |
31 | unsigned long exec_fdpic_loadmap; | 30 | unsigned long exec_fdpic_loadmap; |
32 | unsigned long interp_fdpic_loadmap; | 31 | unsigned long interp_fdpic_loadmap; |
33 | #endif | 32 | #endif |
34 | 33 | ||
35 | } mm_context_t; | 34 | } mm_context_t; |
36 | 35 | ||
37 | #ifdef CONFIG_MMU | 36 | #ifdef CONFIG_MMU |
38 | extern int __nongpreldata cxn_pinned; | 37 | extern int __nongpreldata cxn_pinned; |
39 | extern int cxn_pin_by_pid(pid_t pid); | 38 | extern int cxn_pin_by_pid(pid_t pid); |
40 | #endif | 39 | #endif |
41 | 40 | ||
42 | #endif /* _ASM_MMU_H */ | 41 | #endif /* _ASM_MMU_H */ |
43 | 42 |
include/asm-m32r/mmu.h
1 | #ifndef _ASM_M32R_MMU_H | 1 | #ifndef _ASM_M32R_MMU_H |
2 | #define _ASM_M32R_MMU_H | 2 | #define _ASM_M32R_MMU_H |
3 | 3 | ||
4 | #if !defined(CONFIG_MMU) | 4 | #if !defined(CONFIG_MMU) |
5 | 5 | ||
6 | typedef struct { | 6 | typedef struct { |
7 | struct vm_list_struct *vmlist; | ||
8 | unsigned long end_brk; | 7 | unsigned long end_brk; |
9 | } mm_context_t; | 8 | } mm_context_t; |
10 | 9 | ||
11 | #else /* CONFIG_MMU */ | 10 | #else /* CONFIG_MMU */ |
12 | 11 | ||
13 | /* Default "unsigned long" context */ | 12 | /* Default "unsigned long" context */ |
14 | #ifndef CONFIG_SMP | 13 | #ifndef CONFIG_SMP |
15 | typedef unsigned long mm_context_t; | 14 | typedef unsigned long mm_context_t; |
16 | #else | 15 | #else |
17 | typedef unsigned long mm_context_t[NR_CPUS]; | 16 | typedef unsigned long mm_context_t[NR_CPUS]; |
18 | #endif | 17 | #endif |
19 | 18 | ||
20 | #endif /* CONFIG_MMU */ | 19 | #endif /* CONFIG_MMU */ |
21 | 20 | ||
22 | #endif /* _ASM_M32R_MMU_H */ | 21 | #endif /* _ASM_M32R_MMU_H */ |
23 | 22 |
include/linux/mm.h
1 | #ifndef _LINUX_MM_H | 1 | #ifndef _LINUX_MM_H |
2 | #define _LINUX_MM_H | 2 | #define _LINUX_MM_H |
3 | 3 | ||
4 | #include <linux/errno.h> | 4 | #include <linux/errno.h> |
5 | 5 | ||
6 | #ifdef __KERNEL__ | 6 | #ifdef __KERNEL__ |
7 | 7 | ||
8 | #include <linux/gfp.h> | 8 | #include <linux/gfp.h> |
9 | #include <linux/list.h> | 9 | #include <linux/list.h> |
10 | #include <linux/mmdebug.h> | 10 | #include <linux/mmdebug.h> |
11 | #include <linux/mmzone.h> | 11 | #include <linux/mmzone.h> |
12 | #include <linux/rbtree.h> | 12 | #include <linux/rbtree.h> |
13 | #include <linux/prio_tree.h> | 13 | #include <linux/prio_tree.h> |
14 | #include <linux/debug_locks.h> | 14 | #include <linux/debug_locks.h> |
15 | #include <linux/mm_types.h> | 15 | #include <linux/mm_types.h> |
16 | 16 | ||
17 | struct mempolicy; | 17 | struct mempolicy; |
18 | struct anon_vma; | 18 | struct anon_vma; |
19 | struct file_ra_state; | 19 | struct file_ra_state; |
20 | struct user_struct; | 20 | struct user_struct; |
21 | struct writeback_control; | 21 | struct writeback_control; |
22 | 22 | ||
23 | #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ | 23 | #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ |
24 | extern unsigned long max_mapnr; | 24 | extern unsigned long max_mapnr; |
25 | #endif | 25 | #endif |
26 | 26 | ||
27 | extern unsigned long num_physpages; | 27 | extern unsigned long num_physpages; |
28 | extern void * high_memory; | 28 | extern void * high_memory; |
29 | extern int page_cluster; | 29 | extern int page_cluster; |
30 | 30 | ||
31 | #ifdef CONFIG_SYSCTL | 31 | #ifdef CONFIG_SYSCTL |
32 | extern int sysctl_legacy_va_layout; | 32 | extern int sysctl_legacy_va_layout; |
33 | #else | 33 | #else |
34 | #define sysctl_legacy_va_layout 0 | 34 | #define sysctl_legacy_va_layout 0 |
35 | #endif | 35 | #endif |
36 | 36 | ||
37 | extern unsigned long mmap_min_addr; | 37 | extern unsigned long mmap_min_addr; |
38 | 38 | ||
39 | #include <asm/page.h> | 39 | #include <asm/page.h> |
40 | #include <asm/pgtable.h> | 40 | #include <asm/pgtable.h> |
41 | #include <asm/processor.h> | 41 | #include <asm/processor.h> |
42 | 42 | ||
43 | #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) | 43 | #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) |
44 | 44 | ||
45 | /* to align the pointer to the (next) page boundary */ | 45 | /* to align the pointer to the (next) page boundary */ |
46 | #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) | 46 | #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Linux kernel virtual memory manager primitives. | 49 | * Linux kernel virtual memory manager primitives. |
50 | * The idea being to have a "virtual" mm in the same way | 50 | * The idea being to have a "virtual" mm in the same way |
51 | * we have a virtual fs - giving a cleaner interface to the | 51 | * we have a virtual fs - giving a cleaner interface to the |
52 | * mm details, and allowing different kinds of memory mappings | 52 | * mm details, and allowing different kinds of memory mappings |
53 | * (from shared memory to executable loading to arbitrary | 53 | * (from shared memory to executable loading to arbitrary |
54 | * mmap() functions). | 54 | * mmap() functions). |
55 | */ | 55 | */ |
56 | 56 | ||
57 | extern struct kmem_cache *vm_area_cachep; | 57 | extern struct kmem_cache *vm_area_cachep; |
58 | 58 | ||
59 | /* | ||
60 | * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is | ||
61 | * disabled, then there's a single shared list of VMAs maintained by the | ||
62 | * system, and mm's subscribe to these individually | ||
63 | */ | ||
64 | struct vm_list_struct { | ||
65 | struct vm_list_struct *next; | ||
66 | struct vm_area_struct *vma; | ||
67 | }; | ||
68 | |||
69 | #ifndef CONFIG_MMU | 59 | #ifndef CONFIG_MMU |
70 | extern struct rb_root nommu_vma_tree; | 60 | extern struct rb_root nommu_region_tree; |
71 | extern struct rw_semaphore nommu_vma_sem; | 61 | extern struct rw_semaphore nommu_region_sem; |
72 | 62 | ||
73 | extern unsigned int kobjsize(const void *objp); | 63 | extern unsigned int kobjsize(const void *objp); |
74 | #endif | 64 | #endif |
75 | 65 | ||
76 | /* | 66 | /* |
77 | * vm_flags in vm_area_struct, see mm_types.h. | 67 | * vm_flags in vm_area_struct, see mm_types.h. |
78 | */ | 68 | */ |
79 | #define VM_READ 0x00000001 /* currently active flags */ | 69 | #define VM_READ 0x00000001 /* currently active flags */ |
80 | #define VM_WRITE 0x00000002 | 70 | #define VM_WRITE 0x00000002 |
81 | #define VM_EXEC 0x00000004 | 71 | #define VM_EXEC 0x00000004 |
82 | #define VM_SHARED 0x00000008 | 72 | #define VM_SHARED 0x00000008 |
83 | 73 | ||
84 | /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ | 74 | /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ |
85 | #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ | 75 | #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ |
86 | #define VM_MAYWRITE 0x00000020 | 76 | #define VM_MAYWRITE 0x00000020 |
87 | #define VM_MAYEXEC 0x00000040 | 77 | #define VM_MAYEXEC 0x00000040 |
88 | #define VM_MAYSHARE 0x00000080 | 78 | #define VM_MAYSHARE 0x00000080 |
89 | 79 | ||
90 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ | 80 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ |
91 | #define VM_GROWSUP 0x00000200 | 81 | #define VM_GROWSUP 0x00000200 |
92 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ | 82 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ |
93 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ | 83 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ |
94 | 84 | ||
95 | #define VM_EXECUTABLE 0x00001000 | 85 | #define VM_EXECUTABLE 0x00001000 |
96 | #define VM_LOCKED 0x00002000 | 86 | #define VM_LOCKED 0x00002000 |
97 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ | 87 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ |
98 | 88 | ||
99 | /* Used by sys_madvise() */ | 89 | /* Used by sys_madvise() */ |
100 | #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ | 90 | #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ |
101 | #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ | 91 | #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ |
102 | 92 | ||
103 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ | 93 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ |
104 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ | 94 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ |
105 | #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ | 95 | #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ |
106 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ | 96 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ |
107 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ | 97 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ |
108 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ | 98 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ |
109 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ | 99 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ |
110 | #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ | 100 | #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ |
111 | #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ | 101 | #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ |
112 | #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ | 102 | #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ |
113 | 103 | ||
114 | #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ | 104 | #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ |
115 | #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ | 105 | #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ |
116 | #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ | 106 | #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ |
117 | 107 | ||
118 | #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ | 108 | #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ |
119 | #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS | 109 | #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS |
120 | #endif | 110 | #endif |
121 | 111 | ||
122 | #ifdef CONFIG_STACK_GROWSUP | 112 | #ifdef CONFIG_STACK_GROWSUP |
123 | #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) | 113 | #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) |
124 | #else | 114 | #else |
125 | #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) | 115 | #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) |
126 | #endif | 116 | #endif |
127 | 117 | ||
128 | #define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) | 118 | #define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) |
129 | #define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK | 119 | #define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK |
130 | #define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) | 120 | #define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) |
131 | #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) | 121 | #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) |
132 | #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) | 122 | #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) |
133 | 123 | ||
134 | /* | 124 | /* |
135 | * special vmas that are non-mergable, non-mlock()able | 125 | * special vmas that are non-mergable, non-mlock()able |
136 | */ | 126 | */ |
137 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) | 127 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) |
138 | 128 | ||
139 | /* | 129 | /* |
140 | * mapping from the currently active vm_flags protection bits (the | 130 | * mapping from the currently active vm_flags protection bits (the |
141 | * low four bits) to a page protection mask.. | 131 | * low four bits) to a page protection mask.. |
142 | */ | 132 | */ |
143 | extern pgprot_t protection_map[16]; | 133 | extern pgprot_t protection_map[16]; |
144 | 134 | ||
145 | #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ | 135 | #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ |
146 | #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ | 136 | #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ |
147 | 137 | ||
148 | /* | 138 | /* |
149 | * This interface is used by x86 PAT code to identify a pfn mapping that is | 139 | * This interface is used by x86 PAT code to identify a pfn mapping that is |
150 | * linear over entire vma. This is to optimize PAT code that deals with | 140 | * linear over entire vma. This is to optimize PAT code that deals with |
151 | * marking the physical region with a particular prot. This is not for generic | 141 | * marking the physical region with a particular prot. This is not for generic |
152 | * mm use. Note also that this check will not work if the pfn mapping is | 142 | * mm use. Note also that this check will not work if the pfn mapping is |
153 | * linear for a vma starting at physical address 0. In which case PAT code | 143 | * linear for a vma starting at physical address 0. In which case PAT code |
154 | * falls back to slow path of reserving physical range page by page. | 144 | * falls back to slow path of reserving physical range page by page. |
155 | */ | 145 | */ |
156 | static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) | 146 | static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) |
157 | { | 147 | { |
158 | return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); | 148 | return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); |
159 | } | 149 | } |
160 | 150 | ||
161 | static inline int is_pfn_mapping(struct vm_area_struct *vma) | 151 | static inline int is_pfn_mapping(struct vm_area_struct *vma) |
162 | { | 152 | { |
163 | return (vma->vm_flags & VM_PFNMAP); | 153 | return (vma->vm_flags & VM_PFNMAP); |
164 | } | 154 | } |
165 | 155 | ||
166 | /* | 156 | /* |
167 | * vm_fault is filled by the the pagefault handler and passed to the vma's | 157 | * vm_fault is filled by the the pagefault handler and passed to the vma's |
168 | * ->fault function. The vma's ->fault is responsible for returning a bitmask | 158 | * ->fault function. The vma's ->fault is responsible for returning a bitmask |
169 | * of VM_FAULT_xxx flags that give details about how the fault was handled. | 159 | * of VM_FAULT_xxx flags that give details about how the fault was handled. |
170 | * | 160 | * |
171 | * pgoff should be used in favour of virtual_address, if possible. If pgoff | 161 | * pgoff should be used in favour of virtual_address, if possible. If pgoff |
172 | * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear | 162 | * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear |
173 | * mapping support. | 163 | * mapping support. |
174 | */ | 164 | */ |
175 | struct vm_fault { | 165 | struct vm_fault { |
176 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | 166 | unsigned int flags; /* FAULT_FLAG_xxx flags */ |
177 | pgoff_t pgoff; /* Logical page offset based on vma */ | 167 | pgoff_t pgoff; /* Logical page offset based on vma */ |
178 | void __user *virtual_address; /* Faulting virtual address */ | 168 | void __user *virtual_address; /* Faulting virtual address */ |
179 | 169 | ||
180 | struct page *page; /* ->fault handlers should return a | 170 | struct page *page; /* ->fault handlers should return a |
181 | * page here, unless VM_FAULT_NOPAGE | 171 | * page here, unless VM_FAULT_NOPAGE |
182 | * is set (which is also implied by | 172 | * is set (which is also implied by |
183 | * VM_FAULT_ERROR). | 173 | * VM_FAULT_ERROR). |
184 | */ | 174 | */ |
185 | }; | 175 | }; |
186 | 176 | ||
187 | /* | 177 | /* |
188 | * These are the virtual MM functions - opening of an area, closing and | 178 | * These are the virtual MM functions - opening of an area, closing and |
189 | * unmapping it (needed to keep files on disk up-to-date etc), pointer | 179 | * unmapping it (needed to keep files on disk up-to-date etc), pointer |
190 | * to the functions called when a no-page or a wp-page exception occurs. | 180 | * to the functions called when a no-page or a wp-page exception occurs. |
191 | */ | 181 | */ |
192 | struct vm_operations_struct { | 182 | struct vm_operations_struct { |
193 | void (*open)(struct vm_area_struct * area); | 183 | void (*open)(struct vm_area_struct * area); |
194 | void (*close)(struct vm_area_struct * area); | 184 | void (*close)(struct vm_area_struct * area); |
195 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); | 185 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); |
196 | 186 | ||
197 | /* notification that a previously read-only page is about to become | 187 | /* notification that a previously read-only page is about to become |
198 | * writable, if an error is returned it will cause a SIGBUS */ | 188 | * writable, if an error is returned it will cause a SIGBUS */ |
199 | int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); | 189 | int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); |
200 | 190 | ||
201 | /* called by access_process_vm when get_user_pages() fails, typically | 191 | /* called by access_process_vm when get_user_pages() fails, typically |
202 | * for use by special VMAs that can switch between memory and hardware | 192 | * for use by special VMAs that can switch between memory and hardware |
203 | */ | 193 | */ |
204 | int (*access)(struct vm_area_struct *vma, unsigned long addr, | 194 | int (*access)(struct vm_area_struct *vma, unsigned long addr, |
205 | void *buf, int len, int write); | 195 | void *buf, int len, int write); |
206 | #ifdef CONFIG_NUMA | 196 | #ifdef CONFIG_NUMA |
207 | /* | 197 | /* |
208 | * set_policy() op must add a reference to any non-NULL @new mempolicy | 198 | * set_policy() op must add a reference to any non-NULL @new mempolicy |
209 | * to hold the policy upon return. Caller should pass NULL @new to | 199 | * to hold the policy upon return. Caller should pass NULL @new to |
210 | * remove a policy and fall back to surrounding context--i.e. do not | 200 | * remove a policy and fall back to surrounding context--i.e. do not |
211 | * install a MPOL_DEFAULT policy, nor the task or system default | 201 | * install a MPOL_DEFAULT policy, nor the task or system default |
212 | * mempolicy. | 202 | * mempolicy. |
213 | */ | 203 | */ |
214 | int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); | 204 | int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); |
215 | 205 | ||
216 | /* | 206 | /* |
217 | * get_policy() op must add reference [mpol_get()] to any policy at | 207 | * get_policy() op must add reference [mpol_get()] to any policy at |
218 | * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure | 208 | * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure |
219 | * in mm/mempolicy.c will do this automatically. | 209 | * in mm/mempolicy.c will do this automatically. |
220 | * get_policy() must NOT add a ref if the policy at (vma,addr) is not | 210 | * get_policy() must NOT add a ref if the policy at (vma,addr) is not |
221 | * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. | 211 | * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. |
222 | * If no [shared/vma] mempolicy exists at the addr, get_policy() op | 212 | * If no [shared/vma] mempolicy exists at the addr, get_policy() op |
223 | * must return NULL--i.e., do not "fallback" to task or system default | 213 | * must return NULL--i.e., do not "fallback" to task or system default |
224 | * policy. | 214 | * policy. |
225 | */ | 215 | */ |
226 | struct mempolicy *(*get_policy)(struct vm_area_struct *vma, | 216 | struct mempolicy *(*get_policy)(struct vm_area_struct *vma, |
227 | unsigned long addr); | 217 | unsigned long addr); |
228 | int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, | 218 | int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, |
229 | const nodemask_t *to, unsigned long flags); | 219 | const nodemask_t *to, unsigned long flags); |
230 | #endif | 220 | #endif |
231 | }; | 221 | }; |
232 | 222 | ||
233 | struct mmu_gather; | 223 | struct mmu_gather; |
234 | struct inode; | 224 | struct inode; |
235 | 225 | ||
236 | #define page_private(page) ((page)->private) | 226 | #define page_private(page) ((page)->private) |
237 | #define set_page_private(page, v) ((page)->private = (v)) | 227 | #define set_page_private(page, v) ((page)->private = (v)) |
238 | 228 | ||
239 | /* | 229 | /* |
240 | * FIXME: take this include out, include page-flags.h in | 230 | * FIXME: take this include out, include page-flags.h in |
241 | * files which need it (119 of them) | 231 | * files which need it (119 of them) |
242 | */ | 232 | */ |
243 | #include <linux/page-flags.h> | 233 | #include <linux/page-flags.h> |
244 | 234 | ||
245 | /* | 235 | /* |
246 | * Methods to modify the page usage count. | 236 | * Methods to modify the page usage count. |
247 | * | 237 | * |
248 | * What counts for a page usage: | 238 | * What counts for a page usage: |
249 | * - cache mapping (page->mapping) | 239 | * - cache mapping (page->mapping) |
250 | * - private data (page->private) | 240 | * - private data (page->private) |
251 | * - page mapped in a task's page tables, each mapping | 241 | * - page mapped in a task's page tables, each mapping |
252 | * is counted separately | 242 | * is counted separately |
253 | * | 243 | * |
254 | * Also, many kernel routines increase the page count before a critical | 244 | * Also, many kernel routines increase the page count before a critical |
255 | * routine so they can be sure the page doesn't go away from under them. | 245 | * routine so they can be sure the page doesn't go away from under them. |
256 | */ | 246 | */ |
257 | 247 | ||
258 | /* | 248 | /* |
259 | * Drop a ref, return true if the refcount fell to zero (the page has no users) | 249 | * Drop a ref, return true if the refcount fell to zero (the page has no users) |
260 | */ | 250 | */ |
261 | static inline int put_page_testzero(struct page *page) | 251 | static inline int put_page_testzero(struct page *page) |
262 | { | 252 | { |
263 | VM_BUG_ON(atomic_read(&page->_count) == 0); | 253 | VM_BUG_ON(atomic_read(&page->_count) == 0); |
264 | return atomic_dec_and_test(&page->_count); | 254 | return atomic_dec_and_test(&page->_count); |
265 | } | 255 | } |
266 | 256 | ||
267 | /* | 257 | /* |
268 | * Try to grab a ref unless the page has a refcount of zero, return false if | 258 | * Try to grab a ref unless the page has a refcount of zero, return false if |
269 | * that is the case. | 259 | * that is the case. |
270 | */ | 260 | */ |
271 | static inline int get_page_unless_zero(struct page *page) | 261 | static inline int get_page_unless_zero(struct page *page) |
272 | { | 262 | { |
273 | VM_BUG_ON(PageTail(page)); | 263 | VM_BUG_ON(PageTail(page)); |
274 | return atomic_inc_not_zero(&page->_count); | 264 | return atomic_inc_not_zero(&page->_count); |
275 | } | 265 | } |
276 | 266 | ||
277 | /* Support for virtually mapped pages */ | 267 | /* Support for virtually mapped pages */ |
278 | struct page *vmalloc_to_page(const void *addr); | 268 | struct page *vmalloc_to_page(const void *addr); |
279 | unsigned long vmalloc_to_pfn(const void *addr); | 269 | unsigned long vmalloc_to_pfn(const void *addr); |
280 | 270 | ||
281 | /* | 271 | /* |
282 | * Determine if an address is within the vmalloc range | 272 | * Determine if an address is within the vmalloc range |
283 | * | 273 | * |
284 | * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there | 274 | * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there |
285 | * is no special casing required. | 275 | * is no special casing required. |
286 | */ | 276 | */ |
287 | static inline int is_vmalloc_addr(const void *x) | 277 | static inline int is_vmalloc_addr(const void *x) |
288 | { | 278 | { |
289 | #ifdef CONFIG_MMU | 279 | #ifdef CONFIG_MMU |
290 | unsigned long addr = (unsigned long)x; | 280 | unsigned long addr = (unsigned long)x; |
291 | 281 | ||
292 | return addr >= VMALLOC_START && addr < VMALLOC_END; | 282 | return addr >= VMALLOC_START && addr < VMALLOC_END; |
293 | #else | 283 | #else |
294 | return 0; | 284 | return 0; |
295 | #endif | 285 | #endif |
296 | } | 286 | } |
297 | 287 | ||
298 | static inline struct page *compound_head(struct page *page) | 288 | static inline struct page *compound_head(struct page *page) |
299 | { | 289 | { |
300 | if (unlikely(PageTail(page))) | 290 | if (unlikely(PageTail(page))) |
301 | return page->first_page; | 291 | return page->first_page; |
302 | return page; | 292 | return page; |
303 | } | 293 | } |
304 | 294 | ||
305 | static inline int page_count(struct page *page) | 295 | static inline int page_count(struct page *page) |
306 | { | 296 | { |
307 | return atomic_read(&compound_head(page)->_count); | 297 | return atomic_read(&compound_head(page)->_count); |
308 | } | 298 | } |
309 | 299 | ||
310 | static inline void get_page(struct page *page) | 300 | static inline void get_page(struct page *page) |
311 | { | 301 | { |
312 | page = compound_head(page); | 302 | page = compound_head(page); |
313 | VM_BUG_ON(atomic_read(&page->_count) == 0); | 303 | VM_BUG_ON(atomic_read(&page->_count) == 0); |
314 | atomic_inc(&page->_count); | 304 | atomic_inc(&page->_count); |
315 | } | 305 | } |
316 | 306 | ||
317 | static inline struct page *virt_to_head_page(const void *x) | 307 | static inline struct page *virt_to_head_page(const void *x) |
318 | { | 308 | { |
319 | struct page *page = virt_to_page(x); | 309 | struct page *page = virt_to_page(x); |
320 | return compound_head(page); | 310 | return compound_head(page); |
321 | } | 311 | } |
322 | 312 | ||
323 | /* | 313 | /* |
324 | * Setup the page count before being freed into the page allocator for | 314 | * Setup the page count before being freed into the page allocator for |
325 | * the first time (boot or memory hotplug) | 315 | * the first time (boot or memory hotplug) |
326 | */ | 316 | */ |
327 | static inline void init_page_count(struct page *page) | 317 | static inline void init_page_count(struct page *page) |
328 | { | 318 | { |
329 | atomic_set(&page->_count, 1); | 319 | atomic_set(&page->_count, 1); |
330 | } | 320 | } |
331 | 321 | ||
332 | void put_page(struct page *page); | 322 | void put_page(struct page *page); |
333 | void put_pages_list(struct list_head *pages); | 323 | void put_pages_list(struct list_head *pages); |
334 | 324 | ||
335 | void split_page(struct page *page, unsigned int order); | 325 | void split_page(struct page *page, unsigned int order); |
336 | 326 | ||
337 | /* | 327 | /* |
338 | * Compound pages have a destructor function. Provide a | 328 | * Compound pages have a destructor function. Provide a |
339 | * prototype for that function and accessor functions. | 329 | * prototype for that function and accessor functions. |
340 | * These are _only_ valid on the head of a PG_compound page. | 330 | * These are _only_ valid on the head of a PG_compound page. |
341 | */ | 331 | */ |
342 | typedef void compound_page_dtor(struct page *); | 332 | typedef void compound_page_dtor(struct page *); |
343 | 333 | ||
344 | static inline void set_compound_page_dtor(struct page *page, | 334 | static inline void set_compound_page_dtor(struct page *page, |
345 | compound_page_dtor *dtor) | 335 | compound_page_dtor *dtor) |
346 | { | 336 | { |
347 | page[1].lru.next = (void *)dtor; | 337 | page[1].lru.next = (void *)dtor; |
348 | } | 338 | } |
349 | 339 | ||
350 | static inline compound_page_dtor *get_compound_page_dtor(struct page *page) | 340 | static inline compound_page_dtor *get_compound_page_dtor(struct page *page) |
351 | { | 341 | { |
352 | return (compound_page_dtor *)page[1].lru.next; | 342 | return (compound_page_dtor *)page[1].lru.next; |
353 | } | 343 | } |
354 | 344 | ||
355 | static inline int compound_order(struct page *page) | 345 | static inline int compound_order(struct page *page) |
356 | { | 346 | { |
357 | if (!PageHead(page)) | 347 | if (!PageHead(page)) |
358 | return 0; | 348 | return 0; |
359 | return (unsigned long)page[1].lru.prev; | 349 | return (unsigned long)page[1].lru.prev; |
360 | } | 350 | } |
361 | 351 | ||
362 | static inline void set_compound_order(struct page *page, unsigned long order) | 352 | static inline void set_compound_order(struct page *page, unsigned long order) |
363 | { | 353 | { |
364 | page[1].lru.prev = (void *)order; | 354 | page[1].lru.prev = (void *)order; |
365 | } | 355 | } |
366 | 356 | ||
367 | /* | 357 | /* |
368 | * Multiple processes may "see" the same page. E.g. for untouched | 358 | * Multiple processes may "see" the same page. E.g. for untouched |
369 | * mappings of /dev/null, all processes see the same page full of | 359 | * mappings of /dev/null, all processes see the same page full of |
370 | * zeroes, and text pages of executables and shared libraries have | 360 | * zeroes, and text pages of executables and shared libraries have |
371 | * only one copy in memory, at most, normally. | 361 | * only one copy in memory, at most, normally. |
372 | * | 362 | * |
373 | * For the non-reserved pages, page_count(page) denotes a reference count. | 363 | * For the non-reserved pages, page_count(page) denotes a reference count. |
374 | * page_count() == 0 means the page is free. page->lru is then used for | 364 | * page_count() == 0 means the page is free. page->lru is then used for |
375 | * freelist management in the buddy allocator. | 365 | * freelist management in the buddy allocator. |
376 | * page_count() > 0 means the page has been allocated. | 366 | * page_count() > 0 means the page has been allocated. |
377 | * | 367 | * |
378 | * Pages are allocated by the slab allocator in order to provide memory | 368 | * Pages are allocated by the slab allocator in order to provide memory |
379 | * to kmalloc and kmem_cache_alloc. In this case, the management of the | 369 | * to kmalloc and kmem_cache_alloc. In this case, the management of the |
380 | * page, and the fields in 'struct page' are the responsibility of mm/slab.c | 370 | * page, and the fields in 'struct page' are the responsibility of mm/slab.c |
381 | * unless a particular usage is carefully commented. (the responsibility of | 371 | * unless a particular usage is carefully commented. (the responsibility of |
382 | * freeing the kmalloc memory is the caller's, of course). | 372 | * freeing the kmalloc memory is the caller's, of course). |
383 | * | 373 | * |
384 | * A page may be used by anyone else who does a __get_free_page(). | 374 | * A page may be used by anyone else who does a __get_free_page(). |
385 | * In this case, page_count still tracks the references, and should only | 375 | * In this case, page_count still tracks the references, and should only |
386 | * be used through the normal accessor functions. The top bits of page->flags | 376 | * be used through the normal accessor functions. The top bits of page->flags |
387 | * and page->virtual store page management information, but all other fields | 377 | * and page->virtual store page management information, but all other fields |
388 | * are unused and could be used privately, carefully. The management of this | 378 | * are unused and could be used privately, carefully. The management of this |
389 | * page is the responsibility of the one who allocated it, and those who have | 379 | * page is the responsibility of the one who allocated it, and those who have |
390 | * subsequently been given references to it. | 380 | * subsequently been given references to it. |
391 | * | 381 | * |
392 | * The other pages (we may call them "pagecache pages") are completely | 382 | * The other pages (we may call them "pagecache pages") are completely |
393 | * managed by the Linux memory manager: I/O, buffers, swapping etc. | 383 | * managed by the Linux memory manager: I/O, buffers, swapping etc. |
394 | * The following discussion applies only to them. | 384 | * The following discussion applies only to them. |
395 | * | 385 | * |
396 | * A pagecache page contains an opaque `private' member, which belongs to the | 386 | * A pagecache page contains an opaque `private' member, which belongs to the |
397 | * page's address_space. Usually, this is the address of a circular list of | 387 | * page's address_space. Usually, this is the address of a circular list of |
398 | * the page's disk buffers. PG_private must be set to tell the VM to call | 388 | * the page's disk buffers. PG_private must be set to tell the VM to call |
399 | * into the filesystem to release these pages. | 389 | * into the filesystem to release these pages. |
400 | * | 390 | * |
401 | * A page may belong to an inode's memory mapping. In this case, page->mapping | 391 | * A page may belong to an inode's memory mapping. In this case, page->mapping |
402 | * is the pointer to the inode, and page->index is the file offset of the page, | 392 | * is the pointer to the inode, and page->index is the file offset of the page, |
403 | * in units of PAGE_CACHE_SIZE. | 393 | * in units of PAGE_CACHE_SIZE. |
404 | * | 394 | * |
405 | * If pagecache pages are not associated with an inode, they are said to be | 395 | * If pagecache pages are not associated with an inode, they are said to be |
406 | * anonymous pages. These may become associated with the swapcache, and in that | 396 | * anonymous pages. These may become associated with the swapcache, and in that |
407 | * case PG_swapcache is set, and page->private is an offset into the swapcache. | 397 | * case PG_swapcache is set, and page->private is an offset into the swapcache. |
408 | * | 398 | * |
409 | * In either case (swapcache or inode backed), the pagecache itself holds one | 399 | * In either case (swapcache or inode backed), the pagecache itself holds one |
410 | * reference to the page. Setting PG_private should also increment the | 400 | * reference to the page. Setting PG_private should also increment the |
411 | * refcount. The each user mapping also has a reference to the page. | 401 | * refcount. The each user mapping also has a reference to the page. |
412 | * | 402 | * |
413 | * The pagecache pages are stored in a per-mapping radix tree, which is | 403 | * The pagecache pages are stored in a per-mapping radix tree, which is |
414 | * rooted at mapping->page_tree, and indexed by offset. | 404 | * rooted at mapping->page_tree, and indexed by offset. |
415 | * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space | 405 | * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space |
416 | * lists, we instead now tag pages as dirty/writeback in the radix tree. | 406 | * lists, we instead now tag pages as dirty/writeback in the radix tree. |
417 | * | 407 | * |
418 | * All pagecache pages may be subject to I/O: | 408 | * All pagecache pages may be subject to I/O: |
419 | * - inode pages may need to be read from disk, | 409 | * - inode pages may need to be read from disk, |
420 | * - inode pages which have been modified and are MAP_SHARED may need | 410 | * - inode pages which have been modified and are MAP_SHARED may need |
421 | * to be written back to the inode on disk, | 411 | * to be written back to the inode on disk, |
422 | * - anonymous pages (including MAP_PRIVATE file mappings) which have been | 412 | * - anonymous pages (including MAP_PRIVATE file mappings) which have been |
423 | * modified may need to be swapped out to swap space and (later) to be read | 413 | * modified may need to be swapped out to swap space and (later) to be read |
424 | * back into memory. | 414 | * back into memory. |
425 | */ | 415 | */ |
426 | 416 | ||
427 | /* | 417 | /* |
428 | * The zone field is never updated after free_area_init_core() | 418 | * The zone field is never updated after free_area_init_core() |
429 | * sets it, so none of the operations on it need to be atomic. | 419 | * sets it, so none of the operations on it need to be atomic. |
430 | */ | 420 | */ |
431 | 421 | ||
432 | 422 | ||
433 | /* | 423 | /* |
434 | * page->flags layout: | 424 | * page->flags layout: |
435 | * | 425 | * |
436 | * There are three possibilities for how page->flags get | 426 | * There are three possibilities for how page->flags get |
437 | * laid out. The first is for the normal case, without | 427 | * laid out. The first is for the normal case, without |
438 | * sparsemem. The second is for sparsemem when there is | 428 | * sparsemem. The second is for sparsemem when there is |
439 | * plenty of space for node and section. The last is when | 429 | * plenty of space for node and section. The last is when |
440 | * we have run out of space and have to fall back to an | 430 | * we have run out of space and have to fall back to an |
441 | * alternate (slower) way of determining the node. | 431 | * alternate (slower) way of determining the node. |
442 | * | 432 | * |
443 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | | 433 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | |
444 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | | 434 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | |
445 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | | 435 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | |
446 | */ | 436 | */ |
447 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 437 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) |
448 | #define SECTIONS_WIDTH SECTIONS_SHIFT | 438 | #define SECTIONS_WIDTH SECTIONS_SHIFT |
449 | #else | 439 | #else |
450 | #define SECTIONS_WIDTH 0 | 440 | #define SECTIONS_WIDTH 0 |
451 | #endif | 441 | #endif |
452 | 442 | ||
453 | #define ZONES_WIDTH ZONES_SHIFT | 443 | #define ZONES_WIDTH ZONES_SHIFT |
454 | 444 | ||
455 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | 445 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS |
456 | #define NODES_WIDTH NODES_SHIFT | 446 | #define NODES_WIDTH NODES_SHIFT |
457 | #else | 447 | #else |
458 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 448 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
459 | #error "Vmemmap: No space for nodes field in page flags" | 449 | #error "Vmemmap: No space for nodes field in page flags" |
460 | #endif | 450 | #endif |
461 | #define NODES_WIDTH 0 | 451 | #define NODES_WIDTH 0 |
462 | #endif | 452 | #endif |
463 | 453 | ||
464 | /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ | 454 | /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ |
465 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) | 455 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) |
466 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) | 456 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) |
467 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) | 457 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) |
468 | 458 | ||
469 | /* | 459 | /* |
470 | * We are going to use the flags for the page to node mapping if its in | 460 | * We are going to use the flags for the page to node mapping if its in |
471 | * there. This includes the case where there is no node, so it is implicit. | 461 | * there. This includes the case where there is no node, so it is implicit. |
472 | */ | 462 | */ |
473 | #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) | 463 | #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) |
474 | #define NODE_NOT_IN_PAGE_FLAGS | 464 | #define NODE_NOT_IN_PAGE_FLAGS |
475 | #endif | 465 | #endif |
476 | 466 | ||
477 | #ifndef PFN_SECTION_SHIFT | 467 | #ifndef PFN_SECTION_SHIFT |
478 | #define PFN_SECTION_SHIFT 0 | 468 | #define PFN_SECTION_SHIFT 0 |
479 | #endif | 469 | #endif |
480 | 470 | ||
481 | /* | 471 | /* |
482 | * Define the bit shifts to access each section. For non-existant | 472 | * Define the bit shifts to access each section. For non-existant |
483 | * sections we define the shift as 0; that plus a 0 mask ensures | 473 | * sections we define the shift as 0; that plus a 0 mask ensures |
484 | * the compiler will optimise away reference to them. | 474 | * the compiler will optimise away reference to them. |
485 | */ | 475 | */ |
486 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) | 476 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) |
487 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) | 477 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) |
488 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) | 478 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) |
489 | 479 | ||
490 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */ | 480 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */ |
491 | #ifdef NODE_NOT_IN_PAGEFLAGS | 481 | #ifdef NODE_NOT_IN_PAGEFLAGS |
492 | #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) | 482 | #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) |
493 | #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ | 483 | #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ |
494 | SECTIONS_PGOFF : ZONES_PGOFF) | 484 | SECTIONS_PGOFF : ZONES_PGOFF) |
495 | #else | 485 | #else |
496 | #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) | 486 | #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) |
497 | #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ | 487 | #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ |
498 | NODES_PGOFF : ZONES_PGOFF) | 488 | NODES_PGOFF : ZONES_PGOFF) |
499 | #endif | 489 | #endif |
500 | 490 | ||
501 | #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) | 491 | #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) |
502 | 492 | ||
503 | #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS | 493 | #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS |
504 | #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS | 494 | #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS |
505 | #endif | 495 | #endif |
506 | 496 | ||
507 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) | 497 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) |
508 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) | 498 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) |
509 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) | 499 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) |
510 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) | 500 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) |
511 | 501 | ||
512 | static inline enum zone_type page_zonenum(struct page *page) | 502 | static inline enum zone_type page_zonenum(struct page *page) |
513 | { | 503 | { |
514 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; | 504 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; |
515 | } | 505 | } |
516 | 506 | ||
517 | /* | 507 | /* |
518 | * The identification function is only used by the buddy allocator for | 508 | * The identification function is only used by the buddy allocator for |
519 | * determining if two pages could be buddies. We are not really | 509 | * determining if two pages could be buddies. We are not really |
520 | * identifying a zone since we could be using a the section number | 510 | * identifying a zone since we could be using a the section number |
521 | * id if we have not node id available in page flags. | 511 | * id if we have not node id available in page flags. |
522 | * We guarantee only that it will return the same value for two | 512 | * We guarantee only that it will return the same value for two |
523 | * combinable pages in a zone. | 513 | * combinable pages in a zone. |
524 | */ | 514 | */ |
525 | static inline int page_zone_id(struct page *page) | 515 | static inline int page_zone_id(struct page *page) |
526 | { | 516 | { |
527 | return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; | 517 | return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; |
528 | } | 518 | } |
529 | 519 | ||
530 | static inline int zone_to_nid(struct zone *zone) | 520 | static inline int zone_to_nid(struct zone *zone) |
531 | { | 521 | { |
532 | #ifdef CONFIG_NUMA | 522 | #ifdef CONFIG_NUMA |
533 | return zone->node; | 523 | return zone->node; |
534 | #else | 524 | #else |
535 | return 0; | 525 | return 0; |
536 | #endif | 526 | #endif |
537 | } | 527 | } |
538 | 528 | ||
539 | #ifdef NODE_NOT_IN_PAGE_FLAGS | 529 | #ifdef NODE_NOT_IN_PAGE_FLAGS |
540 | extern int page_to_nid(struct page *page); | 530 | extern int page_to_nid(struct page *page); |
541 | #else | 531 | #else |
542 | static inline int page_to_nid(struct page *page) | 532 | static inline int page_to_nid(struct page *page) |
543 | { | 533 | { |
544 | return (page->flags >> NODES_PGSHIFT) & NODES_MASK; | 534 | return (page->flags >> NODES_PGSHIFT) & NODES_MASK; |
545 | } | 535 | } |
546 | #endif | 536 | #endif |
547 | 537 | ||
548 | static inline struct zone *page_zone(struct page *page) | 538 | static inline struct zone *page_zone(struct page *page) |
549 | { | 539 | { |
550 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; | 540 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; |
551 | } | 541 | } |
552 | 542 | ||
553 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 543 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) |
554 | static inline unsigned long page_to_section(struct page *page) | 544 | static inline unsigned long page_to_section(struct page *page) |
555 | { | 545 | { |
556 | return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; | 546 | return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; |
557 | } | 547 | } |
558 | #endif | 548 | #endif |
559 | 549 | ||
560 | static inline void set_page_zone(struct page *page, enum zone_type zone) | 550 | static inline void set_page_zone(struct page *page, enum zone_type zone) |
561 | { | 551 | { |
562 | page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); | 552 | page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); |
563 | page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; | 553 | page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; |
564 | } | 554 | } |
565 | 555 | ||
566 | static inline void set_page_node(struct page *page, unsigned long node) | 556 | static inline void set_page_node(struct page *page, unsigned long node) |
567 | { | 557 | { |
568 | page->flags &= ~(NODES_MASK << NODES_PGSHIFT); | 558 | page->flags &= ~(NODES_MASK << NODES_PGSHIFT); |
569 | page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; | 559 | page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; |
570 | } | 560 | } |
571 | 561 | ||
572 | static inline void set_page_section(struct page *page, unsigned long section) | 562 | static inline void set_page_section(struct page *page, unsigned long section) |
573 | { | 563 | { |
574 | page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); | 564 | page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); |
575 | page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; | 565 | page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; |
576 | } | 566 | } |
577 | 567 | ||
578 | static inline void set_page_links(struct page *page, enum zone_type zone, | 568 | static inline void set_page_links(struct page *page, enum zone_type zone, |
579 | unsigned long node, unsigned long pfn) | 569 | unsigned long node, unsigned long pfn) |
580 | { | 570 | { |
581 | set_page_zone(page, zone); | 571 | set_page_zone(page, zone); |
582 | set_page_node(page, node); | 572 | set_page_node(page, node); |
583 | set_page_section(page, pfn_to_section_nr(pfn)); | 573 | set_page_section(page, pfn_to_section_nr(pfn)); |
584 | } | 574 | } |
585 | 575 | ||
586 | /* | 576 | /* |
587 | * If a hint addr is less than mmap_min_addr change hint to be as | 577 | * If a hint addr is less than mmap_min_addr change hint to be as |
588 | * low as possible but still greater than mmap_min_addr | 578 | * low as possible but still greater than mmap_min_addr |
589 | */ | 579 | */ |
590 | static inline unsigned long round_hint_to_min(unsigned long hint) | 580 | static inline unsigned long round_hint_to_min(unsigned long hint) |
591 | { | 581 | { |
592 | #ifdef CONFIG_SECURITY | 582 | #ifdef CONFIG_SECURITY |
593 | hint &= PAGE_MASK; | 583 | hint &= PAGE_MASK; |
594 | if (((void *)hint != NULL) && | 584 | if (((void *)hint != NULL) && |
595 | (hint < mmap_min_addr)) | 585 | (hint < mmap_min_addr)) |
596 | return PAGE_ALIGN(mmap_min_addr); | 586 | return PAGE_ALIGN(mmap_min_addr); |
597 | #endif | 587 | #endif |
598 | return hint; | 588 | return hint; |
599 | } | 589 | } |
600 | 590 | ||
601 | /* | 591 | /* |
602 | * Some inline functions in vmstat.h depend on page_zone() | 592 | * Some inline functions in vmstat.h depend on page_zone() |
603 | */ | 593 | */ |
604 | #include <linux/vmstat.h> | 594 | #include <linux/vmstat.h> |
605 | 595 | ||
606 | static __always_inline void *lowmem_page_address(struct page *page) | 596 | static __always_inline void *lowmem_page_address(struct page *page) |
607 | { | 597 | { |
608 | return __va(page_to_pfn(page) << PAGE_SHIFT); | 598 | return __va(page_to_pfn(page) << PAGE_SHIFT); |
609 | } | 599 | } |
610 | 600 | ||
611 | #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) | 601 | #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) |
612 | #define HASHED_PAGE_VIRTUAL | 602 | #define HASHED_PAGE_VIRTUAL |
613 | #endif | 603 | #endif |
614 | 604 | ||
615 | #if defined(WANT_PAGE_VIRTUAL) | 605 | #if defined(WANT_PAGE_VIRTUAL) |
616 | #define page_address(page) ((page)->virtual) | 606 | #define page_address(page) ((page)->virtual) |
617 | #define set_page_address(page, address) \ | 607 | #define set_page_address(page, address) \ |
618 | do { \ | 608 | do { \ |
619 | (page)->virtual = (address); \ | 609 | (page)->virtual = (address); \ |
620 | } while(0) | 610 | } while(0) |
621 | #define page_address_init() do { } while(0) | 611 | #define page_address_init() do { } while(0) |
622 | #endif | 612 | #endif |
623 | 613 | ||
624 | #if defined(HASHED_PAGE_VIRTUAL) | 614 | #if defined(HASHED_PAGE_VIRTUAL) |
625 | void *page_address(struct page *page); | 615 | void *page_address(struct page *page); |
626 | void set_page_address(struct page *page, void *virtual); | 616 | void set_page_address(struct page *page, void *virtual); |
627 | void page_address_init(void); | 617 | void page_address_init(void); |
628 | #endif | 618 | #endif |
629 | 619 | ||
630 | #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) | 620 | #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) |
631 | #define page_address(page) lowmem_page_address(page) | 621 | #define page_address(page) lowmem_page_address(page) |
632 | #define set_page_address(page, address) do { } while(0) | 622 | #define set_page_address(page, address) do { } while(0) |
633 | #define page_address_init() do { } while(0) | 623 | #define page_address_init() do { } while(0) |
634 | #endif | 624 | #endif |
635 | 625 | ||
636 | /* | 626 | /* |
637 | * On an anonymous page mapped into a user virtual memory area, | 627 | * On an anonymous page mapped into a user virtual memory area, |
638 | * page->mapping points to its anon_vma, not to a struct address_space; | 628 | * page->mapping points to its anon_vma, not to a struct address_space; |
639 | * with the PAGE_MAPPING_ANON bit set to distinguish it. | 629 | * with the PAGE_MAPPING_ANON bit set to distinguish it. |
640 | * | 630 | * |
641 | * Please note that, confusingly, "page_mapping" refers to the inode | 631 | * Please note that, confusingly, "page_mapping" refers to the inode |
642 | * address_space which maps the page from disk; whereas "page_mapped" | 632 | * address_space which maps the page from disk; whereas "page_mapped" |
643 | * refers to user virtual address space into which the page is mapped. | 633 | * refers to user virtual address space into which the page is mapped. |
644 | */ | 634 | */ |
645 | #define PAGE_MAPPING_ANON 1 | 635 | #define PAGE_MAPPING_ANON 1 |
646 | 636 | ||
647 | extern struct address_space swapper_space; | 637 | extern struct address_space swapper_space; |
648 | static inline struct address_space *page_mapping(struct page *page) | 638 | static inline struct address_space *page_mapping(struct page *page) |
649 | { | 639 | { |
650 | struct address_space *mapping = page->mapping; | 640 | struct address_space *mapping = page->mapping; |
651 | 641 | ||
652 | VM_BUG_ON(PageSlab(page)); | 642 | VM_BUG_ON(PageSlab(page)); |
653 | #ifdef CONFIG_SWAP | 643 | #ifdef CONFIG_SWAP |
654 | if (unlikely(PageSwapCache(page))) | 644 | if (unlikely(PageSwapCache(page))) |
655 | mapping = &swapper_space; | 645 | mapping = &swapper_space; |
656 | else | 646 | else |
657 | #endif | 647 | #endif |
658 | if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) | 648 | if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) |
659 | mapping = NULL; | 649 | mapping = NULL; |
660 | return mapping; | 650 | return mapping; |
661 | } | 651 | } |
662 | 652 | ||
663 | static inline int PageAnon(struct page *page) | 653 | static inline int PageAnon(struct page *page) |
664 | { | 654 | { |
665 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; | 655 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; |
666 | } | 656 | } |
667 | 657 | ||
668 | /* | 658 | /* |
669 | * Return the pagecache index of the passed page. Regular pagecache pages | 659 | * Return the pagecache index of the passed page. Regular pagecache pages |
670 | * use ->index whereas swapcache pages use ->private | 660 | * use ->index whereas swapcache pages use ->private |
671 | */ | 661 | */ |
672 | static inline pgoff_t page_index(struct page *page) | 662 | static inline pgoff_t page_index(struct page *page) |
673 | { | 663 | { |
674 | if (unlikely(PageSwapCache(page))) | 664 | if (unlikely(PageSwapCache(page))) |
675 | return page_private(page); | 665 | return page_private(page); |
676 | return page->index; | 666 | return page->index; |
677 | } | 667 | } |
678 | 668 | ||
679 | /* | 669 | /* |
680 | * The atomic page->_mapcount, like _count, starts from -1: | 670 | * The atomic page->_mapcount, like _count, starts from -1: |
681 | * so that transitions both from it and to it can be tracked, | 671 | * so that transitions both from it and to it can be tracked, |
682 | * using atomic_inc_and_test and atomic_add_negative(-1). | 672 | * using atomic_inc_and_test and atomic_add_negative(-1). |
683 | */ | 673 | */ |
684 | static inline void reset_page_mapcount(struct page *page) | 674 | static inline void reset_page_mapcount(struct page *page) |
685 | { | 675 | { |
686 | atomic_set(&(page)->_mapcount, -1); | 676 | atomic_set(&(page)->_mapcount, -1); |
687 | } | 677 | } |
688 | 678 | ||
689 | static inline int page_mapcount(struct page *page) | 679 | static inline int page_mapcount(struct page *page) |
690 | { | 680 | { |
691 | return atomic_read(&(page)->_mapcount) + 1; | 681 | return atomic_read(&(page)->_mapcount) + 1; |
692 | } | 682 | } |
693 | 683 | ||
694 | /* | 684 | /* |
695 | * Return true if this page is mapped into pagetables. | 685 | * Return true if this page is mapped into pagetables. |
696 | */ | 686 | */ |
697 | static inline int page_mapped(struct page *page) | 687 | static inline int page_mapped(struct page *page) |
698 | { | 688 | { |
699 | return atomic_read(&(page)->_mapcount) >= 0; | 689 | return atomic_read(&(page)->_mapcount) >= 0; |
700 | } | 690 | } |
701 | 691 | ||
702 | /* | 692 | /* |
703 | * Different kinds of faults, as returned by handle_mm_fault(). | 693 | * Different kinds of faults, as returned by handle_mm_fault(). |
704 | * Used to decide whether a process gets delivered SIGBUS or | 694 | * Used to decide whether a process gets delivered SIGBUS or |
705 | * just gets major/minor fault counters bumped up. | 695 | * just gets major/minor fault counters bumped up. |
706 | */ | 696 | */ |
707 | 697 | ||
708 | #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */ | 698 | #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */ |
709 | 699 | ||
710 | #define VM_FAULT_OOM 0x0001 | 700 | #define VM_FAULT_OOM 0x0001 |
711 | #define VM_FAULT_SIGBUS 0x0002 | 701 | #define VM_FAULT_SIGBUS 0x0002 |
712 | #define VM_FAULT_MAJOR 0x0004 | 702 | #define VM_FAULT_MAJOR 0x0004 |
713 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ | 703 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ |
714 | 704 | ||
715 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ | 705 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
716 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 706 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
717 | 707 | ||
718 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS) | 708 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS) |
719 | 709 | ||
720 | /* | 710 | /* |
721 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. | 711 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. |
722 | */ | 712 | */ |
723 | extern void pagefault_out_of_memory(void); | 713 | extern void pagefault_out_of_memory(void); |
724 | 714 | ||
725 | #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) | 715 | #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) |
726 | 716 | ||
727 | extern void show_free_areas(void); | 717 | extern void show_free_areas(void); |
728 | 718 | ||
729 | #ifdef CONFIG_SHMEM | 719 | #ifdef CONFIG_SHMEM |
730 | extern int shmem_lock(struct file *file, int lock, struct user_struct *user); | 720 | extern int shmem_lock(struct file *file, int lock, struct user_struct *user); |
731 | #else | 721 | #else |
732 | static inline int shmem_lock(struct file *file, int lock, | 722 | static inline int shmem_lock(struct file *file, int lock, |
733 | struct user_struct *user) | 723 | struct user_struct *user) |
734 | { | 724 | { |
735 | return 0; | 725 | return 0; |
736 | } | 726 | } |
737 | #endif | 727 | #endif |
738 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); | 728 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); |
739 | 729 | ||
740 | int shmem_zero_setup(struct vm_area_struct *); | 730 | int shmem_zero_setup(struct vm_area_struct *); |
741 | 731 | ||
742 | #ifndef CONFIG_MMU | 732 | #ifndef CONFIG_MMU |
743 | extern unsigned long shmem_get_unmapped_area(struct file *file, | 733 | extern unsigned long shmem_get_unmapped_area(struct file *file, |
744 | unsigned long addr, | 734 | unsigned long addr, |
745 | unsigned long len, | 735 | unsigned long len, |
746 | unsigned long pgoff, | 736 | unsigned long pgoff, |
747 | unsigned long flags); | 737 | unsigned long flags); |
748 | #endif | 738 | #endif |
749 | 739 | ||
750 | extern int can_do_mlock(void); | 740 | extern int can_do_mlock(void); |
751 | extern int user_shm_lock(size_t, struct user_struct *); | 741 | extern int user_shm_lock(size_t, struct user_struct *); |
752 | extern void user_shm_unlock(size_t, struct user_struct *); | 742 | extern void user_shm_unlock(size_t, struct user_struct *); |
753 | 743 | ||
754 | /* | 744 | /* |
755 | * Parameter block passed down to zap_pte_range in exceptional cases. | 745 | * Parameter block passed down to zap_pte_range in exceptional cases. |
756 | */ | 746 | */ |
757 | struct zap_details { | 747 | struct zap_details { |
758 | struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ | 748 | struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ |
759 | struct address_space *check_mapping; /* Check page->mapping if set */ | 749 | struct address_space *check_mapping; /* Check page->mapping if set */ |
760 | pgoff_t first_index; /* Lowest page->index to unmap */ | 750 | pgoff_t first_index; /* Lowest page->index to unmap */ |
761 | pgoff_t last_index; /* Highest page->index to unmap */ | 751 | pgoff_t last_index; /* Highest page->index to unmap */ |
762 | spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ | 752 | spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ |
763 | unsigned long truncate_count; /* Compare vm_truncate_count */ | 753 | unsigned long truncate_count; /* Compare vm_truncate_count */ |
764 | }; | 754 | }; |
765 | 755 | ||
766 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 756 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
767 | pte_t pte); | 757 | pte_t pte); |
768 | 758 | ||
769 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | 759 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, |
770 | unsigned long size); | 760 | unsigned long size); |
771 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | 761 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, |
772 | unsigned long size, struct zap_details *); | 762 | unsigned long size, struct zap_details *); |
773 | unsigned long unmap_vmas(struct mmu_gather **tlb, | 763 | unsigned long unmap_vmas(struct mmu_gather **tlb, |
774 | struct vm_area_struct *start_vma, unsigned long start_addr, | 764 | struct vm_area_struct *start_vma, unsigned long start_addr, |
775 | unsigned long end_addr, unsigned long *nr_accounted, | 765 | unsigned long end_addr, unsigned long *nr_accounted, |
776 | struct zap_details *); | 766 | struct zap_details *); |
777 | 767 | ||
778 | /** | 768 | /** |
779 | * mm_walk - callbacks for walk_page_range | 769 | * mm_walk - callbacks for walk_page_range |
780 | * @pgd_entry: if set, called for each non-empty PGD (top-level) entry | 770 | * @pgd_entry: if set, called for each non-empty PGD (top-level) entry |
781 | * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry | 771 | * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry |
782 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry | 772 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry |
783 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry | 773 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry |
784 | * @pte_hole: if set, called for each hole at all levels | 774 | * @pte_hole: if set, called for each hole at all levels |
785 | * | 775 | * |
786 | * (see walk_page_range for more details) | 776 | * (see walk_page_range for more details) |
787 | */ | 777 | */ |
788 | struct mm_walk { | 778 | struct mm_walk { |
789 | int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *); | 779 | int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *); |
790 | int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *); | 780 | int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *); |
791 | int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); | 781 | int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); |
792 | int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); | 782 | int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); |
793 | int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); | 783 | int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); |
794 | struct mm_struct *mm; | 784 | struct mm_struct *mm; |
795 | void *private; | 785 | void *private; |
796 | }; | 786 | }; |
797 | 787 | ||
798 | int walk_page_range(unsigned long addr, unsigned long end, | 788 | int walk_page_range(unsigned long addr, unsigned long end, |
799 | struct mm_walk *walk); | 789 | struct mm_walk *walk); |
800 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, | 790 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
801 | unsigned long end, unsigned long floor, unsigned long ceiling); | 791 | unsigned long end, unsigned long floor, unsigned long ceiling); |
802 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, | 792 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, |
803 | struct vm_area_struct *vma); | 793 | struct vm_area_struct *vma); |
804 | void unmap_mapping_range(struct address_space *mapping, | 794 | void unmap_mapping_range(struct address_space *mapping, |
805 | loff_t const holebegin, loff_t const holelen, int even_cows); | 795 | loff_t const holebegin, loff_t const holelen, int even_cows); |
806 | int follow_phys(struct vm_area_struct *vma, unsigned long address, | 796 | int follow_phys(struct vm_area_struct *vma, unsigned long address, |
807 | unsigned int flags, unsigned long *prot, resource_size_t *phys); | 797 | unsigned int flags, unsigned long *prot, resource_size_t *phys); |
808 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | 798 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, |
809 | void *buf, int len, int write); | 799 | void *buf, int len, int write); |
810 | 800 | ||
811 | static inline void unmap_shared_mapping_range(struct address_space *mapping, | 801 | static inline void unmap_shared_mapping_range(struct address_space *mapping, |
812 | loff_t const holebegin, loff_t const holelen) | 802 | loff_t const holebegin, loff_t const holelen) |
813 | { | 803 | { |
814 | unmap_mapping_range(mapping, holebegin, holelen, 0); | 804 | unmap_mapping_range(mapping, holebegin, holelen, 0); |
815 | } | 805 | } |
816 | 806 | ||
817 | extern int vmtruncate(struct inode * inode, loff_t offset); | 807 | extern int vmtruncate(struct inode * inode, loff_t offset); |
818 | extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); | 808 | extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); |
819 | 809 | ||
820 | #ifdef CONFIG_MMU | 810 | #ifdef CONFIG_MMU |
821 | extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 811 | extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
822 | unsigned long address, int write_access); | 812 | unsigned long address, int write_access); |
823 | #else | 813 | #else |
824 | static inline int handle_mm_fault(struct mm_struct *mm, | 814 | static inline int handle_mm_fault(struct mm_struct *mm, |
825 | struct vm_area_struct *vma, unsigned long address, | 815 | struct vm_area_struct *vma, unsigned long address, |
826 | int write_access) | 816 | int write_access) |
827 | { | 817 | { |
828 | /* should never happen if there's no MMU */ | 818 | /* should never happen if there's no MMU */ |
829 | BUG(); | 819 | BUG(); |
830 | return VM_FAULT_SIGBUS; | 820 | return VM_FAULT_SIGBUS; |
831 | } | 821 | } |
832 | #endif | 822 | #endif |
833 | 823 | ||
834 | extern int make_pages_present(unsigned long addr, unsigned long end); | 824 | extern int make_pages_present(unsigned long addr, unsigned long end); |
835 | extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); | 825 | extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); |
836 | 826 | ||
837 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, | 827 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, |
838 | int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); | 828 | int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); |
839 | 829 | ||
840 | extern int try_to_release_page(struct page * page, gfp_t gfp_mask); | 830 | extern int try_to_release_page(struct page * page, gfp_t gfp_mask); |
841 | extern void do_invalidatepage(struct page *page, unsigned long offset); | 831 | extern void do_invalidatepage(struct page *page, unsigned long offset); |
842 | 832 | ||
843 | int __set_page_dirty_nobuffers(struct page *page); | 833 | int __set_page_dirty_nobuffers(struct page *page); |
844 | int __set_page_dirty_no_writeback(struct page *page); | 834 | int __set_page_dirty_no_writeback(struct page *page); |
845 | int redirty_page_for_writepage(struct writeback_control *wbc, | 835 | int redirty_page_for_writepage(struct writeback_control *wbc, |
846 | struct page *page); | 836 | struct page *page); |
847 | int set_page_dirty(struct page *page); | 837 | int set_page_dirty(struct page *page); |
848 | int set_page_dirty_lock(struct page *page); | 838 | int set_page_dirty_lock(struct page *page); |
849 | int clear_page_dirty_for_io(struct page *page); | 839 | int clear_page_dirty_for_io(struct page *page); |
850 | 840 | ||
851 | extern unsigned long move_page_tables(struct vm_area_struct *vma, | 841 | extern unsigned long move_page_tables(struct vm_area_struct *vma, |
852 | unsigned long old_addr, struct vm_area_struct *new_vma, | 842 | unsigned long old_addr, struct vm_area_struct *new_vma, |
853 | unsigned long new_addr, unsigned long len); | 843 | unsigned long new_addr, unsigned long len); |
854 | extern unsigned long do_mremap(unsigned long addr, | 844 | extern unsigned long do_mremap(unsigned long addr, |
855 | unsigned long old_len, unsigned long new_len, | 845 | unsigned long old_len, unsigned long new_len, |
856 | unsigned long flags, unsigned long new_addr); | 846 | unsigned long flags, unsigned long new_addr); |
857 | extern int mprotect_fixup(struct vm_area_struct *vma, | 847 | extern int mprotect_fixup(struct vm_area_struct *vma, |
858 | struct vm_area_struct **pprev, unsigned long start, | 848 | struct vm_area_struct **pprev, unsigned long start, |
859 | unsigned long end, unsigned long newflags); | 849 | unsigned long end, unsigned long newflags); |
860 | 850 | ||
861 | /* | 851 | /* |
862 | * get_user_pages_fast provides equivalent functionality to get_user_pages, | 852 | * get_user_pages_fast provides equivalent functionality to get_user_pages, |
863 | * operating on current and current->mm (force=0 and doesn't return any vmas). | 853 | * operating on current and current->mm (force=0 and doesn't return any vmas). |
864 | * | 854 | * |
865 | * get_user_pages_fast may take mmap_sem and page tables, so no assumptions | 855 | * get_user_pages_fast may take mmap_sem and page tables, so no assumptions |
866 | * can be made about locking. get_user_pages_fast is to be implemented in a | 856 | * can be made about locking. get_user_pages_fast is to be implemented in a |
867 | * way that is advantageous (vs get_user_pages()) when the user memory area is | 857 | * way that is advantageous (vs get_user_pages()) when the user memory area is |
868 | * already faulted in and present in ptes. However if the pages have to be | 858 | * already faulted in and present in ptes. However if the pages have to be |
869 | * faulted in, it may turn out to be slightly slower). | 859 | * faulted in, it may turn out to be slightly slower). |
870 | */ | 860 | */ |
871 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 861 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
872 | struct page **pages); | 862 | struct page **pages); |
873 | 863 | ||
874 | /* | 864 | /* |
875 | * A callback you can register to apply pressure to ageable caches. | 865 | * A callback you can register to apply pressure to ageable caches. |
876 | * | 866 | * |
877 | * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should | 867 | * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should |
878 | * look through the least-recently-used 'nr_to_scan' entries and | 868 | * look through the least-recently-used 'nr_to_scan' entries and |
879 | * attempt to free them up. It should return the number of objects | 869 | * attempt to free them up. It should return the number of objects |
880 | * which remain in the cache. If it returns -1, it means it cannot do | 870 | * which remain in the cache. If it returns -1, it means it cannot do |
881 | * any scanning at this time (eg. there is a risk of deadlock). | 871 | * any scanning at this time (eg. there is a risk of deadlock). |
882 | * | 872 | * |
883 | * The 'gfpmask' refers to the allocation we are currently trying to | 873 | * The 'gfpmask' refers to the allocation we are currently trying to |
884 | * fulfil. | 874 | * fulfil. |
885 | * | 875 | * |
886 | * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is | 876 | * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is |
887 | * querying the cache size, so a fastpath for that case is appropriate. | 877 | * querying the cache size, so a fastpath for that case is appropriate. |
888 | */ | 878 | */ |
889 | struct shrinker { | 879 | struct shrinker { |
890 | int (*shrink)(int nr_to_scan, gfp_t gfp_mask); | 880 | int (*shrink)(int nr_to_scan, gfp_t gfp_mask); |
891 | int seeks; /* seeks to recreate an obj */ | 881 | int seeks; /* seeks to recreate an obj */ |
892 | 882 | ||
893 | /* These are for internal use */ | 883 | /* These are for internal use */ |
894 | struct list_head list; | 884 | struct list_head list; |
895 | long nr; /* objs pending delete */ | 885 | long nr; /* objs pending delete */ |
896 | }; | 886 | }; |
897 | #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ | 887 | #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ |
898 | extern void register_shrinker(struct shrinker *); | 888 | extern void register_shrinker(struct shrinker *); |
899 | extern void unregister_shrinker(struct shrinker *); | 889 | extern void unregister_shrinker(struct shrinker *); |
900 | 890 | ||
901 | int vma_wants_writenotify(struct vm_area_struct *vma); | 891 | int vma_wants_writenotify(struct vm_area_struct *vma); |
902 | 892 | ||
903 | extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); | 893 | extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); |
904 | 894 | ||
905 | #ifdef __PAGETABLE_PUD_FOLDED | 895 | #ifdef __PAGETABLE_PUD_FOLDED |
906 | static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, | 896 | static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, |
907 | unsigned long address) | 897 | unsigned long address) |
908 | { | 898 | { |
909 | return 0; | 899 | return 0; |
910 | } | 900 | } |
911 | #else | 901 | #else |
912 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); | 902 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); |
913 | #endif | 903 | #endif |
914 | 904 | ||
915 | #ifdef __PAGETABLE_PMD_FOLDED | 905 | #ifdef __PAGETABLE_PMD_FOLDED |
916 | static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, | 906 | static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, |
917 | unsigned long address) | 907 | unsigned long address) |
918 | { | 908 | { |
919 | return 0; | 909 | return 0; |
920 | } | 910 | } |
921 | #else | 911 | #else |
922 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); | 912 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); |
923 | #endif | 913 | #endif |
924 | 914 | ||
925 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); | 915 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); |
926 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); | 916 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); |
927 | 917 | ||
928 | /* | 918 | /* |
929 | * The following ifdef needed to get the 4level-fixup.h header to work. | 919 | * The following ifdef needed to get the 4level-fixup.h header to work. |
930 | * Remove it when 4level-fixup.h has been removed. | 920 | * Remove it when 4level-fixup.h has been removed. |
931 | */ | 921 | */ |
932 | #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) | 922 | #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) |
933 | static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | 923 | static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) |
934 | { | 924 | { |
935 | return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? | 925 | return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? |
936 | NULL: pud_offset(pgd, address); | 926 | NULL: pud_offset(pgd, address); |
937 | } | 927 | } |
938 | 928 | ||
939 | static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | 929 | static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) |
940 | { | 930 | { |
941 | return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? | 931 | return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? |
942 | NULL: pmd_offset(pud, address); | 932 | NULL: pmd_offset(pud, address); |
943 | } | 933 | } |
944 | #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ | 934 | #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ |
945 | 935 | ||
946 | #if USE_SPLIT_PTLOCKS | 936 | #if USE_SPLIT_PTLOCKS |
947 | /* | 937 | /* |
948 | * We tuck a spinlock to guard each pagetable page into its struct page, | 938 | * We tuck a spinlock to guard each pagetable page into its struct page, |
949 | * at page->private, with BUILD_BUG_ON to make sure that this will not | 939 | * at page->private, with BUILD_BUG_ON to make sure that this will not |
950 | * overflow into the next struct page (as it might with DEBUG_SPINLOCK). | 940 | * overflow into the next struct page (as it might with DEBUG_SPINLOCK). |
951 | * When freeing, reset page->mapping so free_pages_check won't complain. | 941 | * When freeing, reset page->mapping so free_pages_check won't complain. |
952 | */ | 942 | */ |
953 | #define __pte_lockptr(page) &((page)->ptl) | 943 | #define __pte_lockptr(page) &((page)->ptl) |
954 | #define pte_lock_init(_page) do { \ | 944 | #define pte_lock_init(_page) do { \ |
955 | spin_lock_init(__pte_lockptr(_page)); \ | 945 | spin_lock_init(__pte_lockptr(_page)); \ |
956 | } while (0) | 946 | } while (0) |
957 | #define pte_lock_deinit(page) ((page)->mapping = NULL) | 947 | #define pte_lock_deinit(page) ((page)->mapping = NULL) |
958 | #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) | 948 | #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) |
959 | #else /* !USE_SPLIT_PTLOCKS */ | 949 | #else /* !USE_SPLIT_PTLOCKS */ |
960 | /* | 950 | /* |
961 | * We use mm->page_table_lock to guard all pagetable pages of the mm. | 951 | * We use mm->page_table_lock to guard all pagetable pages of the mm. |
962 | */ | 952 | */ |
963 | #define pte_lock_init(page) do {} while (0) | 953 | #define pte_lock_init(page) do {} while (0) |
964 | #define pte_lock_deinit(page) do {} while (0) | 954 | #define pte_lock_deinit(page) do {} while (0) |
965 | #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) | 955 | #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) |
966 | #endif /* USE_SPLIT_PTLOCKS */ | 956 | #endif /* USE_SPLIT_PTLOCKS */ |
967 | 957 | ||
968 | static inline void pgtable_page_ctor(struct page *page) | 958 | static inline void pgtable_page_ctor(struct page *page) |
969 | { | 959 | { |
970 | pte_lock_init(page); | 960 | pte_lock_init(page); |
971 | inc_zone_page_state(page, NR_PAGETABLE); | 961 | inc_zone_page_state(page, NR_PAGETABLE); |
972 | } | 962 | } |
973 | 963 | ||
974 | static inline void pgtable_page_dtor(struct page *page) | 964 | static inline void pgtable_page_dtor(struct page *page) |
975 | { | 965 | { |
976 | pte_lock_deinit(page); | 966 | pte_lock_deinit(page); |
977 | dec_zone_page_state(page, NR_PAGETABLE); | 967 | dec_zone_page_state(page, NR_PAGETABLE); |
978 | } | 968 | } |
979 | 969 | ||
980 | #define pte_offset_map_lock(mm, pmd, address, ptlp) \ | 970 | #define pte_offset_map_lock(mm, pmd, address, ptlp) \ |
981 | ({ \ | 971 | ({ \ |
982 | spinlock_t *__ptl = pte_lockptr(mm, pmd); \ | 972 | spinlock_t *__ptl = pte_lockptr(mm, pmd); \ |
983 | pte_t *__pte = pte_offset_map(pmd, address); \ | 973 | pte_t *__pte = pte_offset_map(pmd, address); \ |
984 | *(ptlp) = __ptl; \ | 974 | *(ptlp) = __ptl; \ |
985 | spin_lock(__ptl); \ | 975 | spin_lock(__ptl); \ |
986 | __pte; \ | 976 | __pte; \ |
987 | }) | 977 | }) |
988 | 978 | ||
989 | #define pte_unmap_unlock(pte, ptl) do { \ | 979 | #define pte_unmap_unlock(pte, ptl) do { \ |
990 | spin_unlock(ptl); \ | 980 | spin_unlock(ptl); \ |
991 | pte_unmap(pte); \ | 981 | pte_unmap(pte); \ |
992 | } while (0) | 982 | } while (0) |
993 | 983 | ||
994 | #define pte_alloc_map(mm, pmd, address) \ | 984 | #define pte_alloc_map(mm, pmd, address) \ |
995 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ | 985 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ |
996 | NULL: pte_offset_map(pmd, address)) | 986 | NULL: pte_offset_map(pmd, address)) |
997 | 987 | ||
998 | #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ | 988 | #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ |
999 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ | 989 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ |
1000 | NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) | 990 | NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) |
1001 | 991 | ||
1002 | #define pte_alloc_kernel(pmd, address) \ | 992 | #define pte_alloc_kernel(pmd, address) \ |
1003 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ | 993 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ |
1004 | NULL: pte_offset_kernel(pmd, address)) | 994 | NULL: pte_offset_kernel(pmd, address)) |
1005 | 995 | ||
1006 | extern void free_area_init(unsigned long * zones_size); | 996 | extern void free_area_init(unsigned long * zones_size); |
1007 | extern void free_area_init_node(int nid, unsigned long * zones_size, | 997 | extern void free_area_init_node(int nid, unsigned long * zones_size, |
1008 | unsigned long zone_start_pfn, unsigned long *zholes_size); | 998 | unsigned long zone_start_pfn, unsigned long *zholes_size); |
1009 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 999 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
1010 | /* | 1000 | /* |
1011 | * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its | 1001 | * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its |
1012 | * zones, allocate the backing mem_map and account for memory holes in a more | 1002 | * zones, allocate the backing mem_map and account for memory holes in a more |
1013 | * architecture independent manner. This is a substitute for creating the | 1003 | * architecture independent manner. This is a substitute for creating the |
1014 | * zone_sizes[] and zholes_size[] arrays and passing them to | 1004 | * zone_sizes[] and zholes_size[] arrays and passing them to |
1015 | * free_area_init_node() | 1005 | * free_area_init_node() |
1016 | * | 1006 | * |
1017 | * An architecture is expected to register range of page frames backed by | 1007 | * An architecture is expected to register range of page frames backed by |
1018 | * physical memory with add_active_range() before calling | 1008 | * physical memory with add_active_range() before calling |
1019 | * free_area_init_nodes() passing in the PFN each zone ends at. At a basic | 1009 | * free_area_init_nodes() passing in the PFN each zone ends at. At a basic |
1020 | * usage, an architecture is expected to do something like | 1010 | * usage, an architecture is expected to do something like |
1021 | * | 1011 | * |
1022 | * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, | 1012 | * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, |
1023 | * max_highmem_pfn}; | 1013 | * max_highmem_pfn}; |
1024 | * for_each_valid_physical_page_range() | 1014 | * for_each_valid_physical_page_range() |
1025 | * add_active_range(node_id, start_pfn, end_pfn) | 1015 | * add_active_range(node_id, start_pfn, end_pfn) |
1026 | * free_area_init_nodes(max_zone_pfns); | 1016 | * free_area_init_nodes(max_zone_pfns); |
1027 | * | 1017 | * |
1028 | * If the architecture guarantees that there are no holes in the ranges | 1018 | * If the architecture guarantees that there are no holes in the ranges |
1029 | * registered with add_active_range(), free_bootmem_active_regions() | 1019 | * registered with add_active_range(), free_bootmem_active_regions() |
1030 | * will call free_bootmem_node() for each registered physical page range. | 1020 | * will call free_bootmem_node() for each registered physical page range. |
1031 | * Similarly sparse_memory_present_with_active_regions() calls | 1021 | * Similarly sparse_memory_present_with_active_regions() calls |
1032 | * memory_present() for each range when SPARSEMEM is enabled. | 1022 | * memory_present() for each range when SPARSEMEM is enabled. |
1033 | * | 1023 | * |
1034 | * See mm/page_alloc.c for more information on each function exposed by | 1024 | * See mm/page_alloc.c for more information on each function exposed by |
1035 | * CONFIG_ARCH_POPULATES_NODE_MAP | 1025 | * CONFIG_ARCH_POPULATES_NODE_MAP |
1036 | */ | 1026 | */ |
1037 | extern void free_area_init_nodes(unsigned long *max_zone_pfn); | 1027 | extern void free_area_init_nodes(unsigned long *max_zone_pfn); |
1038 | extern void add_active_range(unsigned int nid, unsigned long start_pfn, | 1028 | extern void add_active_range(unsigned int nid, unsigned long start_pfn, |
1039 | unsigned long end_pfn); | 1029 | unsigned long end_pfn); |
1040 | extern void remove_active_range(unsigned int nid, unsigned long start_pfn, | 1030 | extern void remove_active_range(unsigned int nid, unsigned long start_pfn, |
1041 | unsigned long end_pfn); | 1031 | unsigned long end_pfn); |
1042 | extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, | 1032 | extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, |
1043 | unsigned long end_pfn); | 1033 | unsigned long end_pfn); |
1044 | extern void remove_all_active_ranges(void); | 1034 | extern void remove_all_active_ranges(void); |
1045 | extern unsigned long absent_pages_in_range(unsigned long start_pfn, | 1035 | extern unsigned long absent_pages_in_range(unsigned long start_pfn, |
1046 | unsigned long end_pfn); | 1036 | unsigned long end_pfn); |
1047 | extern void get_pfn_range_for_nid(unsigned int nid, | 1037 | extern void get_pfn_range_for_nid(unsigned int nid, |
1048 | unsigned long *start_pfn, unsigned long *end_pfn); | 1038 | unsigned long *start_pfn, unsigned long *end_pfn); |
1049 | extern unsigned long find_min_pfn_with_active_regions(void); | 1039 | extern unsigned long find_min_pfn_with_active_regions(void); |
1050 | extern void free_bootmem_with_active_regions(int nid, | 1040 | extern void free_bootmem_with_active_regions(int nid, |
1051 | unsigned long max_low_pfn); | 1041 | unsigned long max_low_pfn); |
1052 | typedef int (*work_fn_t)(unsigned long, unsigned long, void *); | 1042 | typedef int (*work_fn_t)(unsigned long, unsigned long, void *); |
1053 | extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); | 1043 | extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); |
1054 | extern void sparse_memory_present_with_active_regions(int nid); | 1044 | extern void sparse_memory_present_with_active_regions(int nid); |
1055 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | 1045 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
1056 | extern int early_pfn_to_nid(unsigned long pfn); | 1046 | extern int early_pfn_to_nid(unsigned long pfn); |
1057 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | 1047 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
1058 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 1048 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
1059 | extern void set_dma_reserve(unsigned long new_dma_reserve); | 1049 | extern void set_dma_reserve(unsigned long new_dma_reserve); |
1060 | extern void memmap_init_zone(unsigned long, int, unsigned long, | 1050 | extern void memmap_init_zone(unsigned long, int, unsigned long, |
1061 | unsigned long, enum memmap_context); | 1051 | unsigned long, enum memmap_context); |
1062 | extern void setup_per_zone_pages_min(void); | 1052 | extern void setup_per_zone_pages_min(void); |
1063 | extern void mem_init(void); | 1053 | extern void mem_init(void); |
1054 | extern void __init mmap_init(void); | ||
1064 | extern void show_mem(void); | 1055 | extern void show_mem(void); |
1065 | extern void si_meminfo(struct sysinfo * val); | 1056 | extern void si_meminfo(struct sysinfo * val); |
1066 | extern void si_meminfo_node(struct sysinfo *val, int nid); | 1057 | extern void si_meminfo_node(struct sysinfo *val, int nid); |
1067 | extern int after_bootmem; | 1058 | extern int after_bootmem; |
1068 | 1059 | ||
1069 | #ifdef CONFIG_NUMA | 1060 | #ifdef CONFIG_NUMA |
1070 | extern void setup_per_cpu_pageset(void); | 1061 | extern void setup_per_cpu_pageset(void); |
1071 | #else | 1062 | #else |
1072 | static inline void setup_per_cpu_pageset(void) {} | 1063 | static inline void setup_per_cpu_pageset(void) {} |
1073 | #endif | 1064 | #endif |
1065 | |||
1066 | /* nommu.c */ | ||
1067 | extern atomic_t mmap_pages_allocated; | ||
1074 | 1068 | ||
1075 | /* prio_tree.c */ | 1069 | /* prio_tree.c */ |
1076 | void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); | 1070 | void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); |
1077 | void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); | 1071 | void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); |
1078 | void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); | 1072 | void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); |
1079 | struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, | 1073 | struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, |
1080 | struct prio_tree_iter *iter); | 1074 | struct prio_tree_iter *iter); |
1081 | 1075 | ||
1082 | #define vma_prio_tree_foreach(vma, iter, root, begin, end) \ | 1076 | #define vma_prio_tree_foreach(vma, iter, root, begin, end) \ |
1083 | for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ | 1077 | for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ |
1084 | (vma = vma_prio_tree_next(vma, iter)); ) | 1078 | (vma = vma_prio_tree_next(vma, iter)); ) |
1085 | 1079 | ||
1086 | static inline void vma_nonlinear_insert(struct vm_area_struct *vma, | 1080 | static inline void vma_nonlinear_insert(struct vm_area_struct *vma, |
1087 | struct list_head *list) | 1081 | struct list_head *list) |
1088 | { | 1082 | { |
1089 | vma->shared.vm_set.parent = NULL; | 1083 | vma->shared.vm_set.parent = NULL; |
1090 | list_add_tail(&vma->shared.vm_set.list, list); | 1084 | list_add_tail(&vma->shared.vm_set.list, list); |
1091 | } | 1085 | } |
1092 | 1086 | ||
1093 | /* mmap.c */ | 1087 | /* mmap.c */ |
1094 | extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); | 1088 | extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); |
1095 | extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, | 1089 | extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, |
1096 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); | 1090 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); |
1097 | extern struct vm_area_struct *vma_merge(struct mm_struct *, | 1091 | extern struct vm_area_struct *vma_merge(struct mm_struct *, |
1098 | struct vm_area_struct *prev, unsigned long addr, unsigned long end, | 1092 | struct vm_area_struct *prev, unsigned long addr, unsigned long end, |
1099 | unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, | 1093 | unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, |
1100 | struct mempolicy *); | 1094 | struct mempolicy *); |
1101 | extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); | 1095 | extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); |
1102 | extern int split_vma(struct mm_struct *, | 1096 | extern int split_vma(struct mm_struct *, |
1103 | struct vm_area_struct *, unsigned long addr, int new_below); | 1097 | struct vm_area_struct *, unsigned long addr, int new_below); |
1104 | extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); | 1098 | extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); |
1105 | extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, | 1099 | extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, |
1106 | struct rb_node **, struct rb_node *); | 1100 | struct rb_node **, struct rb_node *); |
1107 | extern void unlink_file_vma(struct vm_area_struct *); | 1101 | extern void unlink_file_vma(struct vm_area_struct *); |
1108 | extern struct vm_area_struct *copy_vma(struct vm_area_struct **, | 1102 | extern struct vm_area_struct *copy_vma(struct vm_area_struct **, |
1109 | unsigned long addr, unsigned long len, pgoff_t pgoff); | 1103 | unsigned long addr, unsigned long len, pgoff_t pgoff); |
1110 | extern void exit_mmap(struct mm_struct *); | 1104 | extern void exit_mmap(struct mm_struct *); |
1111 | 1105 | ||
1112 | extern int mm_take_all_locks(struct mm_struct *mm); | 1106 | extern int mm_take_all_locks(struct mm_struct *mm); |
1113 | extern void mm_drop_all_locks(struct mm_struct *mm); | 1107 | extern void mm_drop_all_locks(struct mm_struct *mm); |
1114 | 1108 | ||
1115 | #ifdef CONFIG_PROC_FS | 1109 | #ifdef CONFIG_PROC_FS |
1116 | /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ | 1110 | /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ |
1117 | extern void added_exe_file_vma(struct mm_struct *mm); | 1111 | extern void added_exe_file_vma(struct mm_struct *mm); |
1118 | extern void removed_exe_file_vma(struct mm_struct *mm); | 1112 | extern void removed_exe_file_vma(struct mm_struct *mm); |
1119 | #else | 1113 | #else |
1120 | static inline void added_exe_file_vma(struct mm_struct *mm) | 1114 | static inline void added_exe_file_vma(struct mm_struct *mm) |
1121 | {} | 1115 | {} |
1122 | 1116 | ||
1123 | static inline void removed_exe_file_vma(struct mm_struct *mm) | 1117 | static inline void removed_exe_file_vma(struct mm_struct *mm) |
1124 | {} | 1118 | {} |
1125 | #endif /* CONFIG_PROC_FS */ | 1119 | #endif /* CONFIG_PROC_FS */ |
1126 | 1120 | ||
1127 | extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); | 1121 | extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); |
1128 | extern int install_special_mapping(struct mm_struct *mm, | 1122 | extern int install_special_mapping(struct mm_struct *mm, |
1129 | unsigned long addr, unsigned long len, | 1123 | unsigned long addr, unsigned long len, |
1130 | unsigned long flags, struct page **pages); | 1124 | unsigned long flags, struct page **pages); |
1131 | 1125 | ||
1132 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 1126 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
1133 | 1127 | ||
1134 | extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 1128 | extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1135 | unsigned long len, unsigned long prot, | 1129 | unsigned long len, unsigned long prot, |
1136 | unsigned long flag, unsigned long pgoff); | 1130 | unsigned long flag, unsigned long pgoff); |
1137 | extern unsigned long mmap_region(struct file *file, unsigned long addr, | 1131 | extern unsigned long mmap_region(struct file *file, unsigned long addr, |
1138 | unsigned long len, unsigned long flags, | 1132 | unsigned long len, unsigned long flags, |
1139 | unsigned int vm_flags, unsigned long pgoff, | 1133 | unsigned int vm_flags, unsigned long pgoff, |
1140 | int accountable); | 1134 | int accountable); |
1141 | 1135 | ||
1142 | static inline unsigned long do_mmap(struct file *file, unsigned long addr, | 1136 | static inline unsigned long do_mmap(struct file *file, unsigned long addr, |
1143 | unsigned long len, unsigned long prot, | 1137 | unsigned long len, unsigned long prot, |
1144 | unsigned long flag, unsigned long offset) | 1138 | unsigned long flag, unsigned long offset) |
1145 | { | 1139 | { |
1146 | unsigned long ret = -EINVAL; | 1140 | unsigned long ret = -EINVAL; |
1147 | if ((offset + PAGE_ALIGN(len)) < offset) | 1141 | if ((offset + PAGE_ALIGN(len)) < offset) |
1148 | goto out; | 1142 | goto out; |
1149 | if (!(offset & ~PAGE_MASK)) | 1143 | if (!(offset & ~PAGE_MASK)) |
1150 | ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); | 1144 | ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); |
1151 | out: | 1145 | out: |
1152 | return ret; | 1146 | return ret; |
1153 | } | 1147 | } |
1154 | 1148 | ||
1155 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); | 1149 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); |
1156 | 1150 | ||
1157 | extern unsigned long do_brk(unsigned long, unsigned long); | 1151 | extern unsigned long do_brk(unsigned long, unsigned long); |
1158 | 1152 | ||
1159 | /* filemap.c */ | 1153 | /* filemap.c */ |
1160 | extern unsigned long page_unuse(struct page *); | 1154 | extern unsigned long page_unuse(struct page *); |
1161 | extern void truncate_inode_pages(struct address_space *, loff_t); | 1155 | extern void truncate_inode_pages(struct address_space *, loff_t); |
1162 | extern void truncate_inode_pages_range(struct address_space *, | 1156 | extern void truncate_inode_pages_range(struct address_space *, |
1163 | loff_t lstart, loff_t lend); | 1157 | loff_t lstart, loff_t lend); |
1164 | 1158 | ||
1165 | /* generic vm_area_ops exported for stackable file systems */ | 1159 | /* generic vm_area_ops exported for stackable file systems */ |
1166 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); | 1160 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); |
1167 | 1161 | ||
1168 | /* mm/page-writeback.c */ | 1162 | /* mm/page-writeback.c */ |
1169 | int write_one_page(struct page *page, int wait); | 1163 | int write_one_page(struct page *page, int wait); |
1170 | 1164 | ||
1171 | /* readahead.c */ | 1165 | /* readahead.c */ |
1172 | #define VM_MAX_READAHEAD 128 /* kbytes */ | 1166 | #define VM_MAX_READAHEAD 128 /* kbytes */ |
1173 | #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ | 1167 | #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ |
1174 | 1168 | ||
1175 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | 1169 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
1176 | pgoff_t offset, unsigned long nr_to_read); | 1170 | pgoff_t offset, unsigned long nr_to_read); |
1177 | int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | 1171 | int force_page_cache_readahead(struct address_space *mapping, struct file *filp, |
1178 | pgoff_t offset, unsigned long nr_to_read); | 1172 | pgoff_t offset, unsigned long nr_to_read); |
1179 | 1173 | ||
1180 | void page_cache_sync_readahead(struct address_space *mapping, | 1174 | void page_cache_sync_readahead(struct address_space *mapping, |
1181 | struct file_ra_state *ra, | 1175 | struct file_ra_state *ra, |
1182 | struct file *filp, | 1176 | struct file *filp, |
1183 | pgoff_t offset, | 1177 | pgoff_t offset, |
1184 | unsigned long size); | 1178 | unsigned long size); |
1185 | 1179 | ||
1186 | void page_cache_async_readahead(struct address_space *mapping, | 1180 | void page_cache_async_readahead(struct address_space *mapping, |
1187 | struct file_ra_state *ra, | 1181 | struct file_ra_state *ra, |
1188 | struct file *filp, | 1182 | struct file *filp, |
1189 | struct page *pg, | 1183 | struct page *pg, |
1190 | pgoff_t offset, | 1184 | pgoff_t offset, |
1191 | unsigned long size); | 1185 | unsigned long size); |
1192 | 1186 | ||
1193 | unsigned long max_sane_readahead(unsigned long nr); | 1187 | unsigned long max_sane_readahead(unsigned long nr); |
1194 | 1188 | ||
1195 | /* Do stack extension */ | 1189 | /* Do stack extension */ |
1196 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); | 1190 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); |
1197 | #ifdef CONFIG_IA64 | 1191 | #ifdef CONFIG_IA64 |
1198 | extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); | 1192 | extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); |
1199 | #endif | 1193 | #endif |
1200 | extern int expand_stack_downwards(struct vm_area_struct *vma, | 1194 | extern int expand_stack_downwards(struct vm_area_struct *vma, |
1201 | unsigned long address); | 1195 | unsigned long address); |
1202 | 1196 | ||
1203 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1197 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1204 | extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); | 1198 | extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); |
1205 | extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, | 1199 | extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, |
1206 | struct vm_area_struct **pprev); | 1200 | struct vm_area_struct **pprev); |
1207 | 1201 | ||
1208 | /* Look up the first VMA which intersects the interval start_addr..end_addr-1, | 1202 | /* Look up the first VMA which intersects the interval start_addr..end_addr-1, |
1209 | NULL if none. Assume start_addr < end_addr. */ | 1203 | NULL if none. Assume start_addr < end_addr. */ |
1210 | static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) | 1204 | static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) |
1211 | { | 1205 | { |
1212 | struct vm_area_struct * vma = find_vma(mm,start_addr); | 1206 | struct vm_area_struct * vma = find_vma(mm,start_addr); |
1213 | 1207 | ||
1214 | if (vma && end_addr <= vma->vm_start) | 1208 | if (vma && end_addr <= vma->vm_start) |
1215 | vma = NULL; | 1209 | vma = NULL; |
1216 | return vma; | 1210 | return vma; |
1217 | } | 1211 | } |
1218 | 1212 | ||
1219 | static inline unsigned long vma_pages(struct vm_area_struct *vma) | 1213 | static inline unsigned long vma_pages(struct vm_area_struct *vma) |
1220 | { | 1214 | { |
1221 | return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 1215 | return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; |
1222 | } | 1216 | } |
1223 | 1217 | ||
1224 | pgprot_t vm_get_page_prot(unsigned long vm_flags); | 1218 | pgprot_t vm_get_page_prot(unsigned long vm_flags); |
1225 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); | 1219 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); |
1226 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, | 1220 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, |
1227 | unsigned long pfn, unsigned long size, pgprot_t); | 1221 | unsigned long pfn, unsigned long size, pgprot_t); |
1228 | int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); | 1222 | int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); |
1229 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1223 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1230 | unsigned long pfn); | 1224 | unsigned long pfn); |
1231 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1225 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
1232 | unsigned long pfn); | 1226 | unsigned long pfn); |
1233 | 1227 | ||
1234 | struct page *follow_page(struct vm_area_struct *, unsigned long address, | 1228 | struct page *follow_page(struct vm_area_struct *, unsigned long address, |
1235 | unsigned int foll_flags); | 1229 | unsigned int foll_flags); |
1236 | #define FOLL_WRITE 0x01 /* check pte is writable */ | 1230 | #define FOLL_WRITE 0x01 /* check pte is writable */ |
1237 | #define FOLL_TOUCH 0x02 /* mark page accessed */ | 1231 | #define FOLL_TOUCH 0x02 /* mark page accessed */ |
1238 | #define FOLL_GET 0x04 /* do get_page on page */ | 1232 | #define FOLL_GET 0x04 /* do get_page on page */ |
1239 | #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ | 1233 | #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ |
1240 | 1234 | ||
1241 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, | 1235 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, |
1242 | void *data); | 1236 | void *data); |
1243 | extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, | 1237 | extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, |
1244 | unsigned long size, pte_fn_t fn, void *data); | 1238 | unsigned long size, pte_fn_t fn, void *data); |
1245 | 1239 | ||
1246 | #ifdef CONFIG_PROC_FS | 1240 | #ifdef CONFIG_PROC_FS |
1247 | void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); | 1241 | void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); |
1248 | #else | 1242 | #else |
1249 | static inline void vm_stat_account(struct mm_struct *mm, | 1243 | static inline void vm_stat_account(struct mm_struct *mm, |
1250 | unsigned long flags, struct file *file, long pages) | 1244 | unsigned long flags, struct file *file, long pages) |
1251 | { | 1245 | { |
1252 | } | 1246 | } |
1253 | #endif /* CONFIG_PROC_FS */ | 1247 | #endif /* CONFIG_PROC_FS */ |
1254 | 1248 | ||
1255 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1249 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1256 | extern int debug_pagealloc_enabled; | 1250 | extern int debug_pagealloc_enabled; |
1257 | 1251 | ||
1258 | extern void kernel_map_pages(struct page *page, int numpages, int enable); | 1252 | extern void kernel_map_pages(struct page *page, int numpages, int enable); |
1259 | 1253 | ||
1260 | static inline void enable_debug_pagealloc(void) | 1254 | static inline void enable_debug_pagealloc(void) |
1261 | { | 1255 | { |
1262 | debug_pagealloc_enabled = 1; | 1256 | debug_pagealloc_enabled = 1; |
1263 | } | 1257 | } |
1264 | #ifdef CONFIG_HIBERNATION | 1258 | #ifdef CONFIG_HIBERNATION |
1265 | extern bool kernel_page_present(struct page *page); | 1259 | extern bool kernel_page_present(struct page *page); |
1266 | #endif /* CONFIG_HIBERNATION */ | 1260 | #endif /* CONFIG_HIBERNATION */ |
1267 | #else | 1261 | #else |
1268 | static inline void | 1262 | static inline void |
1269 | kernel_map_pages(struct page *page, int numpages, int enable) {} | 1263 | kernel_map_pages(struct page *page, int numpages, int enable) {} |
1270 | static inline void enable_debug_pagealloc(void) | 1264 | static inline void enable_debug_pagealloc(void) |
1271 | { | 1265 | { |
1272 | } | 1266 | } |
1273 | #ifdef CONFIG_HIBERNATION | 1267 | #ifdef CONFIG_HIBERNATION |
1274 | static inline bool kernel_page_present(struct page *page) { return true; } | 1268 | static inline bool kernel_page_present(struct page *page) { return true; } |
1275 | #endif /* CONFIG_HIBERNATION */ | 1269 | #endif /* CONFIG_HIBERNATION */ |
1276 | #endif | 1270 | #endif |
1277 | 1271 | ||
1278 | extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); | 1272 | extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); |
1279 | #ifdef __HAVE_ARCH_GATE_AREA | 1273 | #ifdef __HAVE_ARCH_GATE_AREA |
1280 | int in_gate_area_no_task(unsigned long addr); | 1274 | int in_gate_area_no_task(unsigned long addr); |
1281 | int in_gate_area(struct task_struct *task, unsigned long addr); | 1275 | int in_gate_area(struct task_struct *task, unsigned long addr); |
1282 | #else | 1276 | #else |
1283 | int in_gate_area_no_task(unsigned long addr); | 1277 | int in_gate_area_no_task(unsigned long addr); |
1284 | #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) | 1278 | #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) |
1285 | #endif /* __HAVE_ARCH_GATE_AREA */ | 1279 | #endif /* __HAVE_ARCH_GATE_AREA */ |
1286 | 1280 | ||
1287 | int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, | 1281 | int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, |
1288 | void __user *, size_t *, loff_t *); | 1282 | void __user *, size_t *, loff_t *); |
1289 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | 1283 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, |
1290 | unsigned long lru_pages); | 1284 | unsigned long lru_pages); |
1291 | 1285 | ||
1292 | #ifndef CONFIG_MMU | 1286 | #ifndef CONFIG_MMU |
1293 | #define randomize_va_space 0 | 1287 | #define randomize_va_space 0 |
1294 | #else | 1288 | #else |
1295 | extern int randomize_va_space; | 1289 | extern int randomize_va_space; |
1296 | #endif | 1290 | #endif |
1297 | 1291 | ||
1298 | const char * arch_vma_name(struct vm_area_struct *vma); | 1292 | const char * arch_vma_name(struct vm_area_struct *vma); |
1299 | void print_vma_addr(char *prefix, unsigned long rip); | 1293 | void print_vma_addr(char *prefix, unsigned long rip); |
1300 | 1294 | ||
1301 | struct page *sparse_mem_map_populate(unsigned long pnum, int nid); | 1295 | struct page *sparse_mem_map_populate(unsigned long pnum, int nid); |
1302 | pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); | 1296 | pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); |
1303 | pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); | 1297 | pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); |
1304 | pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); | 1298 | pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); |
1305 | pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); | 1299 | pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); |
1306 | void *vmemmap_alloc_block(unsigned long size, int node); | 1300 | void *vmemmap_alloc_block(unsigned long size, int node); |
1307 | void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); | 1301 | void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); |
1308 | int vmemmap_populate_basepages(struct page *start_page, | 1302 | int vmemmap_populate_basepages(struct page *start_page, |
1309 | unsigned long pages, int node); | 1303 | unsigned long pages, int node); |
1310 | int vmemmap_populate(struct page *start_page, unsigned long pages, int node); | 1304 | int vmemmap_populate(struct page *start_page, unsigned long pages, int node); |
1311 | void vmemmap_populate_print_last(void); | 1305 | void vmemmap_populate_print_last(void); |
1312 | 1306 | ||
1313 | extern void *alloc_locked_buffer(size_t size); | 1307 | extern void *alloc_locked_buffer(size_t size); |
include/linux/mm_types.h
1 | #ifndef _LINUX_MM_TYPES_H | 1 | #ifndef _LINUX_MM_TYPES_H |
2 | #define _LINUX_MM_TYPES_H | 2 | #define _LINUX_MM_TYPES_H |
3 | 3 | ||
4 | #include <linux/auxvec.h> | 4 | #include <linux/auxvec.h> |
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
6 | #include <linux/threads.h> | 6 | #include <linux/threads.h> |
7 | #include <linux/list.h> | 7 | #include <linux/list.h> |
8 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
9 | #include <linux/prio_tree.h> | 9 | #include <linux/prio_tree.h> |
10 | #include <linux/rbtree.h> | 10 | #include <linux/rbtree.h> |
11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
12 | #include <linux/completion.h> | 12 | #include <linux/completion.h> |
13 | #include <linux/cpumask.h> | 13 | #include <linux/cpumask.h> |
14 | #include <asm/page.h> | 14 | #include <asm/page.h> |
15 | #include <asm/mmu.h> | 15 | #include <asm/mmu.h> |
16 | 16 | ||
17 | #ifndef AT_VECTOR_SIZE_ARCH | 17 | #ifndef AT_VECTOR_SIZE_ARCH |
18 | #define AT_VECTOR_SIZE_ARCH 0 | 18 | #define AT_VECTOR_SIZE_ARCH 0 |
19 | #endif | 19 | #endif |
20 | #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) | 20 | #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) |
21 | 21 | ||
22 | struct address_space; | 22 | struct address_space; |
23 | 23 | ||
24 | #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) | 24 | #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) |
25 | 25 | ||
26 | #if USE_SPLIT_PTLOCKS | 26 | #if USE_SPLIT_PTLOCKS |
27 | typedef atomic_long_t mm_counter_t; | 27 | typedef atomic_long_t mm_counter_t; |
28 | #else /* !USE_SPLIT_PTLOCKS */ | 28 | #else /* !USE_SPLIT_PTLOCKS */ |
29 | typedef unsigned long mm_counter_t; | 29 | typedef unsigned long mm_counter_t; |
30 | #endif /* !USE_SPLIT_PTLOCKS */ | 30 | #endif /* !USE_SPLIT_PTLOCKS */ |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Each physical page in the system has a struct page associated with | 33 | * Each physical page in the system has a struct page associated with |
34 | * it to keep track of whatever it is we are using the page for at the | 34 | * it to keep track of whatever it is we are using the page for at the |
35 | * moment. Note that we have no way to track which tasks are using | 35 | * moment. Note that we have no way to track which tasks are using |
36 | * a page, though if it is a pagecache page, rmap structures can tell us | 36 | * a page, though if it is a pagecache page, rmap structures can tell us |
37 | * who is mapping it. | 37 | * who is mapping it. |
38 | */ | 38 | */ |
39 | struct page { | 39 | struct page { |
40 | unsigned long flags; /* Atomic flags, some possibly | 40 | unsigned long flags; /* Atomic flags, some possibly |
41 | * updated asynchronously */ | 41 | * updated asynchronously */ |
42 | atomic_t _count; /* Usage count, see below. */ | 42 | atomic_t _count; /* Usage count, see below. */ |
43 | union { | 43 | union { |
44 | atomic_t _mapcount; /* Count of ptes mapped in mms, | 44 | atomic_t _mapcount; /* Count of ptes mapped in mms, |
45 | * to show when page is mapped | 45 | * to show when page is mapped |
46 | * & limit reverse map searches. | 46 | * & limit reverse map searches. |
47 | */ | 47 | */ |
48 | struct { /* SLUB */ | 48 | struct { /* SLUB */ |
49 | u16 inuse; | 49 | u16 inuse; |
50 | u16 objects; | 50 | u16 objects; |
51 | }; | 51 | }; |
52 | }; | 52 | }; |
53 | union { | 53 | union { |
54 | struct { | 54 | struct { |
55 | unsigned long private; /* Mapping-private opaque data: | 55 | unsigned long private; /* Mapping-private opaque data: |
56 | * usually used for buffer_heads | 56 | * usually used for buffer_heads |
57 | * if PagePrivate set; used for | 57 | * if PagePrivate set; used for |
58 | * swp_entry_t if PageSwapCache; | 58 | * swp_entry_t if PageSwapCache; |
59 | * indicates order in the buddy | 59 | * indicates order in the buddy |
60 | * system if PG_buddy is set. | 60 | * system if PG_buddy is set. |
61 | */ | 61 | */ |
62 | struct address_space *mapping; /* If low bit clear, points to | 62 | struct address_space *mapping; /* If low bit clear, points to |
63 | * inode address_space, or NULL. | 63 | * inode address_space, or NULL. |
64 | * If page mapped as anonymous | 64 | * If page mapped as anonymous |
65 | * memory, low bit is set, and | 65 | * memory, low bit is set, and |
66 | * it points to anon_vma object: | 66 | * it points to anon_vma object: |
67 | * see PAGE_MAPPING_ANON below. | 67 | * see PAGE_MAPPING_ANON below. |
68 | */ | 68 | */ |
69 | }; | 69 | }; |
70 | #if USE_SPLIT_PTLOCKS | 70 | #if USE_SPLIT_PTLOCKS |
71 | spinlock_t ptl; | 71 | spinlock_t ptl; |
72 | #endif | 72 | #endif |
73 | struct kmem_cache *slab; /* SLUB: Pointer to slab */ | 73 | struct kmem_cache *slab; /* SLUB: Pointer to slab */ |
74 | struct page *first_page; /* Compound tail pages */ | 74 | struct page *first_page; /* Compound tail pages */ |
75 | }; | 75 | }; |
76 | union { | 76 | union { |
77 | pgoff_t index; /* Our offset within mapping. */ | 77 | pgoff_t index; /* Our offset within mapping. */ |
78 | void *freelist; /* SLUB: freelist req. slab lock */ | 78 | void *freelist; /* SLUB: freelist req. slab lock */ |
79 | }; | 79 | }; |
80 | struct list_head lru; /* Pageout list, eg. active_list | 80 | struct list_head lru; /* Pageout list, eg. active_list |
81 | * protected by zone->lru_lock ! | 81 | * protected by zone->lru_lock ! |
82 | */ | 82 | */ |
83 | /* | 83 | /* |
84 | * On machines where all RAM is mapped into kernel address space, | 84 | * On machines where all RAM is mapped into kernel address space, |
85 | * we can simply calculate the virtual address. On machines with | 85 | * we can simply calculate the virtual address. On machines with |
86 | * highmem some memory is mapped into kernel virtual memory | 86 | * highmem some memory is mapped into kernel virtual memory |
87 | * dynamically, so we need a place to store that address. | 87 | * dynamically, so we need a place to store that address. |
88 | * Note that this field could be 16 bits on x86 ... ;) | 88 | * Note that this field could be 16 bits on x86 ... ;) |
89 | * | 89 | * |
90 | * Architectures with slow multiplication can define | 90 | * Architectures with slow multiplication can define |
91 | * WANT_PAGE_VIRTUAL in asm/page.h | 91 | * WANT_PAGE_VIRTUAL in asm/page.h |
92 | */ | 92 | */ |
93 | #if defined(WANT_PAGE_VIRTUAL) | 93 | #if defined(WANT_PAGE_VIRTUAL) |
94 | void *virtual; /* Kernel virtual address (NULL if | 94 | void *virtual; /* Kernel virtual address (NULL if |
95 | not kmapped, ie. highmem) */ | 95 | not kmapped, ie. highmem) */ |
96 | #endif /* WANT_PAGE_VIRTUAL */ | 96 | #endif /* WANT_PAGE_VIRTUAL */ |
97 | }; | 97 | }; |
98 | 98 | ||
99 | /* | 99 | /* |
100 | * A region containing a mapping of a non-memory backed file under NOMMU | ||
101 | * conditions. These are held in a global tree and are pinned by the VMAs that | ||
102 | * map parts of them. | ||
103 | */ | ||
104 | struct vm_region { | ||
105 | struct rb_node vm_rb; /* link in global region tree */ | ||
106 | unsigned long vm_flags; /* VMA vm_flags */ | ||
107 | unsigned long vm_start; /* start address of region */ | ||
108 | unsigned long vm_end; /* region initialised to here */ | ||
109 | unsigned long vm_top; /* region allocated to here */ | ||
110 | unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ | ||
111 | struct file *vm_file; /* the backing file or NULL */ | ||
112 | |||
113 | atomic_t vm_usage; /* region usage count */ | ||
114 | }; | ||
115 | |||
116 | /* | ||
100 | * This struct defines a memory VMM memory area. There is one of these | 117 | * This struct defines a memory VMM memory area. There is one of these |
101 | * per VM-area/task. A VM area is any part of the process virtual memory | 118 | * per VM-area/task. A VM area is any part of the process virtual memory |
102 | * space that has a special rule for the page-fault handlers (ie a shared | 119 | * space that has a special rule for the page-fault handlers (ie a shared |
103 | * library, the executable area etc). | 120 | * library, the executable area etc). |
104 | */ | 121 | */ |
105 | struct vm_area_struct { | 122 | struct vm_area_struct { |
106 | struct mm_struct * vm_mm; /* The address space we belong to. */ | 123 | struct mm_struct * vm_mm; /* The address space we belong to. */ |
107 | unsigned long vm_start; /* Our start address within vm_mm. */ | 124 | unsigned long vm_start; /* Our start address within vm_mm. */ |
108 | unsigned long vm_end; /* The first byte after our end address | 125 | unsigned long vm_end; /* The first byte after our end address |
109 | within vm_mm. */ | 126 | within vm_mm. */ |
110 | 127 | ||
111 | /* linked list of VM areas per task, sorted by address */ | 128 | /* linked list of VM areas per task, sorted by address */ |
112 | struct vm_area_struct *vm_next; | 129 | struct vm_area_struct *vm_next; |
113 | 130 | ||
114 | pgprot_t vm_page_prot; /* Access permissions of this VMA. */ | 131 | pgprot_t vm_page_prot; /* Access permissions of this VMA. */ |
115 | unsigned long vm_flags; /* Flags, see mm.h. */ | 132 | unsigned long vm_flags; /* Flags, see mm.h. */ |
116 | 133 | ||
117 | struct rb_node vm_rb; | 134 | struct rb_node vm_rb; |
118 | 135 | ||
119 | /* | 136 | /* |
120 | * For areas with an address space and backing store, | 137 | * For areas with an address space and backing store, |
121 | * linkage into the address_space->i_mmap prio tree, or | 138 | * linkage into the address_space->i_mmap prio tree, or |
122 | * linkage to the list of like vmas hanging off its node, or | 139 | * linkage to the list of like vmas hanging off its node, or |
123 | * linkage of vma in the address_space->i_mmap_nonlinear list. | 140 | * linkage of vma in the address_space->i_mmap_nonlinear list. |
124 | */ | 141 | */ |
125 | union { | 142 | union { |
126 | struct { | 143 | struct { |
127 | struct list_head list; | 144 | struct list_head list; |
128 | void *parent; /* aligns with prio_tree_node parent */ | 145 | void *parent; /* aligns with prio_tree_node parent */ |
129 | struct vm_area_struct *head; | 146 | struct vm_area_struct *head; |
130 | } vm_set; | 147 | } vm_set; |
131 | 148 | ||
132 | struct raw_prio_tree_node prio_tree_node; | 149 | struct raw_prio_tree_node prio_tree_node; |
133 | } shared; | 150 | } shared; |
134 | 151 | ||
135 | /* | 152 | /* |
136 | * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma | 153 | * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma |
137 | * list, after a COW of one of the file pages. A MAP_SHARED vma | 154 | * list, after a COW of one of the file pages. A MAP_SHARED vma |
138 | * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack | 155 | * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack |
139 | * or brk vma (with NULL file) can only be in an anon_vma list. | 156 | * or brk vma (with NULL file) can only be in an anon_vma list. |
140 | */ | 157 | */ |
141 | struct list_head anon_vma_node; /* Serialized by anon_vma->lock */ | 158 | struct list_head anon_vma_node; /* Serialized by anon_vma->lock */ |
142 | struct anon_vma *anon_vma; /* Serialized by page_table_lock */ | 159 | struct anon_vma *anon_vma; /* Serialized by page_table_lock */ |
143 | 160 | ||
144 | /* Function pointers to deal with this struct. */ | 161 | /* Function pointers to deal with this struct. */ |
145 | struct vm_operations_struct * vm_ops; | 162 | struct vm_operations_struct * vm_ops; |
146 | 163 | ||
147 | /* Information about our backing store: */ | 164 | /* Information about our backing store: */ |
148 | unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE | 165 | unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE |
149 | units, *not* PAGE_CACHE_SIZE */ | 166 | units, *not* PAGE_CACHE_SIZE */ |
150 | struct file * vm_file; /* File we map to (can be NULL). */ | 167 | struct file * vm_file; /* File we map to (can be NULL). */ |
151 | void * vm_private_data; /* was vm_pte (shared mem) */ | 168 | void * vm_private_data; /* was vm_pte (shared mem) */ |
152 | unsigned long vm_truncate_count;/* truncate_count or restart_addr */ | 169 | unsigned long vm_truncate_count;/* truncate_count or restart_addr */ |
153 | 170 | ||
154 | #ifndef CONFIG_MMU | 171 | #ifndef CONFIG_MMU |
155 | atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */ | 172 | struct vm_region *vm_region; /* NOMMU mapping region */ |
156 | #endif | 173 | #endif |
157 | #ifdef CONFIG_NUMA | 174 | #ifdef CONFIG_NUMA |
158 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ | 175 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ |
159 | #endif | 176 | #endif |
160 | }; | 177 | }; |
161 | 178 | ||
162 | struct core_thread { | 179 | struct core_thread { |
163 | struct task_struct *task; | 180 | struct task_struct *task; |
164 | struct core_thread *next; | 181 | struct core_thread *next; |
165 | }; | 182 | }; |
166 | 183 | ||
167 | struct core_state { | 184 | struct core_state { |
168 | atomic_t nr_threads; | 185 | atomic_t nr_threads; |
169 | struct core_thread dumper; | 186 | struct core_thread dumper; |
170 | struct completion startup; | 187 | struct completion startup; |
171 | }; | 188 | }; |
172 | 189 | ||
173 | struct mm_struct { | 190 | struct mm_struct { |
174 | struct vm_area_struct * mmap; /* list of VMAs */ | 191 | struct vm_area_struct * mmap; /* list of VMAs */ |
175 | struct rb_root mm_rb; | 192 | struct rb_root mm_rb; |
176 | struct vm_area_struct * mmap_cache; /* last find_vma result */ | 193 | struct vm_area_struct * mmap_cache; /* last find_vma result */ |
177 | unsigned long (*get_unmapped_area) (struct file *filp, | 194 | unsigned long (*get_unmapped_area) (struct file *filp, |
178 | unsigned long addr, unsigned long len, | 195 | unsigned long addr, unsigned long len, |
179 | unsigned long pgoff, unsigned long flags); | 196 | unsigned long pgoff, unsigned long flags); |
180 | void (*unmap_area) (struct mm_struct *mm, unsigned long addr); | 197 | void (*unmap_area) (struct mm_struct *mm, unsigned long addr); |
181 | unsigned long mmap_base; /* base of mmap area */ | 198 | unsigned long mmap_base; /* base of mmap area */ |
182 | unsigned long task_size; /* size of task vm space */ | 199 | unsigned long task_size; /* size of task vm space */ |
183 | unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ | 200 | unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ |
184 | unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ | 201 | unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ |
185 | pgd_t * pgd; | 202 | pgd_t * pgd; |
186 | atomic_t mm_users; /* How many users with user space? */ | 203 | atomic_t mm_users; /* How many users with user space? */ |
187 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ | 204 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ |
188 | int map_count; /* number of VMAs */ | 205 | int map_count; /* number of VMAs */ |
189 | struct rw_semaphore mmap_sem; | 206 | struct rw_semaphore mmap_sem; |
190 | spinlock_t page_table_lock; /* Protects page tables and some counters */ | 207 | spinlock_t page_table_lock; /* Protects page tables and some counters */ |
191 | 208 | ||
192 | struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung | 209 | struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung |
193 | * together off init_mm.mmlist, and are protected | 210 | * together off init_mm.mmlist, and are protected |
194 | * by mmlist_lock | 211 | * by mmlist_lock |
195 | */ | 212 | */ |
196 | 213 | ||
197 | /* Special counters, in some configurations protected by the | 214 | /* Special counters, in some configurations protected by the |
198 | * page_table_lock, in other configurations by being atomic. | 215 | * page_table_lock, in other configurations by being atomic. |
199 | */ | 216 | */ |
200 | mm_counter_t _file_rss; | 217 | mm_counter_t _file_rss; |
201 | mm_counter_t _anon_rss; | 218 | mm_counter_t _anon_rss; |
202 | 219 | ||
203 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ | 220 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ |
204 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | 221 | unsigned long hiwater_vm; /* High-water virtual memory usage */ |
205 | 222 | ||
206 | unsigned long total_vm, locked_vm, shared_vm, exec_vm; | 223 | unsigned long total_vm, locked_vm, shared_vm, exec_vm; |
207 | unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; | 224 | unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; |
208 | unsigned long start_code, end_code, start_data, end_data; | 225 | unsigned long start_code, end_code, start_data, end_data; |
209 | unsigned long start_brk, brk, start_stack; | 226 | unsigned long start_brk, brk, start_stack; |
210 | unsigned long arg_start, arg_end, env_start, env_end; | 227 | unsigned long arg_start, arg_end, env_start, env_end; |
211 | 228 | ||
212 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ | 229 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
213 | 230 | ||
214 | cpumask_t cpu_vm_mask; | 231 | cpumask_t cpu_vm_mask; |
215 | 232 | ||
216 | /* Architecture-specific MM context */ | 233 | /* Architecture-specific MM context */ |
217 | mm_context_t context; | 234 | mm_context_t context; |
218 | 235 | ||
219 | /* Swap token stuff */ | 236 | /* Swap token stuff */ |
220 | /* | 237 | /* |
221 | * Last value of global fault stamp as seen by this process. | 238 | * Last value of global fault stamp as seen by this process. |
222 | * In other words, this value gives an indication of how long | 239 | * In other words, this value gives an indication of how long |
223 | * it has been since this task got the token. | 240 | * it has been since this task got the token. |
224 | * Look at mm/thrash.c | 241 | * Look at mm/thrash.c |
225 | */ | 242 | */ |
226 | unsigned int faultstamp; | 243 | unsigned int faultstamp; |
227 | unsigned int token_priority; | 244 | unsigned int token_priority; |
228 | unsigned int last_interval; | 245 | unsigned int last_interval; |
229 | 246 | ||
230 | unsigned long flags; /* Must use atomic bitops to access the bits */ | 247 | unsigned long flags; /* Must use atomic bitops to access the bits */ |
231 | 248 | ||
232 | struct core_state *core_state; /* coredumping support */ | 249 | struct core_state *core_state; /* coredumping support */ |
233 | 250 | ||
234 | /* aio bits */ | 251 | /* aio bits */ |
235 | spinlock_t ioctx_lock; | 252 | spinlock_t ioctx_lock; |
236 | struct hlist_head ioctx_list; | 253 | struct hlist_head ioctx_list; |
237 | 254 | ||
238 | #ifdef CONFIG_MM_OWNER | 255 | #ifdef CONFIG_MM_OWNER |
239 | /* | 256 | /* |
240 | * "owner" points to a task that is regarded as the canonical | 257 | * "owner" points to a task that is regarded as the canonical |
241 | * user/owner of this mm. All of the following must be true in | 258 | * user/owner of this mm. All of the following must be true in |
242 | * order for it to be changed: | 259 | * order for it to be changed: |
243 | * | 260 | * |
244 | * current == mm->owner | 261 | * current == mm->owner |
245 | * current->mm != mm | 262 | * current->mm != mm |
246 | * new_owner->mm == mm | 263 | * new_owner->mm == mm |
247 | * new_owner->alloc_lock is held | 264 | * new_owner->alloc_lock is held |
248 | */ | 265 | */ |
249 | struct task_struct *owner; | 266 | struct task_struct *owner; |
250 | #endif | 267 | #endif |
251 | 268 | ||
252 | #ifdef CONFIG_PROC_FS | 269 | #ifdef CONFIG_PROC_FS |
253 | /* store ref to file /proc/<pid>/exe symlink points to */ | 270 | /* store ref to file /proc/<pid>/exe symlink points to */ |
254 | struct file *exe_file; | 271 | struct file *exe_file; |
255 | unsigned long num_exe_file_vmas; | 272 | unsigned long num_exe_file_vmas; |
256 | #endif | 273 | #endif |
257 | #ifdef CONFIG_MMU_NOTIFIER | 274 | #ifdef CONFIG_MMU_NOTIFIER |
258 | struct mmu_notifier_mm *mmu_notifier_mm; | 275 | struct mmu_notifier_mm *mmu_notifier_mm; |
259 | #endif | 276 | #endif |
260 | }; | 277 | }; |
261 | 278 | ||
262 | #endif /* _LINUX_MM_TYPES_H */ | 279 | #endif /* _LINUX_MM_TYPES_H */ |
263 | 280 |
init/initramfs.c
1 | #include <linux/init.h> | 1 | #include <linux/init.h> |
2 | #include <linux/fs.h> | 2 | #include <linux/fs.h> |
3 | #include <linux/slab.h> | 3 | #include <linux/slab.h> |
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <linux/fcntl.h> | 5 | #include <linux/fcntl.h> |
6 | #include <linux/delay.h> | 6 | #include <linux/delay.h> |
7 | #include <linux/string.h> | 7 | #include <linux/string.h> |
8 | #include <linux/syscalls.h> | 8 | #include <linux/syscalls.h> |
9 | #include <linux/utime.h> | 9 | #include <linux/utime.h> |
10 | 10 | ||
11 | static __initdata char *message; | 11 | static __initdata char *message; |
12 | static void __init error(char *x) | 12 | static void __init error(char *x) |
13 | { | 13 | { |
14 | if (!message) | 14 | if (!message) |
15 | message = x; | 15 | message = x; |
16 | } | 16 | } |
17 | 17 | ||
18 | /* link hash */ | 18 | /* link hash */ |
19 | 19 | ||
20 | #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) | 20 | #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) |
21 | 21 | ||
22 | static __initdata struct hash { | 22 | static __initdata struct hash { |
23 | int ino, minor, major; | 23 | int ino, minor, major; |
24 | mode_t mode; | 24 | mode_t mode; |
25 | struct hash *next; | 25 | struct hash *next; |
26 | char name[N_ALIGN(PATH_MAX)]; | 26 | char name[N_ALIGN(PATH_MAX)]; |
27 | } *head[32]; | 27 | } *head[32]; |
28 | 28 | ||
29 | static inline int hash(int major, int minor, int ino) | 29 | static inline int hash(int major, int minor, int ino) |
30 | { | 30 | { |
31 | unsigned long tmp = ino + minor + (major << 3); | 31 | unsigned long tmp = ino + minor + (major << 3); |
32 | tmp += tmp >> 5; | 32 | tmp += tmp >> 5; |
33 | return tmp & 31; | 33 | return tmp & 31; |
34 | } | 34 | } |
35 | 35 | ||
36 | static char __init *find_link(int major, int minor, int ino, | 36 | static char __init *find_link(int major, int minor, int ino, |
37 | mode_t mode, char *name) | 37 | mode_t mode, char *name) |
38 | { | 38 | { |
39 | struct hash **p, *q; | 39 | struct hash **p, *q; |
40 | for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { | 40 | for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { |
41 | if ((*p)->ino != ino) | 41 | if ((*p)->ino != ino) |
42 | continue; | 42 | continue; |
43 | if ((*p)->minor != minor) | 43 | if ((*p)->minor != minor) |
44 | continue; | 44 | continue; |
45 | if ((*p)->major != major) | 45 | if ((*p)->major != major) |
46 | continue; | 46 | continue; |
47 | if (((*p)->mode ^ mode) & S_IFMT) | 47 | if (((*p)->mode ^ mode) & S_IFMT) |
48 | continue; | 48 | continue; |
49 | return (*p)->name; | 49 | return (*p)->name; |
50 | } | 50 | } |
51 | q = kmalloc(sizeof(struct hash), GFP_KERNEL); | 51 | q = kmalloc(sizeof(struct hash), GFP_KERNEL); |
52 | if (!q) | 52 | if (!q) |
53 | panic("can't allocate link hash entry"); | 53 | panic("can't allocate link hash entry"); |
54 | q->major = major; | 54 | q->major = major; |
55 | q->minor = minor; | 55 | q->minor = minor; |
56 | q->ino = ino; | 56 | q->ino = ino; |
57 | q->mode = mode; | 57 | q->mode = mode; |
58 | strcpy(q->name, name); | 58 | strcpy(q->name, name); |
59 | q->next = NULL; | 59 | q->next = NULL; |
60 | *p = q; | 60 | *p = q; |
61 | return NULL; | 61 | return NULL; |
62 | } | 62 | } |
63 | 63 | ||
64 | static void __init free_hash(void) | 64 | static void __init free_hash(void) |
65 | { | 65 | { |
66 | struct hash **p, *q; | 66 | struct hash **p, *q; |
67 | for (p = head; p < head + 32; p++) { | 67 | for (p = head; p < head + 32; p++) { |
68 | while (*p) { | 68 | while (*p) { |
69 | q = *p; | 69 | q = *p; |
70 | *p = q->next; | 70 | *p = q->next; |
71 | kfree(q); | 71 | kfree(q); |
72 | } | 72 | } |
73 | } | 73 | } |
74 | } | 74 | } |
75 | 75 | ||
76 | static long __init do_utime(char __user *filename, time_t mtime) | 76 | static long __init do_utime(char __user *filename, time_t mtime) |
77 | { | 77 | { |
78 | struct timespec t[2]; | 78 | struct timespec t[2]; |
79 | 79 | ||
80 | t[0].tv_sec = mtime; | 80 | t[0].tv_sec = mtime; |
81 | t[0].tv_nsec = 0; | 81 | t[0].tv_nsec = 0; |
82 | t[1].tv_sec = mtime; | 82 | t[1].tv_sec = mtime; |
83 | t[1].tv_nsec = 0; | 83 | t[1].tv_nsec = 0; |
84 | 84 | ||
85 | return do_utimes(AT_FDCWD, filename, t, AT_SYMLINK_NOFOLLOW); | 85 | return do_utimes(AT_FDCWD, filename, t, AT_SYMLINK_NOFOLLOW); |
86 | } | 86 | } |
87 | 87 | ||
88 | static __initdata LIST_HEAD(dir_list); | 88 | static __initdata LIST_HEAD(dir_list); |
89 | struct dir_entry { | 89 | struct dir_entry { |
90 | struct list_head list; | 90 | struct list_head list; |
91 | char *name; | 91 | char *name; |
92 | time_t mtime; | 92 | time_t mtime; |
93 | }; | 93 | }; |
94 | 94 | ||
95 | static void __init dir_add(const char *name, time_t mtime) | 95 | static void __init dir_add(const char *name, time_t mtime) |
96 | { | 96 | { |
97 | struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); | 97 | struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); |
98 | if (!de) | 98 | if (!de) |
99 | panic("can't allocate dir_entry buffer"); | 99 | panic("can't allocate dir_entry buffer"); |
100 | INIT_LIST_HEAD(&de->list); | 100 | INIT_LIST_HEAD(&de->list); |
101 | de->name = kstrdup(name, GFP_KERNEL); | 101 | de->name = kstrdup(name, GFP_KERNEL); |
102 | de->mtime = mtime; | 102 | de->mtime = mtime; |
103 | list_add(&de->list, &dir_list); | 103 | list_add(&de->list, &dir_list); |
104 | } | 104 | } |
105 | 105 | ||
106 | static void __init dir_utime(void) | 106 | static void __init dir_utime(void) |
107 | { | 107 | { |
108 | struct dir_entry *de, *tmp; | 108 | struct dir_entry *de, *tmp; |
109 | list_for_each_entry_safe(de, tmp, &dir_list, list) { | 109 | list_for_each_entry_safe(de, tmp, &dir_list, list) { |
110 | list_del(&de->list); | 110 | list_del(&de->list); |
111 | do_utime(de->name, de->mtime); | 111 | do_utime(de->name, de->mtime); |
112 | kfree(de->name); | 112 | kfree(de->name); |
113 | kfree(de); | 113 | kfree(de); |
114 | } | 114 | } |
115 | } | 115 | } |
116 | 116 | ||
117 | static __initdata time_t mtime; | 117 | static __initdata time_t mtime; |
118 | 118 | ||
119 | /* cpio header parsing */ | 119 | /* cpio header parsing */ |
120 | 120 | ||
121 | static __initdata unsigned long ino, major, minor, nlink; | 121 | static __initdata unsigned long ino, major, minor, nlink; |
122 | static __initdata mode_t mode; | 122 | static __initdata mode_t mode; |
123 | static __initdata unsigned long body_len, name_len; | 123 | static __initdata unsigned long body_len, name_len; |
124 | static __initdata uid_t uid; | 124 | static __initdata uid_t uid; |
125 | static __initdata gid_t gid; | 125 | static __initdata gid_t gid; |
126 | static __initdata unsigned rdev; | 126 | static __initdata unsigned rdev; |
127 | 127 | ||
128 | static void __init parse_header(char *s) | 128 | static void __init parse_header(char *s) |
129 | { | 129 | { |
130 | unsigned long parsed[12]; | 130 | unsigned long parsed[12]; |
131 | char buf[9]; | 131 | char buf[9]; |
132 | int i; | 132 | int i; |
133 | 133 | ||
134 | buf[8] = '\0'; | 134 | buf[8] = '\0'; |
135 | for (i = 0, s += 6; i < 12; i++, s += 8) { | 135 | for (i = 0, s += 6; i < 12; i++, s += 8) { |
136 | memcpy(buf, s, 8); | 136 | memcpy(buf, s, 8); |
137 | parsed[i] = simple_strtoul(buf, NULL, 16); | 137 | parsed[i] = simple_strtoul(buf, NULL, 16); |
138 | } | 138 | } |
139 | ino = parsed[0]; | 139 | ino = parsed[0]; |
140 | mode = parsed[1]; | 140 | mode = parsed[1]; |
141 | uid = parsed[2]; | 141 | uid = parsed[2]; |
142 | gid = parsed[3]; | 142 | gid = parsed[3]; |
143 | nlink = parsed[4]; | 143 | nlink = parsed[4]; |
144 | mtime = parsed[5]; | 144 | mtime = parsed[5]; |
145 | body_len = parsed[6]; | 145 | body_len = parsed[6]; |
146 | major = parsed[7]; | 146 | major = parsed[7]; |
147 | minor = parsed[8]; | 147 | minor = parsed[8]; |
148 | rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); | 148 | rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); |
149 | name_len = parsed[11]; | 149 | name_len = parsed[11]; |
150 | } | 150 | } |
151 | 151 | ||
152 | /* FSM */ | 152 | /* FSM */ |
153 | 153 | ||
154 | static __initdata enum state { | 154 | static __initdata enum state { |
155 | Start, | 155 | Start, |
156 | Collect, | 156 | Collect, |
157 | GotHeader, | 157 | GotHeader, |
158 | SkipIt, | 158 | SkipIt, |
159 | GotName, | 159 | GotName, |
160 | CopyFile, | 160 | CopyFile, |
161 | GotSymlink, | 161 | GotSymlink, |
162 | Reset | 162 | Reset |
163 | } state, next_state; | 163 | } state, next_state; |
164 | 164 | ||
165 | static __initdata char *victim; | 165 | static __initdata char *victim; |
166 | static __initdata unsigned count; | 166 | static __initdata unsigned count; |
167 | static __initdata loff_t this_header, next_header; | 167 | static __initdata loff_t this_header, next_header; |
168 | 168 | ||
169 | static __initdata int dry_run; | 169 | static __initdata int dry_run; |
170 | 170 | ||
171 | static inline void __init eat(unsigned n) | 171 | static inline void __init eat(unsigned n) |
172 | { | 172 | { |
173 | victim += n; | 173 | victim += n; |
174 | this_header += n; | 174 | this_header += n; |
175 | count -= n; | 175 | count -= n; |
176 | } | 176 | } |
177 | 177 | ||
178 | static __initdata char *vcollected; | 178 | static __initdata char *vcollected; |
179 | static __initdata char *collected; | 179 | static __initdata char *collected; |
180 | static __initdata int remains; | 180 | static __initdata int remains; |
181 | static __initdata char *collect; | 181 | static __initdata char *collect; |
182 | 182 | ||
183 | static void __init read_into(char *buf, unsigned size, enum state next) | 183 | static void __init read_into(char *buf, unsigned size, enum state next) |
184 | { | 184 | { |
185 | if (count >= size) { | 185 | if (count >= size) { |
186 | collected = victim; | 186 | collected = victim; |
187 | eat(size); | 187 | eat(size); |
188 | state = next; | 188 | state = next; |
189 | } else { | 189 | } else { |
190 | collect = collected = buf; | 190 | collect = collected = buf; |
191 | remains = size; | 191 | remains = size; |
192 | next_state = next; | 192 | next_state = next; |
193 | state = Collect; | 193 | state = Collect; |
194 | } | 194 | } |
195 | } | 195 | } |
196 | 196 | ||
197 | static __initdata char *header_buf, *symlink_buf, *name_buf; | 197 | static __initdata char *header_buf, *symlink_buf, *name_buf; |
198 | 198 | ||
199 | static int __init do_start(void) | 199 | static int __init do_start(void) |
200 | { | 200 | { |
201 | read_into(header_buf, 110, GotHeader); | 201 | read_into(header_buf, 110, GotHeader); |
202 | return 0; | 202 | return 0; |
203 | } | 203 | } |
204 | 204 | ||
205 | static int __init do_collect(void) | 205 | static int __init do_collect(void) |
206 | { | 206 | { |
207 | unsigned n = remains; | 207 | unsigned n = remains; |
208 | if (count < n) | 208 | if (count < n) |
209 | n = count; | 209 | n = count; |
210 | memcpy(collect, victim, n); | 210 | memcpy(collect, victim, n); |
211 | eat(n); | 211 | eat(n); |
212 | collect += n; | 212 | collect += n; |
213 | if ((remains -= n) != 0) | 213 | if ((remains -= n) != 0) |
214 | return 1; | 214 | return 1; |
215 | state = next_state; | 215 | state = next_state; |
216 | return 0; | 216 | return 0; |
217 | } | 217 | } |
218 | 218 | ||
219 | static int __init do_header(void) | 219 | static int __init do_header(void) |
220 | { | 220 | { |
221 | if (memcmp(collected, "070707", 6)==0) { | 221 | if (memcmp(collected, "070707", 6)==0) { |
222 | error("incorrect cpio method used: use -H newc option"); | 222 | error("incorrect cpio method used: use -H newc option"); |
223 | return 1; | 223 | return 1; |
224 | } | 224 | } |
225 | if (memcmp(collected, "070701", 6)) { | 225 | if (memcmp(collected, "070701", 6)) { |
226 | error("no cpio magic"); | 226 | error("no cpio magic"); |
227 | return 1; | 227 | return 1; |
228 | } | 228 | } |
229 | parse_header(collected); | 229 | parse_header(collected); |
230 | next_header = this_header + N_ALIGN(name_len) + body_len; | 230 | next_header = this_header + N_ALIGN(name_len) + body_len; |
231 | next_header = (next_header + 3) & ~3; | 231 | next_header = (next_header + 3) & ~3; |
232 | if (dry_run) { | 232 | if (dry_run) { |
233 | read_into(name_buf, N_ALIGN(name_len), GotName); | 233 | read_into(name_buf, N_ALIGN(name_len), GotName); |
234 | return 0; | 234 | return 0; |
235 | } | 235 | } |
236 | state = SkipIt; | 236 | state = SkipIt; |
237 | if (name_len <= 0 || name_len > PATH_MAX) | 237 | if (name_len <= 0 || name_len > PATH_MAX) |
238 | return 0; | 238 | return 0; |
239 | if (S_ISLNK(mode)) { | 239 | if (S_ISLNK(mode)) { |
240 | if (body_len > PATH_MAX) | 240 | if (body_len > PATH_MAX) |
241 | return 0; | 241 | return 0; |
242 | collect = collected = symlink_buf; | 242 | collect = collected = symlink_buf; |
243 | remains = N_ALIGN(name_len) + body_len; | 243 | remains = N_ALIGN(name_len) + body_len; |
244 | next_state = GotSymlink; | 244 | next_state = GotSymlink; |
245 | state = Collect; | 245 | state = Collect; |
246 | return 0; | 246 | return 0; |
247 | } | 247 | } |
248 | if (S_ISREG(mode) || !body_len) | 248 | if (S_ISREG(mode) || !body_len) |
249 | read_into(name_buf, N_ALIGN(name_len), GotName); | 249 | read_into(name_buf, N_ALIGN(name_len), GotName); |
250 | return 0; | 250 | return 0; |
251 | } | 251 | } |
252 | 252 | ||
253 | static int __init do_skip(void) | 253 | static int __init do_skip(void) |
254 | { | 254 | { |
255 | if (this_header + count < next_header) { | 255 | if (this_header + count < next_header) { |
256 | eat(count); | 256 | eat(count); |
257 | return 1; | 257 | return 1; |
258 | } else { | 258 | } else { |
259 | eat(next_header - this_header); | 259 | eat(next_header - this_header); |
260 | state = next_state; | 260 | state = next_state; |
261 | return 0; | 261 | return 0; |
262 | } | 262 | } |
263 | } | 263 | } |
264 | 264 | ||
265 | static int __init do_reset(void) | 265 | static int __init do_reset(void) |
266 | { | 266 | { |
267 | while(count && *victim == '\0') | 267 | while(count && *victim == '\0') |
268 | eat(1); | 268 | eat(1); |
269 | if (count && (this_header & 3)) | 269 | if (count && (this_header & 3)) |
270 | error("broken padding"); | 270 | error("broken padding"); |
271 | return 1; | 271 | return 1; |
272 | } | 272 | } |
273 | 273 | ||
274 | static int __init maybe_link(void) | 274 | static int __init maybe_link(void) |
275 | { | 275 | { |
276 | if (nlink >= 2) { | 276 | if (nlink >= 2) { |
277 | char *old = find_link(major, minor, ino, mode, collected); | 277 | char *old = find_link(major, minor, ino, mode, collected); |
278 | if (old) | 278 | if (old) |
279 | return (sys_link(old, collected) < 0) ? -1 : 1; | 279 | return (sys_link(old, collected) < 0) ? -1 : 1; |
280 | } | 280 | } |
281 | return 0; | 281 | return 0; |
282 | } | 282 | } |
283 | 283 | ||
284 | static void __init clean_path(char *path, mode_t mode) | 284 | static void __init clean_path(char *path, mode_t mode) |
285 | { | 285 | { |
286 | struct stat st; | 286 | struct stat st; |
287 | 287 | ||
288 | if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) { | 288 | if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) { |
289 | if (S_ISDIR(st.st_mode)) | 289 | if (S_ISDIR(st.st_mode)) |
290 | sys_rmdir(path); | 290 | sys_rmdir(path); |
291 | else | 291 | else |
292 | sys_unlink(path); | 292 | sys_unlink(path); |
293 | } | 293 | } |
294 | } | 294 | } |
295 | 295 | ||
296 | static __initdata int wfd; | 296 | static __initdata int wfd; |
297 | 297 | ||
298 | static int __init do_name(void) | 298 | static int __init do_name(void) |
299 | { | 299 | { |
300 | state = SkipIt; | 300 | state = SkipIt; |
301 | next_state = Reset; | 301 | next_state = Reset; |
302 | if (strcmp(collected, "TRAILER!!!") == 0) { | 302 | if (strcmp(collected, "TRAILER!!!") == 0) { |
303 | free_hash(); | 303 | free_hash(); |
304 | return 0; | 304 | return 0; |
305 | } | 305 | } |
306 | if (dry_run) | 306 | if (dry_run) |
307 | return 0; | 307 | return 0; |
308 | clean_path(collected, mode); | 308 | clean_path(collected, mode); |
309 | if (S_ISREG(mode)) { | 309 | if (S_ISREG(mode)) { |
310 | int ml = maybe_link(); | 310 | int ml = maybe_link(); |
311 | if (ml >= 0) { | 311 | if (ml >= 0) { |
312 | int openflags = O_WRONLY|O_CREAT; | 312 | int openflags = O_WRONLY|O_CREAT; |
313 | if (ml != 1) | 313 | if (ml != 1) |
314 | openflags |= O_TRUNC; | 314 | openflags |= O_TRUNC; |
315 | wfd = sys_open(collected, openflags, mode); | 315 | wfd = sys_open(collected, openflags, mode); |
316 | 316 | ||
317 | if (wfd >= 0) { | 317 | if (wfd >= 0) { |
318 | sys_fchown(wfd, uid, gid); | 318 | sys_fchown(wfd, uid, gid); |
319 | sys_fchmod(wfd, mode); | 319 | sys_fchmod(wfd, mode); |
320 | sys_ftruncate(wfd, body_len); | ||
320 | vcollected = kstrdup(collected, GFP_KERNEL); | 321 | vcollected = kstrdup(collected, GFP_KERNEL); |
321 | state = CopyFile; | 322 | state = CopyFile; |
322 | } | 323 | } |
323 | } | 324 | } |
324 | } else if (S_ISDIR(mode)) { | 325 | } else if (S_ISDIR(mode)) { |
325 | sys_mkdir(collected, mode); | 326 | sys_mkdir(collected, mode); |
326 | sys_chown(collected, uid, gid); | 327 | sys_chown(collected, uid, gid); |
327 | sys_chmod(collected, mode); | 328 | sys_chmod(collected, mode); |
328 | dir_add(collected, mtime); | 329 | dir_add(collected, mtime); |
329 | } else if (S_ISBLK(mode) || S_ISCHR(mode) || | 330 | } else if (S_ISBLK(mode) || S_ISCHR(mode) || |
330 | S_ISFIFO(mode) || S_ISSOCK(mode)) { | 331 | S_ISFIFO(mode) || S_ISSOCK(mode)) { |
331 | if (maybe_link() == 0) { | 332 | if (maybe_link() == 0) { |
332 | sys_mknod(collected, mode, rdev); | 333 | sys_mknod(collected, mode, rdev); |
333 | sys_chown(collected, uid, gid); | 334 | sys_chown(collected, uid, gid); |
334 | sys_chmod(collected, mode); | 335 | sys_chmod(collected, mode); |
335 | do_utime(collected, mtime); | 336 | do_utime(collected, mtime); |
336 | } | 337 | } |
337 | } | 338 | } |
338 | return 0; | 339 | return 0; |
339 | } | 340 | } |
340 | 341 | ||
341 | static int __init do_copy(void) | 342 | static int __init do_copy(void) |
342 | { | 343 | { |
343 | if (count >= body_len) { | 344 | if (count >= body_len) { |
344 | sys_write(wfd, victim, body_len); | 345 | sys_write(wfd, victim, body_len); |
345 | sys_close(wfd); | 346 | sys_close(wfd); |
346 | do_utime(vcollected, mtime); | 347 | do_utime(vcollected, mtime); |
347 | kfree(vcollected); | 348 | kfree(vcollected); |
348 | eat(body_len); | 349 | eat(body_len); |
349 | state = SkipIt; | 350 | state = SkipIt; |
350 | return 0; | 351 | return 0; |
351 | } else { | 352 | } else { |
352 | sys_write(wfd, victim, count); | 353 | sys_write(wfd, victim, count); |
353 | body_len -= count; | 354 | body_len -= count; |
354 | eat(count); | 355 | eat(count); |
355 | return 1; | 356 | return 1; |
356 | } | 357 | } |
357 | } | 358 | } |
358 | 359 | ||
359 | static int __init do_symlink(void) | 360 | static int __init do_symlink(void) |
360 | { | 361 | { |
361 | collected[N_ALIGN(name_len) + body_len] = '\0'; | 362 | collected[N_ALIGN(name_len) + body_len] = '\0'; |
362 | clean_path(collected, 0); | 363 | clean_path(collected, 0); |
363 | sys_symlink(collected + N_ALIGN(name_len), collected); | 364 | sys_symlink(collected + N_ALIGN(name_len), collected); |
364 | sys_lchown(collected, uid, gid); | 365 | sys_lchown(collected, uid, gid); |
365 | do_utime(collected, mtime); | 366 | do_utime(collected, mtime); |
366 | state = SkipIt; | 367 | state = SkipIt; |
367 | next_state = Reset; | 368 | next_state = Reset; |
368 | return 0; | 369 | return 0; |
369 | } | 370 | } |
370 | 371 | ||
371 | static __initdata int (*actions[])(void) = { | 372 | static __initdata int (*actions[])(void) = { |
372 | [Start] = do_start, | 373 | [Start] = do_start, |
373 | [Collect] = do_collect, | 374 | [Collect] = do_collect, |
374 | [GotHeader] = do_header, | 375 | [GotHeader] = do_header, |
375 | [SkipIt] = do_skip, | 376 | [SkipIt] = do_skip, |
376 | [GotName] = do_name, | 377 | [GotName] = do_name, |
377 | [CopyFile] = do_copy, | 378 | [CopyFile] = do_copy, |
378 | [GotSymlink] = do_symlink, | 379 | [GotSymlink] = do_symlink, |
379 | [Reset] = do_reset, | 380 | [Reset] = do_reset, |
380 | }; | 381 | }; |
381 | 382 | ||
382 | static int __init write_buffer(char *buf, unsigned len) | 383 | static int __init write_buffer(char *buf, unsigned len) |
383 | { | 384 | { |
384 | count = len; | 385 | count = len; |
385 | victim = buf; | 386 | victim = buf; |
386 | 387 | ||
387 | while (!actions[state]()) | 388 | while (!actions[state]()) |
388 | ; | 389 | ; |
389 | return len - count; | 390 | return len - count; |
390 | } | 391 | } |
391 | 392 | ||
392 | static void __init flush_buffer(char *buf, unsigned len) | 393 | static void __init flush_buffer(char *buf, unsigned len) |
393 | { | 394 | { |
394 | int written; | 395 | int written; |
395 | if (message) | 396 | if (message) |
396 | return; | 397 | return; |
397 | while ((written = write_buffer(buf, len)) < len && !message) { | 398 | while ((written = write_buffer(buf, len)) < len && !message) { |
398 | char c = buf[written]; | 399 | char c = buf[written]; |
399 | if (c == '0') { | 400 | if (c == '0') { |
400 | buf += written; | 401 | buf += written; |
401 | len -= written; | 402 | len -= written; |
402 | state = Start; | 403 | state = Start; |
403 | } else if (c == 0) { | 404 | } else if (c == 0) { |
404 | buf += written; | 405 | buf += written; |
405 | len -= written; | 406 | len -= written; |
406 | state = Reset; | 407 | state = Reset; |
407 | } else | 408 | } else |
408 | error("junk in compressed archive"); | 409 | error("junk in compressed archive"); |
409 | } | 410 | } |
410 | } | 411 | } |
411 | 412 | ||
412 | /* | 413 | /* |
413 | * gzip declarations | 414 | * gzip declarations |
414 | */ | 415 | */ |
415 | 416 | ||
416 | #define OF(args) args | 417 | #define OF(args) args |
417 | 418 | ||
418 | #ifndef memzero | 419 | #ifndef memzero |
419 | #define memzero(s, n) memset ((s), 0, (n)) | 420 | #define memzero(s, n) memset ((s), 0, (n)) |
420 | #endif | 421 | #endif |
421 | 422 | ||
422 | typedef unsigned char uch; | 423 | typedef unsigned char uch; |
423 | typedef unsigned short ush; | 424 | typedef unsigned short ush; |
424 | typedef unsigned long ulg; | 425 | typedef unsigned long ulg; |
425 | 426 | ||
426 | #define WSIZE 0x8000 /* window size--must be a power of two, and */ | 427 | #define WSIZE 0x8000 /* window size--must be a power of two, and */ |
427 | /* at least 32K for zip's deflate method */ | 428 | /* at least 32K for zip's deflate method */ |
428 | 429 | ||
429 | static uch *inbuf; | 430 | static uch *inbuf; |
430 | static uch *window; | 431 | static uch *window; |
431 | 432 | ||
432 | static unsigned insize; /* valid bytes in inbuf */ | 433 | static unsigned insize; /* valid bytes in inbuf */ |
433 | static unsigned inptr; /* index of next byte to be processed in inbuf */ | 434 | static unsigned inptr; /* index of next byte to be processed in inbuf */ |
434 | static unsigned outcnt; /* bytes in output buffer */ | 435 | static unsigned outcnt; /* bytes in output buffer */ |
435 | static long bytes_out; | 436 | static long bytes_out; |
436 | 437 | ||
437 | #define get_byte() (inptr < insize ? inbuf[inptr++] : -1) | 438 | #define get_byte() (inptr < insize ? inbuf[inptr++] : -1) |
438 | 439 | ||
439 | /* Diagnostic functions (stubbed out) */ | 440 | /* Diagnostic functions (stubbed out) */ |
440 | #define Assert(cond,msg) | 441 | #define Assert(cond,msg) |
441 | #define Trace(x) | 442 | #define Trace(x) |
442 | #define Tracev(x) | 443 | #define Tracev(x) |
443 | #define Tracevv(x) | 444 | #define Tracevv(x) |
444 | #define Tracec(c,x) | 445 | #define Tracec(c,x) |
445 | #define Tracecv(c,x) | 446 | #define Tracecv(c,x) |
446 | 447 | ||
447 | #define STATIC static | 448 | #define STATIC static |
448 | #define INIT __init | 449 | #define INIT __init |
449 | 450 | ||
450 | static void __init flush_window(void); | 451 | static void __init flush_window(void); |
451 | static void __init error(char *m); | 452 | static void __init error(char *m); |
452 | 453 | ||
453 | #define NO_INFLATE_MALLOC | 454 | #define NO_INFLATE_MALLOC |
454 | 455 | ||
455 | #include "../lib/inflate.c" | 456 | #include "../lib/inflate.c" |
456 | 457 | ||
457 | /* =========================================================================== | 458 | /* =========================================================================== |
458 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. | 459 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. |
459 | * (Used for the decompressed data only.) | 460 | * (Used for the decompressed data only.) |
460 | */ | 461 | */ |
461 | static void __init flush_window(void) | 462 | static void __init flush_window(void) |
462 | { | 463 | { |
463 | ulg c = crc; /* temporary variable */ | 464 | ulg c = crc; /* temporary variable */ |
464 | unsigned n; | 465 | unsigned n; |
465 | uch *in, ch; | 466 | uch *in, ch; |
466 | 467 | ||
467 | flush_buffer(window, outcnt); | 468 | flush_buffer(window, outcnt); |
468 | in = window; | 469 | in = window; |
469 | for (n = 0; n < outcnt; n++) { | 470 | for (n = 0; n < outcnt; n++) { |
470 | ch = *in++; | 471 | ch = *in++; |
471 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | 472 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); |
472 | } | 473 | } |
473 | crc = c; | 474 | crc = c; |
474 | bytes_out += (ulg)outcnt; | 475 | bytes_out += (ulg)outcnt; |
475 | outcnt = 0; | 476 | outcnt = 0; |
476 | } | 477 | } |
477 | 478 | ||
478 | static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only) | 479 | static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only) |
479 | { | 480 | { |
480 | int written; | 481 | int written; |
481 | dry_run = check_only; | 482 | dry_run = check_only; |
482 | header_buf = kmalloc(110, GFP_KERNEL); | 483 | header_buf = kmalloc(110, GFP_KERNEL); |
483 | symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); | 484 | symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); |
484 | name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); | 485 | name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); |
485 | window = kmalloc(WSIZE, GFP_KERNEL); | 486 | window = kmalloc(WSIZE, GFP_KERNEL); |
486 | if (!window || !header_buf || !symlink_buf || !name_buf) | 487 | if (!window || !header_buf || !symlink_buf || !name_buf) |
487 | panic("can't allocate buffers"); | 488 | panic("can't allocate buffers"); |
488 | state = Start; | 489 | state = Start; |
489 | this_header = 0; | 490 | this_header = 0; |
490 | message = NULL; | 491 | message = NULL; |
491 | while (!message && len) { | 492 | while (!message && len) { |
492 | loff_t saved_offset = this_header; | 493 | loff_t saved_offset = this_header; |
493 | if (*buf == '0' && !(this_header & 3)) { | 494 | if (*buf == '0' && !(this_header & 3)) { |
494 | state = Start; | 495 | state = Start; |
495 | written = write_buffer(buf, len); | 496 | written = write_buffer(buf, len); |
496 | buf += written; | 497 | buf += written; |
497 | len -= written; | 498 | len -= written; |
498 | continue; | 499 | continue; |
499 | } | 500 | } |
500 | if (!*buf) { | 501 | if (!*buf) { |
501 | buf++; | 502 | buf++; |
502 | len--; | 503 | len--; |
503 | this_header++; | 504 | this_header++; |
504 | continue; | 505 | continue; |
505 | } | 506 | } |
506 | this_header = 0; | 507 | this_header = 0; |
507 | insize = len; | 508 | insize = len; |
508 | inbuf = buf; | 509 | inbuf = buf; |
509 | inptr = 0; | 510 | inptr = 0; |
510 | outcnt = 0; /* bytes in output buffer */ | 511 | outcnt = 0; /* bytes in output buffer */ |
511 | bytes_out = 0; | 512 | bytes_out = 0; |
512 | crc = (ulg)0xffffffffL; /* shift register contents */ | 513 | crc = (ulg)0xffffffffL; /* shift register contents */ |
513 | makecrc(); | 514 | makecrc(); |
514 | gunzip(); | 515 | gunzip(); |
515 | if (state != Reset) | 516 | if (state != Reset) |
516 | error("junk in gzipped archive"); | 517 | error("junk in gzipped archive"); |
517 | this_header = saved_offset + inptr; | 518 | this_header = saved_offset + inptr; |
518 | buf += inptr; | 519 | buf += inptr; |
519 | len -= inptr; | 520 | len -= inptr; |
520 | } | 521 | } |
521 | dir_utime(); | 522 | dir_utime(); |
522 | kfree(window); | 523 | kfree(window); |
523 | kfree(name_buf); | 524 | kfree(name_buf); |
524 | kfree(symlink_buf); | 525 | kfree(symlink_buf); |
525 | kfree(header_buf); | 526 | kfree(header_buf); |
526 | return message; | 527 | return message; |
527 | } | 528 | } |
528 | 529 | ||
529 | static int __initdata do_retain_initrd; | 530 | static int __initdata do_retain_initrd; |
530 | 531 | ||
531 | static int __init retain_initrd_param(char *str) | 532 | static int __init retain_initrd_param(char *str) |
532 | { | 533 | { |
533 | if (*str) | 534 | if (*str) |
534 | return 0; | 535 | return 0; |
535 | do_retain_initrd = 1; | 536 | do_retain_initrd = 1; |
536 | return 1; | 537 | return 1; |
537 | } | 538 | } |
538 | __setup("retain_initrd", retain_initrd_param); | 539 | __setup("retain_initrd", retain_initrd_param); |
539 | 540 | ||
540 | extern char __initramfs_start[], __initramfs_end[]; | 541 | extern char __initramfs_start[], __initramfs_end[]; |
541 | #include <linux/initrd.h> | 542 | #include <linux/initrd.h> |
542 | #include <linux/kexec.h> | 543 | #include <linux/kexec.h> |
543 | 544 | ||
544 | static void __init free_initrd(void) | 545 | static void __init free_initrd(void) |
545 | { | 546 | { |
546 | #ifdef CONFIG_KEXEC | 547 | #ifdef CONFIG_KEXEC |
547 | unsigned long crashk_start = (unsigned long)__va(crashk_res.start); | 548 | unsigned long crashk_start = (unsigned long)__va(crashk_res.start); |
548 | unsigned long crashk_end = (unsigned long)__va(crashk_res.end); | 549 | unsigned long crashk_end = (unsigned long)__va(crashk_res.end); |
549 | #endif | 550 | #endif |
550 | if (do_retain_initrd) | 551 | if (do_retain_initrd) |
551 | goto skip; | 552 | goto skip; |
552 | 553 | ||
553 | #ifdef CONFIG_KEXEC | 554 | #ifdef CONFIG_KEXEC |
554 | /* | 555 | /* |
555 | * If the initrd region is overlapped with crashkernel reserved region, | 556 | * If the initrd region is overlapped with crashkernel reserved region, |
556 | * free only memory that is not part of crashkernel region. | 557 | * free only memory that is not part of crashkernel region. |
557 | */ | 558 | */ |
558 | if (initrd_start < crashk_end && initrd_end > crashk_start) { | 559 | if (initrd_start < crashk_end && initrd_end > crashk_start) { |
559 | /* | 560 | /* |
560 | * Initialize initrd memory region since the kexec boot does | 561 | * Initialize initrd memory region since the kexec boot does |
561 | * not do. | 562 | * not do. |
562 | */ | 563 | */ |
563 | memset((void *)initrd_start, 0, initrd_end - initrd_start); | 564 | memset((void *)initrd_start, 0, initrd_end - initrd_start); |
564 | if (initrd_start < crashk_start) | 565 | if (initrd_start < crashk_start) |
565 | free_initrd_mem(initrd_start, crashk_start); | 566 | free_initrd_mem(initrd_start, crashk_start); |
566 | if (initrd_end > crashk_end) | 567 | if (initrd_end > crashk_end) |
567 | free_initrd_mem(crashk_end, initrd_end); | 568 | free_initrd_mem(crashk_end, initrd_end); |
568 | } else | 569 | } else |
569 | #endif | 570 | #endif |
570 | free_initrd_mem(initrd_start, initrd_end); | 571 | free_initrd_mem(initrd_start, initrd_end); |
571 | skip: | 572 | skip: |
572 | initrd_start = 0; | 573 | initrd_start = 0; |
573 | initrd_end = 0; | 574 | initrd_end = 0; |
574 | } | 575 | } |
575 | 576 | ||
576 | static int __init populate_rootfs(void) | 577 | static int __init populate_rootfs(void) |
577 | { | 578 | { |
578 | char *err = unpack_to_rootfs(__initramfs_start, | 579 | char *err = unpack_to_rootfs(__initramfs_start, |
579 | __initramfs_end - __initramfs_start, 0); | 580 | __initramfs_end - __initramfs_start, 0); |
580 | if (err) | 581 | if (err) |
581 | panic(err); | 582 | panic(err); |
582 | if (initrd_start) { | 583 | if (initrd_start) { |
583 | #ifdef CONFIG_BLK_DEV_RAM | 584 | #ifdef CONFIG_BLK_DEV_RAM |
584 | int fd; | 585 | int fd; |
585 | printk(KERN_INFO "checking if image is initramfs..."); | 586 | printk(KERN_INFO "checking if image is initramfs..."); |
586 | err = unpack_to_rootfs((char *)initrd_start, | 587 | err = unpack_to_rootfs((char *)initrd_start, |
587 | initrd_end - initrd_start, 1); | 588 | initrd_end - initrd_start, 1); |
588 | if (!err) { | 589 | if (!err) { |
589 | printk(" it is\n"); | 590 | printk(" it is\n"); |
590 | unpack_to_rootfs((char *)initrd_start, | 591 | unpack_to_rootfs((char *)initrd_start, |
591 | initrd_end - initrd_start, 0); | 592 | initrd_end - initrd_start, 0); |
592 | free_initrd(); | 593 | free_initrd(); |
593 | return 0; | 594 | return 0; |
594 | } | 595 | } |
595 | printk("it isn't (%s); looks like an initrd\n", err); | 596 | printk("it isn't (%s); looks like an initrd\n", err); |
596 | fd = sys_open("/initrd.image", O_WRONLY|O_CREAT, 0700); | 597 | fd = sys_open("/initrd.image", O_WRONLY|O_CREAT, 0700); |
597 | if (fd >= 0) { | 598 | if (fd >= 0) { |
598 | sys_write(fd, (char *)initrd_start, | 599 | sys_write(fd, (char *)initrd_start, |
599 | initrd_end - initrd_start); | 600 | initrd_end - initrd_start); |
600 | sys_close(fd); | 601 | sys_close(fd); |
601 | free_initrd(); | 602 | free_initrd(); |
602 | } | 603 | } |
603 | #else | 604 | #else |
604 | printk(KERN_INFO "Unpacking initramfs..."); | 605 | printk(KERN_INFO "Unpacking initramfs..."); |
605 | err = unpack_to_rootfs((char *)initrd_start, | 606 | err = unpack_to_rootfs((char *)initrd_start, |
606 | initrd_end - initrd_start, 0); | 607 | initrd_end - initrd_start, 0); |
607 | if (err) | 608 | if (err) |
608 | panic(err); | 609 | panic(err); |
609 | printk(" done\n"); | 610 | printk(" done\n"); |
610 | free_initrd(); | 611 | free_initrd(); |
611 | #endif | 612 | #endif |
612 | } | 613 | } |
613 | return 0; | 614 | return 0; |
614 | } | 615 | } |
615 | rootfs_initcall(populate_rootfs); | 616 | rootfs_initcall(populate_rootfs); |
616 | 617 |
ipc/shm.c
1 | /* | 1 | /* |
2 | * linux/ipc/shm.c | 2 | * linux/ipc/shm.c |
3 | * Copyright (C) 1992, 1993 Krishna Balasubramanian | 3 | * Copyright (C) 1992, 1993 Krishna Balasubramanian |
4 | * Many improvements/fixes by Bruno Haible. | 4 | * Many improvements/fixes by Bruno Haible. |
5 | * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994. | 5 | * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994. |
6 | * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli. | 6 | * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli. |
7 | * | 7 | * |
8 | * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com> | 8 | * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com> |
9 | * BIGMEM support, Andrea Arcangeli <andrea@suse.de> | 9 | * BIGMEM support, Andrea Arcangeli <andrea@suse.de> |
10 | * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr> | 10 | * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr> |
11 | * HIGHMEM support, Ingo Molnar <mingo@redhat.com> | 11 | * HIGHMEM support, Ingo Molnar <mingo@redhat.com> |
12 | * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com> | 12 | * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com> |
13 | * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> | 13 | * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> |
14 | * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com> | 14 | * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com> |
15 | * | 15 | * |
16 | * support for audit of ipc object properties and permission changes | 16 | * support for audit of ipc object properties and permission changes |
17 | * Dustin Kirkland <dustin.kirkland@us.ibm.com> | 17 | * Dustin Kirkland <dustin.kirkland@us.ibm.com> |
18 | * | 18 | * |
19 | * namespaces support | 19 | * namespaces support |
20 | * OpenVZ, SWsoft Inc. | 20 | * OpenVZ, SWsoft Inc. |
21 | * Pavel Emelianov <xemul@openvz.org> | 21 | * Pavel Emelianov <xemul@openvz.org> |
22 | */ | 22 | */ |
23 | 23 | ||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
27 | #include <linux/shm.h> | 27 | #include <linux/shm.h> |
28 | #include <linux/init.h> | 28 | #include <linux/init.h> |
29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
30 | #include <linux/mman.h> | 30 | #include <linux/mman.h> |
31 | #include <linux/shmem_fs.h> | 31 | #include <linux/shmem_fs.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/syscalls.h> | 33 | #include <linux/syscalls.h> |
34 | #include <linux/audit.h> | 34 | #include <linux/audit.h> |
35 | #include <linux/capability.h> | 35 | #include <linux/capability.h> |
36 | #include <linux/ptrace.h> | 36 | #include <linux/ptrace.h> |
37 | #include <linux/seq_file.h> | 37 | #include <linux/seq_file.h> |
38 | #include <linux/rwsem.h> | 38 | #include <linux/rwsem.h> |
39 | #include <linux/nsproxy.h> | 39 | #include <linux/nsproxy.h> |
40 | #include <linux/mount.h> | 40 | #include <linux/mount.h> |
41 | #include <linux/ipc_namespace.h> | 41 | #include <linux/ipc_namespace.h> |
42 | 42 | ||
43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
44 | 44 | ||
45 | #include "util.h" | 45 | #include "util.h" |
46 | 46 | ||
47 | struct shm_file_data { | 47 | struct shm_file_data { |
48 | int id; | 48 | int id; |
49 | struct ipc_namespace *ns; | 49 | struct ipc_namespace *ns; |
50 | struct file *file; | 50 | struct file *file; |
51 | const struct vm_operations_struct *vm_ops; | 51 | const struct vm_operations_struct *vm_ops; |
52 | }; | 52 | }; |
53 | 53 | ||
54 | #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) | 54 | #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) |
55 | 55 | ||
56 | static const struct file_operations shm_file_operations; | 56 | static const struct file_operations shm_file_operations; |
57 | static struct vm_operations_struct shm_vm_ops; | 57 | static struct vm_operations_struct shm_vm_ops; |
58 | 58 | ||
59 | #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) | 59 | #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) |
60 | 60 | ||
61 | #define shm_unlock(shp) \ | 61 | #define shm_unlock(shp) \ |
62 | ipc_unlock(&(shp)->shm_perm) | 62 | ipc_unlock(&(shp)->shm_perm) |
63 | 63 | ||
64 | static int newseg(struct ipc_namespace *, struct ipc_params *); | 64 | static int newseg(struct ipc_namespace *, struct ipc_params *); |
65 | static void shm_open(struct vm_area_struct *vma); | 65 | static void shm_open(struct vm_area_struct *vma); |
66 | static void shm_close(struct vm_area_struct *vma); | 66 | static void shm_close(struct vm_area_struct *vma); |
67 | static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); | 67 | static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); |
68 | #ifdef CONFIG_PROC_FS | 68 | #ifdef CONFIG_PROC_FS |
69 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it); | 69 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it); |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | void shm_init_ns(struct ipc_namespace *ns) | 72 | void shm_init_ns(struct ipc_namespace *ns) |
73 | { | 73 | { |
74 | ns->shm_ctlmax = SHMMAX; | 74 | ns->shm_ctlmax = SHMMAX; |
75 | ns->shm_ctlall = SHMALL; | 75 | ns->shm_ctlall = SHMALL; |
76 | ns->shm_ctlmni = SHMMNI; | 76 | ns->shm_ctlmni = SHMMNI; |
77 | ns->shm_tot = 0; | 77 | ns->shm_tot = 0; |
78 | ipc_init_ids(&shm_ids(ns)); | 78 | ipc_init_ids(&shm_ids(ns)); |
79 | } | 79 | } |
80 | 80 | ||
81 | /* | 81 | /* |
82 | * Called with shm_ids.rw_mutex (writer) and the shp structure locked. | 82 | * Called with shm_ids.rw_mutex (writer) and the shp structure locked. |
83 | * Only shm_ids.rw_mutex remains locked on exit. | 83 | * Only shm_ids.rw_mutex remains locked on exit. |
84 | */ | 84 | */ |
85 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | 85 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) |
86 | { | 86 | { |
87 | struct shmid_kernel *shp; | 87 | struct shmid_kernel *shp; |
88 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 88 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
89 | 89 | ||
90 | if (shp->shm_nattch){ | 90 | if (shp->shm_nattch){ |
91 | shp->shm_perm.mode |= SHM_DEST; | 91 | shp->shm_perm.mode |= SHM_DEST; |
92 | /* Do not find it any more */ | 92 | /* Do not find it any more */ |
93 | shp->shm_perm.key = IPC_PRIVATE; | 93 | shp->shm_perm.key = IPC_PRIVATE; |
94 | shm_unlock(shp); | 94 | shm_unlock(shp); |
95 | } else | 95 | } else |
96 | shm_destroy(ns, shp); | 96 | shm_destroy(ns, shp); |
97 | } | 97 | } |
98 | 98 | ||
99 | #ifdef CONFIG_IPC_NS | 99 | #ifdef CONFIG_IPC_NS |
100 | void shm_exit_ns(struct ipc_namespace *ns) | 100 | void shm_exit_ns(struct ipc_namespace *ns) |
101 | { | 101 | { |
102 | free_ipcs(ns, &shm_ids(ns), do_shm_rmid); | 102 | free_ipcs(ns, &shm_ids(ns), do_shm_rmid); |
103 | } | 103 | } |
104 | #endif | 104 | #endif |
105 | 105 | ||
106 | void __init shm_init (void) | 106 | void __init shm_init (void) |
107 | { | 107 | { |
108 | shm_init_ns(&init_ipc_ns); | 108 | shm_init_ns(&init_ipc_ns); |
109 | ipc_init_proc_interface("sysvipc/shm", | 109 | ipc_init_proc_interface("sysvipc/shm", |
110 | " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime\n", | 110 | " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime\n", |
111 | IPC_SHM_IDS, sysvipc_shm_proc_show); | 111 | IPC_SHM_IDS, sysvipc_shm_proc_show); |
112 | } | 112 | } |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * shm_lock_(check_) routines are called in the paths where the rw_mutex | 115 | * shm_lock_(check_) routines are called in the paths where the rw_mutex |
116 | * is not necessarily held. | 116 | * is not necessarily held. |
117 | */ | 117 | */ |
118 | static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) | 118 | static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) |
119 | { | 119 | { |
120 | struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); | 120 | struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); |
121 | 121 | ||
122 | if (IS_ERR(ipcp)) | 122 | if (IS_ERR(ipcp)) |
123 | return (struct shmid_kernel *)ipcp; | 123 | return (struct shmid_kernel *)ipcp; |
124 | 124 | ||
125 | return container_of(ipcp, struct shmid_kernel, shm_perm); | 125 | return container_of(ipcp, struct shmid_kernel, shm_perm); |
126 | } | 126 | } |
127 | 127 | ||
128 | static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, | 128 | static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, |
129 | int id) | 129 | int id) |
130 | { | 130 | { |
131 | struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); | 131 | struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); |
132 | 132 | ||
133 | if (IS_ERR(ipcp)) | 133 | if (IS_ERR(ipcp)) |
134 | return (struct shmid_kernel *)ipcp; | 134 | return (struct shmid_kernel *)ipcp; |
135 | 135 | ||
136 | return container_of(ipcp, struct shmid_kernel, shm_perm); | 136 | return container_of(ipcp, struct shmid_kernel, shm_perm); |
137 | } | 137 | } |
138 | 138 | ||
139 | static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) | 139 | static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) |
140 | { | 140 | { |
141 | ipc_rmid(&shm_ids(ns), &s->shm_perm); | 141 | ipc_rmid(&shm_ids(ns), &s->shm_perm); |
142 | } | 142 | } |
143 | 143 | ||
144 | 144 | ||
145 | /* This is called by fork, once for every shm attach. */ | 145 | /* This is called by fork, once for every shm attach. */ |
146 | static void shm_open(struct vm_area_struct *vma) | 146 | static void shm_open(struct vm_area_struct *vma) |
147 | { | 147 | { |
148 | struct file *file = vma->vm_file; | 148 | struct file *file = vma->vm_file; |
149 | struct shm_file_data *sfd = shm_file_data(file); | 149 | struct shm_file_data *sfd = shm_file_data(file); |
150 | struct shmid_kernel *shp; | 150 | struct shmid_kernel *shp; |
151 | 151 | ||
152 | shp = shm_lock(sfd->ns, sfd->id); | 152 | shp = shm_lock(sfd->ns, sfd->id); |
153 | BUG_ON(IS_ERR(shp)); | 153 | BUG_ON(IS_ERR(shp)); |
154 | shp->shm_atim = get_seconds(); | 154 | shp->shm_atim = get_seconds(); |
155 | shp->shm_lprid = task_tgid_vnr(current); | 155 | shp->shm_lprid = task_tgid_vnr(current); |
156 | shp->shm_nattch++; | 156 | shp->shm_nattch++; |
157 | shm_unlock(shp); | 157 | shm_unlock(shp); |
158 | } | 158 | } |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * shm_destroy - free the struct shmid_kernel | 161 | * shm_destroy - free the struct shmid_kernel |
162 | * | 162 | * |
163 | * @ns: namespace | 163 | * @ns: namespace |
164 | * @shp: struct to free | 164 | * @shp: struct to free |
165 | * | 165 | * |
166 | * It has to be called with shp and shm_ids.rw_mutex (writer) locked, | 166 | * It has to be called with shp and shm_ids.rw_mutex (writer) locked, |
167 | * but returns with shp unlocked and freed. | 167 | * but returns with shp unlocked and freed. |
168 | */ | 168 | */ |
169 | static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) | 169 | static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) |
170 | { | 170 | { |
171 | ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; | 171 | ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; |
172 | shm_rmid(ns, shp); | 172 | shm_rmid(ns, shp); |
173 | shm_unlock(shp); | 173 | shm_unlock(shp); |
174 | if (!is_file_hugepages(shp->shm_file)) | 174 | if (!is_file_hugepages(shp->shm_file)) |
175 | shmem_lock(shp->shm_file, 0, shp->mlock_user); | 175 | shmem_lock(shp->shm_file, 0, shp->mlock_user); |
176 | else | 176 | else |
177 | user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, | 177 | user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, |
178 | shp->mlock_user); | 178 | shp->mlock_user); |
179 | fput (shp->shm_file); | 179 | fput (shp->shm_file); |
180 | security_shm_free(shp); | 180 | security_shm_free(shp); |
181 | ipc_rcu_putref(shp); | 181 | ipc_rcu_putref(shp); |
182 | } | 182 | } |
183 | 183 | ||
184 | /* | 184 | /* |
185 | * remove the attach descriptor vma. | 185 | * remove the attach descriptor vma. |
186 | * free memory for segment if it is marked destroyed. | 186 | * free memory for segment if it is marked destroyed. |
187 | * The descriptor has already been removed from the current->mm->mmap list | 187 | * The descriptor has already been removed from the current->mm->mmap list |
188 | * and will later be kfree()d. | 188 | * and will later be kfree()d. |
189 | */ | 189 | */ |
190 | static void shm_close(struct vm_area_struct *vma) | 190 | static void shm_close(struct vm_area_struct *vma) |
191 | { | 191 | { |
192 | struct file * file = vma->vm_file; | 192 | struct file * file = vma->vm_file; |
193 | struct shm_file_data *sfd = shm_file_data(file); | 193 | struct shm_file_data *sfd = shm_file_data(file); |
194 | struct shmid_kernel *shp; | 194 | struct shmid_kernel *shp; |
195 | struct ipc_namespace *ns = sfd->ns; | 195 | struct ipc_namespace *ns = sfd->ns; |
196 | 196 | ||
197 | down_write(&shm_ids(ns).rw_mutex); | 197 | down_write(&shm_ids(ns).rw_mutex); |
198 | /* remove from the list of attaches of the shm segment */ | 198 | /* remove from the list of attaches of the shm segment */ |
199 | shp = shm_lock(ns, sfd->id); | 199 | shp = shm_lock(ns, sfd->id); |
200 | BUG_ON(IS_ERR(shp)); | 200 | BUG_ON(IS_ERR(shp)); |
201 | shp->shm_lprid = task_tgid_vnr(current); | 201 | shp->shm_lprid = task_tgid_vnr(current); |
202 | shp->shm_dtim = get_seconds(); | 202 | shp->shm_dtim = get_seconds(); |
203 | shp->shm_nattch--; | 203 | shp->shm_nattch--; |
204 | if(shp->shm_nattch == 0 && | 204 | if(shp->shm_nattch == 0 && |
205 | shp->shm_perm.mode & SHM_DEST) | 205 | shp->shm_perm.mode & SHM_DEST) |
206 | shm_destroy(ns, shp); | 206 | shm_destroy(ns, shp); |
207 | else | 207 | else |
208 | shm_unlock(shp); | 208 | shm_unlock(shp); |
209 | up_write(&shm_ids(ns).rw_mutex); | 209 | up_write(&shm_ids(ns).rw_mutex); |
210 | } | 210 | } |
211 | 211 | ||
212 | static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 212 | static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
213 | { | 213 | { |
214 | struct file *file = vma->vm_file; | 214 | struct file *file = vma->vm_file; |
215 | struct shm_file_data *sfd = shm_file_data(file); | 215 | struct shm_file_data *sfd = shm_file_data(file); |
216 | 216 | ||
217 | return sfd->vm_ops->fault(vma, vmf); | 217 | return sfd->vm_ops->fault(vma, vmf); |
218 | } | 218 | } |
219 | 219 | ||
220 | #ifdef CONFIG_NUMA | 220 | #ifdef CONFIG_NUMA |
221 | static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 221 | static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
222 | { | 222 | { |
223 | struct file *file = vma->vm_file; | 223 | struct file *file = vma->vm_file; |
224 | struct shm_file_data *sfd = shm_file_data(file); | 224 | struct shm_file_data *sfd = shm_file_data(file); |
225 | int err = 0; | 225 | int err = 0; |
226 | if (sfd->vm_ops->set_policy) | 226 | if (sfd->vm_ops->set_policy) |
227 | err = sfd->vm_ops->set_policy(vma, new); | 227 | err = sfd->vm_ops->set_policy(vma, new); |
228 | return err; | 228 | return err; |
229 | } | 229 | } |
230 | 230 | ||
231 | static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, | 231 | static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, |
232 | unsigned long addr) | 232 | unsigned long addr) |
233 | { | 233 | { |
234 | struct file *file = vma->vm_file; | 234 | struct file *file = vma->vm_file; |
235 | struct shm_file_data *sfd = shm_file_data(file); | 235 | struct shm_file_data *sfd = shm_file_data(file); |
236 | struct mempolicy *pol = NULL; | 236 | struct mempolicy *pol = NULL; |
237 | 237 | ||
238 | if (sfd->vm_ops->get_policy) | 238 | if (sfd->vm_ops->get_policy) |
239 | pol = sfd->vm_ops->get_policy(vma, addr); | 239 | pol = sfd->vm_ops->get_policy(vma, addr); |
240 | else if (vma->vm_policy) | 240 | else if (vma->vm_policy) |
241 | pol = vma->vm_policy; | 241 | pol = vma->vm_policy; |
242 | 242 | ||
243 | return pol; | 243 | return pol; |
244 | } | 244 | } |
245 | #endif | 245 | #endif |
246 | 246 | ||
247 | static int shm_mmap(struct file * file, struct vm_area_struct * vma) | 247 | static int shm_mmap(struct file * file, struct vm_area_struct * vma) |
248 | { | 248 | { |
249 | struct shm_file_data *sfd = shm_file_data(file); | 249 | struct shm_file_data *sfd = shm_file_data(file); |
250 | int ret; | 250 | int ret; |
251 | 251 | ||
252 | ret = sfd->file->f_op->mmap(sfd->file, vma); | 252 | ret = sfd->file->f_op->mmap(sfd->file, vma); |
253 | if (ret != 0) | 253 | if (ret != 0) |
254 | return ret; | 254 | return ret; |
255 | sfd->vm_ops = vma->vm_ops; | 255 | sfd->vm_ops = vma->vm_ops; |
256 | #ifdef CONFIG_MMU | 256 | #ifdef CONFIG_MMU |
257 | BUG_ON(!sfd->vm_ops->fault); | 257 | BUG_ON(!sfd->vm_ops->fault); |
258 | #endif | 258 | #endif |
259 | vma->vm_ops = &shm_vm_ops; | 259 | vma->vm_ops = &shm_vm_ops; |
260 | shm_open(vma); | 260 | shm_open(vma); |
261 | 261 | ||
262 | return ret; | 262 | return ret; |
263 | } | 263 | } |
264 | 264 | ||
265 | static int shm_release(struct inode *ino, struct file *file) | 265 | static int shm_release(struct inode *ino, struct file *file) |
266 | { | 266 | { |
267 | struct shm_file_data *sfd = shm_file_data(file); | 267 | struct shm_file_data *sfd = shm_file_data(file); |
268 | 268 | ||
269 | put_ipc_ns(sfd->ns); | 269 | put_ipc_ns(sfd->ns); |
270 | shm_file_data(file) = NULL; | 270 | shm_file_data(file) = NULL; |
271 | kfree(sfd); | 271 | kfree(sfd); |
272 | return 0; | 272 | return 0; |
273 | } | 273 | } |
274 | 274 | ||
275 | static int shm_fsync(struct file *file, struct dentry *dentry, int datasync) | 275 | static int shm_fsync(struct file *file, struct dentry *dentry, int datasync) |
276 | { | 276 | { |
277 | int (*fsync) (struct file *, struct dentry *, int datasync); | 277 | int (*fsync) (struct file *, struct dentry *, int datasync); |
278 | struct shm_file_data *sfd = shm_file_data(file); | 278 | struct shm_file_data *sfd = shm_file_data(file); |
279 | int ret = -EINVAL; | 279 | int ret = -EINVAL; |
280 | 280 | ||
281 | fsync = sfd->file->f_op->fsync; | 281 | fsync = sfd->file->f_op->fsync; |
282 | if (fsync) | 282 | if (fsync) |
283 | ret = fsync(sfd->file, sfd->file->f_path.dentry, datasync); | 283 | ret = fsync(sfd->file, sfd->file->f_path.dentry, datasync); |
284 | return ret; | 284 | return ret; |
285 | } | 285 | } |
286 | 286 | ||
287 | static unsigned long shm_get_unmapped_area(struct file *file, | 287 | static unsigned long shm_get_unmapped_area(struct file *file, |
288 | unsigned long addr, unsigned long len, unsigned long pgoff, | 288 | unsigned long addr, unsigned long len, unsigned long pgoff, |
289 | unsigned long flags) | 289 | unsigned long flags) |
290 | { | 290 | { |
291 | struct shm_file_data *sfd = shm_file_data(file); | 291 | struct shm_file_data *sfd = shm_file_data(file); |
292 | return get_unmapped_area(sfd->file, addr, len, pgoff, flags); | 292 | return get_unmapped_area(sfd->file, addr, len, pgoff, flags); |
293 | } | 293 | } |
294 | 294 | ||
295 | int is_file_shm_hugepages(struct file *file) | 295 | int is_file_shm_hugepages(struct file *file) |
296 | { | 296 | { |
297 | int ret = 0; | 297 | int ret = 0; |
298 | 298 | ||
299 | if (file->f_op == &shm_file_operations) { | 299 | if (file->f_op == &shm_file_operations) { |
300 | struct shm_file_data *sfd; | 300 | struct shm_file_data *sfd; |
301 | sfd = shm_file_data(file); | 301 | sfd = shm_file_data(file); |
302 | ret = is_file_hugepages(sfd->file); | 302 | ret = is_file_hugepages(sfd->file); |
303 | } | 303 | } |
304 | return ret; | 304 | return ret; |
305 | } | 305 | } |
306 | 306 | ||
307 | static const struct file_operations shm_file_operations = { | 307 | static const struct file_operations shm_file_operations = { |
308 | .mmap = shm_mmap, | 308 | .mmap = shm_mmap, |
309 | .fsync = shm_fsync, | 309 | .fsync = shm_fsync, |
310 | .release = shm_release, | 310 | .release = shm_release, |
311 | .get_unmapped_area = shm_get_unmapped_area, | 311 | .get_unmapped_area = shm_get_unmapped_area, |
312 | }; | 312 | }; |
313 | 313 | ||
314 | static struct vm_operations_struct shm_vm_ops = { | 314 | static struct vm_operations_struct shm_vm_ops = { |
315 | .open = shm_open, /* callback for a new vm-area open */ | 315 | .open = shm_open, /* callback for a new vm-area open */ |
316 | .close = shm_close, /* callback for when the vm-area is released */ | 316 | .close = shm_close, /* callback for when the vm-area is released */ |
317 | .fault = shm_fault, | 317 | .fault = shm_fault, |
318 | #if defined(CONFIG_NUMA) | 318 | #if defined(CONFIG_NUMA) |
319 | .set_policy = shm_set_policy, | 319 | .set_policy = shm_set_policy, |
320 | .get_policy = shm_get_policy, | 320 | .get_policy = shm_get_policy, |
321 | #endif | 321 | #endif |
322 | }; | 322 | }; |
323 | 323 | ||
324 | /** | 324 | /** |
325 | * newseg - Create a new shared memory segment | 325 | * newseg - Create a new shared memory segment |
326 | * @ns: namespace | 326 | * @ns: namespace |
327 | * @params: ptr to the structure that contains key, size and shmflg | 327 | * @params: ptr to the structure that contains key, size and shmflg |
328 | * | 328 | * |
329 | * Called with shm_ids.rw_mutex held as a writer. | 329 | * Called with shm_ids.rw_mutex held as a writer. |
330 | */ | 330 | */ |
331 | 331 | ||
332 | static int newseg(struct ipc_namespace *ns, struct ipc_params *params) | 332 | static int newseg(struct ipc_namespace *ns, struct ipc_params *params) |
333 | { | 333 | { |
334 | key_t key = params->key; | 334 | key_t key = params->key; |
335 | int shmflg = params->flg; | 335 | int shmflg = params->flg; |
336 | size_t size = params->u.size; | 336 | size_t size = params->u.size; |
337 | int error; | 337 | int error; |
338 | struct shmid_kernel *shp; | 338 | struct shmid_kernel *shp; |
339 | int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; | 339 | int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; |
340 | struct file * file; | 340 | struct file * file; |
341 | char name[13]; | 341 | char name[13]; |
342 | int id; | 342 | int id; |
343 | 343 | ||
344 | if (size < SHMMIN || size > ns->shm_ctlmax) | 344 | if (size < SHMMIN || size > ns->shm_ctlmax) |
345 | return -EINVAL; | 345 | return -EINVAL; |
346 | 346 | ||
347 | if (ns->shm_tot + numpages > ns->shm_ctlall) | 347 | if (ns->shm_tot + numpages > ns->shm_ctlall) |
348 | return -ENOSPC; | 348 | return -ENOSPC; |
349 | 349 | ||
350 | shp = ipc_rcu_alloc(sizeof(*shp)); | 350 | shp = ipc_rcu_alloc(sizeof(*shp)); |
351 | if (!shp) | 351 | if (!shp) |
352 | return -ENOMEM; | 352 | return -ENOMEM; |
353 | 353 | ||
354 | shp->shm_perm.key = key; | 354 | shp->shm_perm.key = key; |
355 | shp->shm_perm.mode = (shmflg & S_IRWXUGO); | 355 | shp->shm_perm.mode = (shmflg & S_IRWXUGO); |
356 | shp->mlock_user = NULL; | 356 | shp->mlock_user = NULL; |
357 | 357 | ||
358 | shp->shm_perm.security = NULL; | 358 | shp->shm_perm.security = NULL; |
359 | error = security_shm_alloc(shp); | 359 | error = security_shm_alloc(shp); |
360 | if (error) { | 360 | if (error) { |
361 | ipc_rcu_putref(shp); | 361 | ipc_rcu_putref(shp); |
362 | return error; | 362 | return error; |
363 | } | 363 | } |
364 | 364 | ||
365 | sprintf (name, "SYSV%08x", key); | 365 | sprintf (name, "SYSV%08x", key); |
366 | if (shmflg & SHM_HUGETLB) { | 366 | if (shmflg & SHM_HUGETLB) { |
367 | /* hugetlb_file_setup takes care of mlock user accounting */ | 367 | /* hugetlb_file_setup takes care of mlock user accounting */ |
368 | file = hugetlb_file_setup(name, size); | 368 | file = hugetlb_file_setup(name, size); |
369 | shp->mlock_user = current_user(); | 369 | shp->mlock_user = current_user(); |
370 | } else { | 370 | } else { |
371 | int acctflag = VM_ACCOUNT; | 371 | int acctflag = VM_ACCOUNT; |
372 | /* | 372 | /* |
373 | * Do not allow no accounting for OVERCOMMIT_NEVER, even | 373 | * Do not allow no accounting for OVERCOMMIT_NEVER, even |
374 | * if it's asked for. | 374 | * if it's asked for. |
375 | */ | 375 | */ |
376 | if ((shmflg & SHM_NORESERVE) && | 376 | if ((shmflg & SHM_NORESERVE) && |
377 | sysctl_overcommit_memory != OVERCOMMIT_NEVER) | 377 | sysctl_overcommit_memory != OVERCOMMIT_NEVER) |
378 | acctflag = 0; | 378 | acctflag = 0; |
379 | file = shmem_file_setup(name, size, acctflag); | 379 | file = shmem_file_setup(name, size, acctflag); |
380 | } | 380 | } |
381 | error = PTR_ERR(file); | 381 | error = PTR_ERR(file); |
382 | if (IS_ERR(file)) | 382 | if (IS_ERR(file)) |
383 | goto no_file; | 383 | goto no_file; |
384 | 384 | ||
385 | id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); | 385 | id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); |
386 | if (id < 0) { | 386 | if (id < 0) { |
387 | error = id; | 387 | error = id; |
388 | goto no_id; | 388 | goto no_id; |
389 | } | 389 | } |
390 | 390 | ||
391 | shp->shm_cprid = task_tgid_vnr(current); | 391 | shp->shm_cprid = task_tgid_vnr(current); |
392 | shp->shm_lprid = 0; | 392 | shp->shm_lprid = 0; |
393 | shp->shm_atim = shp->shm_dtim = 0; | 393 | shp->shm_atim = shp->shm_dtim = 0; |
394 | shp->shm_ctim = get_seconds(); | 394 | shp->shm_ctim = get_seconds(); |
395 | shp->shm_segsz = size; | 395 | shp->shm_segsz = size; |
396 | shp->shm_nattch = 0; | 396 | shp->shm_nattch = 0; |
397 | shp->shm_file = file; | 397 | shp->shm_file = file; |
398 | /* | 398 | /* |
399 | * shmid gets reported as "inode#" in /proc/pid/maps. | 399 | * shmid gets reported as "inode#" in /proc/pid/maps. |
400 | * proc-ps tools use this. Changing this will break them. | 400 | * proc-ps tools use this. Changing this will break them. |
401 | */ | 401 | */ |
402 | file->f_dentry->d_inode->i_ino = shp->shm_perm.id; | 402 | file->f_dentry->d_inode->i_ino = shp->shm_perm.id; |
403 | 403 | ||
404 | ns->shm_tot += numpages; | 404 | ns->shm_tot += numpages; |
405 | error = shp->shm_perm.id; | 405 | error = shp->shm_perm.id; |
406 | shm_unlock(shp); | 406 | shm_unlock(shp); |
407 | return error; | 407 | return error; |
408 | 408 | ||
409 | no_id: | 409 | no_id: |
410 | fput(file); | 410 | fput(file); |
411 | no_file: | 411 | no_file: |
412 | security_shm_free(shp); | 412 | security_shm_free(shp); |
413 | ipc_rcu_putref(shp); | 413 | ipc_rcu_putref(shp); |
414 | return error; | 414 | return error; |
415 | } | 415 | } |
416 | 416 | ||
417 | /* | 417 | /* |
418 | * Called with shm_ids.rw_mutex and ipcp locked. | 418 | * Called with shm_ids.rw_mutex and ipcp locked. |
419 | */ | 419 | */ |
420 | static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) | 420 | static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) |
421 | { | 421 | { |
422 | struct shmid_kernel *shp; | 422 | struct shmid_kernel *shp; |
423 | 423 | ||
424 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 424 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
425 | return security_shm_associate(shp, shmflg); | 425 | return security_shm_associate(shp, shmflg); |
426 | } | 426 | } |
427 | 427 | ||
428 | /* | 428 | /* |
429 | * Called with shm_ids.rw_mutex and ipcp locked. | 429 | * Called with shm_ids.rw_mutex and ipcp locked. |
430 | */ | 430 | */ |
431 | static inline int shm_more_checks(struct kern_ipc_perm *ipcp, | 431 | static inline int shm_more_checks(struct kern_ipc_perm *ipcp, |
432 | struct ipc_params *params) | 432 | struct ipc_params *params) |
433 | { | 433 | { |
434 | struct shmid_kernel *shp; | 434 | struct shmid_kernel *shp; |
435 | 435 | ||
436 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 436 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
437 | if (shp->shm_segsz < params->u.size) | 437 | if (shp->shm_segsz < params->u.size) |
438 | return -EINVAL; | 438 | return -EINVAL; |
439 | 439 | ||
440 | return 0; | 440 | return 0; |
441 | } | 441 | } |
442 | 442 | ||
443 | asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) | 443 | asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) |
444 | { | 444 | { |
445 | struct ipc_namespace *ns; | 445 | struct ipc_namespace *ns; |
446 | struct ipc_ops shm_ops; | 446 | struct ipc_ops shm_ops; |
447 | struct ipc_params shm_params; | 447 | struct ipc_params shm_params; |
448 | 448 | ||
449 | ns = current->nsproxy->ipc_ns; | 449 | ns = current->nsproxy->ipc_ns; |
450 | 450 | ||
451 | shm_ops.getnew = newseg; | 451 | shm_ops.getnew = newseg; |
452 | shm_ops.associate = shm_security; | 452 | shm_ops.associate = shm_security; |
453 | shm_ops.more_checks = shm_more_checks; | 453 | shm_ops.more_checks = shm_more_checks; |
454 | 454 | ||
455 | shm_params.key = key; | 455 | shm_params.key = key; |
456 | shm_params.flg = shmflg; | 456 | shm_params.flg = shmflg; |
457 | shm_params.u.size = size; | 457 | shm_params.u.size = size; |
458 | 458 | ||
459 | return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); | 459 | return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); |
460 | } | 460 | } |
461 | 461 | ||
462 | static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) | 462 | static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) |
463 | { | 463 | { |
464 | switch(version) { | 464 | switch(version) { |
465 | case IPC_64: | 465 | case IPC_64: |
466 | return copy_to_user(buf, in, sizeof(*in)); | 466 | return copy_to_user(buf, in, sizeof(*in)); |
467 | case IPC_OLD: | 467 | case IPC_OLD: |
468 | { | 468 | { |
469 | struct shmid_ds out; | 469 | struct shmid_ds out; |
470 | 470 | ||
471 | ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); | 471 | ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); |
472 | out.shm_segsz = in->shm_segsz; | 472 | out.shm_segsz = in->shm_segsz; |
473 | out.shm_atime = in->shm_atime; | 473 | out.shm_atime = in->shm_atime; |
474 | out.shm_dtime = in->shm_dtime; | 474 | out.shm_dtime = in->shm_dtime; |
475 | out.shm_ctime = in->shm_ctime; | 475 | out.shm_ctime = in->shm_ctime; |
476 | out.shm_cpid = in->shm_cpid; | 476 | out.shm_cpid = in->shm_cpid; |
477 | out.shm_lpid = in->shm_lpid; | 477 | out.shm_lpid = in->shm_lpid; |
478 | out.shm_nattch = in->shm_nattch; | 478 | out.shm_nattch = in->shm_nattch; |
479 | 479 | ||
480 | return copy_to_user(buf, &out, sizeof(out)); | 480 | return copy_to_user(buf, &out, sizeof(out)); |
481 | } | 481 | } |
482 | default: | 482 | default: |
483 | return -EINVAL; | 483 | return -EINVAL; |
484 | } | 484 | } |
485 | } | 485 | } |
486 | 486 | ||
487 | static inline unsigned long | 487 | static inline unsigned long |
488 | copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) | 488 | copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) |
489 | { | 489 | { |
490 | switch(version) { | 490 | switch(version) { |
491 | case IPC_64: | 491 | case IPC_64: |
492 | if (copy_from_user(out, buf, sizeof(*out))) | 492 | if (copy_from_user(out, buf, sizeof(*out))) |
493 | return -EFAULT; | 493 | return -EFAULT; |
494 | return 0; | 494 | return 0; |
495 | case IPC_OLD: | 495 | case IPC_OLD: |
496 | { | 496 | { |
497 | struct shmid_ds tbuf_old; | 497 | struct shmid_ds tbuf_old; |
498 | 498 | ||
499 | if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) | 499 | if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) |
500 | return -EFAULT; | 500 | return -EFAULT; |
501 | 501 | ||
502 | out->shm_perm.uid = tbuf_old.shm_perm.uid; | 502 | out->shm_perm.uid = tbuf_old.shm_perm.uid; |
503 | out->shm_perm.gid = tbuf_old.shm_perm.gid; | 503 | out->shm_perm.gid = tbuf_old.shm_perm.gid; |
504 | out->shm_perm.mode = tbuf_old.shm_perm.mode; | 504 | out->shm_perm.mode = tbuf_old.shm_perm.mode; |
505 | 505 | ||
506 | return 0; | 506 | return 0; |
507 | } | 507 | } |
508 | default: | 508 | default: |
509 | return -EINVAL; | 509 | return -EINVAL; |
510 | } | 510 | } |
511 | } | 511 | } |
512 | 512 | ||
513 | static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) | 513 | static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) |
514 | { | 514 | { |
515 | switch(version) { | 515 | switch(version) { |
516 | case IPC_64: | 516 | case IPC_64: |
517 | return copy_to_user(buf, in, sizeof(*in)); | 517 | return copy_to_user(buf, in, sizeof(*in)); |
518 | case IPC_OLD: | 518 | case IPC_OLD: |
519 | { | 519 | { |
520 | struct shminfo out; | 520 | struct shminfo out; |
521 | 521 | ||
522 | if(in->shmmax > INT_MAX) | 522 | if(in->shmmax > INT_MAX) |
523 | out.shmmax = INT_MAX; | 523 | out.shmmax = INT_MAX; |
524 | else | 524 | else |
525 | out.shmmax = (int)in->shmmax; | 525 | out.shmmax = (int)in->shmmax; |
526 | 526 | ||
527 | out.shmmin = in->shmmin; | 527 | out.shmmin = in->shmmin; |
528 | out.shmmni = in->shmmni; | 528 | out.shmmni = in->shmmni; |
529 | out.shmseg = in->shmseg; | 529 | out.shmseg = in->shmseg; |
530 | out.shmall = in->shmall; | 530 | out.shmall = in->shmall; |
531 | 531 | ||
532 | return copy_to_user(buf, &out, sizeof(out)); | 532 | return copy_to_user(buf, &out, sizeof(out)); |
533 | } | 533 | } |
534 | default: | 534 | default: |
535 | return -EINVAL; | 535 | return -EINVAL; |
536 | } | 536 | } |
537 | } | 537 | } |
538 | 538 | ||
539 | /* | 539 | /* |
540 | * Called with shm_ids.rw_mutex held as a reader | 540 | * Called with shm_ids.rw_mutex held as a reader |
541 | */ | 541 | */ |
542 | static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, | 542 | static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, |
543 | unsigned long *swp) | 543 | unsigned long *swp) |
544 | { | 544 | { |
545 | int next_id; | 545 | int next_id; |
546 | int total, in_use; | 546 | int total, in_use; |
547 | 547 | ||
548 | *rss = 0; | 548 | *rss = 0; |
549 | *swp = 0; | 549 | *swp = 0; |
550 | 550 | ||
551 | in_use = shm_ids(ns).in_use; | 551 | in_use = shm_ids(ns).in_use; |
552 | 552 | ||
553 | for (total = 0, next_id = 0; total < in_use; next_id++) { | 553 | for (total = 0, next_id = 0; total < in_use; next_id++) { |
554 | struct shmid_kernel *shp; | 554 | struct shmid_kernel *shp; |
555 | struct inode *inode; | 555 | struct inode *inode; |
556 | 556 | ||
557 | shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); | 557 | shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); |
558 | if (shp == NULL) | 558 | if (shp == NULL) |
559 | continue; | 559 | continue; |
560 | 560 | ||
561 | inode = shp->shm_file->f_path.dentry->d_inode; | 561 | inode = shp->shm_file->f_path.dentry->d_inode; |
562 | 562 | ||
563 | if (is_file_hugepages(shp->shm_file)) { | 563 | if (is_file_hugepages(shp->shm_file)) { |
564 | struct address_space *mapping = inode->i_mapping; | 564 | struct address_space *mapping = inode->i_mapping; |
565 | struct hstate *h = hstate_file(shp->shm_file); | 565 | struct hstate *h = hstate_file(shp->shm_file); |
566 | *rss += pages_per_huge_page(h) * mapping->nrpages; | 566 | *rss += pages_per_huge_page(h) * mapping->nrpages; |
567 | } else { | 567 | } else { |
568 | struct shmem_inode_info *info = SHMEM_I(inode); | 568 | struct shmem_inode_info *info = SHMEM_I(inode); |
569 | spin_lock(&info->lock); | 569 | spin_lock(&info->lock); |
570 | *rss += inode->i_mapping->nrpages; | 570 | *rss += inode->i_mapping->nrpages; |
571 | *swp += info->swapped; | 571 | *swp += info->swapped; |
572 | spin_unlock(&info->lock); | 572 | spin_unlock(&info->lock); |
573 | } | 573 | } |
574 | 574 | ||
575 | total++; | 575 | total++; |
576 | } | 576 | } |
577 | } | 577 | } |
578 | 578 | ||
579 | /* | 579 | /* |
580 | * This function handles some shmctl commands which require the rw_mutex | 580 | * This function handles some shmctl commands which require the rw_mutex |
581 | * to be held in write mode. | 581 | * to be held in write mode. |
582 | * NOTE: no locks must be held, the rw_mutex is taken inside this function. | 582 | * NOTE: no locks must be held, the rw_mutex is taken inside this function. |
583 | */ | 583 | */ |
584 | static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, | 584 | static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, |
585 | struct shmid_ds __user *buf, int version) | 585 | struct shmid_ds __user *buf, int version) |
586 | { | 586 | { |
587 | struct kern_ipc_perm *ipcp; | 587 | struct kern_ipc_perm *ipcp; |
588 | struct shmid64_ds shmid64; | 588 | struct shmid64_ds shmid64; |
589 | struct shmid_kernel *shp; | 589 | struct shmid_kernel *shp; |
590 | int err; | 590 | int err; |
591 | 591 | ||
592 | if (cmd == IPC_SET) { | 592 | if (cmd == IPC_SET) { |
593 | if (copy_shmid_from_user(&shmid64, buf, version)) | 593 | if (copy_shmid_from_user(&shmid64, buf, version)) |
594 | return -EFAULT; | 594 | return -EFAULT; |
595 | } | 595 | } |
596 | 596 | ||
597 | ipcp = ipcctl_pre_down(&shm_ids(ns), shmid, cmd, &shmid64.shm_perm, 0); | 597 | ipcp = ipcctl_pre_down(&shm_ids(ns), shmid, cmd, &shmid64.shm_perm, 0); |
598 | if (IS_ERR(ipcp)) | 598 | if (IS_ERR(ipcp)) |
599 | return PTR_ERR(ipcp); | 599 | return PTR_ERR(ipcp); |
600 | 600 | ||
601 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 601 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
602 | 602 | ||
603 | err = security_shm_shmctl(shp, cmd); | 603 | err = security_shm_shmctl(shp, cmd); |
604 | if (err) | 604 | if (err) |
605 | goto out_unlock; | 605 | goto out_unlock; |
606 | switch (cmd) { | 606 | switch (cmd) { |
607 | case IPC_RMID: | 607 | case IPC_RMID: |
608 | do_shm_rmid(ns, ipcp); | 608 | do_shm_rmid(ns, ipcp); |
609 | goto out_up; | 609 | goto out_up; |
610 | case IPC_SET: | 610 | case IPC_SET: |
611 | ipc_update_perm(&shmid64.shm_perm, ipcp); | 611 | ipc_update_perm(&shmid64.shm_perm, ipcp); |
612 | shp->shm_ctim = get_seconds(); | 612 | shp->shm_ctim = get_seconds(); |
613 | break; | 613 | break; |
614 | default: | 614 | default: |
615 | err = -EINVAL; | 615 | err = -EINVAL; |
616 | } | 616 | } |
617 | out_unlock: | 617 | out_unlock: |
618 | shm_unlock(shp); | 618 | shm_unlock(shp); |
619 | out_up: | 619 | out_up: |
620 | up_write(&shm_ids(ns).rw_mutex); | 620 | up_write(&shm_ids(ns).rw_mutex); |
621 | return err; | 621 | return err; |
622 | } | 622 | } |
623 | 623 | ||
624 | asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) | 624 | asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) |
625 | { | 625 | { |
626 | struct shmid_kernel *shp; | 626 | struct shmid_kernel *shp; |
627 | int err, version; | 627 | int err, version; |
628 | struct ipc_namespace *ns; | 628 | struct ipc_namespace *ns; |
629 | 629 | ||
630 | if (cmd < 0 || shmid < 0) { | 630 | if (cmd < 0 || shmid < 0) { |
631 | err = -EINVAL; | 631 | err = -EINVAL; |
632 | goto out; | 632 | goto out; |
633 | } | 633 | } |
634 | 634 | ||
635 | version = ipc_parse_version(&cmd); | 635 | version = ipc_parse_version(&cmd); |
636 | ns = current->nsproxy->ipc_ns; | 636 | ns = current->nsproxy->ipc_ns; |
637 | 637 | ||
638 | switch (cmd) { /* replace with proc interface ? */ | 638 | switch (cmd) { /* replace with proc interface ? */ |
639 | case IPC_INFO: | 639 | case IPC_INFO: |
640 | { | 640 | { |
641 | struct shminfo64 shminfo; | 641 | struct shminfo64 shminfo; |
642 | 642 | ||
643 | err = security_shm_shmctl(NULL, cmd); | 643 | err = security_shm_shmctl(NULL, cmd); |
644 | if (err) | 644 | if (err) |
645 | return err; | 645 | return err; |
646 | 646 | ||
647 | memset(&shminfo, 0, sizeof(shminfo)); | 647 | memset(&shminfo, 0, sizeof(shminfo)); |
648 | shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; | 648 | shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; |
649 | shminfo.shmmax = ns->shm_ctlmax; | 649 | shminfo.shmmax = ns->shm_ctlmax; |
650 | shminfo.shmall = ns->shm_ctlall; | 650 | shminfo.shmall = ns->shm_ctlall; |
651 | 651 | ||
652 | shminfo.shmmin = SHMMIN; | 652 | shminfo.shmmin = SHMMIN; |
653 | if(copy_shminfo_to_user (buf, &shminfo, version)) | 653 | if(copy_shminfo_to_user (buf, &shminfo, version)) |
654 | return -EFAULT; | 654 | return -EFAULT; |
655 | 655 | ||
656 | down_read(&shm_ids(ns).rw_mutex); | 656 | down_read(&shm_ids(ns).rw_mutex); |
657 | err = ipc_get_maxid(&shm_ids(ns)); | 657 | err = ipc_get_maxid(&shm_ids(ns)); |
658 | up_read(&shm_ids(ns).rw_mutex); | 658 | up_read(&shm_ids(ns).rw_mutex); |
659 | 659 | ||
660 | if(err<0) | 660 | if(err<0) |
661 | err = 0; | 661 | err = 0; |
662 | goto out; | 662 | goto out; |
663 | } | 663 | } |
664 | case SHM_INFO: | 664 | case SHM_INFO: |
665 | { | 665 | { |
666 | struct shm_info shm_info; | 666 | struct shm_info shm_info; |
667 | 667 | ||
668 | err = security_shm_shmctl(NULL, cmd); | 668 | err = security_shm_shmctl(NULL, cmd); |
669 | if (err) | 669 | if (err) |
670 | return err; | 670 | return err; |
671 | 671 | ||
672 | memset(&shm_info, 0, sizeof(shm_info)); | 672 | memset(&shm_info, 0, sizeof(shm_info)); |
673 | down_read(&shm_ids(ns).rw_mutex); | 673 | down_read(&shm_ids(ns).rw_mutex); |
674 | shm_info.used_ids = shm_ids(ns).in_use; | 674 | shm_info.used_ids = shm_ids(ns).in_use; |
675 | shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); | 675 | shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); |
676 | shm_info.shm_tot = ns->shm_tot; | 676 | shm_info.shm_tot = ns->shm_tot; |
677 | shm_info.swap_attempts = 0; | 677 | shm_info.swap_attempts = 0; |
678 | shm_info.swap_successes = 0; | 678 | shm_info.swap_successes = 0; |
679 | err = ipc_get_maxid(&shm_ids(ns)); | 679 | err = ipc_get_maxid(&shm_ids(ns)); |
680 | up_read(&shm_ids(ns).rw_mutex); | 680 | up_read(&shm_ids(ns).rw_mutex); |
681 | if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { | 681 | if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { |
682 | err = -EFAULT; | 682 | err = -EFAULT; |
683 | goto out; | 683 | goto out; |
684 | } | 684 | } |
685 | 685 | ||
686 | err = err < 0 ? 0 : err; | 686 | err = err < 0 ? 0 : err; |
687 | goto out; | 687 | goto out; |
688 | } | 688 | } |
689 | case SHM_STAT: | 689 | case SHM_STAT: |
690 | case IPC_STAT: | 690 | case IPC_STAT: |
691 | { | 691 | { |
692 | struct shmid64_ds tbuf; | 692 | struct shmid64_ds tbuf; |
693 | int result; | 693 | int result; |
694 | 694 | ||
695 | if (cmd == SHM_STAT) { | 695 | if (cmd == SHM_STAT) { |
696 | shp = shm_lock(ns, shmid); | 696 | shp = shm_lock(ns, shmid); |
697 | if (IS_ERR(shp)) { | 697 | if (IS_ERR(shp)) { |
698 | err = PTR_ERR(shp); | 698 | err = PTR_ERR(shp); |
699 | goto out; | 699 | goto out; |
700 | } | 700 | } |
701 | result = shp->shm_perm.id; | 701 | result = shp->shm_perm.id; |
702 | } else { | 702 | } else { |
703 | shp = shm_lock_check(ns, shmid); | 703 | shp = shm_lock_check(ns, shmid); |
704 | if (IS_ERR(shp)) { | 704 | if (IS_ERR(shp)) { |
705 | err = PTR_ERR(shp); | 705 | err = PTR_ERR(shp); |
706 | goto out; | 706 | goto out; |
707 | } | 707 | } |
708 | result = 0; | 708 | result = 0; |
709 | } | 709 | } |
710 | err = -EACCES; | 710 | err = -EACCES; |
711 | if (ipcperms (&shp->shm_perm, S_IRUGO)) | 711 | if (ipcperms (&shp->shm_perm, S_IRUGO)) |
712 | goto out_unlock; | 712 | goto out_unlock; |
713 | err = security_shm_shmctl(shp, cmd); | 713 | err = security_shm_shmctl(shp, cmd); |
714 | if (err) | 714 | if (err) |
715 | goto out_unlock; | 715 | goto out_unlock; |
716 | memset(&tbuf, 0, sizeof(tbuf)); | 716 | memset(&tbuf, 0, sizeof(tbuf)); |
717 | kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); | 717 | kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); |
718 | tbuf.shm_segsz = shp->shm_segsz; | 718 | tbuf.shm_segsz = shp->shm_segsz; |
719 | tbuf.shm_atime = shp->shm_atim; | 719 | tbuf.shm_atime = shp->shm_atim; |
720 | tbuf.shm_dtime = shp->shm_dtim; | 720 | tbuf.shm_dtime = shp->shm_dtim; |
721 | tbuf.shm_ctime = shp->shm_ctim; | 721 | tbuf.shm_ctime = shp->shm_ctim; |
722 | tbuf.shm_cpid = shp->shm_cprid; | 722 | tbuf.shm_cpid = shp->shm_cprid; |
723 | tbuf.shm_lpid = shp->shm_lprid; | 723 | tbuf.shm_lpid = shp->shm_lprid; |
724 | tbuf.shm_nattch = shp->shm_nattch; | 724 | tbuf.shm_nattch = shp->shm_nattch; |
725 | shm_unlock(shp); | 725 | shm_unlock(shp); |
726 | if(copy_shmid_to_user (buf, &tbuf, version)) | 726 | if(copy_shmid_to_user (buf, &tbuf, version)) |
727 | err = -EFAULT; | 727 | err = -EFAULT; |
728 | else | 728 | else |
729 | err = result; | 729 | err = result; |
730 | goto out; | 730 | goto out; |
731 | } | 731 | } |
732 | case SHM_LOCK: | 732 | case SHM_LOCK: |
733 | case SHM_UNLOCK: | 733 | case SHM_UNLOCK: |
734 | { | 734 | { |
735 | struct file *uninitialized_var(shm_file); | 735 | struct file *uninitialized_var(shm_file); |
736 | 736 | ||
737 | lru_add_drain_all(); /* drain pagevecs to lru lists */ | 737 | lru_add_drain_all(); /* drain pagevecs to lru lists */ |
738 | 738 | ||
739 | shp = shm_lock_check(ns, shmid); | 739 | shp = shm_lock_check(ns, shmid); |
740 | if (IS_ERR(shp)) { | 740 | if (IS_ERR(shp)) { |
741 | err = PTR_ERR(shp); | 741 | err = PTR_ERR(shp); |
742 | goto out; | 742 | goto out; |
743 | } | 743 | } |
744 | 744 | ||
745 | audit_ipc_obj(&(shp->shm_perm)); | 745 | audit_ipc_obj(&(shp->shm_perm)); |
746 | 746 | ||
747 | if (!capable(CAP_IPC_LOCK)) { | 747 | if (!capable(CAP_IPC_LOCK)) { |
748 | uid_t euid = current_euid(); | 748 | uid_t euid = current_euid(); |
749 | err = -EPERM; | 749 | err = -EPERM; |
750 | if (euid != shp->shm_perm.uid && | 750 | if (euid != shp->shm_perm.uid && |
751 | euid != shp->shm_perm.cuid) | 751 | euid != shp->shm_perm.cuid) |
752 | goto out_unlock; | 752 | goto out_unlock; |
753 | if (cmd == SHM_LOCK && | 753 | if (cmd == SHM_LOCK && |
754 | !current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) | 754 | !current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) |
755 | goto out_unlock; | 755 | goto out_unlock; |
756 | } | 756 | } |
757 | 757 | ||
758 | err = security_shm_shmctl(shp, cmd); | 758 | err = security_shm_shmctl(shp, cmd); |
759 | if (err) | 759 | if (err) |
760 | goto out_unlock; | 760 | goto out_unlock; |
761 | 761 | ||
762 | if(cmd==SHM_LOCK) { | 762 | if(cmd==SHM_LOCK) { |
763 | struct user_struct *user = current_user(); | 763 | struct user_struct *user = current_user(); |
764 | if (!is_file_hugepages(shp->shm_file)) { | 764 | if (!is_file_hugepages(shp->shm_file)) { |
765 | err = shmem_lock(shp->shm_file, 1, user); | 765 | err = shmem_lock(shp->shm_file, 1, user); |
766 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ | 766 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ |
767 | shp->shm_perm.mode |= SHM_LOCKED; | 767 | shp->shm_perm.mode |= SHM_LOCKED; |
768 | shp->mlock_user = user; | 768 | shp->mlock_user = user; |
769 | } | 769 | } |
770 | } | 770 | } |
771 | } else if (!is_file_hugepages(shp->shm_file)) { | 771 | } else if (!is_file_hugepages(shp->shm_file)) { |
772 | shmem_lock(shp->shm_file, 0, shp->mlock_user); | 772 | shmem_lock(shp->shm_file, 0, shp->mlock_user); |
773 | shp->shm_perm.mode &= ~SHM_LOCKED; | 773 | shp->shm_perm.mode &= ~SHM_LOCKED; |
774 | shp->mlock_user = NULL; | 774 | shp->mlock_user = NULL; |
775 | } | 775 | } |
776 | shm_unlock(shp); | 776 | shm_unlock(shp); |
777 | goto out; | 777 | goto out; |
778 | } | 778 | } |
779 | case IPC_RMID: | 779 | case IPC_RMID: |
780 | case IPC_SET: | 780 | case IPC_SET: |
781 | err = shmctl_down(ns, shmid, cmd, buf, version); | 781 | err = shmctl_down(ns, shmid, cmd, buf, version); |
782 | return err; | 782 | return err; |
783 | default: | 783 | default: |
784 | return -EINVAL; | 784 | return -EINVAL; |
785 | } | 785 | } |
786 | 786 | ||
787 | out_unlock: | 787 | out_unlock: |
788 | shm_unlock(shp); | 788 | shm_unlock(shp); |
789 | out: | 789 | out: |
790 | return err; | 790 | return err; |
791 | } | 791 | } |
792 | 792 | ||
793 | /* | 793 | /* |
794 | * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. | 794 | * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. |
795 | * | 795 | * |
796 | * NOTE! Despite the name, this is NOT a direct system call entrypoint. The | 796 | * NOTE! Despite the name, this is NOT a direct system call entrypoint. The |
797 | * "raddr" thing points to kernel space, and there has to be a wrapper around | 797 | * "raddr" thing points to kernel space, and there has to be a wrapper around |
798 | * this. | 798 | * this. |
799 | */ | 799 | */ |
800 | long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) | 800 | long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) |
801 | { | 801 | { |
802 | struct shmid_kernel *shp; | 802 | struct shmid_kernel *shp; |
803 | unsigned long addr; | 803 | unsigned long addr; |
804 | unsigned long size; | 804 | unsigned long size; |
805 | struct file * file; | 805 | struct file * file; |
806 | int err; | 806 | int err; |
807 | unsigned long flags; | 807 | unsigned long flags; |
808 | unsigned long prot; | 808 | unsigned long prot; |
809 | int acc_mode; | 809 | int acc_mode; |
810 | unsigned long user_addr; | 810 | unsigned long user_addr; |
811 | struct ipc_namespace *ns; | 811 | struct ipc_namespace *ns; |
812 | struct shm_file_data *sfd; | 812 | struct shm_file_data *sfd; |
813 | struct path path; | 813 | struct path path; |
814 | fmode_t f_mode; | 814 | fmode_t f_mode; |
815 | 815 | ||
816 | err = -EINVAL; | 816 | err = -EINVAL; |
817 | if (shmid < 0) | 817 | if (shmid < 0) |
818 | goto out; | 818 | goto out; |
819 | else if ((addr = (ulong)shmaddr)) { | 819 | else if ((addr = (ulong)shmaddr)) { |
820 | if (addr & (SHMLBA-1)) { | 820 | if (addr & (SHMLBA-1)) { |
821 | if (shmflg & SHM_RND) | 821 | if (shmflg & SHM_RND) |
822 | addr &= ~(SHMLBA-1); /* round down */ | 822 | addr &= ~(SHMLBA-1); /* round down */ |
823 | else | 823 | else |
824 | #ifndef __ARCH_FORCE_SHMLBA | 824 | #ifndef __ARCH_FORCE_SHMLBA |
825 | if (addr & ~PAGE_MASK) | 825 | if (addr & ~PAGE_MASK) |
826 | #endif | 826 | #endif |
827 | goto out; | 827 | goto out; |
828 | } | 828 | } |
829 | flags = MAP_SHARED | MAP_FIXED; | 829 | flags = MAP_SHARED | MAP_FIXED; |
830 | } else { | 830 | } else { |
831 | if ((shmflg & SHM_REMAP)) | 831 | if ((shmflg & SHM_REMAP)) |
832 | goto out; | 832 | goto out; |
833 | 833 | ||
834 | flags = MAP_SHARED; | 834 | flags = MAP_SHARED; |
835 | } | 835 | } |
836 | 836 | ||
837 | if (shmflg & SHM_RDONLY) { | 837 | if (shmflg & SHM_RDONLY) { |
838 | prot = PROT_READ; | 838 | prot = PROT_READ; |
839 | acc_mode = S_IRUGO; | 839 | acc_mode = S_IRUGO; |
840 | f_mode = FMODE_READ; | 840 | f_mode = FMODE_READ; |
841 | } else { | 841 | } else { |
842 | prot = PROT_READ | PROT_WRITE; | 842 | prot = PROT_READ | PROT_WRITE; |
843 | acc_mode = S_IRUGO | S_IWUGO; | 843 | acc_mode = S_IRUGO | S_IWUGO; |
844 | f_mode = FMODE_READ | FMODE_WRITE; | 844 | f_mode = FMODE_READ | FMODE_WRITE; |
845 | } | 845 | } |
846 | if (shmflg & SHM_EXEC) { | 846 | if (shmflg & SHM_EXEC) { |
847 | prot |= PROT_EXEC; | 847 | prot |= PROT_EXEC; |
848 | acc_mode |= S_IXUGO; | 848 | acc_mode |= S_IXUGO; |
849 | } | 849 | } |
850 | 850 | ||
851 | /* | 851 | /* |
852 | * We cannot rely on the fs check since SYSV IPC does have an | 852 | * We cannot rely on the fs check since SYSV IPC does have an |
853 | * additional creator id... | 853 | * additional creator id... |
854 | */ | 854 | */ |
855 | ns = current->nsproxy->ipc_ns; | 855 | ns = current->nsproxy->ipc_ns; |
856 | shp = shm_lock_check(ns, shmid); | 856 | shp = shm_lock_check(ns, shmid); |
857 | if (IS_ERR(shp)) { | 857 | if (IS_ERR(shp)) { |
858 | err = PTR_ERR(shp); | 858 | err = PTR_ERR(shp); |
859 | goto out; | 859 | goto out; |
860 | } | 860 | } |
861 | 861 | ||
862 | err = -EACCES; | 862 | err = -EACCES; |
863 | if (ipcperms(&shp->shm_perm, acc_mode)) | 863 | if (ipcperms(&shp->shm_perm, acc_mode)) |
864 | goto out_unlock; | 864 | goto out_unlock; |
865 | 865 | ||
866 | err = security_shm_shmat(shp, shmaddr, shmflg); | 866 | err = security_shm_shmat(shp, shmaddr, shmflg); |
867 | if (err) | 867 | if (err) |
868 | goto out_unlock; | 868 | goto out_unlock; |
869 | 869 | ||
870 | path.dentry = dget(shp->shm_file->f_path.dentry); | 870 | path.dentry = dget(shp->shm_file->f_path.dentry); |
871 | path.mnt = shp->shm_file->f_path.mnt; | 871 | path.mnt = shp->shm_file->f_path.mnt; |
872 | shp->shm_nattch++; | 872 | shp->shm_nattch++; |
873 | size = i_size_read(path.dentry->d_inode); | 873 | size = i_size_read(path.dentry->d_inode); |
874 | shm_unlock(shp); | 874 | shm_unlock(shp); |
875 | 875 | ||
876 | err = -ENOMEM; | 876 | err = -ENOMEM; |
877 | sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); | 877 | sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); |
878 | if (!sfd) | 878 | if (!sfd) |
879 | goto out_put_dentry; | 879 | goto out_put_dentry; |
880 | 880 | ||
881 | file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); | 881 | file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); |
882 | if (!file) | 882 | if (!file) |
883 | goto out_free; | 883 | goto out_free; |
884 | 884 | ||
885 | file->private_data = sfd; | 885 | file->private_data = sfd; |
886 | file->f_mapping = shp->shm_file->f_mapping; | 886 | file->f_mapping = shp->shm_file->f_mapping; |
887 | sfd->id = shp->shm_perm.id; | 887 | sfd->id = shp->shm_perm.id; |
888 | sfd->ns = get_ipc_ns(ns); | 888 | sfd->ns = get_ipc_ns(ns); |
889 | sfd->file = shp->shm_file; | 889 | sfd->file = shp->shm_file; |
890 | sfd->vm_ops = NULL; | 890 | sfd->vm_ops = NULL; |
891 | 891 | ||
892 | down_write(¤t->mm->mmap_sem); | 892 | down_write(¤t->mm->mmap_sem); |
893 | if (addr && !(shmflg & SHM_REMAP)) { | 893 | if (addr && !(shmflg & SHM_REMAP)) { |
894 | err = -EINVAL; | 894 | err = -EINVAL; |
895 | if (find_vma_intersection(current->mm, addr, addr + size)) | 895 | if (find_vma_intersection(current->mm, addr, addr + size)) |
896 | goto invalid; | 896 | goto invalid; |
897 | /* | 897 | /* |
898 | * If shm segment goes below stack, make sure there is some | 898 | * If shm segment goes below stack, make sure there is some |
899 | * space left for the stack to grow (at least 4 pages). | 899 | * space left for the stack to grow (at least 4 pages). |
900 | */ | 900 | */ |
901 | if (addr < current->mm->start_stack && | 901 | if (addr < current->mm->start_stack && |
902 | addr > current->mm->start_stack - size - PAGE_SIZE * 5) | 902 | addr > current->mm->start_stack - size - PAGE_SIZE * 5) |
903 | goto invalid; | 903 | goto invalid; |
904 | } | 904 | } |
905 | 905 | ||
906 | user_addr = do_mmap (file, addr, size, prot, flags, 0); | 906 | user_addr = do_mmap (file, addr, size, prot, flags, 0); |
907 | *raddr = user_addr; | 907 | *raddr = user_addr; |
908 | err = 0; | 908 | err = 0; |
909 | if (IS_ERR_VALUE(user_addr)) | 909 | if (IS_ERR_VALUE(user_addr)) |
910 | err = (long)user_addr; | 910 | err = (long)user_addr; |
911 | invalid: | 911 | invalid: |
912 | up_write(¤t->mm->mmap_sem); | 912 | up_write(¤t->mm->mmap_sem); |
913 | 913 | ||
914 | fput(file); | 914 | fput(file); |
915 | 915 | ||
916 | out_nattch: | 916 | out_nattch: |
917 | down_write(&shm_ids(ns).rw_mutex); | 917 | down_write(&shm_ids(ns).rw_mutex); |
918 | shp = shm_lock(ns, shmid); | 918 | shp = shm_lock(ns, shmid); |
919 | BUG_ON(IS_ERR(shp)); | 919 | BUG_ON(IS_ERR(shp)); |
920 | shp->shm_nattch--; | 920 | shp->shm_nattch--; |
921 | if(shp->shm_nattch == 0 && | 921 | if(shp->shm_nattch == 0 && |
922 | shp->shm_perm.mode & SHM_DEST) | 922 | shp->shm_perm.mode & SHM_DEST) |
923 | shm_destroy(ns, shp); | 923 | shm_destroy(ns, shp); |
924 | else | 924 | else |
925 | shm_unlock(shp); | 925 | shm_unlock(shp); |
926 | up_write(&shm_ids(ns).rw_mutex); | 926 | up_write(&shm_ids(ns).rw_mutex); |
927 | 927 | ||
928 | out: | 928 | out: |
929 | return err; | 929 | return err; |
930 | 930 | ||
931 | out_unlock: | 931 | out_unlock: |
932 | shm_unlock(shp); | 932 | shm_unlock(shp); |
933 | goto out; | 933 | goto out; |
934 | 934 | ||
935 | out_free: | 935 | out_free: |
936 | kfree(sfd); | 936 | kfree(sfd); |
937 | out_put_dentry: | 937 | out_put_dentry: |
938 | dput(path.dentry); | 938 | dput(path.dentry); |
939 | goto out_nattch; | 939 | goto out_nattch; |
940 | } | 940 | } |
941 | 941 | ||
942 | asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg) | 942 | asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg) |
943 | { | 943 | { |
944 | unsigned long ret; | 944 | unsigned long ret; |
945 | long err; | 945 | long err; |
946 | 946 | ||
947 | err = do_shmat(shmid, shmaddr, shmflg, &ret); | 947 | err = do_shmat(shmid, shmaddr, shmflg, &ret); |
948 | if (err) | 948 | if (err) |
949 | return err; | 949 | return err; |
950 | force_successful_syscall_return(); | 950 | force_successful_syscall_return(); |
951 | return (long)ret; | 951 | return (long)ret; |
952 | } | 952 | } |
953 | 953 | ||
954 | /* | 954 | /* |
955 | * detach and kill segment if marked destroyed. | 955 | * detach and kill segment if marked destroyed. |
956 | * The work is done in shm_close. | 956 | * The work is done in shm_close. |
957 | */ | 957 | */ |
958 | asmlinkage long sys_shmdt(char __user *shmaddr) | 958 | asmlinkage long sys_shmdt(char __user *shmaddr) |
959 | { | 959 | { |
960 | struct mm_struct *mm = current->mm; | 960 | struct mm_struct *mm = current->mm; |
961 | struct vm_area_struct *vma, *next; | 961 | struct vm_area_struct *vma, *next; |
962 | unsigned long addr = (unsigned long)shmaddr; | 962 | unsigned long addr = (unsigned long)shmaddr; |
963 | loff_t size = 0; | 963 | loff_t size = 0; |
964 | int retval = -EINVAL; | 964 | int retval = -EINVAL; |
965 | 965 | ||
966 | if (addr & ~PAGE_MASK) | 966 | if (addr & ~PAGE_MASK) |
967 | return retval; | 967 | return retval; |
968 | 968 | ||
969 | down_write(&mm->mmap_sem); | 969 | down_write(&mm->mmap_sem); |
970 | 970 | ||
971 | /* | 971 | /* |
972 | * This function tries to be smart and unmap shm segments that | 972 | * This function tries to be smart and unmap shm segments that |
973 | * were modified by partial mlock or munmap calls: | 973 | * were modified by partial mlock or munmap calls: |
974 | * - It first determines the size of the shm segment that should be | 974 | * - It first determines the size of the shm segment that should be |
975 | * unmapped: It searches for a vma that is backed by shm and that | 975 | * unmapped: It searches for a vma that is backed by shm and that |
976 | * started at address shmaddr. It records it's size and then unmaps | 976 | * started at address shmaddr. It records it's size and then unmaps |
977 | * it. | 977 | * it. |
978 | * - Then it unmaps all shm vmas that started at shmaddr and that | 978 | * - Then it unmaps all shm vmas that started at shmaddr and that |
979 | * are within the initially determined size. | 979 | * are within the initially determined size. |
980 | * Errors from do_munmap are ignored: the function only fails if | 980 | * Errors from do_munmap are ignored: the function only fails if |
981 | * it's called with invalid parameters or if it's called to unmap | 981 | * it's called with invalid parameters or if it's called to unmap |
982 | * a part of a vma. Both calls in this function are for full vmas, | 982 | * a part of a vma. Both calls in this function are for full vmas, |
983 | * the parameters are directly copied from the vma itself and always | 983 | * the parameters are directly copied from the vma itself and always |
984 | * valid - therefore do_munmap cannot fail. (famous last words?) | 984 | * valid - therefore do_munmap cannot fail. (famous last words?) |
985 | */ | 985 | */ |
986 | /* | 986 | /* |
987 | * If it had been mremap()'d, the starting address would not | 987 | * If it had been mremap()'d, the starting address would not |
988 | * match the usual checks anyway. So assume all vma's are | 988 | * match the usual checks anyway. So assume all vma's are |
989 | * above the starting address given. | 989 | * above the starting address given. |
990 | */ | 990 | */ |
991 | vma = find_vma(mm, addr); | 991 | vma = find_vma(mm, addr); |
992 | 992 | ||
993 | #ifdef CONFIG_MMU | ||
993 | while (vma) { | 994 | while (vma) { |
994 | next = vma->vm_next; | 995 | next = vma->vm_next; |
995 | 996 | ||
996 | /* | 997 | /* |
997 | * Check if the starting address would match, i.e. it's | 998 | * Check if the starting address would match, i.e. it's |
998 | * a fragment created by mprotect() and/or munmap(), or it | 999 | * a fragment created by mprotect() and/or munmap(), or it |
999 | * otherwise it starts at this address with no hassles. | 1000 | * otherwise it starts at this address with no hassles. |
1000 | */ | 1001 | */ |
1001 | if ((vma->vm_ops == &shm_vm_ops) && | 1002 | if ((vma->vm_ops == &shm_vm_ops) && |
1002 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { | 1003 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { |
1003 | 1004 | ||
1004 | 1005 | ||
1005 | size = vma->vm_file->f_path.dentry->d_inode->i_size; | 1006 | size = vma->vm_file->f_path.dentry->d_inode->i_size; |
1006 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1007 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1007 | /* | 1008 | /* |
1008 | * We discovered the size of the shm segment, so | 1009 | * We discovered the size of the shm segment, so |
1009 | * break out of here and fall through to the next | 1010 | * break out of here and fall through to the next |
1010 | * loop that uses the size information to stop | 1011 | * loop that uses the size information to stop |
1011 | * searching for matching vma's. | 1012 | * searching for matching vma's. |
1012 | */ | 1013 | */ |
1013 | retval = 0; | 1014 | retval = 0; |
1014 | vma = next; | 1015 | vma = next; |
1015 | break; | 1016 | break; |
1016 | } | 1017 | } |
1017 | vma = next; | 1018 | vma = next; |
1018 | } | 1019 | } |
1019 | 1020 | ||
1020 | /* | 1021 | /* |
1021 | * We need look no further than the maximum address a fragment | 1022 | * We need look no further than the maximum address a fragment |
1022 | * could possibly have landed at. Also cast things to loff_t to | 1023 | * could possibly have landed at. Also cast things to loff_t to |
1023 | * prevent overflows and make comparisions vs. equal-width types. | 1024 | * prevent overflows and make comparisions vs. equal-width types. |
1024 | */ | 1025 | */ |
1025 | size = PAGE_ALIGN(size); | 1026 | size = PAGE_ALIGN(size); |
1026 | while (vma && (loff_t)(vma->vm_end - addr) <= size) { | 1027 | while (vma && (loff_t)(vma->vm_end - addr) <= size) { |
1027 | next = vma->vm_next; | 1028 | next = vma->vm_next; |
1028 | 1029 | ||
1029 | /* finding a matching vma now does not alter retval */ | 1030 | /* finding a matching vma now does not alter retval */ |
1030 | if ((vma->vm_ops == &shm_vm_ops) && | 1031 | if ((vma->vm_ops == &shm_vm_ops) && |
1031 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) | 1032 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) |
1032 | 1033 | ||
1033 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1034 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1034 | vma = next; | 1035 | vma = next; |
1035 | } | 1036 | } |
1037 | |||
1038 | #else /* CONFIG_MMU */ | ||
1039 | /* under NOMMU conditions, the exact address to be destroyed must be | ||
1040 | * given */ | ||
1041 | retval = -EINVAL; | ||
1042 | if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { | ||
1043 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | ||
1044 | retval = 0; | ||
1045 | } | ||
1046 | |||
1047 | #endif | ||
1036 | 1048 | ||
1037 | up_write(&mm->mmap_sem); | 1049 | up_write(&mm->mmap_sem); |
1038 | return retval; | 1050 | return retval; |
1039 | } | 1051 | } |
1040 | 1052 | ||
1041 | #ifdef CONFIG_PROC_FS | 1053 | #ifdef CONFIG_PROC_FS |
1042 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it) | 1054 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it) |
1043 | { | 1055 | { |
1044 | struct shmid_kernel *shp = it; | 1056 | struct shmid_kernel *shp = it; |
1045 | 1057 | ||
1046 | #if BITS_PER_LONG <= 32 | 1058 | #if BITS_PER_LONG <= 32 |
1047 | #define SIZE_SPEC "%10lu" | 1059 | #define SIZE_SPEC "%10lu" |
1048 | #else | 1060 | #else |
1049 | #define SIZE_SPEC "%21lu" | 1061 | #define SIZE_SPEC "%21lu" |
1050 | #endif | 1062 | #endif |
1051 | 1063 | ||
1052 | return seq_printf(s, | 1064 | return seq_printf(s, |
1053 | "%10d %10d %4o " SIZE_SPEC " %5u %5u " | 1065 | "%10d %10d %4o " SIZE_SPEC " %5u %5u " |
1054 | "%5lu %5u %5u %5u %5u %10lu %10lu %10lu\n", | 1066 | "%5lu %5u %5u %5u %5u %10lu %10lu %10lu\n", |
1055 | shp->shm_perm.key, | 1067 | shp->shm_perm.key, |
1056 | shp->shm_perm.id, | 1068 | shp->shm_perm.id, |
1057 | shp->shm_perm.mode, | 1069 | shp->shm_perm.mode, |
1058 | shp->shm_segsz, | 1070 | shp->shm_segsz, |
1059 | shp->shm_cprid, | 1071 | shp->shm_cprid, |
1060 | shp->shm_lprid, | 1072 | shp->shm_lprid, |
1061 | shp->shm_nattch, | 1073 | shp->shm_nattch, |
1062 | shp->shm_perm.uid, | 1074 | shp->shm_perm.uid, |
1063 | shp->shm_perm.gid, | 1075 | shp->shm_perm.gid, |
1064 | shp->shm_perm.cuid, | 1076 | shp->shm_perm.cuid, |
1065 | shp->shm_perm.cgid, | 1077 | shp->shm_perm.cgid, |
1066 | shp->shm_atim, | 1078 | shp->shm_atim, |
1067 | shp->shm_dtim, | 1079 | shp->shm_dtim, |
1068 | shp->shm_ctim); | 1080 | shp->shm_ctim); |
1069 | } | 1081 | } |
1070 | #endif | 1082 | #endif |
1071 | 1083 |
kernel/fork.c
1 | /* | 1 | /* |
2 | * linux/kernel/fork.c | 2 | * linux/kernel/fork.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * 'fork.c' contains the help-routines for the 'fork' system call | 8 | * 'fork.c' contains the help-routines for the 'fork' system call |
9 | * (see also entry.S and others). | 9 | * (see also entry.S and others). |
10 | * Fork is rather simple, once you get the hang of it, but the memory | 10 | * Fork is rather simple, once you get the hang of it, but the memory |
11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' | 11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/unistd.h> | 16 | #include <linux/unistd.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
19 | #include <linux/completion.h> | 19 | #include <linux/completion.h> |
20 | #include <linux/mnt_namespace.h> | 20 | #include <linux/mnt_namespace.h> |
21 | #include <linux/personality.h> | 21 | #include <linux/personality.h> |
22 | #include <linux/mempolicy.h> | 22 | #include <linux/mempolicy.h> |
23 | #include <linux/sem.h> | 23 | #include <linux/sem.h> |
24 | #include <linux/file.h> | 24 | #include <linux/file.h> |
25 | #include <linux/fdtable.h> | 25 | #include <linux/fdtable.h> |
26 | #include <linux/iocontext.h> | 26 | #include <linux/iocontext.h> |
27 | #include <linux/key.h> | 27 | #include <linux/key.h> |
28 | #include <linux/binfmts.h> | 28 | #include <linux/binfmts.h> |
29 | #include <linux/mman.h> | 29 | #include <linux/mman.h> |
30 | #include <linux/mmu_notifier.h> | 30 | #include <linux/mmu_notifier.h> |
31 | #include <linux/fs.h> | 31 | #include <linux/fs.h> |
32 | #include <linux/nsproxy.h> | 32 | #include <linux/nsproxy.h> |
33 | #include <linux/capability.h> | 33 | #include <linux/capability.h> |
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/cgroup.h> | 35 | #include <linux/cgroup.h> |
36 | #include <linux/security.h> | 36 | #include <linux/security.h> |
37 | #include <linux/hugetlb.h> | 37 | #include <linux/hugetlb.h> |
38 | #include <linux/swap.h> | 38 | #include <linux/swap.h> |
39 | #include <linux/syscalls.h> | 39 | #include <linux/syscalls.h> |
40 | #include <linux/jiffies.h> | 40 | #include <linux/jiffies.h> |
41 | #include <linux/tracehook.h> | 41 | #include <linux/tracehook.h> |
42 | #include <linux/futex.h> | 42 | #include <linux/futex.h> |
43 | #include <linux/compat.h> | 43 | #include <linux/compat.h> |
44 | #include <linux/task_io_accounting_ops.h> | 44 | #include <linux/task_io_accounting_ops.h> |
45 | #include <linux/rcupdate.h> | 45 | #include <linux/rcupdate.h> |
46 | #include <linux/ptrace.h> | 46 | #include <linux/ptrace.h> |
47 | #include <linux/mount.h> | 47 | #include <linux/mount.h> |
48 | #include <linux/audit.h> | 48 | #include <linux/audit.h> |
49 | #include <linux/memcontrol.h> | 49 | #include <linux/memcontrol.h> |
50 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | #include <linux/profile.h> | 51 | #include <linux/profile.h> |
52 | #include <linux/rmap.h> | 52 | #include <linux/rmap.h> |
53 | #include <linux/acct.h> | 53 | #include <linux/acct.h> |
54 | #include <linux/tsacct_kern.h> | 54 | #include <linux/tsacct_kern.h> |
55 | #include <linux/cn_proc.h> | 55 | #include <linux/cn_proc.h> |
56 | #include <linux/freezer.h> | 56 | #include <linux/freezer.h> |
57 | #include <linux/delayacct.h> | 57 | #include <linux/delayacct.h> |
58 | #include <linux/taskstats_kern.h> | 58 | #include <linux/taskstats_kern.h> |
59 | #include <linux/random.h> | 59 | #include <linux/random.h> |
60 | #include <linux/tty.h> | 60 | #include <linux/tty.h> |
61 | #include <linux/proc_fs.h> | 61 | #include <linux/proc_fs.h> |
62 | #include <linux/blkdev.h> | 62 | #include <linux/blkdev.h> |
63 | #include <trace/sched.h> | 63 | #include <trace/sched.h> |
64 | 64 | ||
65 | #include <asm/pgtable.h> | 65 | #include <asm/pgtable.h> |
66 | #include <asm/pgalloc.h> | 66 | #include <asm/pgalloc.h> |
67 | #include <asm/uaccess.h> | 67 | #include <asm/uaccess.h> |
68 | #include <asm/mmu_context.h> | 68 | #include <asm/mmu_context.h> |
69 | #include <asm/cacheflush.h> | 69 | #include <asm/cacheflush.h> |
70 | #include <asm/tlbflush.h> | 70 | #include <asm/tlbflush.h> |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * Protected counters by write_lock_irq(&tasklist_lock) | 73 | * Protected counters by write_lock_irq(&tasklist_lock) |
74 | */ | 74 | */ |
75 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | 75 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
76 | int nr_threads; /* The idle threads do not count.. */ | 76 | int nr_threads; /* The idle threads do not count.. */ |
77 | 77 | ||
78 | int max_threads; /* tunable limit on nr_threads */ | 78 | int max_threads; /* tunable limit on nr_threads */ |
79 | 79 | ||
80 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | 80 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
81 | 81 | ||
82 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ | 82 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
83 | 83 | ||
84 | DEFINE_TRACE(sched_process_fork); | 84 | DEFINE_TRACE(sched_process_fork); |
85 | 85 | ||
86 | int nr_processes(void) | 86 | int nr_processes(void) |
87 | { | 87 | { |
88 | int cpu; | 88 | int cpu; |
89 | int total = 0; | 89 | int total = 0; |
90 | 90 | ||
91 | for_each_online_cpu(cpu) | 91 | for_each_online_cpu(cpu) |
92 | total += per_cpu(process_counts, cpu); | 92 | total += per_cpu(process_counts, cpu); |
93 | 93 | ||
94 | return total; | 94 | return total; |
95 | } | 95 | } |
96 | 96 | ||
97 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 97 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
98 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) | 98 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) |
99 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) | 99 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) |
100 | static struct kmem_cache *task_struct_cachep; | 100 | static struct kmem_cache *task_struct_cachep; |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 103 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR |
104 | static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) | 104 | static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) |
105 | { | 105 | { |
106 | #ifdef CONFIG_DEBUG_STACK_USAGE | 106 | #ifdef CONFIG_DEBUG_STACK_USAGE |
107 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 107 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; |
108 | #else | 108 | #else |
109 | gfp_t mask = GFP_KERNEL; | 109 | gfp_t mask = GFP_KERNEL; |
110 | #endif | 110 | #endif |
111 | return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); | 111 | return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); |
112 | } | 112 | } |
113 | 113 | ||
114 | static inline void free_thread_info(struct thread_info *ti) | 114 | static inline void free_thread_info(struct thread_info *ti) |
115 | { | 115 | { |
116 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 116 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
117 | } | 117 | } |
118 | #endif | 118 | #endif |
119 | 119 | ||
120 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 120 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
121 | static struct kmem_cache *signal_cachep; | 121 | static struct kmem_cache *signal_cachep; |
122 | 122 | ||
123 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ | 123 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ |
124 | struct kmem_cache *sighand_cachep; | 124 | struct kmem_cache *sighand_cachep; |
125 | 125 | ||
126 | /* SLAB cache for files_struct structures (tsk->files) */ | 126 | /* SLAB cache for files_struct structures (tsk->files) */ |
127 | struct kmem_cache *files_cachep; | 127 | struct kmem_cache *files_cachep; |
128 | 128 | ||
129 | /* SLAB cache for fs_struct structures (tsk->fs) */ | 129 | /* SLAB cache for fs_struct structures (tsk->fs) */ |
130 | struct kmem_cache *fs_cachep; | 130 | struct kmem_cache *fs_cachep; |
131 | 131 | ||
132 | /* SLAB cache for vm_area_struct structures */ | 132 | /* SLAB cache for vm_area_struct structures */ |
133 | struct kmem_cache *vm_area_cachep; | 133 | struct kmem_cache *vm_area_cachep; |
134 | 134 | ||
135 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 135 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
136 | static struct kmem_cache *mm_cachep; | 136 | static struct kmem_cache *mm_cachep; |
137 | 137 | ||
138 | void free_task(struct task_struct *tsk) | 138 | void free_task(struct task_struct *tsk) |
139 | { | 139 | { |
140 | prop_local_destroy_single(&tsk->dirties); | 140 | prop_local_destroy_single(&tsk->dirties); |
141 | free_thread_info(tsk->stack); | 141 | free_thread_info(tsk->stack); |
142 | rt_mutex_debug_task_free(tsk); | 142 | rt_mutex_debug_task_free(tsk); |
143 | ftrace_graph_exit_task(tsk); | 143 | ftrace_graph_exit_task(tsk); |
144 | free_task_struct(tsk); | 144 | free_task_struct(tsk); |
145 | } | 145 | } |
146 | EXPORT_SYMBOL(free_task); | 146 | EXPORT_SYMBOL(free_task); |
147 | 147 | ||
148 | void __put_task_struct(struct task_struct *tsk) | 148 | void __put_task_struct(struct task_struct *tsk) |
149 | { | 149 | { |
150 | WARN_ON(!tsk->exit_state); | 150 | WARN_ON(!tsk->exit_state); |
151 | WARN_ON(atomic_read(&tsk->usage)); | 151 | WARN_ON(atomic_read(&tsk->usage)); |
152 | WARN_ON(tsk == current); | 152 | WARN_ON(tsk == current); |
153 | 153 | ||
154 | put_cred(tsk->real_cred); | 154 | put_cred(tsk->real_cred); |
155 | put_cred(tsk->cred); | 155 | put_cred(tsk->cred); |
156 | delayacct_tsk_free(tsk); | 156 | delayacct_tsk_free(tsk); |
157 | 157 | ||
158 | if (!profile_handoff_task(tsk)) | 158 | if (!profile_handoff_task(tsk)) |
159 | free_task(tsk); | 159 | free_task(tsk); |
160 | } | 160 | } |
161 | 161 | ||
162 | /* | 162 | /* |
163 | * macro override instead of weak attribute alias, to workaround | 163 | * macro override instead of weak attribute alias, to workaround |
164 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | 164 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. |
165 | */ | 165 | */ |
166 | #ifndef arch_task_cache_init | 166 | #ifndef arch_task_cache_init |
167 | #define arch_task_cache_init() | 167 | #define arch_task_cache_init() |
168 | #endif | 168 | #endif |
169 | 169 | ||
170 | void __init fork_init(unsigned long mempages) | 170 | void __init fork_init(unsigned long mempages) |
171 | { | 171 | { |
172 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 172 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
173 | #ifndef ARCH_MIN_TASKALIGN | 173 | #ifndef ARCH_MIN_TASKALIGN |
174 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 174 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
175 | #endif | 175 | #endif |
176 | /* create a slab on which task_structs can be allocated */ | 176 | /* create a slab on which task_structs can be allocated */ |
177 | task_struct_cachep = | 177 | task_struct_cachep = |
178 | kmem_cache_create("task_struct", sizeof(struct task_struct), | 178 | kmem_cache_create("task_struct", sizeof(struct task_struct), |
179 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); | 179 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); |
180 | #endif | 180 | #endif |
181 | 181 | ||
182 | /* do the arch specific task caches init */ | 182 | /* do the arch specific task caches init */ |
183 | arch_task_cache_init(); | 183 | arch_task_cache_init(); |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * The default maximum number of threads is set to a safe | 186 | * The default maximum number of threads is set to a safe |
187 | * value: the thread structures can take up at most half | 187 | * value: the thread structures can take up at most half |
188 | * of memory. | 188 | * of memory. |
189 | */ | 189 | */ |
190 | max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); | 190 | max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); |
191 | 191 | ||
192 | /* | 192 | /* |
193 | * we need to allow at least 20 threads to boot a system | 193 | * we need to allow at least 20 threads to boot a system |
194 | */ | 194 | */ |
195 | if(max_threads < 20) | 195 | if(max_threads < 20) |
196 | max_threads = 20; | 196 | max_threads = 20; |
197 | 197 | ||
198 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; | 198 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; |
199 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; | 199 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; |
200 | init_task.signal->rlim[RLIMIT_SIGPENDING] = | 200 | init_task.signal->rlim[RLIMIT_SIGPENDING] = |
201 | init_task.signal->rlim[RLIMIT_NPROC]; | 201 | init_task.signal->rlim[RLIMIT_NPROC]; |
202 | } | 202 | } |
203 | 203 | ||
204 | int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, | 204 | int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, |
205 | struct task_struct *src) | 205 | struct task_struct *src) |
206 | { | 206 | { |
207 | *dst = *src; | 207 | *dst = *src; |
208 | return 0; | 208 | return 0; |
209 | } | 209 | } |
210 | 210 | ||
211 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 211 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
212 | { | 212 | { |
213 | struct task_struct *tsk; | 213 | struct task_struct *tsk; |
214 | struct thread_info *ti; | 214 | struct thread_info *ti; |
215 | int err; | 215 | int err; |
216 | 216 | ||
217 | prepare_to_copy(orig); | 217 | prepare_to_copy(orig); |
218 | 218 | ||
219 | tsk = alloc_task_struct(); | 219 | tsk = alloc_task_struct(); |
220 | if (!tsk) | 220 | if (!tsk) |
221 | return NULL; | 221 | return NULL; |
222 | 222 | ||
223 | ti = alloc_thread_info(tsk); | 223 | ti = alloc_thread_info(tsk); |
224 | if (!ti) { | 224 | if (!ti) { |
225 | free_task_struct(tsk); | 225 | free_task_struct(tsk); |
226 | return NULL; | 226 | return NULL; |
227 | } | 227 | } |
228 | 228 | ||
229 | err = arch_dup_task_struct(tsk, orig); | 229 | err = arch_dup_task_struct(tsk, orig); |
230 | if (err) | 230 | if (err) |
231 | goto out; | 231 | goto out; |
232 | 232 | ||
233 | tsk->stack = ti; | 233 | tsk->stack = ti; |
234 | 234 | ||
235 | err = prop_local_init_single(&tsk->dirties); | 235 | err = prop_local_init_single(&tsk->dirties); |
236 | if (err) | 236 | if (err) |
237 | goto out; | 237 | goto out; |
238 | 238 | ||
239 | setup_thread_stack(tsk, orig); | 239 | setup_thread_stack(tsk, orig); |
240 | 240 | ||
241 | #ifdef CONFIG_CC_STACKPROTECTOR | 241 | #ifdef CONFIG_CC_STACKPROTECTOR |
242 | tsk->stack_canary = get_random_int(); | 242 | tsk->stack_canary = get_random_int(); |
243 | #endif | 243 | #endif |
244 | 244 | ||
245 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 245 | /* One for us, one for whoever does the "release_task()" (usually parent) */ |
246 | atomic_set(&tsk->usage,2); | 246 | atomic_set(&tsk->usage,2); |
247 | atomic_set(&tsk->fs_excl, 0); | 247 | atomic_set(&tsk->fs_excl, 0); |
248 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 248 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
249 | tsk->btrace_seq = 0; | 249 | tsk->btrace_seq = 0; |
250 | #endif | 250 | #endif |
251 | tsk->splice_pipe = NULL; | 251 | tsk->splice_pipe = NULL; |
252 | return tsk; | 252 | return tsk; |
253 | 253 | ||
254 | out: | 254 | out: |
255 | free_thread_info(ti); | 255 | free_thread_info(ti); |
256 | free_task_struct(tsk); | 256 | free_task_struct(tsk); |
257 | return NULL; | 257 | return NULL; |
258 | } | 258 | } |
259 | 259 | ||
260 | #ifdef CONFIG_MMU | 260 | #ifdef CONFIG_MMU |
261 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | 261 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
262 | { | 262 | { |
263 | struct vm_area_struct *mpnt, *tmp, **pprev; | 263 | struct vm_area_struct *mpnt, *tmp, **pprev; |
264 | struct rb_node **rb_link, *rb_parent; | 264 | struct rb_node **rb_link, *rb_parent; |
265 | int retval; | 265 | int retval; |
266 | unsigned long charge; | 266 | unsigned long charge; |
267 | struct mempolicy *pol; | 267 | struct mempolicy *pol; |
268 | 268 | ||
269 | down_write(&oldmm->mmap_sem); | 269 | down_write(&oldmm->mmap_sem); |
270 | flush_cache_dup_mm(oldmm); | 270 | flush_cache_dup_mm(oldmm); |
271 | /* | 271 | /* |
272 | * Not linked in yet - no deadlock potential: | 272 | * Not linked in yet - no deadlock potential: |
273 | */ | 273 | */ |
274 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | 274 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
275 | 275 | ||
276 | mm->locked_vm = 0; | 276 | mm->locked_vm = 0; |
277 | mm->mmap = NULL; | 277 | mm->mmap = NULL; |
278 | mm->mmap_cache = NULL; | 278 | mm->mmap_cache = NULL; |
279 | mm->free_area_cache = oldmm->mmap_base; | 279 | mm->free_area_cache = oldmm->mmap_base; |
280 | mm->cached_hole_size = ~0UL; | 280 | mm->cached_hole_size = ~0UL; |
281 | mm->map_count = 0; | 281 | mm->map_count = 0; |
282 | cpus_clear(mm->cpu_vm_mask); | 282 | cpus_clear(mm->cpu_vm_mask); |
283 | mm->mm_rb = RB_ROOT; | 283 | mm->mm_rb = RB_ROOT; |
284 | rb_link = &mm->mm_rb.rb_node; | 284 | rb_link = &mm->mm_rb.rb_node; |
285 | rb_parent = NULL; | 285 | rb_parent = NULL; |
286 | pprev = &mm->mmap; | 286 | pprev = &mm->mmap; |
287 | 287 | ||
288 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 288 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
289 | struct file *file; | 289 | struct file *file; |
290 | 290 | ||
291 | if (mpnt->vm_flags & VM_DONTCOPY) { | 291 | if (mpnt->vm_flags & VM_DONTCOPY) { |
292 | long pages = vma_pages(mpnt); | 292 | long pages = vma_pages(mpnt); |
293 | mm->total_vm -= pages; | 293 | mm->total_vm -= pages; |
294 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | 294 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, |
295 | -pages); | 295 | -pages); |
296 | continue; | 296 | continue; |
297 | } | 297 | } |
298 | charge = 0; | 298 | charge = 0; |
299 | if (mpnt->vm_flags & VM_ACCOUNT) { | 299 | if (mpnt->vm_flags & VM_ACCOUNT) { |
300 | unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | 300 | unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; |
301 | if (security_vm_enough_memory(len)) | 301 | if (security_vm_enough_memory(len)) |
302 | goto fail_nomem; | 302 | goto fail_nomem; |
303 | charge = len; | 303 | charge = len; |
304 | } | 304 | } |
305 | tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 305 | tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
306 | if (!tmp) | 306 | if (!tmp) |
307 | goto fail_nomem; | 307 | goto fail_nomem; |
308 | *tmp = *mpnt; | 308 | *tmp = *mpnt; |
309 | pol = mpol_dup(vma_policy(mpnt)); | 309 | pol = mpol_dup(vma_policy(mpnt)); |
310 | retval = PTR_ERR(pol); | 310 | retval = PTR_ERR(pol); |
311 | if (IS_ERR(pol)) | 311 | if (IS_ERR(pol)) |
312 | goto fail_nomem_policy; | 312 | goto fail_nomem_policy; |
313 | vma_set_policy(tmp, pol); | 313 | vma_set_policy(tmp, pol); |
314 | tmp->vm_flags &= ~VM_LOCKED; | 314 | tmp->vm_flags &= ~VM_LOCKED; |
315 | tmp->vm_mm = mm; | 315 | tmp->vm_mm = mm; |
316 | tmp->vm_next = NULL; | 316 | tmp->vm_next = NULL; |
317 | anon_vma_link(tmp); | 317 | anon_vma_link(tmp); |
318 | file = tmp->vm_file; | 318 | file = tmp->vm_file; |
319 | if (file) { | 319 | if (file) { |
320 | struct inode *inode = file->f_path.dentry->d_inode; | 320 | struct inode *inode = file->f_path.dentry->d_inode; |
321 | struct address_space *mapping = file->f_mapping; | 321 | struct address_space *mapping = file->f_mapping; |
322 | 322 | ||
323 | get_file(file); | 323 | get_file(file); |
324 | if (tmp->vm_flags & VM_DENYWRITE) | 324 | if (tmp->vm_flags & VM_DENYWRITE) |
325 | atomic_dec(&inode->i_writecount); | 325 | atomic_dec(&inode->i_writecount); |
326 | spin_lock(&mapping->i_mmap_lock); | 326 | spin_lock(&mapping->i_mmap_lock); |
327 | if (tmp->vm_flags & VM_SHARED) | 327 | if (tmp->vm_flags & VM_SHARED) |
328 | mapping->i_mmap_writable++; | 328 | mapping->i_mmap_writable++; |
329 | tmp->vm_truncate_count = mpnt->vm_truncate_count; | 329 | tmp->vm_truncate_count = mpnt->vm_truncate_count; |
330 | flush_dcache_mmap_lock(mapping); | 330 | flush_dcache_mmap_lock(mapping); |
331 | /* insert tmp into the share list, just after mpnt */ | 331 | /* insert tmp into the share list, just after mpnt */ |
332 | vma_prio_tree_add(tmp, mpnt); | 332 | vma_prio_tree_add(tmp, mpnt); |
333 | flush_dcache_mmap_unlock(mapping); | 333 | flush_dcache_mmap_unlock(mapping); |
334 | spin_unlock(&mapping->i_mmap_lock); | 334 | spin_unlock(&mapping->i_mmap_lock); |
335 | } | 335 | } |
336 | 336 | ||
337 | /* | 337 | /* |
338 | * Clear hugetlb-related page reserves for children. This only | 338 | * Clear hugetlb-related page reserves for children. This only |
339 | * affects MAP_PRIVATE mappings. Faults generated by the child | 339 | * affects MAP_PRIVATE mappings. Faults generated by the child |
340 | * are not guaranteed to succeed, even if read-only | 340 | * are not guaranteed to succeed, even if read-only |
341 | */ | 341 | */ |
342 | if (is_vm_hugetlb_page(tmp)) | 342 | if (is_vm_hugetlb_page(tmp)) |
343 | reset_vma_resv_huge_pages(tmp); | 343 | reset_vma_resv_huge_pages(tmp); |
344 | 344 | ||
345 | /* | 345 | /* |
346 | * Link in the new vma and copy the page table entries. | 346 | * Link in the new vma and copy the page table entries. |
347 | */ | 347 | */ |
348 | *pprev = tmp; | 348 | *pprev = tmp; |
349 | pprev = &tmp->vm_next; | 349 | pprev = &tmp->vm_next; |
350 | 350 | ||
351 | __vma_link_rb(mm, tmp, rb_link, rb_parent); | 351 | __vma_link_rb(mm, tmp, rb_link, rb_parent); |
352 | rb_link = &tmp->vm_rb.rb_right; | 352 | rb_link = &tmp->vm_rb.rb_right; |
353 | rb_parent = &tmp->vm_rb; | 353 | rb_parent = &tmp->vm_rb; |
354 | 354 | ||
355 | mm->map_count++; | 355 | mm->map_count++; |
356 | retval = copy_page_range(mm, oldmm, mpnt); | 356 | retval = copy_page_range(mm, oldmm, mpnt); |
357 | 357 | ||
358 | if (tmp->vm_ops && tmp->vm_ops->open) | 358 | if (tmp->vm_ops && tmp->vm_ops->open) |
359 | tmp->vm_ops->open(tmp); | 359 | tmp->vm_ops->open(tmp); |
360 | 360 | ||
361 | if (retval) | 361 | if (retval) |
362 | goto out; | 362 | goto out; |
363 | } | 363 | } |
364 | /* a new mm has just been created */ | 364 | /* a new mm has just been created */ |
365 | arch_dup_mmap(oldmm, mm); | 365 | arch_dup_mmap(oldmm, mm); |
366 | retval = 0; | 366 | retval = 0; |
367 | out: | 367 | out: |
368 | up_write(&mm->mmap_sem); | 368 | up_write(&mm->mmap_sem); |
369 | flush_tlb_mm(oldmm); | 369 | flush_tlb_mm(oldmm); |
370 | up_write(&oldmm->mmap_sem); | 370 | up_write(&oldmm->mmap_sem); |
371 | return retval; | 371 | return retval; |
372 | fail_nomem_policy: | 372 | fail_nomem_policy: |
373 | kmem_cache_free(vm_area_cachep, tmp); | 373 | kmem_cache_free(vm_area_cachep, tmp); |
374 | fail_nomem: | 374 | fail_nomem: |
375 | retval = -ENOMEM; | 375 | retval = -ENOMEM; |
376 | vm_unacct_memory(charge); | 376 | vm_unacct_memory(charge); |
377 | goto out; | 377 | goto out; |
378 | } | 378 | } |
379 | 379 | ||
380 | static inline int mm_alloc_pgd(struct mm_struct * mm) | 380 | static inline int mm_alloc_pgd(struct mm_struct * mm) |
381 | { | 381 | { |
382 | mm->pgd = pgd_alloc(mm); | 382 | mm->pgd = pgd_alloc(mm); |
383 | if (unlikely(!mm->pgd)) | 383 | if (unlikely(!mm->pgd)) |
384 | return -ENOMEM; | 384 | return -ENOMEM; |
385 | return 0; | 385 | return 0; |
386 | } | 386 | } |
387 | 387 | ||
388 | static inline void mm_free_pgd(struct mm_struct * mm) | 388 | static inline void mm_free_pgd(struct mm_struct * mm) |
389 | { | 389 | { |
390 | pgd_free(mm, mm->pgd); | 390 | pgd_free(mm, mm->pgd); |
391 | } | 391 | } |
392 | #else | 392 | #else |
393 | #define dup_mmap(mm, oldmm) (0) | 393 | #define dup_mmap(mm, oldmm) (0) |
394 | #define mm_alloc_pgd(mm) (0) | 394 | #define mm_alloc_pgd(mm) (0) |
395 | #define mm_free_pgd(mm) | 395 | #define mm_free_pgd(mm) |
396 | #endif /* CONFIG_MMU */ | 396 | #endif /* CONFIG_MMU */ |
397 | 397 | ||
398 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); | 398 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); |
399 | 399 | ||
400 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) | 400 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) |
401 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | 401 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) |
402 | 402 | ||
403 | static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; | 403 | static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; |
404 | 404 | ||
405 | static int __init coredump_filter_setup(char *s) | 405 | static int __init coredump_filter_setup(char *s) |
406 | { | 406 | { |
407 | default_dump_filter = | 407 | default_dump_filter = |
408 | (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & | 408 | (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & |
409 | MMF_DUMP_FILTER_MASK; | 409 | MMF_DUMP_FILTER_MASK; |
410 | return 1; | 410 | return 1; |
411 | } | 411 | } |
412 | 412 | ||
413 | __setup("coredump_filter=", coredump_filter_setup); | 413 | __setup("coredump_filter=", coredump_filter_setup); |
414 | 414 | ||
415 | #include <linux/init_task.h> | 415 | #include <linux/init_task.h> |
416 | 416 | ||
417 | static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | 417 | static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) |
418 | { | 418 | { |
419 | atomic_set(&mm->mm_users, 1); | 419 | atomic_set(&mm->mm_users, 1); |
420 | atomic_set(&mm->mm_count, 1); | 420 | atomic_set(&mm->mm_count, 1); |
421 | init_rwsem(&mm->mmap_sem); | 421 | init_rwsem(&mm->mmap_sem); |
422 | INIT_LIST_HEAD(&mm->mmlist); | 422 | INIT_LIST_HEAD(&mm->mmlist); |
423 | mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; | 423 | mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; |
424 | mm->core_state = NULL; | 424 | mm->core_state = NULL; |
425 | mm->nr_ptes = 0; | 425 | mm->nr_ptes = 0; |
426 | set_mm_counter(mm, file_rss, 0); | 426 | set_mm_counter(mm, file_rss, 0); |
427 | set_mm_counter(mm, anon_rss, 0); | 427 | set_mm_counter(mm, anon_rss, 0); |
428 | spin_lock_init(&mm->page_table_lock); | 428 | spin_lock_init(&mm->page_table_lock); |
429 | spin_lock_init(&mm->ioctx_lock); | 429 | spin_lock_init(&mm->ioctx_lock); |
430 | INIT_HLIST_HEAD(&mm->ioctx_list); | 430 | INIT_HLIST_HEAD(&mm->ioctx_list); |
431 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 431 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
432 | mm->cached_hole_size = ~0UL; | 432 | mm->cached_hole_size = ~0UL; |
433 | mm_init_owner(mm, p); | 433 | mm_init_owner(mm, p); |
434 | 434 | ||
435 | if (likely(!mm_alloc_pgd(mm))) { | 435 | if (likely(!mm_alloc_pgd(mm))) { |
436 | mm->def_flags = 0; | 436 | mm->def_flags = 0; |
437 | mmu_notifier_mm_init(mm); | 437 | mmu_notifier_mm_init(mm); |
438 | return mm; | 438 | return mm; |
439 | } | 439 | } |
440 | 440 | ||
441 | free_mm(mm); | 441 | free_mm(mm); |
442 | return NULL; | 442 | return NULL; |
443 | } | 443 | } |
444 | 444 | ||
445 | /* | 445 | /* |
446 | * Allocate and initialize an mm_struct. | 446 | * Allocate and initialize an mm_struct. |
447 | */ | 447 | */ |
448 | struct mm_struct * mm_alloc(void) | 448 | struct mm_struct * mm_alloc(void) |
449 | { | 449 | { |
450 | struct mm_struct * mm; | 450 | struct mm_struct * mm; |
451 | 451 | ||
452 | mm = allocate_mm(); | 452 | mm = allocate_mm(); |
453 | if (mm) { | 453 | if (mm) { |
454 | memset(mm, 0, sizeof(*mm)); | 454 | memset(mm, 0, sizeof(*mm)); |
455 | mm = mm_init(mm, current); | 455 | mm = mm_init(mm, current); |
456 | } | 456 | } |
457 | return mm; | 457 | return mm; |
458 | } | 458 | } |
459 | 459 | ||
460 | /* | 460 | /* |
461 | * Called when the last reference to the mm | 461 | * Called when the last reference to the mm |
462 | * is dropped: either by a lazy thread or by | 462 | * is dropped: either by a lazy thread or by |
463 | * mmput. Free the page directory and the mm. | 463 | * mmput. Free the page directory and the mm. |
464 | */ | 464 | */ |
465 | void __mmdrop(struct mm_struct *mm) | 465 | void __mmdrop(struct mm_struct *mm) |
466 | { | 466 | { |
467 | BUG_ON(mm == &init_mm); | 467 | BUG_ON(mm == &init_mm); |
468 | mm_free_pgd(mm); | 468 | mm_free_pgd(mm); |
469 | destroy_context(mm); | 469 | destroy_context(mm); |
470 | mmu_notifier_mm_destroy(mm); | 470 | mmu_notifier_mm_destroy(mm); |
471 | free_mm(mm); | 471 | free_mm(mm); |
472 | } | 472 | } |
473 | EXPORT_SYMBOL_GPL(__mmdrop); | 473 | EXPORT_SYMBOL_GPL(__mmdrop); |
474 | 474 | ||
475 | /* | 475 | /* |
476 | * Decrement the use count and release all resources for an mm. | 476 | * Decrement the use count and release all resources for an mm. |
477 | */ | 477 | */ |
478 | void mmput(struct mm_struct *mm) | 478 | void mmput(struct mm_struct *mm) |
479 | { | 479 | { |
480 | might_sleep(); | 480 | might_sleep(); |
481 | 481 | ||
482 | if (atomic_dec_and_test(&mm->mm_users)) { | 482 | if (atomic_dec_and_test(&mm->mm_users)) { |
483 | exit_aio(mm); | 483 | exit_aio(mm); |
484 | exit_mmap(mm); | 484 | exit_mmap(mm); |
485 | set_mm_exe_file(mm, NULL); | 485 | set_mm_exe_file(mm, NULL); |
486 | if (!list_empty(&mm->mmlist)) { | 486 | if (!list_empty(&mm->mmlist)) { |
487 | spin_lock(&mmlist_lock); | 487 | spin_lock(&mmlist_lock); |
488 | list_del(&mm->mmlist); | 488 | list_del(&mm->mmlist); |
489 | spin_unlock(&mmlist_lock); | 489 | spin_unlock(&mmlist_lock); |
490 | } | 490 | } |
491 | put_swap_token(mm); | 491 | put_swap_token(mm); |
492 | mmdrop(mm); | 492 | mmdrop(mm); |
493 | } | 493 | } |
494 | } | 494 | } |
495 | EXPORT_SYMBOL_GPL(mmput); | 495 | EXPORT_SYMBOL_GPL(mmput); |
496 | 496 | ||
497 | /** | 497 | /** |
498 | * get_task_mm - acquire a reference to the task's mm | 498 | * get_task_mm - acquire a reference to the task's mm |
499 | * | 499 | * |
500 | * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning | 500 | * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning |
501 | * this kernel workthread has transiently adopted a user mm with use_mm, | 501 | * this kernel workthread has transiently adopted a user mm with use_mm, |
502 | * to do its AIO) is not set and if so returns a reference to it, after | 502 | * to do its AIO) is not set and if so returns a reference to it, after |
503 | * bumping up the use count. User must release the mm via mmput() | 503 | * bumping up the use count. User must release the mm via mmput() |
504 | * after use. Typically used by /proc and ptrace. | 504 | * after use. Typically used by /proc and ptrace. |
505 | */ | 505 | */ |
506 | struct mm_struct *get_task_mm(struct task_struct *task) | 506 | struct mm_struct *get_task_mm(struct task_struct *task) |
507 | { | 507 | { |
508 | struct mm_struct *mm; | 508 | struct mm_struct *mm; |
509 | 509 | ||
510 | task_lock(task); | 510 | task_lock(task); |
511 | mm = task->mm; | 511 | mm = task->mm; |
512 | if (mm) { | 512 | if (mm) { |
513 | if (task->flags & PF_KTHREAD) | 513 | if (task->flags & PF_KTHREAD) |
514 | mm = NULL; | 514 | mm = NULL; |
515 | else | 515 | else |
516 | atomic_inc(&mm->mm_users); | 516 | atomic_inc(&mm->mm_users); |
517 | } | 517 | } |
518 | task_unlock(task); | 518 | task_unlock(task); |
519 | return mm; | 519 | return mm; |
520 | } | 520 | } |
521 | EXPORT_SYMBOL_GPL(get_task_mm); | 521 | EXPORT_SYMBOL_GPL(get_task_mm); |
522 | 522 | ||
523 | /* Please note the differences between mmput and mm_release. | 523 | /* Please note the differences between mmput and mm_release. |
524 | * mmput is called whenever we stop holding onto a mm_struct, | 524 | * mmput is called whenever we stop holding onto a mm_struct, |
525 | * error success whatever. | 525 | * error success whatever. |
526 | * | 526 | * |
527 | * mm_release is called after a mm_struct has been removed | 527 | * mm_release is called after a mm_struct has been removed |
528 | * from the current process. | 528 | * from the current process. |
529 | * | 529 | * |
530 | * This difference is important for error handling, when we | 530 | * This difference is important for error handling, when we |
531 | * only half set up a mm_struct for a new process and need to restore | 531 | * only half set up a mm_struct for a new process and need to restore |
532 | * the old one. Because we mmput the new mm_struct before | 532 | * the old one. Because we mmput the new mm_struct before |
533 | * restoring the old one. . . | 533 | * restoring the old one. . . |
534 | * Eric Biederman 10 January 1998 | 534 | * Eric Biederman 10 January 1998 |
535 | */ | 535 | */ |
536 | void mm_release(struct task_struct *tsk, struct mm_struct *mm) | 536 | void mm_release(struct task_struct *tsk, struct mm_struct *mm) |
537 | { | 537 | { |
538 | struct completion *vfork_done = tsk->vfork_done; | 538 | struct completion *vfork_done = tsk->vfork_done; |
539 | 539 | ||
540 | /* Get rid of any futexes when releasing the mm */ | 540 | /* Get rid of any futexes when releasing the mm */ |
541 | #ifdef CONFIG_FUTEX | 541 | #ifdef CONFIG_FUTEX |
542 | if (unlikely(tsk->robust_list)) | 542 | if (unlikely(tsk->robust_list)) |
543 | exit_robust_list(tsk); | 543 | exit_robust_list(tsk); |
544 | #ifdef CONFIG_COMPAT | 544 | #ifdef CONFIG_COMPAT |
545 | if (unlikely(tsk->compat_robust_list)) | 545 | if (unlikely(tsk->compat_robust_list)) |
546 | compat_exit_robust_list(tsk); | 546 | compat_exit_robust_list(tsk); |
547 | #endif | 547 | #endif |
548 | #endif | 548 | #endif |
549 | 549 | ||
550 | /* Get rid of any cached register state */ | 550 | /* Get rid of any cached register state */ |
551 | deactivate_mm(tsk, mm); | 551 | deactivate_mm(tsk, mm); |
552 | 552 | ||
553 | /* notify parent sleeping on vfork() */ | 553 | /* notify parent sleeping on vfork() */ |
554 | if (vfork_done) { | 554 | if (vfork_done) { |
555 | tsk->vfork_done = NULL; | 555 | tsk->vfork_done = NULL; |
556 | complete(vfork_done); | 556 | complete(vfork_done); |
557 | } | 557 | } |
558 | 558 | ||
559 | /* | 559 | /* |
560 | * If we're exiting normally, clear a user-space tid field if | 560 | * If we're exiting normally, clear a user-space tid field if |
561 | * requested. We leave this alone when dying by signal, to leave | 561 | * requested. We leave this alone when dying by signal, to leave |
562 | * the value intact in a core dump, and to save the unnecessary | 562 | * the value intact in a core dump, and to save the unnecessary |
563 | * trouble otherwise. Userland only wants this done for a sys_exit. | 563 | * trouble otherwise. Userland only wants this done for a sys_exit. |
564 | */ | 564 | */ |
565 | if (tsk->clear_child_tid | 565 | if (tsk->clear_child_tid |
566 | && !(tsk->flags & PF_SIGNALED) | 566 | && !(tsk->flags & PF_SIGNALED) |
567 | && atomic_read(&mm->mm_users) > 1) { | 567 | && atomic_read(&mm->mm_users) > 1) { |
568 | u32 __user * tidptr = tsk->clear_child_tid; | 568 | u32 __user * tidptr = tsk->clear_child_tid; |
569 | tsk->clear_child_tid = NULL; | 569 | tsk->clear_child_tid = NULL; |
570 | 570 | ||
571 | /* | 571 | /* |
572 | * We don't check the error code - if userspace has | 572 | * We don't check the error code - if userspace has |
573 | * not set up a proper pointer then tough luck. | 573 | * not set up a proper pointer then tough luck. |
574 | */ | 574 | */ |
575 | put_user(0, tidptr); | 575 | put_user(0, tidptr); |
576 | sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); | 576 | sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); |
577 | } | 577 | } |
578 | } | 578 | } |
579 | 579 | ||
580 | /* | 580 | /* |
581 | * Allocate a new mm structure and copy contents from the | 581 | * Allocate a new mm structure and copy contents from the |
582 | * mm structure of the passed in task structure. | 582 | * mm structure of the passed in task structure. |
583 | */ | 583 | */ |
584 | struct mm_struct *dup_mm(struct task_struct *tsk) | 584 | struct mm_struct *dup_mm(struct task_struct *tsk) |
585 | { | 585 | { |
586 | struct mm_struct *mm, *oldmm = current->mm; | 586 | struct mm_struct *mm, *oldmm = current->mm; |
587 | int err; | 587 | int err; |
588 | 588 | ||
589 | if (!oldmm) | 589 | if (!oldmm) |
590 | return NULL; | 590 | return NULL; |
591 | 591 | ||
592 | mm = allocate_mm(); | 592 | mm = allocate_mm(); |
593 | if (!mm) | 593 | if (!mm) |
594 | goto fail_nomem; | 594 | goto fail_nomem; |
595 | 595 | ||
596 | memcpy(mm, oldmm, sizeof(*mm)); | 596 | memcpy(mm, oldmm, sizeof(*mm)); |
597 | 597 | ||
598 | /* Initializing for Swap token stuff */ | 598 | /* Initializing for Swap token stuff */ |
599 | mm->token_priority = 0; | 599 | mm->token_priority = 0; |
600 | mm->last_interval = 0; | 600 | mm->last_interval = 0; |
601 | 601 | ||
602 | if (!mm_init(mm, tsk)) | 602 | if (!mm_init(mm, tsk)) |
603 | goto fail_nomem; | 603 | goto fail_nomem; |
604 | 604 | ||
605 | if (init_new_context(tsk, mm)) | 605 | if (init_new_context(tsk, mm)) |
606 | goto fail_nocontext; | 606 | goto fail_nocontext; |
607 | 607 | ||
608 | dup_mm_exe_file(oldmm, mm); | 608 | dup_mm_exe_file(oldmm, mm); |
609 | 609 | ||
610 | err = dup_mmap(mm, oldmm); | 610 | err = dup_mmap(mm, oldmm); |
611 | if (err) | 611 | if (err) |
612 | goto free_pt; | 612 | goto free_pt; |
613 | 613 | ||
614 | mm->hiwater_rss = get_mm_rss(mm); | 614 | mm->hiwater_rss = get_mm_rss(mm); |
615 | mm->hiwater_vm = mm->total_vm; | 615 | mm->hiwater_vm = mm->total_vm; |
616 | 616 | ||
617 | return mm; | 617 | return mm; |
618 | 618 | ||
619 | free_pt: | 619 | free_pt: |
620 | mmput(mm); | 620 | mmput(mm); |
621 | 621 | ||
622 | fail_nomem: | 622 | fail_nomem: |
623 | return NULL; | 623 | return NULL; |
624 | 624 | ||
625 | fail_nocontext: | 625 | fail_nocontext: |
626 | /* | 626 | /* |
627 | * If init_new_context() failed, we cannot use mmput() to free the mm | 627 | * If init_new_context() failed, we cannot use mmput() to free the mm |
628 | * because it calls destroy_context() | 628 | * because it calls destroy_context() |
629 | */ | 629 | */ |
630 | mm_free_pgd(mm); | 630 | mm_free_pgd(mm); |
631 | free_mm(mm); | 631 | free_mm(mm); |
632 | return NULL; | 632 | return NULL; |
633 | } | 633 | } |
634 | 634 | ||
635 | static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | 635 | static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) |
636 | { | 636 | { |
637 | struct mm_struct * mm, *oldmm; | 637 | struct mm_struct * mm, *oldmm; |
638 | int retval; | 638 | int retval; |
639 | 639 | ||
640 | tsk->min_flt = tsk->maj_flt = 0; | 640 | tsk->min_flt = tsk->maj_flt = 0; |
641 | tsk->nvcsw = tsk->nivcsw = 0; | 641 | tsk->nvcsw = tsk->nivcsw = 0; |
642 | 642 | ||
643 | tsk->mm = NULL; | 643 | tsk->mm = NULL; |
644 | tsk->active_mm = NULL; | 644 | tsk->active_mm = NULL; |
645 | 645 | ||
646 | /* | 646 | /* |
647 | * Are we cloning a kernel thread? | 647 | * Are we cloning a kernel thread? |
648 | * | 648 | * |
649 | * We need to steal a active VM for that.. | 649 | * We need to steal a active VM for that.. |
650 | */ | 650 | */ |
651 | oldmm = current->mm; | 651 | oldmm = current->mm; |
652 | if (!oldmm) | 652 | if (!oldmm) |
653 | return 0; | 653 | return 0; |
654 | 654 | ||
655 | if (clone_flags & CLONE_VM) { | 655 | if (clone_flags & CLONE_VM) { |
656 | atomic_inc(&oldmm->mm_users); | 656 | atomic_inc(&oldmm->mm_users); |
657 | mm = oldmm; | 657 | mm = oldmm; |
658 | goto good_mm; | 658 | goto good_mm; |
659 | } | 659 | } |
660 | 660 | ||
661 | retval = -ENOMEM; | 661 | retval = -ENOMEM; |
662 | mm = dup_mm(tsk); | 662 | mm = dup_mm(tsk); |
663 | if (!mm) | 663 | if (!mm) |
664 | goto fail_nomem; | 664 | goto fail_nomem; |
665 | 665 | ||
666 | good_mm: | 666 | good_mm: |
667 | /* Initializing for Swap token stuff */ | 667 | /* Initializing for Swap token stuff */ |
668 | mm->token_priority = 0; | 668 | mm->token_priority = 0; |
669 | mm->last_interval = 0; | 669 | mm->last_interval = 0; |
670 | 670 | ||
671 | tsk->mm = mm; | 671 | tsk->mm = mm; |
672 | tsk->active_mm = mm; | 672 | tsk->active_mm = mm; |
673 | return 0; | 673 | return 0; |
674 | 674 | ||
675 | fail_nomem: | 675 | fail_nomem: |
676 | return retval; | 676 | return retval; |
677 | } | 677 | } |
678 | 678 | ||
679 | static struct fs_struct *__copy_fs_struct(struct fs_struct *old) | 679 | static struct fs_struct *__copy_fs_struct(struct fs_struct *old) |
680 | { | 680 | { |
681 | struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); | 681 | struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); |
682 | /* We don't need to lock fs - think why ;-) */ | 682 | /* We don't need to lock fs - think why ;-) */ |
683 | if (fs) { | 683 | if (fs) { |
684 | atomic_set(&fs->count, 1); | 684 | atomic_set(&fs->count, 1); |
685 | rwlock_init(&fs->lock); | 685 | rwlock_init(&fs->lock); |
686 | fs->umask = old->umask; | 686 | fs->umask = old->umask; |
687 | read_lock(&old->lock); | 687 | read_lock(&old->lock); |
688 | fs->root = old->root; | 688 | fs->root = old->root; |
689 | path_get(&old->root); | 689 | path_get(&old->root); |
690 | fs->pwd = old->pwd; | 690 | fs->pwd = old->pwd; |
691 | path_get(&old->pwd); | 691 | path_get(&old->pwd); |
692 | read_unlock(&old->lock); | 692 | read_unlock(&old->lock); |
693 | } | 693 | } |
694 | return fs; | 694 | return fs; |
695 | } | 695 | } |
696 | 696 | ||
697 | struct fs_struct *copy_fs_struct(struct fs_struct *old) | 697 | struct fs_struct *copy_fs_struct(struct fs_struct *old) |
698 | { | 698 | { |
699 | return __copy_fs_struct(old); | 699 | return __copy_fs_struct(old); |
700 | } | 700 | } |
701 | 701 | ||
702 | EXPORT_SYMBOL_GPL(copy_fs_struct); | 702 | EXPORT_SYMBOL_GPL(copy_fs_struct); |
703 | 703 | ||
704 | static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) | 704 | static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) |
705 | { | 705 | { |
706 | if (clone_flags & CLONE_FS) { | 706 | if (clone_flags & CLONE_FS) { |
707 | atomic_inc(¤t->fs->count); | 707 | atomic_inc(¤t->fs->count); |
708 | return 0; | 708 | return 0; |
709 | } | 709 | } |
710 | tsk->fs = __copy_fs_struct(current->fs); | 710 | tsk->fs = __copy_fs_struct(current->fs); |
711 | if (!tsk->fs) | 711 | if (!tsk->fs) |
712 | return -ENOMEM; | 712 | return -ENOMEM; |
713 | return 0; | 713 | return 0; |
714 | } | 714 | } |
715 | 715 | ||
716 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | 716 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) |
717 | { | 717 | { |
718 | struct files_struct *oldf, *newf; | 718 | struct files_struct *oldf, *newf; |
719 | int error = 0; | 719 | int error = 0; |
720 | 720 | ||
721 | /* | 721 | /* |
722 | * A background process may not have any files ... | 722 | * A background process may not have any files ... |
723 | */ | 723 | */ |
724 | oldf = current->files; | 724 | oldf = current->files; |
725 | if (!oldf) | 725 | if (!oldf) |
726 | goto out; | 726 | goto out; |
727 | 727 | ||
728 | if (clone_flags & CLONE_FILES) { | 728 | if (clone_flags & CLONE_FILES) { |
729 | atomic_inc(&oldf->count); | 729 | atomic_inc(&oldf->count); |
730 | goto out; | 730 | goto out; |
731 | } | 731 | } |
732 | 732 | ||
733 | newf = dup_fd(oldf, &error); | 733 | newf = dup_fd(oldf, &error); |
734 | if (!newf) | 734 | if (!newf) |
735 | goto out; | 735 | goto out; |
736 | 736 | ||
737 | tsk->files = newf; | 737 | tsk->files = newf; |
738 | error = 0; | 738 | error = 0; |
739 | out: | 739 | out: |
740 | return error; | 740 | return error; |
741 | } | 741 | } |
742 | 742 | ||
743 | static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | 743 | static int copy_io(unsigned long clone_flags, struct task_struct *tsk) |
744 | { | 744 | { |
745 | #ifdef CONFIG_BLOCK | 745 | #ifdef CONFIG_BLOCK |
746 | struct io_context *ioc = current->io_context; | 746 | struct io_context *ioc = current->io_context; |
747 | 747 | ||
748 | if (!ioc) | 748 | if (!ioc) |
749 | return 0; | 749 | return 0; |
750 | /* | 750 | /* |
751 | * Share io context with parent, if CLONE_IO is set | 751 | * Share io context with parent, if CLONE_IO is set |
752 | */ | 752 | */ |
753 | if (clone_flags & CLONE_IO) { | 753 | if (clone_flags & CLONE_IO) { |
754 | tsk->io_context = ioc_task_link(ioc); | 754 | tsk->io_context = ioc_task_link(ioc); |
755 | if (unlikely(!tsk->io_context)) | 755 | if (unlikely(!tsk->io_context)) |
756 | return -ENOMEM; | 756 | return -ENOMEM; |
757 | } else if (ioprio_valid(ioc->ioprio)) { | 757 | } else if (ioprio_valid(ioc->ioprio)) { |
758 | tsk->io_context = alloc_io_context(GFP_KERNEL, -1); | 758 | tsk->io_context = alloc_io_context(GFP_KERNEL, -1); |
759 | if (unlikely(!tsk->io_context)) | 759 | if (unlikely(!tsk->io_context)) |
760 | return -ENOMEM; | 760 | return -ENOMEM; |
761 | 761 | ||
762 | tsk->io_context->ioprio = ioc->ioprio; | 762 | tsk->io_context->ioprio = ioc->ioprio; |
763 | } | 763 | } |
764 | #endif | 764 | #endif |
765 | return 0; | 765 | return 0; |
766 | } | 766 | } |
767 | 767 | ||
768 | static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | 768 | static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) |
769 | { | 769 | { |
770 | struct sighand_struct *sig; | 770 | struct sighand_struct *sig; |
771 | 771 | ||
772 | if (clone_flags & CLONE_SIGHAND) { | 772 | if (clone_flags & CLONE_SIGHAND) { |
773 | atomic_inc(¤t->sighand->count); | 773 | atomic_inc(¤t->sighand->count); |
774 | return 0; | 774 | return 0; |
775 | } | 775 | } |
776 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); | 776 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
777 | rcu_assign_pointer(tsk->sighand, sig); | 777 | rcu_assign_pointer(tsk->sighand, sig); |
778 | if (!sig) | 778 | if (!sig) |
779 | return -ENOMEM; | 779 | return -ENOMEM; |
780 | atomic_set(&sig->count, 1); | 780 | atomic_set(&sig->count, 1); |
781 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); | 781 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
782 | return 0; | 782 | return 0; |
783 | } | 783 | } |
784 | 784 | ||
785 | void __cleanup_sighand(struct sighand_struct *sighand) | 785 | void __cleanup_sighand(struct sighand_struct *sighand) |
786 | { | 786 | { |
787 | if (atomic_dec_and_test(&sighand->count)) | 787 | if (atomic_dec_and_test(&sighand->count)) |
788 | kmem_cache_free(sighand_cachep, sighand); | 788 | kmem_cache_free(sighand_cachep, sighand); |
789 | } | 789 | } |
790 | 790 | ||
791 | 791 | ||
792 | /* | 792 | /* |
793 | * Initialize POSIX timer handling for a thread group. | 793 | * Initialize POSIX timer handling for a thread group. |
794 | */ | 794 | */ |
795 | static void posix_cpu_timers_init_group(struct signal_struct *sig) | 795 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
796 | { | 796 | { |
797 | /* Thread group counters. */ | 797 | /* Thread group counters. */ |
798 | thread_group_cputime_init(sig); | 798 | thread_group_cputime_init(sig); |
799 | 799 | ||
800 | /* Expiration times and increments. */ | 800 | /* Expiration times and increments. */ |
801 | sig->it_virt_expires = cputime_zero; | 801 | sig->it_virt_expires = cputime_zero; |
802 | sig->it_virt_incr = cputime_zero; | 802 | sig->it_virt_incr = cputime_zero; |
803 | sig->it_prof_expires = cputime_zero; | 803 | sig->it_prof_expires = cputime_zero; |
804 | sig->it_prof_incr = cputime_zero; | 804 | sig->it_prof_incr = cputime_zero; |
805 | 805 | ||
806 | /* Cached expiration times. */ | 806 | /* Cached expiration times. */ |
807 | sig->cputime_expires.prof_exp = cputime_zero; | 807 | sig->cputime_expires.prof_exp = cputime_zero; |
808 | sig->cputime_expires.virt_exp = cputime_zero; | 808 | sig->cputime_expires.virt_exp = cputime_zero; |
809 | sig->cputime_expires.sched_exp = 0; | 809 | sig->cputime_expires.sched_exp = 0; |
810 | 810 | ||
811 | /* The timer lists. */ | 811 | /* The timer lists. */ |
812 | INIT_LIST_HEAD(&sig->cpu_timers[0]); | 812 | INIT_LIST_HEAD(&sig->cpu_timers[0]); |
813 | INIT_LIST_HEAD(&sig->cpu_timers[1]); | 813 | INIT_LIST_HEAD(&sig->cpu_timers[1]); |
814 | INIT_LIST_HEAD(&sig->cpu_timers[2]); | 814 | INIT_LIST_HEAD(&sig->cpu_timers[2]); |
815 | } | 815 | } |
816 | 816 | ||
817 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | 817 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) |
818 | { | 818 | { |
819 | struct signal_struct *sig; | 819 | struct signal_struct *sig; |
820 | int ret; | 820 | int ret; |
821 | 821 | ||
822 | if (clone_flags & CLONE_THREAD) { | 822 | if (clone_flags & CLONE_THREAD) { |
823 | ret = thread_group_cputime_clone_thread(current); | 823 | ret = thread_group_cputime_clone_thread(current); |
824 | if (likely(!ret)) { | 824 | if (likely(!ret)) { |
825 | atomic_inc(¤t->signal->count); | 825 | atomic_inc(¤t->signal->count); |
826 | atomic_inc(¤t->signal->live); | 826 | atomic_inc(¤t->signal->live); |
827 | } | 827 | } |
828 | return ret; | 828 | return ret; |
829 | } | 829 | } |
830 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); | 830 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); |
831 | tsk->signal = sig; | 831 | tsk->signal = sig; |
832 | if (!sig) | 832 | if (!sig) |
833 | return -ENOMEM; | 833 | return -ENOMEM; |
834 | 834 | ||
835 | atomic_set(&sig->count, 1); | 835 | atomic_set(&sig->count, 1); |
836 | atomic_set(&sig->live, 1); | 836 | atomic_set(&sig->live, 1); |
837 | init_waitqueue_head(&sig->wait_chldexit); | 837 | init_waitqueue_head(&sig->wait_chldexit); |
838 | sig->flags = 0; | 838 | sig->flags = 0; |
839 | sig->group_exit_code = 0; | 839 | sig->group_exit_code = 0; |
840 | sig->group_exit_task = NULL; | 840 | sig->group_exit_task = NULL; |
841 | sig->group_stop_count = 0; | 841 | sig->group_stop_count = 0; |
842 | sig->curr_target = tsk; | 842 | sig->curr_target = tsk; |
843 | init_sigpending(&sig->shared_pending); | 843 | init_sigpending(&sig->shared_pending); |
844 | INIT_LIST_HEAD(&sig->posix_timers); | 844 | INIT_LIST_HEAD(&sig->posix_timers); |
845 | 845 | ||
846 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 846 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
847 | sig->it_real_incr.tv64 = 0; | 847 | sig->it_real_incr.tv64 = 0; |
848 | sig->real_timer.function = it_real_fn; | 848 | sig->real_timer.function = it_real_fn; |
849 | 849 | ||
850 | sig->leader = 0; /* session leadership doesn't inherit */ | 850 | sig->leader = 0; /* session leadership doesn't inherit */ |
851 | sig->tty_old_pgrp = NULL; | 851 | sig->tty_old_pgrp = NULL; |
852 | sig->tty = NULL; | 852 | sig->tty = NULL; |
853 | 853 | ||
854 | sig->cutime = sig->cstime = cputime_zero; | 854 | sig->cutime = sig->cstime = cputime_zero; |
855 | sig->gtime = cputime_zero; | 855 | sig->gtime = cputime_zero; |
856 | sig->cgtime = cputime_zero; | 856 | sig->cgtime = cputime_zero; |
857 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | 857 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
858 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | 858 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; |
859 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; | 859 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; |
860 | task_io_accounting_init(&sig->ioac); | 860 | task_io_accounting_init(&sig->ioac); |
861 | taskstats_tgid_init(sig); | 861 | taskstats_tgid_init(sig); |
862 | 862 | ||
863 | task_lock(current->group_leader); | 863 | task_lock(current->group_leader); |
864 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); | 864 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); |
865 | task_unlock(current->group_leader); | 865 | task_unlock(current->group_leader); |
866 | 866 | ||
867 | posix_cpu_timers_init_group(sig); | 867 | posix_cpu_timers_init_group(sig); |
868 | 868 | ||
869 | acct_init_pacct(&sig->pacct); | 869 | acct_init_pacct(&sig->pacct); |
870 | 870 | ||
871 | tty_audit_fork(sig); | 871 | tty_audit_fork(sig); |
872 | 872 | ||
873 | return 0; | 873 | return 0; |
874 | } | 874 | } |
875 | 875 | ||
876 | void __cleanup_signal(struct signal_struct *sig) | 876 | void __cleanup_signal(struct signal_struct *sig) |
877 | { | 877 | { |
878 | thread_group_cputime_free(sig); | 878 | thread_group_cputime_free(sig); |
879 | tty_kref_put(sig->tty); | 879 | tty_kref_put(sig->tty); |
880 | kmem_cache_free(signal_cachep, sig); | 880 | kmem_cache_free(signal_cachep, sig); |
881 | } | 881 | } |
882 | 882 | ||
883 | static void cleanup_signal(struct task_struct *tsk) | 883 | static void cleanup_signal(struct task_struct *tsk) |
884 | { | 884 | { |
885 | struct signal_struct *sig = tsk->signal; | 885 | struct signal_struct *sig = tsk->signal; |
886 | 886 | ||
887 | atomic_dec(&sig->live); | 887 | atomic_dec(&sig->live); |
888 | 888 | ||
889 | if (atomic_dec_and_test(&sig->count)) | 889 | if (atomic_dec_and_test(&sig->count)) |
890 | __cleanup_signal(sig); | 890 | __cleanup_signal(sig); |
891 | } | 891 | } |
892 | 892 | ||
893 | static void copy_flags(unsigned long clone_flags, struct task_struct *p) | 893 | static void copy_flags(unsigned long clone_flags, struct task_struct *p) |
894 | { | 894 | { |
895 | unsigned long new_flags = p->flags; | 895 | unsigned long new_flags = p->flags; |
896 | 896 | ||
897 | new_flags &= ~PF_SUPERPRIV; | 897 | new_flags &= ~PF_SUPERPRIV; |
898 | new_flags |= PF_FORKNOEXEC; | 898 | new_flags |= PF_FORKNOEXEC; |
899 | new_flags |= PF_STARTING; | 899 | new_flags |= PF_STARTING; |
900 | p->flags = new_flags; | 900 | p->flags = new_flags; |
901 | clear_freeze_flag(p); | 901 | clear_freeze_flag(p); |
902 | } | 902 | } |
903 | 903 | ||
904 | asmlinkage long sys_set_tid_address(int __user *tidptr) | 904 | asmlinkage long sys_set_tid_address(int __user *tidptr) |
905 | { | 905 | { |
906 | current->clear_child_tid = tidptr; | 906 | current->clear_child_tid = tidptr; |
907 | 907 | ||
908 | return task_pid_vnr(current); | 908 | return task_pid_vnr(current); |
909 | } | 909 | } |
910 | 910 | ||
911 | static void rt_mutex_init_task(struct task_struct *p) | 911 | static void rt_mutex_init_task(struct task_struct *p) |
912 | { | 912 | { |
913 | spin_lock_init(&p->pi_lock); | 913 | spin_lock_init(&p->pi_lock); |
914 | #ifdef CONFIG_RT_MUTEXES | 914 | #ifdef CONFIG_RT_MUTEXES |
915 | plist_head_init(&p->pi_waiters, &p->pi_lock); | 915 | plist_head_init(&p->pi_waiters, &p->pi_lock); |
916 | p->pi_blocked_on = NULL; | 916 | p->pi_blocked_on = NULL; |
917 | #endif | 917 | #endif |
918 | } | 918 | } |
919 | 919 | ||
920 | #ifdef CONFIG_MM_OWNER | 920 | #ifdef CONFIG_MM_OWNER |
921 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 921 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
922 | { | 922 | { |
923 | mm->owner = p; | 923 | mm->owner = p; |
924 | } | 924 | } |
925 | #endif /* CONFIG_MM_OWNER */ | 925 | #endif /* CONFIG_MM_OWNER */ |
926 | 926 | ||
927 | /* | 927 | /* |
928 | * Initialize POSIX timer handling for a single task. | 928 | * Initialize POSIX timer handling for a single task. |
929 | */ | 929 | */ |
930 | static void posix_cpu_timers_init(struct task_struct *tsk) | 930 | static void posix_cpu_timers_init(struct task_struct *tsk) |
931 | { | 931 | { |
932 | tsk->cputime_expires.prof_exp = cputime_zero; | 932 | tsk->cputime_expires.prof_exp = cputime_zero; |
933 | tsk->cputime_expires.virt_exp = cputime_zero; | 933 | tsk->cputime_expires.virt_exp = cputime_zero; |
934 | tsk->cputime_expires.sched_exp = 0; | 934 | tsk->cputime_expires.sched_exp = 0; |
935 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); | 935 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
936 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); | 936 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
937 | INIT_LIST_HEAD(&tsk->cpu_timers[2]); | 937 | INIT_LIST_HEAD(&tsk->cpu_timers[2]); |
938 | } | 938 | } |
939 | 939 | ||
940 | /* | 940 | /* |
941 | * This creates a new process as a copy of the old one, | 941 | * This creates a new process as a copy of the old one, |
942 | * but does not actually start it yet. | 942 | * but does not actually start it yet. |
943 | * | 943 | * |
944 | * It copies the registers, and all the appropriate | 944 | * It copies the registers, and all the appropriate |
945 | * parts of the process environment (as per the clone | 945 | * parts of the process environment (as per the clone |
946 | * flags). The actual kick-off is left to the caller. | 946 | * flags). The actual kick-off is left to the caller. |
947 | */ | 947 | */ |
948 | static struct task_struct *copy_process(unsigned long clone_flags, | 948 | static struct task_struct *copy_process(unsigned long clone_flags, |
949 | unsigned long stack_start, | 949 | unsigned long stack_start, |
950 | struct pt_regs *regs, | 950 | struct pt_regs *regs, |
951 | unsigned long stack_size, | 951 | unsigned long stack_size, |
952 | int __user *child_tidptr, | 952 | int __user *child_tidptr, |
953 | struct pid *pid, | 953 | struct pid *pid, |
954 | int trace) | 954 | int trace) |
955 | { | 955 | { |
956 | int retval; | 956 | int retval; |
957 | struct task_struct *p; | 957 | struct task_struct *p; |
958 | int cgroup_callbacks_done = 0; | 958 | int cgroup_callbacks_done = 0; |
959 | 959 | ||
960 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 960 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
961 | return ERR_PTR(-EINVAL); | 961 | return ERR_PTR(-EINVAL); |
962 | 962 | ||
963 | /* | 963 | /* |
964 | * Thread groups must share signals as well, and detached threads | 964 | * Thread groups must share signals as well, and detached threads |
965 | * can only be started up within the thread group. | 965 | * can only be started up within the thread group. |
966 | */ | 966 | */ |
967 | if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) | 967 | if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) |
968 | return ERR_PTR(-EINVAL); | 968 | return ERR_PTR(-EINVAL); |
969 | 969 | ||
970 | /* | 970 | /* |
971 | * Shared signal handlers imply shared VM. By way of the above, | 971 | * Shared signal handlers imply shared VM. By way of the above, |
972 | * thread groups also imply shared VM. Blocking this case allows | 972 | * thread groups also imply shared VM. Blocking this case allows |
973 | * for various simplifications in other code. | 973 | * for various simplifications in other code. |
974 | */ | 974 | */ |
975 | if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) | 975 | if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) |
976 | return ERR_PTR(-EINVAL); | 976 | return ERR_PTR(-EINVAL); |
977 | 977 | ||
978 | retval = security_task_create(clone_flags); | 978 | retval = security_task_create(clone_flags); |
979 | if (retval) | 979 | if (retval) |
980 | goto fork_out; | 980 | goto fork_out; |
981 | 981 | ||
982 | retval = -ENOMEM; | 982 | retval = -ENOMEM; |
983 | p = dup_task_struct(current); | 983 | p = dup_task_struct(current); |
984 | if (!p) | 984 | if (!p) |
985 | goto fork_out; | 985 | goto fork_out; |
986 | 986 | ||
987 | rt_mutex_init_task(p); | 987 | rt_mutex_init_task(p); |
988 | 988 | ||
989 | #ifdef CONFIG_PROVE_LOCKING | 989 | #ifdef CONFIG_PROVE_LOCKING |
990 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); | 990 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
991 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); | 991 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); |
992 | #endif | 992 | #endif |
993 | retval = -EAGAIN; | 993 | retval = -EAGAIN; |
994 | if (atomic_read(&p->real_cred->user->processes) >= | 994 | if (atomic_read(&p->real_cred->user->processes) >= |
995 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { | 995 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { |
996 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && | 996 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && |
997 | p->real_cred->user != INIT_USER) | 997 | p->real_cred->user != INIT_USER) |
998 | goto bad_fork_free; | 998 | goto bad_fork_free; |
999 | } | 999 | } |
1000 | 1000 | ||
1001 | retval = copy_creds(p, clone_flags); | 1001 | retval = copy_creds(p, clone_flags); |
1002 | if (retval < 0) | 1002 | if (retval < 0) |
1003 | goto bad_fork_free; | 1003 | goto bad_fork_free; |
1004 | 1004 | ||
1005 | /* | 1005 | /* |
1006 | * If multiple threads are within copy_process(), then this check | 1006 | * If multiple threads are within copy_process(), then this check |
1007 | * triggers too late. This doesn't hurt, the check is only there | 1007 | * triggers too late. This doesn't hurt, the check is only there |
1008 | * to stop root fork bombs. | 1008 | * to stop root fork bombs. |
1009 | */ | 1009 | */ |
1010 | if (nr_threads >= max_threads) | 1010 | if (nr_threads >= max_threads) |
1011 | goto bad_fork_cleanup_count; | 1011 | goto bad_fork_cleanup_count; |
1012 | 1012 | ||
1013 | if (!try_module_get(task_thread_info(p)->exec_domain->module)) | 1013 | if (!try_module_get(task_thread_info(p)->exec_domain->module)) |
1014 | goto bad_fork_cleanup_count; | 1014 | goto bad_fork_cleanup_count; |
1015 | 1015 | ||
1016 | if (p->binfmt && !try_module_get(p->binfmt->module)) | 1016 | if (p->binfmt && !try_module_get(p->binfmt->module)) |
1017 | goto bad_fork_cleanup_put_domain; | 1017 | goto bad_fork_cleanup_put_domain; |
1018 | 1018 | ||
1019 | p->did_exec = 0; | 1019 | p->did_exec = 0; |
1020 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ | 1020 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ |
1021 | copy_flags(clone_flags, p); | 1021 | copy_flags(clone_flags, p); |
1022 | INIT_LIST_HEAD(&p->children); | 1022 | INIT_LIST_HEAD(&p->children); |
1023 | INIT_LIST_HEAD(&p->sibling); | 1023 | INIT_LIST_HEAD(&p->sibling); |
1024 | #ifdef CONFIG_PREEMPT_RCU | 1024 | #ifdef CONFIG_PREEMPT_RCU |
1025 | p->rcu_read_lock_nesting = 0; | 1025 | p->rcu_read_lock_nesting = 0; |
1026 | p->rcu_flipctr_idx = 0; | 1026 | p->rcu_flipctr_idx = 0; |
1027 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | 1027 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
1028 | p->vfork_done = NULL; | 1028 | p->vfork_done = NULL; |
1029 | spin_lock_init(&p->alloc_lock); | 1029 | spin_lock_init(&p->alloc_lock); |
1030 | 1030 | ||
1031 | clear_tsk_thread_flag(p, TIF_SIGPENDING); | 1031 | clear_tsk_thread_flag(p, TIF_SIGPENDING); |
1032 | init_sigpending(&p->pending); | 1032 | init_sigpending(&p->pending); |
1033 | 1033 | ||
1034 | p->utime = cputime_zero; | 1034 | p->utime = cputime_zero; |
1035 | p->stime = cputime_zero; | 1035 | p->stime = cputime_zero; |
1036 | p->gtime = cputime_zero; | 1036 | p->gtime = cputime_zero; |
1037 | p->utimescaled = cputime_zero; | 1037 | p->utimescaled = cputime_zero; |
1038 | p->stimescaled = cputime_zero; | 1038 | p->stimescaled = cputime_zero; |
1039 | p->prev_utime = cputime_zero; | 1039 | p->prev_utime = cputime_zero; |
1040 | p->prev_stime = cputime_zero; | 1040 | p->prev_stime = cputime_zero; |
1041 | 1041 | ||
1042 | p->default_timer_slack_ns = current->timer_slack_ns; | 1042 | p->default_timer_slack_ns = current->timer_slack_ns; |
1043 | 1043 | ||
1044 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 1044 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
1045 | p->last_switch_count = 0; | 1045 | p->last_switch_count = 0; |
1046 | p->last_switch_timestamp = 0; | 1046 | p->last_switch_timestamp = 0; |
1047 | #endif | 1047 | #endif |
1048 | 1048 | ||
1049 | task_io_accounting_init(&p->ioac); | 1049 | task_io_accounting_init(&p->ioac); |
1050 | acct_clear_integrals(p); | 1050 | acct_clear_integrals(p); |
1051 | 1051 | ||
1052 | posix_cpu_timers_init(p); | 1052 | posix_cpu_timers_init(p); |
1053 | 1053 | ||
1054 | p->lock_depth = -1; /* -1 = no lock */ | 1054 | p->lock_depth = -1; /* -1 = no lock */ |
1055 | do_posix_clock_monotonic_gettime(&p->start_time); | 1055 | do_posix_clock_monotonic_gettime(&p->start_time); |
1056 | p->real_start_time = p->start_time; | 1056 | p->real_start_time = p->start_time; |
1057 | monotonic_to_bootbased(&p->real_start_time); | 1057 | monotonic_to_bootbased(&p->real_start_time); |
1058 | p->io_context = NULL; | 1058 | p->io_context = NULL; |
1059 | p->audit_context = NULL; | 1059 | p->audit_context = NULL; |
1060 | cgroup_fork(p); | 1060 | cgroup_fork(p); |
1061 | #ifdef CONFIG_NUMA | 1061 | #ifdef CONFIG_NUMA |
1062 | p->mempolicy = mpol_dup(p->mempolicy); | 1062 | p->mempolicy = mpol_dup(p->mempolicy); |
1063 | if (IS_ERR(p->mempolicy)) { | 1063 | if (IS_ERR(p->mempolicy)) { |
1064 | retval = PTR_ERR(p->mempolicy); | 1064 | retval = PTR_ERR(p->mempolicy); |
1065 | p->mempolicy = NULL; | 1065 | p->mempolicy = NULL; |
1066 | goto bad_fork_cleanup_cgroup; | 1066 | goto bad_fork_cleanup_cgroup; |
1067 | } | 1067 | } |
1068 | mpol_fix_fork_child_flag(p); | 1068 | mpol_fix_fork_child_flag(p); |
1069 | #endif | 1069 | #endif |
1070 | #ifdef CONFIG_TRACE_IRQFLAGS | 1070 | #ifdef CONFIG_TRACE_IRQFLAGS |
1071 | p->irq_events = 0; | 1071 | p->irq_events = 0; |
1072 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 1072 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
1073 | p->hardirqs_enabled = 1; | 1073 | p->hardirqs_enabled = 1; |
1074 | #else | 1074 | #else |
1075 | p->hardirqs_enabled = 0; | 1075 | p->hardirqs_enabled = 0; |
1076 | #endif | 1076 | #endif |
1077 | p->hardirq_enable_ip = 0; | 1077 | p->hardirq_enable_ip = 0; |
1078 | p->hardirq_enable_event = 0; | 1078 | p->hardirq_enable_event = 0; |
1079 | p->hardirq_disable_ip = _THIS_IP_; | 1079 | p->hardirq_disable_ip = _THIS_IP_; |
1080 | p->hardirq_disable_event = 0; | 1080 | p->hardirq_disable_event = 0; |
1081 | p->softirqs_enabled = 1; | 1081 | p->softirqs_enabled = 1; |
1082 | p->softirq_enable_ip = _THIS_IP_; | 1082 | p->softirq_enable_ip = _THIS_IP_; |
1083 | p->softirq_enable_event = 0; | 1083 | p->softirq_enable_event = 0; |
1084 | p->softirq_disable_ip = 0; | 1084 | p->softirq_disable_ip = 0; |
1085 | p->softirq_disable_event = 0; | 1085 | p->softirq_disable_event = 0; |
1086 | p->hardirq_context = 0; | 1086 | p->hardirq_context = 0; |
1087 | p->softirq_context = 0; | 1087 | p->softirq_context = 0; |
1088 | #endif | 1088 | #endif |
1089 | #ifdef CONFIG_LOCKDEP | 1089 | #ifdef CONFIG_LOCKDEP |
1090 | p->lockdep_depth = 0; /* no locks held yet */ | 1090 | p->lockdep_depth = 0; /* no locks held yet */ |
1091 | p->curr_chain_key = 0; | 1091 | p->curr_chain_key = 0; |
1092 | p->lockdep_recursion = 0; | 1092 | p->lockdep_recursion = 0; |
1093 | #endif | 1093 | #endif |
1094 | 1094 | ||
1095 | #ifdef CONFIG_DEBUG_MUTEXES | 1095 | #ifdef CONFIG_DEBUG_MUTEXES |
1096 | p->blocked_on = NULL; /* not blocked yet */ | 1096 | p->blocked_on = NULL; /* not blocked yet */ |
1097 | #endif | 1097 | #endif |
1098 | if (unlikely(ptrace_reparented(current))) | 1098 | if (unlikely(ptrace_reparented(current))) |
1099 | ptrace_fork(p, clone_flags); | 1099 | ptrace_fork(p, clone_flags); |
1100 | 1100 | ||
1101 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1101 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1102 | sched_fork(p, clone_flags); | 1102 | sched_fork(p, clone_flags); |
1103 | 1103 | ||
1104 | if ((retval = audit_alloc(p))) | 1104 | if ((retval = audit_alloc(p))) |
1105 | goto bad_fork_cleanup_policy; | 1105 | goto bad_fork_cleanup_policy; |
1106 | /* copy all the process information */ | 1106 | /* copy all the process information */ |
1107 | if ((retval = copy_semundo(clone_flags, p))) | 1107 | if ((retval = copy_semundo(clone_flags, p))) |
1108 | goto bad_fork_cleanup_audit; | 1108 | goto bad_fork_cleanup_audit; |
1109 | if ((retval = copy_files(clone_flags, p))) | 1109 | if ((retval = copy_files(clone_flags, p))) |
1110 | goto bad_fork_cleanup_semundo; | 1110 | goto bad_fork_cleanup_semundo; |
1111 | if ((retval = copy_fs(clone_flags, p))) | 1111 | if ((retval = copy_fs(clone_flags, p))) |
1112 | goto bad_fork_cleanup_files; | 1112 | goto bad_fork_cleanup_files; |
1113 | if ((retval = copy_sighand(clone_flags, p))) | 1113 | if ((retval = copy_sighand(clone_flags, p))) |
1114 | goto bad_fork_cleanup_fs; | 1114 | goto bad_fork_cleanup_fs; |
1115 | if ((retval = copy_signal(clone_flags, p))) | 1115 | if ((retval = copy_signal(clone_flags, p))) |
1116 | goto bad_fork_cleanup_sighand; | 1116 | goto bad_fork_cleanup_sighand; |
1117 | if ((retval = copy_mm(clone_flags, p))) | 1117 | if ((retval = copy_mm(clone_flags, p))) |
1118 | goto bad_fork_cleanup_signal; | 1118 | goto bad_fork_cleanup_signal; |
1119 | if ((retval = copy_namespaces(clone_flags, p))) | 1119 | if ((retval = copy_namespaces(clone_flags, p))) |
1120 | goto bad_fork_cleanup_mm; | 1120 | goto bad_fork_cleanup_mm; |
1121 | if ((retval = copy_io(clone_flags, p))) | 1121 | if ((retval = copy_io(clone_flags, p))) |
1122 | goto bad_fork_cleanup_namespaces; | 1122 | goto bad_fork_cleanup_namespaces; |
1123 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); | 1123 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); |
1124 | if (retval) | 1124 | if (retval) |
1125 | goto bad_fork_cleanup_io; | 1125 | goto bad_fork_cleanup_io; |
1126 | 1126 | ||
1127 | if (pid != &init_struct_pid) { | 1127 | if (pid != &init_struct_pid) { |
1128 | retval = -ENOMEM; | 1128 | retval = -ENOMEM; |
1129 | pid = alloc_pid(p->nsproxy->pid_ns); | 1129 | pid = alloc_pid(p->nsproxy->pid_ns); |
1130 | if (!pid) | 1130 | if (!pid) |
1131 | goto bad_fork_cleanup_io; | 1131 | goto bad_fork_cleanup_io; |
1132 | 1132 | ||
1133 | if (clone_flags & CLONE_NEWPID) { | 1133 | if (clone_flags & CLONE_NEWPID) { |
1134 | retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); | 1134 | retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); |
1135 | if (retval < 0) | 1135 | if (retval < 0) |
1136 | goto bad_fork_free_pid; | 1136 | goto bad_fork_free_pid; |
1137 | } | 1137 | } |
1138 | } | 1138 | } |
1139 | 1139 | ||
1140 | ftrace_graph_init_task(p); | 1140 | ftrace_graph_init_task(p); |
1141 | 1141 | ||
1142 | p->pid = pid_nr(pid); | 1142 | p->pid = pid_nr(pid); |
1143 | p->tgid = p->pid; | 1143 | p->tgid = p->pid; |
1144 | if (clone_flags & CLONE_THREAD) | 1144 | if (clone_flags & CLONE_THREAD) |
1145 | p->tgid = current->tgid; | 1145 | p->tgid = current->tgid; |
1146 | 1146 | ||
1147 | if (current->nsproxy != p->nsproxy) { | 1147 | if (current->nsproxy != p->nsproxy) { |
1148 | retval = ns_cgroup_clone(p, pid); | 1148 | retval = ns_cgroup_clone(p, pid); |
1149 | if (retval) | 1149 | if (retval) |
1150 | goto bad_fork_free_graph; | 1150 | goto bad_fork_free_graph; |
1151 | } | 1151 | } |
1152 | 1152 | ||
1153 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1153 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
1154 | /* | 1154 | /* |
1155 | * Clear TID on mm_release()? | 1155 | * Clear TID on mm_release()? |
1156 | */ | 1156 | */ |
1157 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | 1157 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; |
1158 | #ifdef CONFIG_FUTEX | 1158 | #ifdef CONFIG_FUTEX |
1159 | p->robust_list = NULL; | 1159 | p->robust_list = NULL; |
1160 | #ifdef CONFIG_COMPAT | 1160 | #ifdef CONFIG_COMPAT |
1161 | p->compat_robust_list = NULL; | 1161 | p->compat_robust_list = NULL; |
1162 | #endif | 1162 | #endif |
1163 | INIT_LIST_HEAD(&p->pi_state_list); | 1163 | INIT_LIST_HEAD(&p->pi_state_list); |
1164 | p->pi_state_cache = NULL; | 1164 | p->pi_state_cache = NULL; |
1165 | #endif | 1165 | #endif |
1166 | /* | 1166 | /* |
1167 | * sigaltstack should be cleared when sharing the same VM | 1167 | * sigaltstack should be cleared when sharing the same VM |
1168 | */ | 1168 | */ |
1169 | if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) | 1169 | if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) |
1170 | p->sas_ss_sp = p->sas_ss_size = 0; | 1170 | p->sas_ss_sp = p->sas_ss_size = 0; |
1171 | 1171 | ||
1172 | /* | 1172 | /* |
1173 | * Syscall tracing should be turned off in the child regardless | 1173 | * Syscall tracing should be turned off in the child regardless |
1174 | * of CLONE_PTRACE. | 1174 | * of CLONE_PTRACE. |
1175 | */ | 1175 | */ |
1176 | clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); | 1176 | clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); |
1177 | #ifdef TIF_SYSCALL_EMU | 1177 | #ifdef TIF_SYSCALL_EMU |
1178 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 1178 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
1179 | #endif | 1179 | #endif |
1180 | clear_all_latency_tracing(p); | 1180 | clear_all_latency_tracing(p); |
1181 | 1181 | ||
1182 | /* Our parent execution domain becomes current domain | 1182 | /* Our parent execution domain becomes current domain |
1183 | These must match for thread signalling to apply */ | 1183 | These must match for thread signalling to apply */ |
1184 | p->parent_exec_id = p->self_exec_id; | 1184 | p->parent_exec_id = p->self_exec_id; |
1185 | 1185 | ||
1186 | /* ok, now we should be set up.. */ | 1186 | /* ok, now we should be set up.. */ |
1187 | p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); | 1187 | p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); |
1188 | p->pdeath_signal = 0; | 1188 | p->pdeath_signal = 0; |
1189 | p->exit_state = 0; | 1189 | p->exit_state = 0; |
1190 | 1190 | ||
1191 | /* | 1191 | /* |
1192 | * Ok, make it visible to the rest of the system. | 1192 | * Ok, make it visible to the rest of the system. |
1193 | * We dont wake it up yet. | 1193 | * We dont wake it up yet. |
1194 | */ | 1194 | */ |
1195 | p->group_leader = p; | 1195 | p->group_leader = p; |
1196 | INIT_LIST_HEAD(&p->thread_group); | 1196 | INIT_LIST_HEAD(&p->thread_group); |
1197 | 1197 | ||
1198 | /* Now that the task is set up, run cgroup callbacks if | 1198 | /* Now that the task is set up, run cgroup callbacks if |
1199 | * necessary. We need to run them before the task is visible | 1199 | * necessary. We need to run them before the task is visible |
1200 | * on the tasklist. */ | 1200 | * on the tasklist. */ |
1201 | cgroup_fork_callbacks(p); | 1201 | cgroup_fork_callbacks(p); |
1202 | cgroup_callbacks_done = 1; | 1202 | cgroup_callbacks_done = 1; |
1203 | 1203 | ||
1204 | /* Need tasklist lock for parent etc handling! */ | 1204 | /* Need tasklist lock for parent etc handling! */ |
1205 | write_lock_irq(&tasklist_lock); | 1205 | write_lock_irq(&tasklist_lock); |
1206 | 1206 | ||
1207 | /* | 1207 | /* |
1208 | * The task hasn't been attached yet, so its cpus_allowed mask will | 1208 | * The task hasn't been attached yet, so its cpus_allowed mask will |
1209 | * not be changed, nor will its assigned CPU. | 1209 | * not be changed, nor will its assigned CPU. |
1210 | * | 1210 | * |
1211 | * The cpus_allowed mask of the parent may have changed after it was | 1211 | * The cpus_allowed mask of the parent may have changed after it was |
1212 | * copied first time - so re-copy it here, then check the child's CPU | 1212 | * copied first time - so re-copy it here, then check the child's CPU |
1213 | * to ensure it is on a valid CPU (and if not, just force it back to | 1213 | * to ensure it is on a valid CPU (and if not, just force it back to |
1214 | * parent's CPU). This avoids alot of nasty races. | 1214 | * parent's CPU). This avoids alot of nasty races. |
1215 | */ | 1215 | */ |
1216 | p->cpus_allowed = current->cpus_allowed; | 1216 | p->cpus_allowed = current->cpus_allowed; |
1217 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | 1217 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; |
1218 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | 1218 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || |
1219 | !cpu_online(task_cpu(p)))) | 1219 | !cpu_online(task_cpu(p)))) |
1220 | set_task_cpu(p, smp_processor_id()); | 1220 | set_task_cpu(p, smp_processor_id()); |
1221 | 1221 | ||
1222 | /* CLONE_PARENT re-uses the old parent */ | 1222 | /* CLONE_PARENT re-uses the old parent */ |
1223 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) | 1223 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) |
1224 | p->real_parent = current->real_parent; | 1224 | p->real_parent = current->real_parent; |
1225 | else | 1225 | else |
1226 | p->real_parent = current; | 1226 | p->real_parent = current; |
1227 | 1227 | ||
1228 | spin_lock(¤t->sighand->siglock); | 1228 | spin_lock(¤t->sighand->siglock); |
1229 | 1229 | ||
1230 | /* | 1230 | /* |
1231 | * Process group and session signals need to be delivered to just the | 1231 | * Process group and session signals need to be delivered to just the |
1232 | * parent before the fork or both the parent and the child after the | 1232 | * parent before the fork or both the parent and the child after the |
1233 | * fork. Restart if a signal comes in before we add the new process to | 1233 | * fork. Restart if a signal comes in before we add the new process to |
1234 | * it's process group. | 1234 | * it's process group. |
1235 | * A fatal signal pending means that current will exit, so the new | 1235 | * A fatal signal pending means that current will exit, so the new |
1236 | * thread can't slip out of an OOM kill (or normal SIGKILL). | 1236 | * thread can't slip out of an OOM kill (or normal SIGKILL). |
1237 | */ | 1237 | */ |
1238 | recalc_sigpending(); | 1238 | recalc_sigpending(); |
1239 | if (signal_pending(current)) { | 1239 | if (signal_pending(current)) { |
1240 | spin_unlock(¤t->sighand->siglock); | 1240 | spin_unlock(¤t->sighand->siglock); |
1241 | write_unlock_irq(&tasklist_lock); | 1241 | write_unlock_irq(&tasklist_lock); |
1242 | retval = -ERESTARTNOINTR; | 1242 | retval = -ERESTARTNOINTR; |
1243 | goto bad_fork_free_graph; | 1243 | goto bad_fork_free_graph; |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | if (clone_flags & CLONE_THREAD) { | 1246 | if (clone_flags & CLONE_THREAD) { |
1247 | p->group_leader = current->group_leader; | 1247 | p->group_leader = current->group_leader; |
1248 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); | 1248 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); |
1249 | } | 1249 | } |
1250 | 1250 | ||
1251 | if (likely(p->pid)) { | 1251 | if (likely(p->pid)) { |
1252 | list_add_tail(&p->sibling, &p->real_parent->children); | 1252 | list_add_tail(&p->sibling, &p->real_parent->children); |
1253 | tracehook_finish_clone(p, clone_flags, trace); | 1253 | tracehook_finish_clone(p, clone_flags, trace); |
1254 | 1254 | ||
1255 | if (thread_group_leader(p)) { | 1255 | if (thread_group_leader(p)) { |
1256 | if (clone_flags & CLONE_NEWPID) | 1256 | if (clone_flags & CLONE_NEWPID) |
1257 | p->nsproxy->pid_ns->child_reaper = p; | 1257 | p->nsproxy->pid_ns->child_reaper = p; |
1258 | 1258 | ||
1259 | p->signal->leader_pid = pid; | 1259 | p->signal->leader_pid = pid; |
1260 | tty_kref_put(p->signal->tty); | 1260 | tty_kref_put(p->signal->tty); |
1261 | p->signal->tty = tty_kref_get(current->signal->tty); | 1261 | p->signal->tty = tty_kref_get(current->signal->tty); |
1262 | set_task_pgrp(p, task_pgrp_nr(current)); | 1262 | set_task_pgrp(p, task_pgrp_nr(current)); |
1263 | set_task_session(p, task_session_nr(current)); | 1263 | set_task_session(p, task_session_nr(current)); |
1264 | attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); | 1264 | attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); |
1265 | attach_pid(p, PIDTYPE_SID, task_session(current)); | 1265 | attach_pid(p, PIDTYPE_SID, task_session(current)); |
1266 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1266 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1267 | __get_cpu_var(process_counts)++; | 1267 | __get_cpu_var(process_counts)++; |
1268 | } | 1268 | } |
1269 | attach_pid(p, PIDTYPE_PID, pid); | 1269 | attach_pid(p, PIDTYPE_PID, pid); |
1270 | nr_threads++; | 1270 | nr_threads++; |
1271 | } | 1271 | } |
1272 | 1272 | ||
1273 | total_forks++; | 1273 | total_forks++; |
1274 | spin_unlock(¤t->sighand->siglock); | 1274 | spin_unlock(¤t->sighand->siglock); |
1275 | write_unlock_irq(&tasklist_lock); | 1275 | write_unlock_irq(&tasklist_lock); |
1276 | proc_fork_connector(p); | 1276 | proc_fork_connector(p); |
1277 | cgroup_post_fork(p); | 1277 | cgroup_post_fork(p); |
1278 | return p; | 1278 | return p; |
1279 | 1279 | ||
1280 | bad_fork_free_graph: | 1280 | bad_fork_free_graph: |
1281 | ftrace_graph_exit_task(p); | 1281 | ftrace_graph_exit_task(p); |
1282 | bad_fork_free_pid: | 1282 | bad_fork_free_pid: |
1283 | if (pid != &init_struct_pid) | 1283 | if (pid != &init_struct_pid) |
1284 | free_pid(pid); | 1284 | free_pid(pid); |
1285 | bad_fork_cleanup_io: | 1285 | bad_fork_cleanup_io: |
1286 | put_io_context(p->io_context); | 1286 | put_io_context(p->io_context); |
1287 | bad_fork_cleanup_namespaces: | 1287 | bad_fork_cleanup_namespaces: |
1288 | exit_task_namespaces(p); | 1288 | exit_task_namespaces(p); |
1289 | bad_fork_cleanup_mm: | 1289 | bad_fork_cleanup_mm: |
1290 | if (p->mm) | 1290 | if (p->mm) |
1291 | mmput(p->mm); | 1291 | mmput(p->mm); |
1292 | bad_fork_cleanup_signal: | 1292 | bad_fork_cleanup_signal: |
1293 | cleanup_signal(p); | 1293 | cleanup_signal(p); |
1294 | bad_fork_cleanup_sighand: | 1294 | bad_fork_cleanup_sighand: |
1295 | __cleanup_sighand(p->sighand); | 1295 | __cleanup_sighand(p->sighand); |
1296 | bad_fork_cleanup_fs: | 1296 | bad_fork_cleanup_fs: |
1297 | exit_fs(p); /* blocking */ | 1297 | exit_fs(p); /* blocking */ |
1298 | bad_fork_cleanup_files: | 1298 | bad_fork_cleanup_files: |
1299 | exit_files(p); /* blocking */ | 1299 | exit_files(p); /* blocking */ |
1300 | bad_fork_cleanup_semundo: | 1300 | bad_fork_cleanup_semundo: |
1301 | exit_sem(p); | 1301 | exit_sem(p); |
1302 | bad_fork_cleanup_audit: | 1302 | bad_fork_cleanup_audit: |
1303 | audit_free(p); | 1303 | audit_free(p); |
1304 | bad_fork_cleanup_policy: | 1304 | bad_fork_cleanup_policy: |
1305 | #ifdef CONFIG_NUMA | 1305 | #ifdef CONFIG_NUMA |
1306 | mpol_put(p->mempolicy); | 1306 | mpol_put(p->mempolicy); |
1307 | bad_fork_cleanup_cgroup: | 1307 | bad_fork_cleanup_cgroup: |
1308 | #endif | 1308 | #endif |
1309 | cgroup_exit(p, cgroup_callbacks_done); | 1309 | cgroup_exit(p, cgroup_callbacks_done); |
1310 | delayacct_tsk_free(p); | 1310 | delayacct_tsk_free(p); |
1311 | if (p->binfmt) | 1311 | if (p->binfmt) |
1312 | module_put(p->binfmt->module); | 1312 | module_put(p->binfmt->module); |
1313 | bad_fork_cleanup_put_domain: | 1313 | bad_fork_cleanup_put_domain: |
1314 | module_put(task_thread_info(p)->exec_domain->module); | 1314 | module_put(task_thread_info(p)->exec_domain->module); |
1315 | bad_fork_cleanup_count: | 1315 | bad_fork_cleanup_count: |
1316 | atomic_dec(&p->cred->user->processes); | 1316 | atomic_dec(&p->cred->user->processes); |
1317 | put_cred(p->real_cred); | 1317 | put_cred(p->real_cred); |
1318 | put_cred(p->cred); | 1318 | put_cred(p->cred); |
1319 | bad_fork_free: | 1319 | bad_fork_free: |
1320 | free_task(p); | 1320 | free_task(p); |
1321 | fork_out: | 1321 | fork_out: |
1322 | return ERR_PTR(retval); | 1322 | return ERR_PTR(retval); |
1323 | } | 1323 | } |
1324 | 1324 | ||
1325 | noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | 1325 | noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) |
1326 | { | 1326 | { |
1327 | memset(regs, 0, sizeof(struct pt_regs)); | 1327 | memset(regs, 0, sizeof(struct pt_regs)); |
1328 | return regs; | 1328 | return regs; |
1329 | } | 1329 | } |
1330 | 1330 | ||
1331 | struct task_struct * __cpuinit fork_idle(int cpu) | 1331 | struct task_struct * __cpuinit fork_idle(int cpu) |
1332 | { | 1332 | { |
1333 | struct task_struct *task; | 1333 | struct task_struct *task; |
1334 | struct pt_regs regs; | 1334 | struct pt_regs regs; |
1335 | 1335 | ||
1336 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, | 1336 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, |
1337 | &init_struct_pid, 0); | 1337 | &init_struct_pid, 0); |
1338 | if (!IS_ERR(task)) | 1338 | if (!IS_ERR(task)) |
1339 | init_idle(task, cpu); | 1339 | init_idle(task, cpu); |
1340 | 1340 | ||
1341 | return task; | 1341 | return task; |
1342 | } | 1342 | } |
1343 | 1343 | ||
1344 | /* | 1344 | /* |
1345 | * Ok, this is the main fork-routine. | 1345 | * Ok, this is the main fork-routine. |
1346 | * | 1346 | * |
1347 | * It copies the process, and if successful kick-starts | 1347 | * It copies the process, and if successful kick-starts |
1348 | * it and waits for it to finish using the VM if required. | 1348 | * it and waits for it to finish using the VM if required. |
1349 | */ | 1349 | */ |
1350 | long do_fork(unsigned long clone_flags, | 1350 | long do_fork(unsigned long clone_flags, |
1351 | unsigned long stack_start, | 1351 | unsigned long stack_start, |
1352 | struct pt_regs *regs, | 1352 | struct pt_regs *regs, |
1353 | unsigned long stack_size, | 1353 | unsigned long stack_size, |
1354 | int __user *parent_tidptr, | 1354 | int __user *parent_tidptr, |
1355 | int __user *child_tidptr) | 1355 | int __user *child_tidptr) |
1356 | { | 1356 | { |
1357 | struct task_struct *p; | 1357 | struct task_struct *p; |
1358 | int trace = 0; | 1358 | int trace = 0; |
1359 | long nr; | 1359 | long nr; |
1360 | 1360 | ||
1361 | /* | 1361 | /* |
1362 | * Do some preliminary argument and permissions checking before we | 1362 | * Do some preliminary argument and permissions checking before we |
1363 | * actually start allocating stuff | 1363 | * actually start allocating stuff |
1364 | */ | 1364 | */ |
1365 | if (clone_flags & CLONE_NEWUSER) { | 1365 | if (clone_flags & CLONE_NEWUSER) { |
1366 | if (clone_flags & CLONE_THREAD) | 1366 | if (clone_flags & CLONE_THREAD) |
1367 | return -EINVAL; | 1367 | return -EINVAL; |
1368 | /* hopefully this check will go away when userns support is | 1368 | /* hopefully this check will go away when userns support is |
1369 | * complete | 1369 | * complete |
1370 | */ | 1370 | */ |
1371 | if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || | 1371 | if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || |
1372 | !capable(CAP_SETGID)) | 1372 | !capable(CAP_SETGID)) |
1373 | return -EPERM; | 1373 | return -EPERM; |
1374 | } | 1374 | } |
1375 | 1375 | ||
1376 | /* | 1376 | /* |
1377 | * We hope to recycle these flags after 2.6.26 | 1377 | * We hope to recycle these flags after 2.6.26 |
1378 | */ | 1378 | */ |
1379 | if (unlikely(clone_flags & CLONE_STOPPED)) { | 1379 | if (unlikely(clone_flags & CLONE_STOPPED)) { |
1380 | static int __read_mostly count = 100; | 1380 | static int __read_mostly count = 100; |
1381 | 1381 | ||
1382 | if (count > 0 && printk_ratelimit()) { | 1382 | if (count > 0 && printk_ratelimit()) { |
1383 | char comm[TASK_COMM_LEN]; | 1383 | char comm[TASK_COMM_LEN]; |
1384 | 1384 | ||
1385 | count--; | 1385 | count--; |
1386 | printk(KERN_INFO "fork(): process `%s' used deprecated " | 1386 | printk(KERN_INFO "fork(): process `%s' used deprecated " |
1387 | "clone flags 0x%lx\n", | 1387 | "clone flags 0x%lx\n", |
1388 | get_task_comm(comm, current), | 1388 | get_task_comm(comm, current), |
1389 | clone_flags & CLONE_STOPPED); | 1389 | clone_flags & CLONE_STOPPED); |
1390 | } | 1390 | } |
1391 | } | 1391 | } |
1392 | 1392 | ||
1393 | /* | 1393 | /* |
1394 | * When called from kernel_thread, don't do user tracing stuff. | 1394 | * When called from kernel_thread, don't do user tracing stuff. |
1395 | */ | 1395 | */ |
1396 | if (likely(user_mode(regs))) | 1396 | if (likely(user_mode(regs))) |
1397 | trace = tracehook_prepare_clone(clone_flags); | 1397 | trace = tracehook_prepare_clone(clone_flags); |
1398 | 1398 | ||
1399 | p = copy_process(clone_flags, stack_start, regs, stack_size, | 1399 | p = copy_process(clone_flags, stack_start, regs, stack_size, |
1400 | child_tidptr, NULL, trace); | 1400 | child_tidptr, NULL, trace); |
1401 | /* | 1401 | /* |
1402 | * Do this prior waking up the new thread - the thread pointer | 1402 | * Do this prior waking up the new thread - the thread pointer |
1403 | * might get invalid after that point, if the thread exits quickly. | 1403 | * might get invalid after that point, if the thread exits quickly. |
1404 | */ | 1404 | */ |
1405 | if (!IS_ERR(p)) { | 1405 | if (!IS_ERR(p)) { |
1406 | struct completion vfork; | 1406 | struct completion vfork; |
1407 | 1407 | ||
1408 | trace_sched_process_fork(current, p); | 1408 | trace_sched_process_fork(current, p); |
1409 | 1409 | ||
1410 | nr = task_pid_vnr(p); | 1410 | nr = task_pid_vnr(p); |
1411 | 1411 | ||
1412 | if (clone_flags & CLONE_PARENT_SETTID) | 1412 | if (clone_flags & CLONE_PARENT_SETTID) |
1413 | put_user(nr, parent_tidptr); | 1413 | put_user(nr, parent_tidptr); |
1414 | 1414 | ||
1415 | if (clone_flags & CLONE_VFORK) { | 1415 | if (clone_flags & CLONE_VFORK) { |
1416 | p->vfork_done = &vfork; | 1416 | p->vfork_done = &vfork; |
1417 | init_completion(&vfork); | 1417 | init_completion(&vfork); |
1418 | } | 1418 | } |
1419 | 1419 | ||
1420 | audit_finish_fork(p); | 1420 | audit_finish_fork(p); |
1421 | tracehook_report_clone(trace, regs, clone_flags, nr, p); | 1421 | tracehook_report_clone(trace, regs, clone_flags, nr, p); |
1422 | 1422 | ||
1423 | /* | 1423 | /* |
1424 | * We set PF_STARTING at creation in case tracing wants to | 1424 | * We set PF_STARTING at creation in case tracing wants to |
1425 | * use this to distinguish a fully live task from one that | 1425 | * use this to distinguish a fully live task from one that |
1426 | * hasn't gotten to tracehook_report_clone() yet. Now we | 1426 | * hasn't gotten to tracehook_report_clone() yet. Now we |
1427 | * clear it and set the child going. | 1427 | * clear it and set the child going. |
1428 | */ | 1428 | */ |
1429 | p->flags &= ~PF_STARTING; | 1429 | p->flags &= ~PF_STARTING; |
1430 | 1430 | ||
1431 | if (unlikely(clone_flags & CLONE_STOPPED)) { | 1431 | if (unlikely(clone_flags & CLONE_STOPPED)) { |
1432 | /* | 1432 | /* |
1433 | * We'll start up with an immediate SIGSTOP. | 1433 | * We'll start up with an immediate SIGSTOP. |
1434 | */ | 1434 | */ |
1435 | sigaddset(&p->pending.signal, SIGSTOP); | 1435 | sigaddset(&p->pending.signal, SIGSTOP); |
1436 | set_tsk_thread_flag(p, TIF_SIGPENDING); | 1436 | set_tsk_thread_flag(p, TIF_SIGPENDING); |
1437 | __set_task_state(p, TASK_STOPPED); | 1437 | __set_task_state(p, TASK_STOPPED); |
1438 | } else { | 1438 | } else { |
1439 | wake_up_new_task(p, clone_flags); | 1439 | wake_up_new_task(p, clone_flags); |
1440 | } | 1440 | } |
1441 | 1441 | ||
1442 | tracehook_report_clone_complete(trace, regs, | 1442 | tracehook_report_clone_complete(trace, regs, |
1443 | clone_flags, nr, p); | 1443 | clone_flags, nr, p); |
1444 | 1444 | ||
1445 | if (clone_flags & CLONE_VFORK) { | 1445 | if (clone_flags & CLONE_VFORK) { |
1446 | freezer_do_not_count(); | 1446 | freezer_do_not_count(); |
1447 | wait_for_completion(&vfork); | 1447 | wait_for_completion(&vfork); |
1448 | freezer_count(); | 1448 | freezer_count(); |
1449 | tracehook_report_vfork_done(p, nr); | 1449 | tracehook_report_vfork_done(p, nr); |
1450 | } | 1450 | } |
1451 | } else { | 1451 | } else { |
1452 | nr = PTR_ERR(p); | 1452 | nr = PTR_ERR(p); |
1453 | } | 1453 | } |
1454 | return nr; | 1454 | return nr; |
1455 | } | 1455 | } |
1456 | 1456 | ||
1457 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN | 1457 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN |
1458 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | 1458 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 |
1459 | #endif | 1459 | #endif |
1460 | 1460 | ||
1461 | static void sighand_ctor(void *data) | 1461 | static void sighand_ctor(void *data) |
1462 | { | 1462 | { |
1463 | struct sighand_struct *sighand = data; | 1463 | struct sighand_struct *sighand = data; |
1464 | 1464 | ||
1465 | spin_lock_init(&sighand->siglock); | 1465 | spin_lock_init(&sighand->siglock); |
1466 | init_waitqueue_head(&sighand->signalfd_wqh); | 1466 | init_waitqueue_head(&sighand->signalfd_wqh); |
1467 | } | 1467 | } |
1468 | 1468 | ||
1469 | void __init proc_caches_init(void) | 1469 | void __init proc_caches_init(void) |
1470 | { | 1470 | { |
1471 | sighand_cachep = kmem_cache_create("sighand_cache", | 1471 | sighand_cachep = kmem_cache_create("sighand_cache", |
1472 | sizeof(struct sighand_struct), 0, | 1472 | sizeof(struct sighand_struct), 0, |
1473 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, | 1473 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, |
1474 | sighand_ctor); | 1474 | sighand_ctor); |
1475 | signal_cachep = kmem_cache_create("signal_cache", | 1475 | signal_cachep = kmem_cache_create("signal_cache", |
1476 | sizeof(struct signal_struct), 0, | 1476 | sizeof(struct signal_struct), 0, |
1477 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 1477 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1478 | files_cachep = kmem_cache_create("files_cache", | 1478 | files_cachep = kmem_cache_create("files_cache", |
1479 | sizeof(struct files_struct), 0, | 1479 | sizeof(struct files_struct), 0, |
1480 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 1480 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1481 | fs_cachep = kmem_cache_create("fs_cache", | 1481 | fs_cachep = kmem_cache_create("fs_cache", |
1482 | sizeof(struct fs_struct), 0, | 1482 | sizeof(struct fs_struct), 0, |
1483 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 1483 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1484 | vm_area_cachep = kmem_cache_create("vm_area_struct", | ||
1485 | sizeof(struct vm_area_struct), 0, | ||
1486 | SLAB_PANIC, NULL); | ||
1487 | mm_cachep = kmem_cache_create("mm_struct", | 1484 | mm_cachep = kmem_cache_create("mm_struct", |
1488 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 1485 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
1489 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 1486 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1487 | mmap_init(); | ||
1490 | } | 1488 | } |
1491 | 1489 | ||
1492 | /* | 1490 | /* |
1493 | * Check constraints on flags passed to the unshare system call and | 1491 | * Check constraints on flags passed to the unshare system call and |
1494 | * force unsharing of additional process context as appropriate. | 1492 | * force unsharing of additional process context as appropriate. |
1495 | */ | 1493 | */ |
1496 | static void check_unshare_flags(unsigned long *flags_ptr) | 1494 | static void check_unshare_flags(unsigned long *flags_ptr) |
1497 | { | 1495 | { |
1498 | /* | 1496 | /* |
1499 | * If unsharing a thread from a thread group, must also | 1497 | * If unsharing a thread from a thread group, must also |
1500 | * unshare vm. | 1498 | * unshare vm. |
1501 | */ | 1499 | */ |
1502 | if (*flags_ptr & CLONE_THREAD) | 1500 | if (*flags_ptr & CLONE_THREAD) |
1503 | *flags_ptr |= CLONE_VM; | 1501 | *flags_ptr |= CLONE_VM; |
1504 | 1502 | ||
1505 | /* | 1503 | /* |
1506 | * If unsharing vm, must also unshare signal handlers. | 1504 | * If unsharing vm, must also unshare signal handlers. |
1507 | */ | 1505 | */ |
1508 | if (*flags_ptr & CLONE_VM) | 1506 | if (*flags_ptr & CLONE_VM) |
1509 | *flags_ptr |= CLONE_SIGHAND; | 1507 | *flags_ptr |= CLONE_SIGHAND; |
1510 | 1508 | ||
1511 | /* | 1509 | /* |
1512 | * If unsharing signal handlers and the task was created | 1510 | * If unsharing signal handlers and the task was created |
1513 | * using CLONE_THREAD, then must unshare the thread | 1511 | * using CLONE_THREAD, then must unshare the thread |
1514 | */ | 1512 | */ |
1515 | if ((*flags_ptr & CLONE_SIGHAND) && | 1513 | if ((*flags_ptr & CLONE_SIGHAND) && |
1516 | (atomic_read(¤t->signal->count) > 1)) | 1514 | (atomic_read(¤t->signal->count) > 1)) |
1517 | *flags_ptr |= CLONE_THREAD; | 1515 | *flags_ptr |= CLONE_THREAD; |
1518 | 1516 | ||
1519 | /* | 1517 | /* |
1520 | * If unsharing namespace, must also unshare filesystem information. | 1518 | * If unsharing namespace, must also unshare filesystem information. |
1521 | */ | 1519 | */ |
1522 | if (*flags_ptr & CLONE_NEWNS) | 1520 | if (*flags_ptr & CLONE_NEWNS) |
1523 | *flags_ptr |= CLONE_FS; | 1521 | *flags_ptr |= CLONE_FS; |
1524 | } | 1522 | } |
1525 | 1523 | ||
1526 | /* | 1524 | /* |
1527 | * Unsharing of tasks created with CLONE_THREAD is not supported yet | 1525 | * Unsharing of tasks created with CLONE_THREAD is not supported yet |
1528 | */ | 1526 | */ |
1529 | static int unshare_thread(unsigned long unshare_flags) | 1527 | static int unshare_thread(unsigned long unshare_flags) |
1530 | { | 1528 | { |
1531 | if (unshare_flags & CLONE_THREAD) | 1529 | if (unshare_flags & CLONE_THREAD) |
1532 | return -EINVAL; | 1530 | return -EINVAL; |
1533 | 1531 | ||
1534 | return 0; | 1532 | return 0; |
1535 | } | 1533 | } |
1536 | 1534 | ||
1537 | /* | 1535 | /* |
1538 | * Unshare the filesystem structure if it is being shared | 1536 | * Unshare the filesystem structure if it is being shared |
1539 | */ | 1537 | */ |
1540 | static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | 1538 | static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) |
1541 | { | 1539 | { |
1542 | struct fs_struct *fs = current->fs; | 1540 | struct fs_struct *fs = current->fs; |
1543 | 1541 | ||
1544 | if ((unshare_flags & CLONE_FS) && | 1542 | if ((unshare_flags & CLONE_FS) && |
1545 | (fs && atomic_read(&fs->count) > 1)) { | 1543 | (fs && atomic_read(&fs->count) > 1)) { |
1546 | *new_fsp = __copy_fs_struct(current->fs); | 1544 | *new_fsp = __copy_fs_struct(current->fs); |
1547 | if (!*new_fsp) | 1545 | if (!*new_fsp) |
1548 | return -ENOMEM; | 1546 | return -ENOMEM; |
1549 | } | 1547 | } |
1550 | 1548 | ||
1551 | return 0; | 1549 | return 0; |
1552 | } | 1550 | } |
1553 | 1551 | ||
1554 | /* | 1552 | /* |
1555 | * Unsharing of sighand is not supported yet | 1553 | * Unsharing of sighand is not supported yet |
1556 | */ | 1554 | */ |
1557 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | 1555 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) |
1558 | { | 1556 | { |
1559 | struct sighand_struct *sigh = current->sighand; | 1557 | struct sighand_struct *sigh = current->sighand; |
1560 | 1558 | ||
1561 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) | 1559 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) |
1562 | return -EINVAL; | 1560 | return -EINVAL; |
1563 | else | 1561 | else |
1564 | return 0; | 1562 | return 0; |
1565 | } | 1563 | } |
1566 | 1564 | ||
1567 | /* | 1565 | /* |
1568 | * Unshare vm if it is being shared | 1566 | * Unshare vm if it is being shared |
1569 | */ | 1567 | */ |
1570 | static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) | 1568 | static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) |
1571 | { | 1569 | { |
1572 | struct mm_struct *mm = current->mm; | 1570 | struct mm_struct *mm = current->mm; |
1573 | 1571 | ||
1574 | if ((unshare_flags & CLONE_VM) && | 1572 | if ((unshare_flags & CLONE_VM) && |
1575 | (mm && atomic_read(&mm->mm_users) > 1)) { | 1573 | (mm && atomic_read(&mm->mm_users) > 1)) { |
1576 | return -EINVAL; | 1574 | return -EINVAL; |
1577 | } | 1575 | } |
1578 | 1576 | ||
1579 | return 0; | 1577 | return 0; |
1580 | } | 1578 | } |
1581 | 1579 | ||
1582 | /* | 1580 | /* |
1583 | * Unshare file descriptor table if it is being shared | 1581 | * Unshare file descriptor table if it is being shared |
1584 | */ | 1582 | */ |
1585 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) | 1583 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
1586 | { | 1584 | { |
1587 | struct files_struct *fd = current->files; | 1585 | struct files_struct *fd = current->files; |
1588 | int error = 0; | 1586 | int error = 0; |
1589 | 1587 | ||
1590 | if ((unshare_flags & CLONE_FILES) && | 1588 | if ((unshare_flags & CLONE_FILES) && |
1591 | (fd && atomic_read(&fd->count) > 1)) { | 1589 | (fd && atomic_read(&fd->count) > 1)) { |
1592 | *new_fdp = dup_fd(fd, &error); | 1590 | *new_fdp = dup_fd(fd, &error); |
1593 | if (!*new_fdp) | 1591 | if (!*new_fdp) |
1594 | return error; | 1592 | return error; |
1595 | } | 1593 | } |
1596 | 1594 | ||
1597 | return 0; | 1595 | return 0; |
1598 | } | 1596 | } |
1599 | 1597 | ||
1600 | /* | 1598 | /* |
1601 | * unshare allows a process to 'unshare' part of the process | 1599 | * unshare allows a process to 'unshare' part of the process |
1602 | * context which was originally shared using clone. copy_* | 1600 | * context which was originally shared using clone. copy_* |
1603 | * functions used by do_fork() cannot be used here directly | 1601 | * functions used by do_fork() cannot be used here directly |
1604 | * because they modify an inactive task_struct that is being | 1602 | * because they modify an inactive task_struct that is being |
1605 | * constructed. Here we are modifying the current, active, | 1603 | * constructed. Here we are modifying the current, active, |
1606 | * task_struct. | 1604 | * task_struct. |
1607 | */ | 1605 | */ |
1608 | asmlinkage long sys_unshare(unsigned long unshare_flags) | 1606 | asmlinkage long sys_unshare(unsigned long unshare_flags) |
1609 | { | 1607 | { |
1610 | int err = 0; | 1608 | int err = 0; |
1611 | struct fs_struct *fs, *new_fs = NULL; | 1609 | struct fs_struct *fs, *new_fs = NULL; |
1612 | struct sighand_struct *new_sigh = NULL; | 1610 | struct sighand_struct *new_sigh = NULL; |
1613 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | 1611 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; |
1614 | struct files_struct *fd, *new_fd = NULL; | 1612 | struct files_struct *fd, *new_fd = NULL; |
1615 | struct nsproxy *new_nsproxy = NULL; | 1613 | struct nsproxy *new_nsproxy = NULL; |
1616 | int do_sysvsem = 0; | 1614 | int do_sysvsem = 0; |
1617 | 1615 | ||
1618 | check_unshare_flags(&unshare_flags); | 1616 | check_unshare_flags(&unshare_flags); |
1619 | 1617 | ||
1620 | /* Return -EINVAL for all unsupported flags */ | 1618 | /* Return -EINVAL for all unsupported flags */ |
1621 | err = -EINVAL; | 1619 | err = -EINVAL; |
1622 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | 1620 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
1623 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | 1621 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
1624 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | 1622 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) |
1625 | goto bad_unshare_out; | 1623 | goto bad_unshare_out; |
1626 | 1624 | ||
1627 | /* | 1625 | /* |
1628 | * CLONE_NEWIPC must also detach from the undolist: after switching | 1626 | * CLONE_NEWIPC must also detach from the undolist: after switching |
1629 | * to a new ipc namespace, the semaphore arrays from the old | 1627 | * to a new ipc namespace, the semaphore arrays from the old |
1630 | * namespace are unreachable. | 1628 | * namespace are unreachable. |
1631 | */ | 1629 | */ |
1632 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) | 1630 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) |
1633 | do_sysvsem = 1; | 1631 | do_sysvsem = 1; |
1634 | if ((err = unshare_thread(unshare_flags))) | 1632 | if ((err = unshare_thread(unshare_flags))) |
1635 | goto bad_unshare_out; | 1633 | goto bad_unshare_out; |
1636 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1634 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
1637 | goto bad_unshare_cleanup_thread; | 1635 | goto bad_unshare_cleanup_thread; |
1638 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | 1636 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) |
1639 | goto bad_unshare_cleanup_fs; | 1637 | goto bad_unshare_cleanup_fs; |
1640 | if ((err = unshare_vm(unshare_flags, &new_mm))) | 1638 | if ((err = unshare_vm(unshare_flags, &new_mm))) |
1641 | goto bad_unshare_cleanup_sigh; | 1639 | goto bad_unshare_cleanup_sigh; |
1642 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1640 | if ((err = unshare_fd(unshare_flags, &new_fd))) |
1643 | goto bad_unshare_cleanup_vm; | 1641 | goto bad_unshare_cleanup_vm; |
1644 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | 1642 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, |
1645 | new_fs))) | 1643 | new_fs))) |
1646 | goto bad_unshare_cleanup_fd; | 1644 | goto bad_unshare_cleanup_fd; |
1647 | 1645 | ||
1648 | if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { | 1646 | if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { |
1649 | if (do_sysvsem) { | 1647 | if (do_sysvsem) { |
1650 | /* | 1648 | /* |
1651 | * CLONE_SYSVSEM is equivalent to sys_exit(). | 1649 | * CLONE_SYSVSEM is equivalent to sys_exit(). |
1652 | */ | 1650 | */ |
1653 | exit_sem(current); | 1651 | exit_sem(current); |
1654 | } | 1652 | } |
1655 | 1653 | ||
1656 | if (new_nsproxy) { | 1654 | if (new_nsproxy) { |
1657 | switch_task_namespaces(current, new_nsproxy); | 1655 | switch_task_namespaces(current, new_nsproxy); |
1658 | new_nsproxy = NULL; | 1656 | new_nsproxy = NULL; |
1659 | } | 1657 | } |
1660 | 1658 | ||
1661 | task_lock(current); | 1659 | task_lock(current); |
1662 | 1660 | ||
1663 | if (new_fs) { | 1661 | if (new_fs) { |
1664 | fs = current->fs; | 1662 | fs = current->fs; |
1665 | current->fs = new_fs; | 1663 | current->fs = new_fs; |
1666 | new_fs = fs; | 1664 | new_fs = fs; |
1667 | } | 1665 | } |
1668 | 1666 | ||
1669 | if (new_mm) { | 1667 | if (new_mm) { |
1670 | mm = current->mm; | 1668 | mm = current->mm; |
1671 | active_mm = current->active_mm; | 1669 | active_mm = current->active_mm; |
1672 | current->mm = new_mm; | 1670 | current->mm = new_mm; |
1673 | current->active_mm = new_mm; | 1671 | current->active_mm = new_mm; |
1674 | activate_mm(active_mm, new_mm); | 1672 | activate_mm(active_mm, new_mm); |
1675 | new_mm = mm; | 1673 | new_mm = mm; |
1676 | } | 1674 | } |
1677 | 1675 | ||
1678 | if (new_fd) { | 1676 | if (new_fd) { |
1679 | fd = current->files; | 1677 | fd = current->files; |
1680 | current->files = new_fd; | 1678 | current->files = new_fd; |
1681 | new_fd = fd; | 1679 | new_fd = fd; |
1682 | } | 1680 | } |
1683 | 1681 | ||
1684 | task_unlock(current); | 1682 | task_unlock(current); |
1685 | } | 1683 | } |
1686 | 1684 | ||
1687 | if (new_nsproxy) | 1685 | if (new_nsproxy) |
1688 | put_nsproxy(new_nsproxy); | 1686 | put_nsproxy(new_nsproxy); |
1689 | 1687 | ||
1690 | bad_unshare_cleanup_fd: | 1688 | bad_unshare_cleanup_fd: |
1691 | if (new_fd) | 1689 | if (new_fd) |
1692 | put_files_struct(new_fd); | 1690 | put_files_struct(new_fd); |
1693 | 1691 | ||
1694 | bad_unshare_cleanup_vm: | 1692 | bad_unshare_cleanup_vm: |
1695 | if (new_mm) | 1693 | if (new_mm) |
1696 | mmput(new_mm); | 1694 | mmput(new_mm); |
1697 | 1695 | ||
1698 | bad_unshare_cleanup_sigh: | 1696 | bad_unshare_cleanup_sigh: |
1699 | if (new_sigh) | 1697 | if (new_sigh) |
1700 | if (atomic_dec_and_test(&new_sigh->count)) | 1698 | if (atomic_dec_and_test(&new_sigh->count)) |
1701 | kmem_cache_free(sighand_cachep, new_sigh); | 1699 | kmem_cache_free(sighand_cachep, new_sigh); |
1702 | 1700 | ||
1703 | bad_unshare_cleanup_fs: | 1701 | bad_unshare_cleanup_fs: |
1704 | if (new_fs) | 1702 | if (new_fs) |
1705 | put_fs_struct(new_fs); | 1703 | put_fs_struct(new_fs); |
1706 | 1704 | ||
1707 | bad_unshare_cleanup_thread: | 1705 | bad_unshare_cleanup_thread: |
1708 | bad_unshare_out: | 1706 | bad_unshare_out: |
1709 | return err; | 1707 | return err; |
1710 | } | 1708 | } |
1711 | 1709 | ||
1712 | /* | 1710 | /* |
1713 | * Helper to unshare the files of the current task. | 1711 | * Helper to unshare the files of the current task. |
1714 | * We don't want to expose copy_files internals to | 1712 | * We don't want to expose copy_files internals to |
1715 | * the exec layer of the kernel. | 1713 | * the exec layer of the kernel. |
1716 | */ | 1714 | */ |
1717 | 1715 | ||
1718 | int unshare_files(struct files_struct **displaced) | 1716 | int unshare_files(struct files_struct **displaced) |
1719 | { | 1717 | { |
1720 | struct task_struct *task = current; | 1718 | struct task_struct *task = current; |
1721 | struct files_struct *copy = NULL; | 1719 | struct files_struct *copy = NULL; |
1722 | int error; | 1720 | int error; |
1723 | 1721 | ||
1724 | error = unshare_fd(CLONE_FILES, ©); | 1722 | error = unshare_fd(CLONE_FILES, ©); |
1725 | if (error || !copy) { | 1723 | if (error || !copy) { |
1726 | *displaced = NULL; | 1724 | *displaced = NULL; |
1727 | return error; | 1725 | return error; |
1728 | } | 1726 | } |
1729 | *displaced = task->files; | 1727 | *displaced = task->files; |
1730 | task_lock(task); | 1728 | task_lock(task); |
1731 | task->files = copy; | 1729 | task->files = copy; |
1732 | task_unlock(task); | 1730 | task_unlock(task); |
1733 | return 0; | 1731 | return 0; |
1734 | } | 1732 | } |
kernel/sysctl.c
1 | /* | 1 | /* |
2 | * sysctl.c: General linux system control interface | 2 | * sysctl.c: General linux system control interface |
3 | * | 3 | * |
4 | * Begun 24 March 1995, Stephen Tweedie | 4 | * Begun 24 March 1995, Stephen Tweedie |
5 | * Added /proc support, Dec 1995 | 5 | * Added /proc support, Dec 1995 |
6 | * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. | 6 | * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. |
7 | * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. | 7 | * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. |
8 | * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. | 8 | * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. |
9 | * Dynamic registration fixes, Stephen Tweedie. | 9 | * Dynamic registration fixes, Stephen Tweedie. |
10 | * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. | 10 | * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. |
11 | * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris | 11 | * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris |
12 | * Horn. | 12 | * Horn. |
13 | * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. | 13 | * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. |
14 | * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. | 14 | * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. |
15 | * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill | 15 | * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill |
16 | * Wendling. | 16 | * Wendling. |
17 | * The list_for_each() macro wasn't appropriate for the sysctl loop. | 17 | * The list_for_each() macro wasn't appropriate for the sysctl loop. |
18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling | 18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/sysctl.h> | 25 | #include <linux/sysctl.h> |
26 | #include <linux/proc_fs.h> | 26 | #include <linux/proc_fs.h> |
27 | #include <linux/security.h> | 27 | #include <linux/security.h> |
28 | #include <linux/ctype.h> | 28 | #include <linux/ctype.h> |
29 | #include <linux/utsname.h> | 29 | #include <linux/utsname.h> |
30 | #include <linux/smp_lock.h> | 30 | #include <linux/smp_lock.h> |
31 | #include <linux/fs.h> | 31 | #include <linux/fs.h> |
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/kernel.h> | 33 | #include <linux/kernel.h> |
34 | #include <linux/kobject.h> | 34 | #include <linux/kobject.h> |
35 | #include <linux/net.h> | 35 | #include <linux/net.h> |
36 | #include <linux/sysrq.h> | 36 | #include <linux/sysrq.h> |
37 | #include <linux/highuid.h> | 37 | #include <linux/highuid.h> |
38 | #include <linux/writeback.h> | 38 | #include <linux/writeback.h> |
39 | #include <linux/hugetlb.h> | 39 | #include <linux/hugetlb.h> |
40 | #include <linux/initrd.h> | 40 | #include <linux/initrd.h> |
41 | #include <linux/key.h> | 41 | #include <linux/key.h> |
42 | #include <linux/times.h> | 42 | #include <linux/times.h> |
43 | #include <linux/limits.h> | 43 | #include <linux/limits.h> |
44 | #include <linux/dcache.h> | 44 | #include <linux/dcache.h> |
45 | #include <linux/syscalls.h> | 45 | #include <linux/syscalls.h> |
46 | #include <linux/vmstat.h> | 46 | #include <linux/vmstat.h> |
47 | #include <linux/nfs_fs.h> | 47 | #include <linux/nfs_fs.h> |
48 | #include <linux/acpi.h> | 48 | #include <linux/acpi.h> |
49 | #include <linux/reboot.h> | 49 | #include <linux/reboot.h> |
50 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | 51 | ||
52 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
53 | #include <asm/processor.h> | 53 | #include <asm/processor.h> |
54 | 54 | ||
55 | #ifdef CONFIG_X86 | 55 | #ifdef CONFIG_X86 |
56 | #include <asm/nmi.h> | 56 | #include <asm/nmi.h> |
57 | #include <asm/stacktrace.h> | 57 | #include <asm/stacktrace.h> |
58 | #include <asm/io.h> | 58 | #include <asm/io.h> |
59 | #endif | 59 | #endif |
60 | 60 | ||
61 | static int deprecated_sysctl_warning(struct __sysctl_args *args); | 61 | static int deprecated_sysctl_warning(struct __sysctl_args *args); |
62 | 62 | ||
63 | #if defined(CONFIG_SYSCTL) | 63 | #if defined(CONFIG_SYSCTL) |
64 | 64 | ||
65 | /* External variables not in a header file. */ | 65 | /* External variables not in a header file. */ |
66 | extern int C_A_D; | 66 | extern int C_A_D; |
67 | extern int print_fatal_signals; | 67 | extern int print_fatal_signals; |
68 | extern int sysctl_overcommit_memory; | 68 | extern int sysctl_overcommit_memory; |
69 | extern int sysctl_overcommit_ratio; | 69 | extern int sysctl_overcommit_ratio; |
70 | extern int sysctl_panic_on_oom; | 70 | extern int sysctl_panic_on_oom; |
71 | extern int sysctl_oom_kill_allocating_task; | 71 | extern int sysctl_oom_kill_allocating_task; |
72 | extern int sysctl_oom_dump_tasks; | 72 | extern int sysctl_oom_dump_tasks; |
73 | extern int max_threads; | 73 | extern int max_threads; |
74 | extern int core_uses_pid; | 74 | extern int core_uses_pid; |
75 | extern int suid_dumpable; | 75 | extern int suid_dumpable; |
76 | extern char core_pattern[]; | 76 | extern char core_pattern[]; |
77 | extern int pid_max; | 77 | extern int pid_max; |
78 | extern int min_free_kbytes; | 78 | extern int min_free_kbytes; |
79 | extern int pid_max_min, pid_max_max; | 79 | extern int pid_max_min, pid_max_max; |
80 | extern int sysctl_drop_caches; | 80 | extern int sysctl_drop_caches; |
81 | extern int percpu_pagelist_fraction; | 81 | extern int percpu_pagelist_fraction; |
82 | extern int compat_log; | 82 | extern int compat_log; |
83 | extern int latencytop_enabled; | 83 | extern int latencytop_enabled; |
84 | extern int sysctl_nr_open_min, sysctl_nr_open_max; | 84 | extern int sysctl_nr_open_min, sysctl_nr_open_max; |
85 | #ifndef CONFIG_MMU | ||
86 | extern int sysctl_nr_trim_pages; | ||
87 | #endif | ||
85 | #ifdef CONFIG_RCU_TORTURE_TEST | 88 | #ifdef CONFIG_RCU_TORTURE_TEST |
86 | extern int rcutorture_runnable; | 89 | extern int rcutorture_runnable; |
87 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ | 90 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ |
88 | 91 | ||
89 | /* Constants used for minimum and maximum */ | 92 | /* Constants used for minimum and maximum */ |
90 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 93 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
91 | static int sixty = 60; | 94 | static int sixty = 60; |
92 | static int neg_one = -1; | 95 | static int neg_one = -1; |
93 | #endif | 96 | #endif |
94 | 97 | ||
95 | #if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) | 98 | #if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) |
96 | static int two = 2; | 99 | static int two = 2; |
97 | #endif | 100 | #endif |
98 | 101 | ||
99 | static int zero; | 102 | static int zero; |
100 | static int one = 1; | 103 | static int one = 1; |
101 | static int one_hundred = 100; | 104 | static int one_hundred = 100; |
102 | 105 | ||
103 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 106 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
104 | static int maxolduid = 65535; | 107 | static int maxolduid = 65535; |
105 | static int minolduid; | 108 | static int minolduid; |
106 | static int min_percpu_pagelist_fract = 8; | 109 | static int min_percpu_pagelist_fract = 8; |
107 | 110 | ||
108 | static int ngroups_max = NGROUPS_MAX; | 111 | static int ngroups_max = NGROUPS_MAX; |
109 | 112 | ||
110 | #ifdef CONFIG_MODULES | 113 | #ifdef CONFIG_MODULES |
111 | extern char modprobe_path[]; | 114 | extern char modprobe_path[]; |
112 | #endif | 115 | #endif |
113 | #ifdef CONFIG_CHR_DEV_SG | 116 | #ifdef CONFIG_CHR_DEV_SG |
114 | extern int sg_big_buff; | 117 | extern int sg_big_buff; |
115 | #endif | 118 | #endif |
116 | 119 | ||
117 | #ifdef CONFIG_SPARC | 120 | #ifdef CONFIG_SPARC |
118 | #include <asm/system.h> | 121 | #include <asm/system.h> |
119 | #endif | 122 | #endif |
120 | 123 | ||
121 | #ifdef CONFIG_SPARC64 | 124 | #ifdef CONFIG_SPARC64 |
122 | extern int sysctl_tsb_ratio; | 125 | extern int sysctl_tsb_ratio; |
123 | #endif | 126 | #endif |
124 | 127 | ||
125 | #ifdef __hppa__ | 128 | #ifdef __hppa__ |
126 | extern int pwrsw_enabled; | 129 | extern int pwrsw_enabled; |
127 | extern int unaligned_enabled; | 130 | extern int unaligned_enabled; |
128 | #endif | 131 | #endif |
129 | 132 | ||
130 | #ifdef CONFIG_S390 | 133 | #ifdef CONFIG_S390 |
131 | #ifdef CONFIG_MATHEMU | 134 | #ifdef CONFIG_MATHEMU |
132 | extern int sysctl_ieee_emulation_warnings; | 135 | extern int sysctl_ieee_emulation_warnings; |
133 | #endif | 136 | #endif |
134 | extern int sysctl_userprocess_debug; | 137 | extern int sysctl_userprocess_debug; |
135 | extern int spin_retry; | 138 | extern int spin_retry; |
136 | #endif | 139 | #endif |
137 | 140 | ||
138 | #ifdef CONFIG_BSD_PROCESS_ACCT | 141 | #ifdef CONFIG_BSD_PROCESS_ACCT |
139 | extern int acct_parm[]; | 142 | extern int acct_parm[]; |
140 | #endif | 143 | #endif |
141 | 144 | ||
142 | #ifdef CONFIG_IA64 | 145 | #ifdef CONFIG_IA64 |
143 | extern int no_unaligned_warning; | 146 | extern int no_unaligned_warning; |
144 | #endif | 147 | #endif |
145 | 148 | ||
146 | #ifdef CONFIG_RT_MUTEXES | 149 | #ifdef CONFIG_RT_MUTEXES |
147 | extern int max_lock_depth; | 150 | extern int max_lock_depth; |
148 | #endif | 151 | #endif |
149 | 152 | ||
150 | #ifdef CONFIG_PROC_SYSCTL | 153 | #ifdef CONFIG_PROC_SYSCTL |
151 | static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, | 154 | static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, |
152 | void __user *buffer, size_t *lenp, loff_t *ppos); | 155 | void __user *buffer, size_t *lenp, loff_t *ppos); |
153 | static int proc_taint(struct ctl_table *table, int write, struct file *filp, | 156 | static int proc_taint(struct ctl_table *table, int write, struct file *filp, |
154 | void __user *buffer, size_t *lenp, loff_t *ppos); | 157 | void __user *buffer, size_t *lenp, loff_t *ppos); |
155 | #endif | 158 | #endif |
156 | 159 | ||
157 | static struct ctl_table root_table[]; | 160 | static struct ctl_table root_table[]; |
158 | static struct ctl_table_root sysctl_table_root; | 161 | static struct ctl_table_root sysctl_table_root; |
159 | static struct ctl_table_header root_table_header = { | 162 | static struct ctl_table_header root_table_header = { |
160 | .count = 1, | 163 | .count = 1, |
161 | .ctl_table = root_table, | 164 | .ctl_table = root_table, |
162 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), | 165 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), |
163 | .root = &sysctl_table_root, | 166 | .root = &sysctl_table_root, |
164 | .set = &sysctl_table_root.default_set, | 167 | .set = &sysctl_table_root.default_set, |
165 | }; | 168 | }; |
166 | static struct ctl_table_root sysctl_table_root = { | 169 | static struct ctl_table_root sysctl_table_root = { |
167 | .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), | 170 | .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), |
168 | .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), | 171 | .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), |
169 | }; | 172 | }; |
170 | 173 | ||
171 | static struct ctl_table kern_table[]; | 174 | static struct ctl_table kern_table[]; |
172 | static struct ctl_table vm_table[]; | 175 | static struct ctl_table vm_table[]; |
173 | static struct ctl_table fs_table[]; | 176 | static struct ctl_table fs_table[]; |
174 | static struct ctl_table debug_table[]; | 177 | static struct ctl_table debug_table[]; |
175 | static struct ctl_table dev_table[]; | 178 | static struct ctl_table dev_table[]; |
176 | extern struct ctl_table random_table[]; | 179 | extern struct ctl_table random_table[]; |
177 | #ifdef CONFIG_INOTIFY_USER | 180 | #ifdef CONFIG_INOTIFY_USER |
178 | extern struct ctl_table inotify_table[]; | 181 | extern struct ctl_table inotify_table[]; |
179 | #endif | 182 | #endif |
180 | #ifdef CONFIG_EPOLL | 183 | #ifdef CONFIG_EPOLL |
181 | extern struct ctl_table epoll_table[]; | 184 | extern struct ctl_table epoll_table[]; |
182 | #endif | 185 | #endif |
183 | 186 | ||
184 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 187 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT |
185 | int sysctl_legacy_va_layout; | 188 | int sysctl_legacy_va_layout; |
186 | #endif | 189 | #endif |
187 | 190 | ||
188 | extern int prove_locking; | 191 | extern int prove_locking; |
189 | extern int lock_stat; | 192 | extern int lock_stat; |
190 | 193 | ||
191 | /* The default sysctl tables: */ | 194 | /* The default sysctl tables: */ |
192 | 195 | ||
193 | static struct ctl_table root_table[] = { | 196 | static struct ctl_table root_table[] = { |
194 | { | 197 | { |
195 | .ctl_name = CTL_KERN, | 198 | .ctl_name = CTL_KERN, |
196 | .procname = "kernel", | 199 | .procname = "kernel", |
197 | .mode = 0555, | 200 | .mode = 0555, |
198 | .child = kern_table, | 201 | .child = kern_table, |
199 | }, | 202 | }, |
200 | { | 203 | { |
201 | .ctl_name = CTL_VM, | 204 | .ctl_name = CTL_VM, |
202 | .procname = "vm", | 205 | .procname = "vm", |
203 | .mode = 0555, | 206 | .mode = 0555, |
204 | .child = vm_table, | 207 | .child = vm_table, |
205 | }, | 208 | }, |
206 | { | 209 | { |
207 | .ctl_name = CTL_FS, | 210 | .ctl_name = CTL_FS, |
208 | .procname = "fs", | 211 | .procname = "fs", |
209 | .mode = 0555, | 212 | .mode = 0555, |
210 | .child = fs_table, | 213 | .child = fs_table, |
211 | }, | 214 | }, |
212 | { | 215 | { |
213 | .ctl_name = CTL_DEBUG, | 216 | .ctl_name = CTL_DEBUG, |
214 | .procname = "debug", | 217 | .procname = "debug", |
215 | .mode = 0555, | 218 | .mode = 0555, |
216 | .child = debug_table, | 219 | .child = debug_table, |
217 | }, | 220 | }, |
218 | { | 221 | { |
219 | .ctl_name = CTL_DEV, | 222 | .ctl_name = CTL_DEV, |
220 | .procname = "dev", | 223 | .procname = "dev", |
221 | .mode = 0555, | 224 | .mode = 0555, |
222 | .child = dev_table, | 225 | .child = dev_table, |
223 | }, | 226 | }, |
224 | /* | 227 | /* |
225 | * NOTE: do not add new entries to this table unless you have read | 228 | * NOTE: do not add new entries to this table unless you have read |
226 | * Documentation/sysctl/ctl_unnumbered.txt | 229 | * Documentation/sysctl/ctl_unnumbered.txt |
227 | */ | 230 | */ |
228 | { .ctl_name = 0 } | 231 | { .ctl_name = 0 } |
229 | }; | 232 | }; |
230 | 233 | ||
231 | #ifdef CONFIG_SCHED_DEBUG | 234 | #ifdef CONFIG_SCHED_DEBUG |
232 | static int min_sched_granularity_ns = 100000; /* 100 usecs */ | 235 | static int min_sched_granularity_ns = 100000; /* 100 usecs */ |
233 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 236 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
234 | static int min_wakeup_granularity_ns; /* 0 usecs */ | 237 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
235 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 238 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
236 | #endif | 239 | #endif |
237 | 240 | ||
238 | static struct ctl_table kern_table[] = { | 241 | static struct ctl_table kern_table[] = { |
239 | #ifdef CONFIG_SCHED_DEBUG | 242 | #ifdef CONFIG_SCHED_DEBUG |
240 | { | 243 | { |
241 | .ctl_name = CTL_UNNUMBERED, | 244 | .ctl_name = CTL_UNNUMBERED, |
242 | .procname = "sched_min_granularity_ns", | 245 | .procname = "sched_min_granularity_ns", |
243 | .data = &sysctl_sched_min_granularity, | 246 | .data = &sysctl_sched_min_granularity, |
244 | .maxlen = sizeof(unsigned int), | 247 | .maxlen = sizeof(unsigned int), |
245 | .mode = 0644, | 248 | .mode = 0644, |
246 | .proc_handler = &sched_nr_latency_handler, | 249 | .proc_handler = &sched_nr_latency_handler, |
247 | .strategy = &sysctl_intvec, | 250 | .strategy = &sysctl_intvec, |
248 | .extra1 = &min_sched_granularity_ns, | 251 | .extra1 = &min_sched_granularity_ns, |
249 | .extra2 = &max_sched_granularity_ns, | 252 | .extra2 = &max_sched_granularity_ns, |
250 | }, | 253 | }, |
251 | { | 254 | { |
252 | .ctl_name = CTL_UNNUMBERED, | 255 | .ctl_name = CTL_UNNUMBERED, |
253 | .procname = "sched_latency_ns", | 256 | .procname = "sched_latency_ns", |
254 | .data = &sysctl_sched_latency, | 257 | .data = &sysctl_sched_latency, |
255 | .maxlen = sizeof(unsigned int), | 258 | .maxlen = sizeof(unsigned int), |
256 | .mode = 0644, | 259 | .mode = 0644, |
257 | .proc_handler = &sched_nr_latency_handler, | 260 | .proc_handler = &sched_nr_latency_handler, |
258 | .strategy = &sysctl_intvec, | 261 | .strategy = &sysctl_intvec, |
259 | .extra1 = &min_sched_granularity_ns, | 262 | .extra1 = &min_sched_granularity_ns, |
260 | .extra2 = &max_sched_granularity_ns, | 263 | .extra2 = &max_sched_granularity_ns, |
261 | }, | 264 | }, |
262 | { | 265 | { |
263 | .ctl_name = CTL_UNNUMBERED, | 266 | .ctl_name = CTL_UNNUMBERED, |
264 | .procname = "sched_wakeup_granularity_ns", | 267 | .procname = "sched_wakeup_granularity_ns", |
265 | .data = &sysctl_sched_wakeup_granularity, | 268 | .data = &sysctl_sched_wakeup_granularity, |
266 | .maxlen = sizeof(unsigned int), | 269 | .maxlen = sizeof(unsigned int), |
267 | .mode = 0644, | 270 | .mode = 0644, |
268 | .proc_handler = &proc_dointvec_minmax, | 271 | .proc_handler = &proc_dointvec_minmax, |
269 | .strategy = &sysctl_intvec, | 272 | .strategy = &sysctl_intvec, |
270 | .extra1 = &min_wakeup_granularity_ns, | 273 | .extra1 = &min_wakeup_granularity_ns, |
271 | .extra2 = &max_wakeup_granularity_ns, | 274 | .extra2 = &max_wakeup_granularity_ns, |
272 | }, | 275 | }, |
273 | { | 276 | { |
274 | .ctl_name = CTL_UNNUMBERED, | 277 | .ctl_name = CTL_UNNUMBERED, |
275 | .procname = "sched_shares_ratelimit", | 278 | .procname = "sched_shares_ratelimit", |
276 | .data = &sysctl_sched_shares_ratelimit, | 279 | .data = &sysctl_sched_shares_ratelimit, |
277 | .maxlen = sizeof(unsigned int), | 280 | .maxlen = sizeof(unsigned int), |
278 | .mode = 0644, | 281 | .mode = 0644, |
279 | .proc_handler = &proc_dointvec, | 282 | .proc_handler = &proc_dointvec, |
280 | }, | 283 | }, |
281 | { | 284 | { |
282 | .ctl_name = CTL_UNNUMBERED, | 285 | .ctl_name = CTL_UNNUMBERED, |
283 | .procname = "sched_shares_thresh", | 286 | .procname = "sched_shares_thresh", |
284 | .data = &sysctl_sched_shares_thresh, | 287 | .data = &sysctl_sched_shares_thresh, |
285 | .maxlen = sizeof(unsigned int), | 288 | .maxlen = sizeof(unsigned int), |
286 | .mode = 0644, | 289 | .mode = 0644, |
287 | .proc_handler = &proc_dointvec_minmax, | 290 | .proc_handler = &proc_dointvec_minmax, |
288 | .strategy = &sysctl_intvec, | 291 | .strategy = &sysctl_intvec, |
289 | .extra1 = &zero, | 292 | .extra1 = &zero, |
290 | }, | 293 | }, |
291 | { | 294 | { |
292 | .ctl_name = CTL_UNNUMBERED, | 295 | .ctl_name = CTL_UNNUMBERED, |
293 | .procname = "sched_child_runs_first", | 296 | .procname = "sched_child_runs_first", |
294 | .data = &sysctl_sched_child_runs_first, | 297 | .data = &sysctl_sched_child_runs_first, |
295 | .maxlen = sizeof(unsigned int), | 298 | .maxlen = sizeof(unsigned int), |
296 | .mode = 0644, | 299 | .mode = 0644, |
297 | .proc_handler = &proc_dointvec, | 300 | .proc_handler = &proc_dointvec, |
298 | }, | 301 | }, |
299 | { | 302 | { |
300 | .ctl_name = CTL_UNNUMBERED, | 303 | .ctl_name = CTL_UNNUMBERED, |
301 | .procname = "sched_features", | 304 | .procname = "sched_features", |
302 | .data = &sysctl_sched_features, | 305 | .data = &sysctl_sched_features, |
303 | .maxlen = sizeof(unsigned int), | 306 | .maxlen = sizeof(unsigned int), |
304 | .mode = 0644, | 307 | .mode = 0644, |
305 | .proc_handler = &proc_dointvec, | 308 | .proc_handler = &proc_dointvec, |
306 | }, | 309 | }, |
307 | { | 310 | { |
308 | .ctl_name = CTL_UNNUMBERED, | 311 | .ctl_name = CTL_UNNUMBERED, |
309 | .procname = "sched_migration_cost", | 312 | .procname = "sched_migration_cost", |
310 | .data = &sysctl_sched_migration_cost, | 313 | .data = &sysctl_sched_migration_cost, |
311 | .maxlen = sizeof(unsigned int), | 314 | .maxlen = sizeof(unsigned int), |
312 | .mode = 0644, | 315 | .mode = 0644, |
313 | .proc_handler = &proc_dointvec, | 316 | .proc_handler = &proc_dointvec, |
314 | }, | 317 | }, |
315 | { | 318 | { |
316 | .ctl_name = CTL_UNNUMBERED, | 319 | .ctl_name = CTL_UNNUMBERED, |
317 | .procname = "sched_nr_migrate", | 320 | .procname = "sched_nr_migrate", |
318 | .data = &sysctl_sched_nr_migrate, | 321 | .data = &sysctl_sched_nr_migrate, |
319 | .maxlen = sizeof(unsigned int), | 322 | .maxlen = sizeof(unsigned int), |
320 | .mode = 0644, | 323 | .mode = 0644, |
321 | .proc_handler = &proc_dointvec, | 324 | .proc_handler = &proc_dointvec, |
322 | }, | 325 | }, |
323 | #endif | 326 | #endif |
324 | { | 327 | { |
325 | .ctl_name = CTL_UNNUMBERED, | 328 | .ctl_name = CTL_UNNUMBERED, |
326 | .procname = "sched_rt_period_us", | 329 | .procname = "sched_rt_period_us", |
327 | .data = &sysctl_sched_rt_period, | 330 | .data = &sysctl_sched_rt_period, |
328 | .maxlen = sizeof(unsigned int), | 331 | .maxlen = sizeof(unsigned int), |
329 | .mode = 0644, | 332 | .mode = 0644, |
330 | .proc_handler = &sched_rt_handler, | 333 | .proc_handler = &sched_rt_handler, |
331 | }, | 334 | }, |
332 | { | 335 | { |
333 | .ctl_name = CTL_UNNUMBERED, | 336 | .ctl_name = CTL_UNNUMBERED, |
334 | .procname = "sched_rt_runtime_us", | 337 | .procname = "sched_rt_runtime_us", |
335 | .data = &sysctl_sched_rt_runtime, | 338 | .data = &sysctl_sched_rt_runtime, |
336 | .maxlen = sizeof(int), | 339 | .maxlen = sizeof(int), |
337 | .mode = 0644, | 340 | .mode = 0644, |
338 | .proc_handler = &sched_rt_handler, | 341 | .proc_handler = &sched_rt_handler, |
339 | }, | 342 | }, |
340 | { | 343 | { |
341 | .ctl_name = CTL_UNNUMBERED, | 344 | .ctl_name = CTL_UNNUMBERED, |
342 | .procname = "sched_compat_yield", | 345 | .procname = "sched_compat_yield", |
343 | .data = &sysctl_sched_compat_yield, | 346 | .data = &sysctl_sched_compat_yield, |
344 | .maxlen = sizeof(unsigned int), | 347 | .maxlen = sizeof(unsigned int), |
345 | .mode = 0644, | 348 | .mode = 0644, |
346 | .proc_handler = &proc_dointvec, | 349 | .proc_handler = &proc_dointvec, |
347 | }, | 350 | }, |
348 | #ifdef CONFIG_PROVE_LOCKING | 351 | #ifdef CONFIG_PROVE_LOCKING |
349 | { | 352 | { |
350 | .ctl_name = CTL_UNNUMBERED, | 353 | .ctl_name = CTL_UNNUMBERED, |
351 | .procname = "prove_locking", | 354 | .procname = "prove_locking", |
352 | .data = &prove_locking, | 355 | .data = &prove_locking, |
353 | .maxlen = sizeof(int), | 356 | .maxlen = sizeof(int), |
354 | .mode = 0644, | 357 | .mode = 0644, |
355 | .proc_handler = &proc_dointvec, | 358 | .proc_handler = &proc_dointvec, |
356 | }, | 359 | }, |
357 | #endif | 360 | #endif |
358 | #ifdef CONFIG_LOCK_STAT | 361 | #ifdef CONFIG_LOCK_STAT |
359 | { | 362 | { |
360 | .ctl_name = CTL_UNNUMBERED, | 363 | .ctl_name = CTL_UNNUMBERED, |
361 | .procname = "lock_stat", | 364 | .procname = "lock_stat", |
362 | .data = &lock_stat, | 365 | .data = &lock_stat, |
363 | .maxlen = sizeof(int), | 366 | .maxlen = sizeof(int), |
364 | .mode = 0644, | 367 | .mode = 0644, |
365 | .proc_handler = &proc_dointvec, | 368 | .proc_handler = &proc_dointvec, |
366 | }, | 369 | }, |
367 | #endif | 370 | #endif |
368 | { | 371 | { |
369 | .ctl_name = KERN_PANIC, | 372 | .ctl_name = KERN_PANIC, |
370 | .procname = "panic", | 373 | .procname = "panic", |
371 | .data = &panic_timeout, | 374 | .data = &panic_timeout, |
372 | .maxlen = sizeof(int), | 375 | .maxlen = sizeof(int), |
373 | .mode = 0644, | 376 | .mode = 0644, |
374 | .proc_handler = &proc_dointvec, | 377 | .proc_handler = &proc_dointvec, |
375 | }, | 378 | }, |
376 | { | 379 | { |
377 | .ctl_name = KERN_CORE_USES_PID, | 380 | .ctl_name = KERN_CORE_USES_PID, |
378 | .procname = "core_uses_pid", | 381 | .procname = "core_uses_pid", |
379 | .data = &core_uses_pid, | 382 | .data = &core_uses_pid, |
380 | .maxlen = sizeof(int), | 383 | .maxlen = sizeof(int), |
381 | .mode = 0644, | 384 | .mode = 0644, |
382 | .proc_handler = &proc_dointvec, | 385 | .proc_handler = &proc_dointvec, |
383 | }, | 386 | }, |
384 | { | 387 | { |
385 | .ctl_name = KERN_CORE_PATTERN, | 388 | .ctl_name = KERN_CORE_PATTERN, |
386 | .procname = "core_pattern", | 389 | .procname = "core_pattern", |
387 | .data = core_pattern, | 390 | .data = core_pattern, |
388 | .maxlen = CORENAME_MAX_SIZE, | 391 | .maxlen = CORENAME_MAX_SIZE, |
389 | .mode = 0644, | 392 | .mode = 0644, |
390 | .proc_handler = &proc_dostring, | 393 | .proc_handler = &proc_dostring, |
391 | .strategy = &sysctl_string, | 394 | .strategy = &sysctl_string, |
392 | }, | 395 | }, |
393 | #ifdef CONFIG_PROC_SYSCTL | 396 | #ifdef CONFIG_PROC_SYSCTL |
394 | { | 397 | { |
395 | .procname = "tainted", | 398 | .procname = "tainted", |
396 | .maxlen = sizeof(long), | 399 | .maxlen = sizeof(long), |
397 | .mode = 0644, | 400 | .mode = 0644, |
398 | .proc_handler = &proc_taint, | 401 | .proc_handler = &proc_taint, |
399 | }, | 402 | }, |
400 | #endif | 403 | #endif |
401 | #ifdef CONFIG_LATENCYTOP | 404 | #ifdef CONFIG_LATENCYTOP |
402 | { | 405 | { |
403 | .procname = "latencytop", | 406 | .procname = "latencytop", |
404 | .data = &latencytop_enabled, | 407 | .data = &latencytop_enabled, |
405 | .maxlen = sizeof(int), | 408 | .maxlen = sizeof(int), |
406 | .mode = 0644, | 409 | .mode = 0644, |
407 | .proc_handler = &proc_dointvec, | 410 | .proc_handler = &proc_dointvec, |
408 | }, | 411 | }, |
409 | #endif | 412 | #endif |
410 | #ifdef CONFIG_BLK_DEV_INITRD | 413 | #ifdef CONFIG_BLK_DEV_INITRD |
411 | { | 414 | { |
412 | .ctl_name = KERN_REALROOTDEV, | 415 | .ctl_name = KERN_REALROOTDEV, |
413 | .procname = "real-root-dev", | 416 | .procname = "real-root-dev", |
414 | .data = &real_root_dev, | 417 | .data = &real_root_dev, |
415 | .maxlen = sizeof(int), | 418 | .maxlen = sizeof(int), |
416 | .mode = 0644, | 419 | .mode = 0644, |
417 | .proc_handler = &proc_dointvec, | 420 | .proc_handler = &proc_dointvec, |
418 | }, | 421 | }, |
419 | #endif | 422 | #endif |
420 | { | 423 | { |
421 | .ctl_name = CTL_UNNUMBERED, | 424 | .ctl_name = CTL_UNNUMBERED, |
422 | .procname = "print-fatal-signals", | 425 | .procname = "print-fatal-signals", |
423 | .data = &print_fatal_signals, | 426 | .data = &print_fatal_signals, |
424 | .maxlen = sizeof(int), | 427 | .maxlen = sizeof(int), |
425 | .mode = 0644, | 428 | .mode = 0644, |
426 | .proc_handler = &proc_dointvec, | 429 | .proc_handler = &proc_dointvec, |
427 | }, | 430 | }, |
428 | #ifdef CONFIG_SPARC | 431 | #ifdef CONFIG_SPARC |
429 | { | 432 | { |
430 | .ctl_name = KERN_SPARC_REBOOT, | 433 | .ctl_name = KERN_SPARC_REBOOT, |
431 | .procname = "reboot-cmd", | 434 | .procname = "reboot-cmd", |
432 | .data = reboot_command, | 435 | .data = reboot_command, |
433 | .maxlen = 256, | 436 | .maxlen = 256, |
434 | .mode = 0644, | 437 | .mode = 0644, |
435 | .proc_handler = &proc_dostring, | 438 | .proc_handler = &proc_dostring, |
436 | .strategy = &sysctl_string, | 439 | .strategy = &sysctl_string, |
437 | }, | 440 | }, |
438 | { | 441 | { |
439 | .ctl_name = KERN_SPARC_STOP_A, | 442 | .ctl_name = KERN_SPARC_STOP_A, |
440 | .procname = "stop-a", | 443 | .procname = "stop-a", |
441 | .data = &stop_a_enabled, | 444 | .data = &stop_a_enabled, |
442 | .maxlen = sizeof (int), | 445 | .maxlen = sizeof (int), |
443 | .mode = 0644, | 446 | .mode = 0644, |
444 | .proc_handler = &proc_dointvec, | 447 | .proc_handler = &proc_dointvec, |
445 | }, | 448 | }, |
446 | { | 449 | { |
447 | .ctl_name = KERN_SPARC_SCONS_PWROFF, | 450 | .ctl_name = KERN_SPARC_SCONS_PWROFF, |
448 | .procname = "scons-poweroff", | 451 | .procname = "scons-poweroff", |
449 | .data = &scons_pwroff, | 452 | .data = &scons_pwroff, |
450 | .maxlen = sizeof (int), | 453 | .maxlen = sizeof (int), |
451 | .mode = 0644, | 454 | .mode = 0644, |
452 | .proc_handler = &proc_dointvec, | 455 | .proc_handler = &proc_dointvec, |
453 | }, | 456 | }, |
454 | #endif | 457 | #endif |
455 | #ifdef CONFIG_SPARC64 | 458 | #ifdef CONFIG_SPARC64 |
456 | { | 459 | { |
457 | .ctl_name = CTL_UNNUMBERED, | 460 | .ctl_name = CTL_UNNUMBERED, |
458 | .procname = "tsb-ratio", | 461 | .procname = "tsb-ratio", |
459 | .data = &sysctl_tsb_ratio, | 462 | .data = &sysctl_tsb_ratio, |
460 | .maxlen = sizeof (int), | 463 | .maxlen = sizeof (int), |
461 | .mode = 0644, | 464 | .mode = 0644, |
462 | .proc_handler = &proc_dointvec, | 465 | .proc_handler = &proc_dointvec, |
463 | }, | 466 | }, |
464 | #endif | 467 | #endif |
465 | #ifdef __hppa__ | 468 | #ifdef __hppa__ |
466 | { | 469 | { |
467 | .ctl_name = KERN_HPPA_PWRSW, | 470 | .ctl_name = KERN_HPPA_PWRSW, |
468 | .procname = "soft-power", | 471 | .procname = "soft-power", |
469 | .data = &pwrsw_enabled, | 472 | .data = &pwrsw_enabled, |
470 | .maxlen = sizeof (int), | 473 | .maxlen = sizeof (int), |
471 | .mode = 0644, | 474 | .mode = 0644, |
472 | .proc_handler = &proc_dointvec, | 475 | .proc_handler = &proc_dointvec, |
473 | }, | 476 | }, |
474 | { | 477 | { |
475 | .ctl_name = KERN_HPPA_UNALIGNED, | 478 | .ctl_name = KERN_HPPA_UNALIGNED, |
476 | .procname = "unaligned-trap", | 479 | .procname = "unaligned-trap", |
477 | .data = &unaligned_enabled, | 480 | .data = &unaligned_enabled, |
478 | .maxlen = sizeof (int), | 481 | .maxlen = sizeof (int), |
479 | .mode = 0644, | 482 | .mode = 0644, |
480 | .proc_handler = &proc_dointvec, | 483 | .proc_handler = &proc_dointvec, |
481 | }, | 484 | }, |
482 | #endif | 485 | #endif |
483 | { | 486 | { |
484 | .ctl_name = KERN_CTLALTDEL, | 487 | .ctl_name = KERN_CTLALTDEL, |
485 | .procname = "ctrl-alt-del", | 488 | .procname = "ctrl-alt-del", |
486 | .data = &C_A_D, | 489 | .data = &C_A_D, |
487 | .maxlen = sizeof(int), | 490 | .maxlen = sizeof(int), |
488 | .mode = 0644, | 491 | .mode = 0644, |
489 | .proc_handler = &proc_dointvec, | 492 | .proc_handler = &proc_dointvec, |
490 | }, | 493 | }, |
491 | #ifdef CONFIG_FUNCTION_TRACER | 494 | #ifdef CONFIG_FUNCTION_TRACER |
492 | { | 495 | { |
493 | .ctl_name = CTL_UNNUMBERED, | 496 | .ctl_name = CTL_UNNUMBERED, |
494 | .procname = "ftrace_enabled", | 497 | .procname = "ftrace_enabled", |
495 | .data = &ftrace_enabled, | 498 | .data = &ftrace_enabled, |
496 | .maxlen = sizeof(int), | 499 | .maxlen = sizeof(int), |
497 | .mode = 0644, | 500 | .mode = 0644, |
498 | .proc_handler = &ftrace_enable_sysctl, | 501 | .proc_handler = &ftrace_enable_sysctl, |
499 | }, | 502 | }, |
500 | #endif | 503 | #endif |
501 | #ifdef CONFIG_STACK_TRACER | 504 | #ifdef CONFIG_STACK_TRACER |
502 | { | 505 | { |
503 | .ctl_name = CTL_UNNUMBERED, | 506 | .ctl_name = CTL_UNNUMBERED, |
504 | .procname = "stack_tracer_enabled", | 507 | .procname = "stack_tracer_enabled", |
505 | .data = &stack_tracer_enabled, | 508 | .data = &stack_tracer_enabled, |
506 | .maxlen = sizeof(int), | 509 | .maxlen = sizeof(int), |
507 | .mode = 0644, | 510 | .mode = 0644, |
508 | .proc_handler = &stack_trace_sysctl, | 511 | .proc_handler = &stack_trace_sysctl, |
509 | }, | 512 | }, |
510 | #endif | 513 | #endif |
511 | #ifdef CONFIG_TRACING | 514 | #ifdef CONFIG_TRACING |
512 | { | 515 | { |
513 | .ctl_name = CTL_UNNUMBERED, | 516 | .ctl_name = CTL_UNNUMBERED, |
514 | .procname = "ftrace_dump_on_oops", | 517 | .procname = "ftrace_dump_on_oops", |
515 | .data = &ftrace_dump_on_oops, | 518 | .data = &ftrace_dump_on_oops, |
516 | .maxlen = sizeof(int), | 519 | .maxlen = sizeof(int), |
517 | .mode = 0644, | 520 | .mode = 0644, |
518 | .proc_handler = &proc_dointvec, | 521 | .proc_handler = &proc_dointvec, |
519 | }, | 522 | }, |
520 | #endif | 523 | #endif |
521 | #ifdef CONFIG_MODULES | 524 | #ifdef CONFIG_MODULES |
522 | { | 525 | { |
523 | .ctl_name = KERN_MODPROBE, | 526 | .ctl_name = KERN_MODPROBE, |
524 | .procname = "modprobe", | 527 | .procname = "modprobe", |
525 | .data = &modprobe_path, | 528 | .data = &modprobe_path, |
526 | .maxlen = KMOD_PATH_LEN, | 529 | .maxlen = KMOD_PATH_LEN, |
527 | .mode = 0644, | 530 | .mode = 0644, |
528 | .proc_handler = &proc_dostring, | 531 | .proc_handler = &proc_dostring, |
529 | .strategy = &sysctl_string, | 532 | .strategy = &sysctl_string, |
530 | }, | 533 | }, |
531 | #endif | 534 | #endif |
532 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 535 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
533 | { | 536 | { |
534 | .ctl_name = KERN_HOTPLUG, | 537 | .ctl_name = KERN_HOTPLUG, |
535 | .procname = "hotplug", | 538 | .procname = "hotplug", |
536 | .data = &uevent_helper, | 539 | .data = &uevent_helper, |
537 | .maxlen = UEVENT_HELPER_PATH_LEN, | 540 | .maxlen = UEVENT_HELPER_PATH_LEN, |
538 | .mode = 0644, | 541 | .mode = 0644, |
539 | .proc_handler = &proc_dostring, | 542 | .proc_handler = &proc_dostring, |
540 | .strategy = &sysctl_string, | 543 | .strategy = &sysctl_string, |
541 | }, | 544 | }, |
542 | #endif | 545 | #endif |
543 | #ifdef CONFIG_CHR_DEV_SG | 546 | #ifdef CONFIG_CHR_DEV_SG |
544 | { | 547 | { |
545 | .ctl_name = KERN_SG_BIG_BUFF, | 548 | .ctl_name = KERN_SG_BIG_BUFF, |
546 | .procname = "sg-big-buff", | 549 | .procname = "sg-big-buff", |
547 | .data = &sg_big_buff, | 550 | .data = &sg_big_buff, |
548 | .maxlen = sizeof (int), | 551 | .maxlen = sizeof (int), |
549 | .mode = 0444, | 552 | .mode = 0444, |
550 | .proc_handler = &proc_dointvec, | 553 | .proc_handler = &proc_dointvec, |
551 | }, | 554 | }, |
552 | #endif | 555 | #endif |
553 | #ifdef CONFIG_BSD_PROCESS_ACCT | 556 | #ifdef CONFIG_BSD_PROCESS_ACCT |
554 | { | 557 | { |
555 | .ctl_name = KERN_ACCT, | 558 | .ctl_name = KERN_ACCT, |
556 | .procname = "acct", | 559 | .procname = "acct", |
557 | .data = &acct_parm, | 560 | .data = &acct_parm, |
558 | .maxlen = 3*sizeof(int), | 561 | .maxlen = 3*sizeof(int), |
559 | .mode = 0644, | 562 | .mode = 0644, |
560 | .proc_handler = &proc_dointvec, | 563 | .proc_handler = &proc_dointvec, |
561 | }, | 564 | }, |
562 | #endif | 565 | #endif |
563 | #ifdef CONFIG_MAGIC_SYSRQ | 566 | #ifdef CONFIG_MAGIC_SYSRQ |
564 | { | 567 | { |
565 | .ctl_name = KERN_SYSRQ, | 568 | .ctl_name = KERN_SYSRQ, |
566 | .procname = "sysrq", | 569 | .procname = "sysrq", |
567 | .data = &__sysrq_enabled, | 570 | .data = &__sysrq_enabled, |
568 | .maxlen = sizeof (int), | 571 | .maxlen = sizeof (int), |
569 | .mode = 0644, | 572 | .mode = 0644, |
570 | .proc_handler = &proc_dointvec, | 573 | .proc_handler = &proc_dointvec, |
571 | }, | 574 | }, |
572 | #endif | 575 | #endif |
573 | #ifdef CONFIG_PROC_SYSCTL | 576 | #ifdef CONFIG_PROC_SYSCTL |
574 | { | 577 | { |
575 | .procname = "cad_pid", | 578 | .procname = "cad_pid", |
576 | .data = NULL, | 579 | .data = NULL, |
577 | .maxlen = sizeof (int), | 580 | .maxlen = sizeof (int), |
578 | .mode = 0600, | 581 | .mode = 0600, |
579 | .proc_handler = &proc_do_cad_pid, | 582 | .proc_handler = &proc_do_cad_pid, |
580 | }, | 583 | }, |
581 | #endif | 584 | #endif |
582 | { | 585 | { |
583 | .ctl_name = KERN_MAX_THREADS, | 586 | .ctl_name = KERN_MAX_THREADS, |
584 | .procname = "threads-max", | 587 | .procname = "threads-max", |
585 | .data = &max_threads, | 588 | .data = &max_threads, |
586 | .maxlen = sizeof(int), | 589 | .maxlen = sizeof(int), |
587 | .mode = 0644, | 590 | .mode = 0644, |
588 | .proc_handler = &proc_dointvec, | 591 | .proc_handler = &proc_dointvec, |
589 | }, | 592 | }, |
590 | { | 593 | { |
591 | .ctl_name = KERN_RANDOM, | 594 | .ctl_name = KERN_RANDOM, |
592 | .procname = "random", | 595 | .procname = "random", |
593 | .mode = 0555, | 596 | .mode = 0555, |
594 | .child = random_table, | 597 | .child = random_table, |
595 | }, | 598 | }, |
596 | { | 599 | { |
597 | .ctl_name = KERN_OVERFLOWUID, | 600 | .ctl_name = KERN_OVERFLOWUID, |
598 | .procname = "overflowuid", | 601 | .procname = "overflowuid", |
599 | .data = &overflowuid, | 602 | .data = &overflowuid, |
600 | .maxlen = sizeof(int), | 603 | .maxlen = sizeof(int), |
601 | .mode = 0644, | 604 | .mode = 0644, |
602 | .proc_handler = &proc_dointvec_minmax, | 605 | .proc_handler = &proc_dointvec_minmax, |
603 | .strategy = &sysctl_intvec, | 606 | .strategy = &sysctl_intvec, |
604 | .extra1 = &minolduid, | 607 | .extra1 = &minolduid, |
605 | .extra2 = &maxolduid, | 608 | .extra2 = &maxolduid, |
606 | }, | 609 | }, |
607 | { | 610 | { |
608 | .ctl_name = KERN_OVERFLOWGID, | 611 | .ctl_name = KERN_OVERFLOWGID, |
609 | .procname = "overflowgid", | 612 | .procname = "overflowgid", |
610 | .data = &overflowgid, | 613 | .data = &overflowgid, |
611 | .maxlen = sizeof(int), | 614 | .maxlen = sizeof(int), |
612 | .mode = 0644, | 615 | .mode = 0644, |
613 | .proc_handler = &proc_dointvec_minmax, | 616 | .proc_handler = &proc_dointvec_minmax, |
614 | .strategy = &sysctl_intvec, | 617 | .strategy = &sysctl_intvec, |
615 | .extra1 = &minolduid, | 618 | .extra1 = &minolduid, |
616 | .extra2 = &maxolduid, | 619 | .extra2 = &maxolduid, |
617 | }, | 620 | }, |
618 | #ifdef CONFIG_S390 | 621 | #ifdef CONFIG_S390 |
619 | #ifdef CONFIG_MATHEMU | 622 | #ifdef CONFIG_MATHEMU |
620 | { | 623 | { |
621 | .ctl_name = KERN_IEEE_EMULATION_WARNINGS, | 624 | .ctl_name = KERN_IEEE_EMULATION_WARNINGS, |
622 | .procname = "ieee_emulation_warnings", | 625 | .procname = "ieee_emulation_warnings", |
623 | .data = &sysctl_ieee_emulation_warnings, | 626 | .data = &sysctl_ieee_emulation_warnings, |
624 | .maxlen = sizeof(int), | 627 | .maxlen = sizeof(int), |
625 | .mode = 0644, | 628 | .mode = 0644, |
626 | .proc_handler = &proc_dointvec, | 629 | .proc_handler = &proc_dointvec, |
627 | }, | 630 | }, |
628 | #endif | 631 | #endif |
629 | { | 632 | { |
630 | .ctl_name = KERN_S390_USER_DEBUG_LOGGING, | 633 | .ctl_name = KERN_S390_USER_DEBUG_LOGGING, |
631 | .procname = "userprocess_debug", | 634 | .procname = "userprocess_debug", |
632 | .data = &sysctl_userprocess_debug, | 635 | .data = &sysctl_userprocess_debug, |
633 | .maxlen = sizeof(int), | 636 | .maxlen = sizeof(int), |
634 | .mode = 0644, | 637 | .mode = 0644, |
635 | .proc_handler = &proc_dointvec, | 638 | .proc_handler = &proc_dointvec, |
636 | }, | 639 | }, |
637 | #endif | 640 | #endif |
638 | { | 641 | { |
639 | .ctl_name = KERN_PIDMAX, | 642 | .ctl_name = KERN_PIDMAX, |
640 | .procname = "pid_max", | 643 | .procname = "pid_max", |
641 | .data = &pid_max, | 644 | .data = &pid_max, |
642 | .maxlen = sizeof (int), | 645 | .maxlen = sizeof (int), |
643 | .mode = 0644, | 646 | .mode = 0644, |
644 | .proc_handler = &proc_dointvec_minmax, | 647 | .proc_handler = &proc_dointvec_minmax, |
645 | .strategy = sysctl_intvec, | 648 | .strategy = sysctl_intvec, |
646 | .extra1 = &pid_max_min, | 649 | .extra1 = &pid_max_min, |
647 | .extra2 = &pid_max_max, | 650 | .extra2 = &pid_max_max, |
648 | }, | 651 | }, |
649 | { | 652 | { |
650 | .ctl_name = KERN_PANIC_ON_OOPS, | 653 | .ctl_name = KERN_PANIC_ON_OOPS, |
651 | .procname = "panic_on_oops", | 654 | .procname = "panic_on_oops", |
652 | .data = &panic_on_oops, | 655 | .data = &panic_on_oops, |
653 | .maxlen = sizeof(int), | 656 | .maxlen = sizeof(int), |
654 | .mode = 0644, | 657 | .mode = 0644, |
655 | .proc_handler = &proc_dointvec, | 658 | .proc_handler = &proc_dointvec, |
656 | }, | 659 | }, |
657 | #if defined CONFIG_PRINTK | 660 | #if defined CONFIG_PRINTK |
658 | { | 661 | { |
659 | .ctl_name = KERN_PRINTK, | 662 | .ctl_name = KERN_PRINTK, |
660 | .procname = "printk", | 663 | .procname = "printk", |
661 | .data = &console_loglevel, | 664 | .data = &console_loglevel, |
662 | .maxlen = 4*sizeof(int), | 665 | .maxlen = 4*sizeof(int), |
663 | .mode = 0644, | 666 | .mode = 0644, |
664 | .proc_handler = &proc_dointvec, | 667 | .proc_handler = &proc_dointvec, |
665 | }, | 668 | }, |
666 | { | 669 | { |
667 | .ctl_name = KERN_PRINTK_RATELIMIT, | 670 | .ctl_name = KERN_PRINTK_RATELIMIT, |
668 | .procname = "printk_ratelimit", | 671 | .procname = "printk_ratelimit", |
669 | .data = &printk_ratelimit_state.interval, | 672 | .data = &printk_ratelimit_state.interval, |
670 | .maxlen = sizeof(int), | 673 | .maxlen = sizeof(int), |
671 | .mode = 0644, | 674 | .mode = 0644, |
672 | .proc_handler = &proc_dointvec_jiffies, | 675 | .proc_handler = &proc_dointvec_jiffies, |
673 | .strategy = &sysctl_jiffies, | 676 | .strategy = &sysctl_jiffies, |
674 | }, | 677 | }, |
675 | { | 678 | { |
676 | .ctl_name = KERN_PRINTK_RATELIMIT_BURST, | 679 | .ctl_name = KERN_PRINTK_RATELIMIT_BURST, |
677 | .procname = "printk_ratelimit_burst", | 680 | .procname = "printk_ratelimit_burst", |
678 | .data = &printk_ratelimit_state.burst, | 681 | .data = &printk_ratelimit_state.burst, |
679 | .maxlen = sizeof(int), | 682 | .maxlen = sizeof(int), |
680 | .mode = 0644, | 683 | .mode = 0644, |
681 | .proc_handler = &proc_dointvec, | 684 | .proc_handler = &proc_dointvec, |
682 | }, | 685 | }, |
683 | #endif | 686 | #endif |
684 | { | 687 | { |
685 | .ctl_name = KERN_NGROUPS_MAX, | 688 | .ctl_name = KERN_NGROUPS_MAX, |
686 | .procname = "ngroups_max", | 689 | .procname = "ngroups_max", |
687 | .data = &ngroups_max, | 690 | .data = &ngroups_max, |
688 | .maxlen = sizeof (int), | 691 | .maxlen = sizeof (int), |
689 | .mode = 0444, | 692 | .mode = 0444, |
690 | .proc_handler = &proc_dointvec, | 693 | .proc_handler = &proc_dointvec, |
691 | }, | 694 | }, |
692 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 695 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
693 | { | 696 | { |
694 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, | 697 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, |
695 | .procname = "unknown_nmi_panic", | 698 | .procname = "unknown_nmi_panic", |
696 | .data = &unknown_nmi_panic, | 699 | .data = &unknown_nmi_panic, |
697 | .maxlen = sizeof (int), | 700 | .maxlen = sizeof (int), |
698 | .mode = 0644, | 701 | .mode = 0644, |
699 | .proc_handler = &proc_dointvec, | 702 | .proc_handler = &proc_dointvec, |
700 | }, | 703 | }, |
701 | { | 704 | { |
702 | .procname = "nmi_watchdog", | 705 | .procname = "nmi_watchdog", |
703 | .data = &nmi_watchdog_enabled, | 706 | .data = &nmi_watchdog_enabled, |
704 | .maxlen = sizeof (int), | 707 | .maxlen = sizeof (int), |
705 | .mode = 0644, | 708 | .mode = 0644, |
706 | .proc_handler = &proc_nmi_enabled, | 709 | .proc_handler = &proc_nmi_enabled, |
707 | }, | 710 | }, |
708 | #endif | 711 | #endif |
709 | #if defined(CONFIG_X86) | 712 | #if defined(CONFIG_X86) |
710 | { | 713 | { |
711 | .ctl_name = KERN_PANIC_ON_NMI, | 714 | .ctl_name = KERN_PANIC_ON_NMI, |
712 | .procname = "panic_on_unrecovered_nmi", | 715 | .procname = "panic_on_unrecovered_nmi", |
713 | .data = &panic_on_unrecovered_nmi, | 716 | .data = &panic_on_unrecovered_nmi, |
714 | .maxlen = sizeof(int), | 717 | .maxlen = sizeof(int), |
715 | .mode = 0644, | 718 | .mode = 0644, |
716 | .proc_handler = &proc_dointvec, | 719 | .proc_handler = &proc_dointvec, |
717 | }, | 720 | }, |
718 | { | 721 | { |
719 | .ctl_name = KERN_BOOTLOADER_TYPE, | 722 | .ctl_name = KERN_BOOTLOADER_TYPE, |
720 | .procname = "bootloader_type", | 723 | .procname = "bootloader_type", |
721 | .data = &bootloader_type, | 724 | .data = &bootloader_type, |
722 | .maxlen = sizeof (int), | 725 | .maxlen = sizeof (int), |
723 | .mode = 0444, | 726 | .mode = 0444, |
724 | .proc_handler = &proc_dointvec, | 727 | .proc_handler = &proc_dointvec, |
725 | }, | 728 | }, |
726 | { | 729 | { |
727 | .ctl_name = CTL_UNNUMBERED, | 730 | .ctl_name = CTL_UNNUMBERED, |
728 | .procname = "kstack_depth_to_print", | 731 | .procname = "kstack_depth_to_print", |
729 | .data = &kstack_depth_to_print, | 732 | .data = &kstack_depth_to_print, |
730 | .maxlen = sizeof(int), | 733 | .maxlen = sizeof(int), |
731 | .mode = 0644, | 734 | .mode = 0644, |
732 | .proc_handler = &proc_dointvec, | 735 | .proc_handler = &proc_dointvec, |
733 | }, | 736 | }, |
734 | { | 737 | { |
735 | .ctl_name = CTL_UNNUMBERED, | 738 | .ctl_name = CTL_UNNUMBERED, |
736 | .procname = "io_delay_type", | 739 | .procname = "io_delay_type", |
737 | .data = &io_delay_type, | 740 | .data = &io_delay_type, |
738 | .maxlen = sizeof(int), | 741 | .maxlen = sizeof(int), |
739 | .mode = 0644, | 742 | .mode = 0644, |
740 | .proc_handler = &proc_dointvec, | 743 | .proc_handler = &proc_dointvec, |
741 | }, | 744 | }, |
742 | #endif | 745 | #endif |
743 | #if defined(CONFIG_MMU) | 746 | #if defined(CONFIG_MMU) |
744 | { | 747 | { |
745 | .ctl_name = KERN_RANDOMIZE, | 748 | .ctl_name = KERN_RANDOMIZE, |
746 | .procname = "randomize_va_space", | 749 | .procname = "randomize_va_space", |
747 | .data = &randomize_va_space, | 750 | .data = &randomize_va_space, |
748 | .maxlen = sizeof(int), | 751 | .maxlen = sizeof(int), |
749 | .mode = 0644, | 752 | .mode = 0644, |
750 | .proc_handler = &proc_dointvec, | 753 | .proc_handler = &proc_dointvec, |
751 | }, | 754 | }, |
752 | #endif | 755 | #endif |
753 | #if defined(CONFIG_S390) && defined(CONFIG_SMP) | 756 | #if defined(CONFIG_S390) && defined(CONFIG_SMP) |
754 | { | 757 | { |
755 | .ctl_name = KERN_SPIN_RETRY, | 758 | .ctl_name = KERN_SPIN_RETRY, |
756 | .procname = "spin_retry", | 759 | .procname = "spin_retry", |
757 | .data = &spin_retry, | 760 | .data = &spin_retry, |
758 | .maxlen = sizeof (int), | 761 | .maxlen = sizeof (int), |
759 | .mode = 0644, | 762 | .mode = 0644, |
760 | .proc_handler = &proc_dointvec, | 763 | .proc_handler = &proc_dointvec, |
761 | }, | 764 | }, |
762 | #endif | 765 | #endif |
763 | #if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) | 766 | #if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) |
764 | { | 767 | { |
765 | .procname = "acpi_video_flags", | 768 | .procname = "acpi_video_flags", |
766 | .data = &acpi_realmode_flags, | 769 | .data = &acpi_realmode_flags, |
767 | .maxlen = sizeof (unsigned long), | 770 | .maxlen = sizeof (unsigned long), |
768 | .mode = 0644, | 771 | .mode = 0644, |
769 | .proc_handler = &proc_doulongvec_minmax, | 772 | .proc_handler = &proc_doulongvec_minmax, |
770 | }, | 773 | }, |
771 | #endif | 774 | #endif |
772 | #ifdef CONFIG_IA64 | 775 | #ifdef CONFIG_IA64 |
773 | { | 776 | { |
774 | .ctl_name = KERN_IA64_UNALIGNED, | 777 | .ctl_name = KERN_IA64_UNALIGNED, |
775 | .procname = "ignore-unaligned-usertrap", | 778 | .procname = "ignore-unaligned-usertrap", |
776 | .data = &no_unaligned_warning, | 779 | .data = &no_unaligned_warning, |
777 | .maxlen = sizeof (int), | 780 | .maxlen = sizeof (int), |
778 | .mode = 0644, | 781 | .mode = 0644, |
779 | .proc_handler = &proc_dointvec, | 782 | .proc_handler = &proc_dointvec, |
780 | }, | 783 | }, |
781 | #endif | 784 | #endif |
782 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 785 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
783 | { | 786 | { |
784 | .ctl_name = CTL_UNNUMBERED, | 787 | .ctl_name = CTL_UNNUMBERED, |
785 | .procname = "softlockup_panic", | 788 | .procname = "softlockup_panic", |
786 | .data = &softlockup_panic, | 789 | .data = &softlockup_panic, |
787 | .maxlen = sizeof(int), | 790 | .maxlen = sizeof(int), |
788 | .mode = 0644, | 791 | .mode = 0644, |
789 | .proc_handler = &proc_dointvec_minmax, | 792 | .proc_handler = &proc_dointvec_minmax, |
790 | .strategy = &sysctl_intvec, | 793 | .strategy = &sysctl_intvec, |
791 | .extra1 = &zero, | 794 | .extra1 = &zero, |
792 | .extra2 = &one, | 795 | .extra2 = &one, |
793 | }, | 796 | }, |
794 | { | 797 | { |
795 | .ctl_name = CTL_UNNUMBERED, | 798 | .ctl_name = CTL_UNNUMBERED, |
796 | .procname = "softlockup_thresh", | 799 | .procname = "softlockup_thresh", |
797 | .data = &softlockup_thresh, | 800 | .data = &softlockup_thresh, |
798 | .maxlen = sizeof(int), | 801 | .maxlen = sizeof(int), |
799 | .mode = 0644, | 802 | .mode = 0644, |
800 | .proc_handler = &proc_dointvec_minmax, | 803 | .proc_handler = &proc_dointvec_minmax, |
801 | .strategy = &sysctl_intvec, | 804 | .strategy = &sysctl_intvec, |
802 | .extra1 = &neg_one, | 805 | .extra1 = &neg_one, |
803 | .extra2 = &sixty, | 806 | .extra2 = &sixty, |
804 | }, | 807 | }, |
805 | { | 808 | { |
806 | .ctl_name = CTL_UNNUMBERED, | 809 | .ctl_name = CTL_UNNUMBERED, |
807 | .procname = "hung_task_check_count", | 810 | .procname = "hung_task_check_count", |
808 | .data = &sysctl_hung_task_check_count, | 811 | .data = &sysctl_hung_task_check_count, |
809 | .maxlen = sizeof(unsigned long), | 812 | .maxlen = sizeof(unsigned long), |
810 | .mode = 0644, | 813 | .mode = 0644, |
811 | .proc_handler = &proc_doulongvec_minmax, | 814 | .proc_handler = &proc_doulongvec_minmax, |
812 | .strategy = &sysctl_intvec, | 815 | .strategy = &sysctl_intvec, |
813 | }, | 816 | }, |
814 | { | 817 | { |
815 | .ctl_name = CTL_UNNUMBERED, | 818 | .ctl_name = CTL_UNNUMBERED, |
816 | .procname = "hung_task_timeout_secs", | 819 | .procname = "hung_task_timeout_secs", |
817 | .data = &sysctl_hung_task_timeout_secs, | 820 | .data = &sysctl_hung_task_timeout_secs, |
818 | .maxlen = sizeof(unsigned long), | 821 | .maxlen = sizeof(unsigned long), |
819 | .mode = 0644, | 822 | .mode = 0644, |
820 | .proc_handler = &proc_doulongvec_minmax, | 823 | .proc_handler = &proc_doulongvec_minmax, |
821 | .strategy = &sysctl_intvec, | 824 | .strategy = &sysctl_intvec, |
822 | }, | 825 | }, |
823 | { | 826 | { |
824 | .ctl_name = CTL_UNNUMBERED, | 827 | .ctl_name = CTL_UNNUMBERED, |
825 | .procname = "hung_task_warnings", | 828 | .procname = "hung_task_warnings", |
826 | .data = &sysctl_hung_task_warnings, | 829 | .data = &sysctl_hung_task_warnings, |
827 | .maxlen = sizeof(unsigned long), | 830 | .maxlen = sizeof(unsigned long), |
828 | .mode = 0644, | 831 | .mode = 0644, |
829 | .proc_handler = &proc_doulongvec_minmax, | 832 | .proc_handler = &proc_doulongvec_minmax, |
830 | .strategy = &sysctl_intvec, | 833 | .strategy = &sysctl_intvec, |
831 | }, | 834 | }, |
832 | #endif | 835 | #endif |
833 | #ifdef CONFIG_COMPAT | 836 | #ifdef CONFIG_COMPAT |
834 | { | 837 | { |
835 | .ctl_name = KERN_COMPAT_LOG, | 838 | .ctl_name = KERN_COMPAT_LOG, |
836 | .procname = "compat-log", | 839 | .procname = "compat-log", |
837 | .data = &compat_log, | 840 | .data = &compat_log, |
838 | .maxlen = sizeof (int), | 841 | .maxlen = sizeof (int), |
839 | .mode = 0644, | 842 | .mode = 0644, |
840 | .proc_handler = &proc_dointvec, | 843 | .proc_handler = &proc_dointvec, |
841 | }, | 844 | }, |
842 | #endif | 845 | #endif |
843 | #ifdef CONFIG_RT_MUTEXES | 846 | #ifdef CONFIG_RT_MUTEXES |
844 | { | 847 | { |
845 | .ctl_name = KERN_MAX_LOCK_DEPTH, | 848 | .ctl_name = KERN_MAX_LOCK_DEPTH, |
846 | .procname = "max_lock_depth", | 849 | .procname = "max_lock_depth", |
847 | .data = &max_lock_depth, | 850 | .data = &max_lock_depth, |
848 | .maxlen = sizeof(int), | 851 | .maxlen = sizeof(int), |
849 | .mode = 0644, | 852 | .mode = 0644, |
850 | .proc_handler = &proc_dointvec, | 853 | .proc_handler = &proc_dointvec, |
851 | }, | 854 | }, |
852 | #endif | 855 | #endif |
853 | { | 856 | { |
854 | .ctl_name = CTL_UNNUMBERED, | 857 | .ctl_name = CTL_UNNUMBERED, |
855 | .procname = "poweroff_cmd", | 858 | .procname = "poweroff_cmd", |
856 | .data = &poweroff_cmd, | 859 | .data = &poweroff_cmd, |
857 | .maxlen = POWEROFF_CMD_PATH_LEN, | 860 | .maxlen = POWEROFF_CMD_PATH_LEN, |
858 | .mode = 0644, | 861 | .mode = 0644, |
859 | .proc_handler = &proc_dostring, | 862 | .proc_handler = &proc_dostring, |
860 | .strategy = &sysctl_string, | 863 | .strategy = &sysctl_string, |
861 | }, | 864 | }, |
862 | #ifdef CONFIG_KEYS | 865 | #ifdef CONFIG_KEYS |
863 | { | 866 | { |
864 | .ctl_name = CTL_UNNUMBERED, | 867 | .ctl_name = CTL_UNNUMBERED, |
865 | .procname = "keys", | 868 | .procname = "keys", |
866 | .mode = 0555, | 869 | .mode = 0555, |
867 | .child = key_sysctls, | 870 | .child = key_sysctls, |
868 | }, | 871 | }, |
869 | #endif | 872 | #endif |
870 | #ifdef CONFIG_RCU_TORTURE_TEST | 873 | #ifdef CONFIG_RCU_TORTURE_TEST |
871 | { | 874 | { |
872 | .ctl_name = CTL_UNNUMBERED, | 875 | .ctl_name = CTL_UNNUMBERED, |
873 | .procname = "rcutorture_runnable", | 876 | .procname = "rcutorture_runnable", |
874 | .data = &rcutorture_runnable, | 877 | .data = &rcutorture_runnable, |
875 | .maxlen = sizeof(int), | 878 | .maxlen = sizeof(int), |
876 | .mode = 0644, | 879 | .mode = 0644, |
877 | .proc_handler = &proc_dointvec, | 880 | .proc_handler = &proc_dointvec, |
878 | }, | 881 | }, |
879 | #endif | 882 | #endif |
880 | #ifdef CONFIG_UNEVICTABLE_LRU | 883 | #ifdef CONFIG_UNEVICTABLE_LRU |
881 | { | 884 | { |
882 | .ctl_name = CTL_UNNUMBERED, | 885 | .ctl_name = CTL_UNNUMBERED, |
883 | .procname = "scan_unevictable_pages", | 886 | .procname = "scan_unevictable_pages", |
884 | .data = &scan_unevictable_pages, | 887 | .data = &scan_unevictable_pages, |
885 | .maxlen = sizeof(scan_unevictable_pages), | 888 | .maxlen = sizeof(scan_unevictable_pages), |
886 | .mode = 0644, | 889 | .mode = 0644, |
887 | .proc_handler = &scan_unevictable_handler, | 890 | .proc_handler = &scan_unevictable_handler, |
888 | }, | 891 | }, |
889 | #endif | 892 | #endif |
890 | /* | 893 | /* |
891 | * NOTE: do not add new entries to this table unless you have read | 894 | * NOTE: do not add new entries to this table unless you have read |
892 | * Documentation/sysctl/ctl_unnumbered.txt | 895 | * Documentation/sysctl/ctl_unnumbered.txt |
893 | */ | 896 | */ |
894 | { .ctl_name = 0 } | 897 | { .ctl_name = 0 } |
895 | }; | 898 | }; |
896 | 899 | ||
897 | static struct ctl_table vm_table[] = { | 900 | static struct ctl_table vm_table[] = { |
898 | { | 901 | { |
899 | .ctl_name = VM_OVERCOMMIT_MEMORY, | 902 | .ctl_name = VM_OVERCOMMIT_MEMORY, |
900 | .procname = "overcommit_memory", | 903 | .procname = "overcommit_memory", |
901 | .data = &sysctl_overcommit_memory, | 904 | .data = &sysctl_overcommit_memory, |
902 | .maxlen = sizeof(sysctl_overcommit_memory), | 905 | .maxlen = sizeof(sysctl_overcommit_memory), |
903 | .mode = 0644, | 906 | .mode = 0644, |
904 | .proc_handler = &proc_dointvec, | 907 | .proc_handler = &proc_dointvec, |
905 | }, | 908 | }, |
906 | { | 909 | { |
907 | .ctl_name = VM_PANIC_ON_OOM, | 910 | .ctl_name = VM_PANIC_ON_OOM, |
908 | .procname = "panic_on_oom", | 911 | .procname = "panic_on_oom", |
909 | .data = &sysctl_panic_on_oom, | 912 | .data = &sysctl_panic_on_oom, |
910 | .maxlen = sizeof(sysctl_panic_on_oom), | 913 | .maxlen = sizeof(sysctl_panic_on_oom), |
911 | .mode = 0644, | 914 | .mode = 0644, |
912 | .proc_handler = &proc_dointvec, | 915 | .proc_handler = &proc_dointvec, |
913 | }, | 916 | }, |
914 | { | 917 | { |
915 | .ctl_name = CTL_UNNUMBERED, | 918 | .ctl_name = CTL_UNNUMBERED, |
916 | .procname = "oom_kill_allocating_task", | 919 | .procname = "oom_kill_allocating_task", |
917 | .data = &sysctl_oom_kill_allocating_task, | 920 | .data = &sysctl_oom_kill_allocating_task, |
918 | .maxlen = sizeof(sysctl_oom_kill_allocating_task), | 921 | .maxlen = sizeof(sysctl_oom_kill_allocating_task), |
919 | .mode = 0644, | 922 | .mode = 0644, |
920 | .proc_handler = &proc_dointvec, | 923 | .proc_handler = &proc_dointvec, |
921 | }, | 924 | }, |
922 | { | 925 | { |
923 | .ctl_name = CTL_UNNUMBERED, | 926 | .ctl_name = CTL_UNNUMBERED, |
924 | .procname = "oom_dump_tasks", | 927 | .procname = "oom_dump_tasks", |
925 | .data = &sysctl_oom_dump_tasks, | 928 | .data = &sysctl_oom_dump_tasks, |
926 | .maxlen = sizeof(sysctl_oom_dump_tasks), | 929 | .maxlen = sizeof(sysctl_oom_dump_tasks), |
927 | .mode = 0644, | 930 | .mode = 0644, |
928 | .proc_handler = &proc_dointvec, | 931 | .proc_handler = &proc_dointvec, |
929 | }, | 932 | }, |
930 | { | 933 | { |
931 | .ctl_name = VM_OVERCOMMIT_RATIO, | 934 | .ctl_name = VM_OVERCOMMIT_RATIO, |
932 | .procname = "overcommit_ratio", | 935 | .procname = "overcommit_ratio", |
933 | .data = &sysctl_overcommit_ratio, | 936 | .data = &sysctl_overcommit_ratio, |
934 | .maxlen = sizeof(sysctl_overcommit_ratio), | 937 | .maxlen = sizeof(sysctl_overcommit_ratio), |
935 | .mode = 0644, | 938 | .mode = 0644, |
936 | .proc_handler = &proc_dointvec, | 939 | .proc_handler = &proc_dointvec, |
937 | }, | 940 | }, |
938 | { | 941 | { |
939 | .ctl_name = VM_PAGE_CLUSTER, | 942 | .ctl_name = VM_PAGE_CLUSTER, |
940 | .procname = "page-cluster", | 943 | .procname = "page-cluster", |
941 | .data = &page_cluster, | 944 | .data = &page_cluster, |
942 | .maxlen = sizeof(int), | 945 | .maxlen = sizeof(int), |
943 | .mode = 0644, | 946 | .mode = 0644, |
944 | .proc_handler = &proc_dointvec, | 947 | .proc_handler = &proc_dointvec, |
945 | }, | 948 | }, |
946 | { | 949 | { |
947 | .ctl_name = VM_DIRTY_BACKGROUND, | 950 | .ctl_name = VM_DIRTY_BACKGROUND, |
948 | .procname = "dirty_background_ratio", | 951 | .procname = "dirty_background_ratio", |
949 | .data = &dirty_background_ratio, | 952 | .data = &dirty_background_ratio, |
950 | .maxlen = sizeof(dirty_background_ratio), | 953 | .maxlen = sizeof(dirty_background_ratio), |
951 | .mode = 0644, | 954 | .mode = 0644, |
952 | .proc_handler = &dirty_background_ratio_handler, | 955 | .proc_handler = &dirty_background_ratio_handler, |
953 | .strategy = &sysctl_intvec, | 956 | .strategy = &sysctl_intvec, |
954 | .extra1 = &zero, | 957 | .extra1 = &zero, |
955 | .extra2 = &one_hundred, | 958 | .extra2 = &one_hundred, |
956 | }, | 959 | }, |
957 | { | 960 | { |
958 | .ctl_name = CTL_UNNUMBERED, | 961 | .ctl_name = CTL_UNNUMBERED, |
959 | .procname = "dirty_background_bytes", | 962 | .procname = "dirty_background_bytes", |
960 | .data = &dirty_background_bytes, | 963 | .data = &dirty_background_bytes, |
961 | .maxlen = sizeof(dirty_background_bytes), | 964 | .maxlen = sizeof(dirty_background_bytes), |
962 | .mode = 0644, | 965 | .mode = 0644, |
963 | .proc_handler = &dirty_background_bytes_handler, | 966 | .proc_handler = &dirty_background_bytes_handler, |
964 | .strategy = &sysctl_intvec, | 967 | .strategy = &sysctl_intvec, |
965 | .extra1 = &one, | 968 | .extra1 = &one, |
966 | }, | 969 | }, |
967 | { | 970 | { |
968 | .ctl_name = VM_DIRTY_RATIO, | 971 | .ctl_name = VM_DIRTY_RATIO, |
969 | .procname = "dirty_ratio", | 972 | .procname = "dirty_ratio", |
970 | .data = &vm_dirty_ratio, | 973 | .data = &vm_dirty_ratio, |
971 | .maxlen = sizeof(vm_dirty_ratio), | 974 | .maxlen = sizeof(vm_dirty_ratio), |
972 | .mode = 0644, | 975 | .mode = 0644, |
973 | .proc_handler = &dirty_ratio_handler, | 976 | .proc_handler = &dirty_ratio_handler, |
974 | .strategy = &sysctl_intvec, | 977 | .strategy = &sysctl_intvec, |
975 | .extra1 = &zero, | 978 | .extra1 = &zero, |
976 | .extra2 = &one_hundred, | 979 | .extra2 = &one_hundred, |
977 | }, | 980 | }, |
978 | { | 981 | { |
979 | .ctl_name = CTL_UNNUMBERED, | 982 | .ctl_name = CTL_UNNUMBERED, |
980 | .procname = "dirty_bytes", | 983 | .procname = "dirty_bytes", |
981 | .data = &vm_dirty_bytes, | 984 | .data = &vm_dirty_bytes, |
982 | .maxlen = sizeof(vm_dirty_bytes), | 985 | .maxlen = sizeof(vm_dirty_bytes), |
983 | .mode = 0644, | 986 | .mode = 0644, |
984 | .proc_handler = &dirty_bytes_handler, | 987 | .proc_handler = &dirty_bytes_handler, |
985 | .strategy = &sysctl_intvec, | 988 | .strategy = &sysctl_intvec, |
986 | .extra1 = &one, | 989 | .extra1 = &one, |
987 | }, | 990 | }, |
988 | { | 991 | { |
989 | .procname = "dirty_writeback_centisecs", | 992 | .procname = "dirty_writeback_centisecs", |
990 | .data = &dirty_writeback_interval, | 993 | .data = &dirty_writeback_interval, |
991 | .maxlen = sizeof(dirty_writeback_interval), | 994 | .maxlen = sizeof(dirty_writeback_interval), |
992 | .mode = 0644, | 995 | .mode = 0644, |
993 | .proc_handler = &dirty_writeback_centisecs_handler, | 996 | .proc_handler = &dirty_writeback_centisecs_handler, |
994 | }, | 997 | }, |
995 | { | 998 | { |
996 | .procname = "dirty_expire_centisecs", | 999 | .procname = "dirty_expire_centisecs", |
997 | .data = &dirty_expire_interval, | 1000 | .data = &dirty_expire_interval, |
998 | .maxlen = sizeof(dirty_expire_interval), | 1001 | .maxlen = sizeof(dirty_expire_interval), |
999 | .mode = 0644, | 1002 | .mode = 0644, |
1000 | .proc_handler = &proc_dointvec_userhz_jiffies, | 1003 | .proc_handler = &proc_dointvec_userhz_jiffies, |
1001 | }, | 1004 | }, |
1002 | { | 1005 | { |
1003 | .ctl_name = VM_NR_PDFLUSH_THREADS, | 1006 | .ctl_name = VM_NR_PDFLUSH_THREADS, |
1004 | .procname = "nr_pdflush_threads", | 1007 | .procname = "nr_pdflush_threads", |
1005 | .data = &nr_pdflush_threads, | 1008 | .data = &nr_pdflush_threads, |
1006 | .maxlen = sizeof nr_pdflush_threads, | 1009 | .maxlen = sizeof nr_pdflush_threads, |
1007 | .mode = 0444 /* read-only*/, | 1010 | .mode = 0444 /* read-only*/, |
1008 | .proc_handler = &proc_dointvec, | 1011 | .proc_handler = &proc_dointvec, |
1009 | }, | 1012 | }, |
1010 | { | 1013 | { |
1011 | .ctl_name = VM_SWAPPINESS, | 1014 | .ctl_name = VM_SWAPPINESS, |
1012 | .procname = "swappiness", | 1015 | .procname = "swappiness", |
1013 | .data = &vm_swappiness, | 1016 | .data = &vm_swappiness, |
1014 | .maxlen = sizeof(vm_swappiness), | 1017 | .maxlen = sizeof(vm_swappiness), |
1015 | .mode = 0644, | 1018 | .mode = 0644, |
1016 | .proc_handler = &proc_dointvec_minmax, | 1019 | .proc_handler = &proc_dointvec_minmax, |
1017 | .strategy = &sysctl_intvec, | 1020 | .strategy = &sysctl_intvec, |
1018 | .extra1 = &zero, | 1021 | .extra1 = &zero, |
1019 | .extra2 = &one_hundred, | 1022 | .extra2 = &one_hundred, |
1020 | }, | 1023 | }, |
1021 | #ifdef CONFIG_HUGETLB_PAGE | 1024 | #ifdef CONFIG_HUGETLB_PAGE |
1022 | { | 1025 | { |
1023 | .procname = "nr_hugepages", | 1026 | .procname = "nr_hugepages", |
1024 | .data = NULL, | 1027 | .data = NULL, |
1025 | .maxlen = sizeof(unsigned long), | 1028 | .maxlen = sizeof(unsigned long), |
1026 | .mode = 0644, | 1029 | .mode = 0644, |
1027 | .proc_handler = &hugetlb_sysctl_handler, | 1030 | .proc_handler = &hugetlb_sysctl_handler, |
1028 | .extra1 = (void *)&hugetlb_zero, | 1031 | .extra1 = (void *)&hugetlb_zero, |
1029 | .extra2 = (void *)&hugetlb_infinity, | 1032 | .extra2 = (void *)&hugetlb_infinity, |
1030 | }, | 1033 | }, |
1031 | { | 1034 | { |
1032 | .ctl_name = VM_HUGETLB_GROUP, | 1035 | .ctl_name = VM_HUGETLB_GROUP, |
1033 | .procname = "hugetlb_shm_group", | 1036 | .procname = "hugetlb_shm_group", |
1034 | .data = &sysctl_hugetlb_shm_group, | 1037 | .data = &sysctl_hugetlb_shm_group, |
1035 | .maxlen = sizeof(gid_t), | 1038 | .maxlen = sizeof(gid_t), |
1036 | .mode = 0644, | 1039 | .mode = 0644, |
1037 | .proc_handler = &proc_dointvec, | 1040 | .proc_handler = &proc_dointvec, |
1038 | }, | 1041 | }, |
1039 | { | 1042 | { |
1040 | .ctl_name = CTL_UNNUMBERED, | 1043 | .ctl_name = CTL_UNNUMBERED, |
1041 | .procname = "hugepages_treat_as_movable", | 1044 | .procname = "hugepages_treat_as_movable", |
1042 | .data = &hugepages_treat_as_movable, | 1045 | .data = &hugepages_treat_as_movable, |
1043 | .maxlen = sizeof(int), | 1046 | .maxlen = sizeof(int), |
1044 | .mode = 0644, | 1047 | .mode = 0644, |
1045 | .proc_handler = &hugetlb_treat_movable_handler, | 1048 | .proc_handler = &hugetlb_treat_movable_handler, |
1046 | }, | 1049 | }, |
1047 | { | 1050 | { |
1048 | .ctl_name = CTL_UNNUMBERED, | 1051 | .ctl_name = CTL_UNNUMBERED, |
1049 | .procname = "nr_overcommit_hugepages", | 1052 | .procname = "nr_overcommit_hugepages", |
1050 | .data = NULL, | 1053 | .data = NULL, |
1051 | .maxlen = sizeof(unsigned long), | 1054 | .maxlen = sizeof(unsigned long), |
1052 | .mode = 0644, | 1055 | .mode = 0644, |
1053 | .proc_handler = &hugetlb_overcommit_handler, | 1056 | .proc_handler = &hugetlb_overcommit_handler, |
1054 | .extra1 = (void *)&hugetlb_zero, | 1057 | .extra1 = (void *)&hugetlb_zero, |
1055 | .extra2 = (void *)&hugetlb_infinity, | 1058 | .extra2 = (void *)&hugetlb_infinity, |
1056 | }, | 1059 | }, |
1057 | #endif | 1060 | #endif |
1058 | { | 1061 | { |
1059 | .ctl_name = VM_LOWMEM_RESERVE_RATIO, | 1062 | .ctl_name = VM_LOWMEM_RESERVE_RATIO, |
1060 | .procname = "lowmem_reserve_ratio", | 1063 | .procname = "lowmem_reserve_ratio", |
1061 | .data = &sysctl_lowmem_reserve_ratio, | 1064 | .data = &sysctl_lowmem_reserve_ratio, |
1062 | .maxlen = sizeof(sysctl_lowmem_reserve_ratio), | 1065 | .maxlen = sizeof(sysctl_lowmem_reserve_ratio), |
1063 | .mode = 0644, | 1066 | .mode = 0644, |
1064 | .proc_handler = &lowmem_reserve_ratio_sysctl_handler, | 1067 | .proc_handler = &lowmem_reserve_ratio_sysctl_handler, |
1065 | .strategy = &sysctl_intvec, | 1068 | .strategy = &sysctl_intvec, |
1066 | }, | 1069 | }, |
1067 | { | 1070 | { |
1068 | .ctl_name = VM_DROP_PAGECACHE, | 1071 | .ctl_name = VM_DROP_PAGECACHE, |
1069 | .procname = "drop_caches", | 1072 | .procname = "drop_caches", |
1070 | .data = &sysctl_drop_caches, | 1073 | .data = &sysctl_drop_caches, |
1071 | .maxlen = sizeof(int), | 1074 | .maxlen = sizeof(int), |
1072 | .mode = 0644, | 1075 | .mode = 0644, |
1073 | .proc_handler = drop_caches_sysctl_handler, | 1076 | .proc_handler = drop_caches_sysctl_handler, |
1074 | .strategy = &sysctl_intvec, | 1077 | .strategy = &sysctl_intvec, |
1075 | }, | 1078 | }, |
1076 | { | 1079 | { |
1077 | .ctl_name = VM_MIN_FREE_KBYTES, | 1080 | .ctl_name = VM_MIN_FREE_KBYTES, |
1078 | .procname = "min_free_kbytes", | 1081 | .procname = "min_free_kbytes", |
1079 | .data = &min_free_kbytes, | 1082 | .data = &min_free_kbytes, |
1080 | .maxlen = sizeof(min_free_kbytes), | 1083 | .maxlen = sizeof(min_free_kbytes), |
1081 | .mode = 0644, | 1084 | .mode = 0644, |
1082 | .proc_handler = &min_free_kbytes_sysctl_handler, | 1085 | .proc_handler = &min_free_kbytes_sysctl_handler, |
1083 | .strategy = &sysctl_intvec, | 1086 | .strategy = &sysctl_intvec, |
1084 | .extra1 = &zero, | 1087 | .extra1 = &zero, |
1085 | }, | 1088 | }, |
1086 | { | 1089 | { |
1087 | .ctl_name = VM_PERCPU_PAGELIST_FRACTION, | 1090 | .ctl_name = VM_PERCPU_PAGELIST_FRACTION, |
1088 | .procname = "percpu_pagelist_fraction", | 1091 | .procname = "percpu_pagelist_fraction", |
1089 | .data = &percpu_pagelist_fraction, | 1092 | .data = &percpu_pagelist_fraction, |
1090 | .maxlen = sizeof(percpu_pagelist_fraction), | 1093 | .maxlen = sizeof(percpu_pagelist_fraction), |
1091 | .mode = 0644, | 1094 | .mode = 0644, |
1092 | .proc_handler = &percpu_pagelist_fraction_sysctl_handler, | 1095 | .proc_handler = &percpu_pagelist_fraction_sysctl_handler, |
1093 | .strategy = &sysctl_intvec, | 1096 | .strategy = &sysctl_intvec, |
1094 | .extra1 = &min_percpu_pagelist_fract, | 1097 | .extra1 = &min_percpu_pagelist_fract, |
1095 | }, | 1098 | }, |
1096 | #ifdef CONFIG_MMU | 1099 | #ifdef CONFIG_MMU |
1097 | { | 1100 | { |
1098 | .ctl_name = VM_MAX_MAP_COUNT, | 1101 | .ctl_name = VM_MAX_MAP_COUNT, |
1099 | .procname = "max_map_count", | 1102 | .procname = "max_map_count", |
1100 | .data = &sysctl_max_map_count, | 1103 | .data = &sysctl_max_map_count, |
1101 | .maxlen = sizeof(sysctl_max_map_count), | 1104 | .maxlen = sizeof(sysctl_max_map_count), |
1102 | .mode = 0644, | 1105 | .mode = 0644, |
1103 | .proc_handler = &proc_dointvec | 1106 | .proc_handler = &proc_dointvec |
1107 | }, | ||
1108 | #else | ||
1109 | { | ||
1110 | .ctl_name = CTL_UNNUMBERED, | ||
1111 | .procname = "nr_trim_pages", | ||
1112 | .data = &sysctl_nr_trim_pages, | ||
1113 | .maxlen = sizeof(sysctl_nr_trim_pages), | ||
1114 | .mode = 0644, | ||
1115 | .proc_handler = &proc_dointvec_minmax, | ||
1116 | .strategy = &sysctl_intvec, | ||
1117 | .extra1 = &zero, | ||
1104 | }, | 1118 | }, |
1105 | #endif | 1119 | #endif |
1106 | { | 1120 | { |
1107 | .ctl_name = VM_LAPTOP_MODE, | 1121 | .ctl_name = VM_LAPTOP_MODE, |
1108 | .procname = "laptop_mode", | 1122 | .procname = "laptop_mode", |
1109 | .data = &laptop_mode, | 1123 | .data = &laptop_mode, |
1110 | .maxlen = sizeof(laptop_mode), | 1124 | .maxlen = sizeof(laptop_mode), |
1111 | .mode = 0644, | 1125 | .mode = 0644, |
1112 | .proc_handler = &proc_dointvec_jiffies, | 1126 | .proc_handler = &proc_dointvec_jiffies, |
1113 | .strategy = &sysctl_jiffies, | 1127 | .strategy = &sysctl_jiffies, |
1114 | }, | 1128 | }, |
1115 | { | 1129 | { |
1116 | .ctl_name = VM_BLOCK_DUMP, | 1130 | .ctl_name = VM_BLOCK_DUMP, |
1117 | .procname = "block_dump", | 1131 | .procname = "block_dump", |
1118 | .data = &block_dump, | 1132 | .data = &block_dump, |
1119 | .maxlen = sizeof(block_dump), | 1133 | .maxlen = sizeof(block_dump), |
1120 | .mode = 0644, | 1134 | .mode = 0644, |
1121 | .proc_handler = &proc_dointvec, | 1135 | .proc_handler = &proc_dointvec, |
1122 | .strategy = &sysctl_intvec, | 1136 | .strategy = &sysctl_intvec, |
1123 | .extra1 = &zero, | 1137 | .extra1 = &zero, |
1124 | }, | 1138 | }, |
1125 | { | 1139 | { |
1126 | .ctl_name = VM_VFS_CACHE_PRESSURE, | 1140 | .ctl_name = VM_VFS_CACHE_PRESSURE, |
1127 | .procname = "vfs_cache_pressure", | 1141 | .procname = "vfs_cache_pressure", |
1128 | .data = &sysctl_vfs_cache_pressure, | 1142 | .data = &sysctl_vfs_cache_pressure, |
1129 | .maxlen = sizeof(sysctl_vfs_cache_pressure), | 1143 | .maxlen = sizeof(sysctl_vfs_cache_pressure), |
1130 | .mode = 0644, | 1144 | .mode = 0644, |
1131 | .proc_handler = &proc_dointvec, | 1145 | .proc_handler = &proc_dointvec, |
1132 | .strategy = &sysctl_intvec, | 1146 | .strategy = &sysctl_intvec, |
1133 | .extra1 = &zero, | 1147 | .extra1 = &zero, |
1134 | }, | 1148 | }, |
1135 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 1149 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT |
1136 | { | 1150 | { |
1137 | .ctl_name = VM_LEGACY_VA_LAYOUT, | 1151 | .ctl_name = VM_LEGACY_VA_LAYOUT, |
1138 | .procname = "legacy_va_layout", | 1152 | .procname = "legacy_va_layout", |
1139 | .data = &sysctl_legacy_va_layout, | 1153 | .data = &sysctl_legacy_va_layout, |
1140 | .maxlen = sizeof(sysctl_legacy_va_layout), | 1154 | .maxlen = sizeof(sysctl_legacy_va_layout), |
1141 | .mode = 0644, | 1155 | .mode = 0644, |
1142 | .proc_handler = &proc_dointvec, | 1156 | .proc_handler = &proc_dointvec, |
1143 | .strategy = &sysctl_intvec, | 1157 | .strategy = &sysctl_intvec, |
1144 | .extra1 = &zero, | 1158 | .extra1 = &zero, |
1145 | }, | 1159 | }, |
1146 | #endif | 1160 | #endif |
1147 | #ifdef CONFIG_NUMA | 1161 | #ifdef CONFIG_NUMA |
1148 | { | 1162 | { |
1149 | .ctl_name = VM_ZONE_RECLAIM_MODE, | 1163 | .ctl_name = VM_ZONE_RECLAIM_MODE, |
1150 | .procname = "zone_reclaim_mode", | 1164 | .procname = "zone_reclaim_mode", |
1151 | .data = &zone_reclaim_mode, | 1165 | .data = &zone_reclaim_mode, |
1152 | .maxlen = sizeof(zone_reclaim_mode), | 1166 | .maxlen = sizeof(zone_reclaim_mode), |
1153 | .mode = 0644, | 1167 | .mode = 0644, |
1154 | .proc_handler = &proc_dointvec, | 1168 | .proc_handler = &proc_dointvec, |
1155 | .strategy = &sysctl_intvec, | 1169 | .strategy = &sysctl_intvec, |
1156 | .extra1 = &zero, | 1170 | .extra1 = &zero, |
1157 | }, | 1171 | }, |
1158 | { | 1172 | { |
1159 | .ctl_name = VM_MIN_UNMAPPED, | 1173 | .ctl_name = VM_MIN_UNMAPPED, |
1160 | .procname = "min_unmapped_ratio", | 1174 | .procname = "min_unmapped_ratio", |
1161 | .data = &sysctl_min_unmapped_ratio, | 1175 | .data = &sysctl_min_unmapped_ratio, |
1162 | .maxlen = sizeof(sysctl_min_unmapped_ratio), | 1176 | .maxlen = sizeof(sysctl_min_unmapped_ratio), |
1163 | .mode = 0644, | 1177 | .mode = 0644, |
1164 | .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, | 1178 | .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, |
1165 | .strategy = &sysctl_intvec, | 1179 | .strategy = &sysctl_intvec, |
1166 | .extra1 = &zero, | 1180 | .extra1 = &zero, |
1167 | .extra2 = &one_hundred, | 1181 | .extra2 = &one_hundred, |
1168 | }, | 1182 | }, |
1169 | { | 1183 | { |
1170 | .ctl_name = VM_MIN_SLAB, | 1184 | .ctl_name = VM_MIN_SLAB, |
1171 | .procname = "min_slab_ratio", | 1185 | .procname = "min_slab_ratio", |
1172 | .data = &sysctl_min_slab_ratio, | 1186 | .data = &sysctl_min_slab_ratio, |
1173 | .maxlen = sizeof(sysctl_min_slab_ratio), | 1187 | .maxlen = sizeof(sysctl_min_slab_ratio), |
1174 | .mode = 0644, | 1188 | .mode = 0644, |
1175 | .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, | 1189 | .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, |
1176 | .strategy = &sysctl_intvec, | 1190 | .strategy = &sysctl_intvec, |
1177 | .extra1 = &zero, | 1191 | .extra1 = &zero, |
1178 | .extra2 = &one_hundred, | 1192 | .extra2 = &one_hundred, |
1179 | }, | 1193 | }, |
1180 | #endif | 1194 | #endif |
1181 | #ifdef CONFIG_SMP | 1195 | #ifdef CONFIG_SMP |
1182 | { | 1196 | { |
1183 | .ctl_name = CTL_UNNUMBERED, | 1197 | .ctl_name = CTL_UNNUMBERED, |
1184 | .procname = "stat_interval", | 1198 | .procname = "stat_interval", |
1185 | .data = &sysctl_stat_interval, | 1199 | .data = &sysctl_stat_interval, |
1186 | .maxlen = sizeof(sysctl_stat_interval), | 1200 | .maxlen = sizeof(sysctl_stat_interval), |
1187 | .mode = 0644, | 1201 | .mode = 0644, |
1188 | .proc_handler = &proc_dointvec_jiffies, | 1202 | .proc_handler = &proc_dointvec_jiffies, |
1189 | .strategy = &sysctl_jiffies, | 1203 | .strategy = &sysctl_jiffies, |
1190 | }, | 1204 | }, |
1191 | #endif | 1205 | #endif |
1192 | #ifdef CONFIG_SECURITY | 1206 | #ifdef CONFIG_SECURITY |
1193 | { | 1207 | { |
1194 | .ctl_name = CTL_UNNUMBERED, | 1208 | .ctl_name = CTL_UNNUMBERED, |
1195 | .procname = "mmap_min_addr", | 1209 | .procname = "mmap_min_addr", |
1196 | .data = &mmap_min_addr, | 1210 | .data = &mmap_min_addr, |
1197 | .maxlen = sizeof(unsigned long), | 1211 | .maxlen = sizeof(unsigned long), |
1198 | .mode = 0644, | 1212 | .mode = 0644, |
1199 | .proc_handler = &proc_doulongvec_minmax, | 1213 | .proc_handler = &proc_doulongvec_minmax, |
1200 | }, | 1214 | }, |
1201 | #endif | 1215 | #endif |
1202 | #ifdef CONFIG_NUMA | 1216 | #ifdef CONFIG_NUMA |
1203 | { | 1217 | { |
1204 | .ctl_name = CTL_UNNUMBERED, | 1218 | .ctl_name = CTL_UNNUMBERED, |
1205 | .procname = "numa_zonelist_order", | 1219 | .procname = "numa_zonelist_order", |
1206 | .data = &numa_zonelist_order, | 1220 | .data = &numa_zonelist_order, |
1207 | .maxlen = NUMA_ZONELIST_ORDER_LEN, | 1221 | .maxlen = NUMA_ZONELIST_ORDER_LEN, |
1208 | .mode = 0644, | 1222 | .mode = 0644, |
1209 | .proc_handler = &numa_zonelist_order_handler, | 1223 | .proc_handler = &numa_zonelist_order_handler, |
1210 | .strategy = &sysctl_string, | 1224 | .strategy = &sysctl_string, |
1211 | }, | 1225 | }, |
1212 | #endif | 1226 | #endif |
1213 | #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ | 1227 | #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ |
1214 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) | 1228 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) |
1215 | { | 1229 | { |
1216 | .ctl_name = VM_VDSO_ENABLED, | 1230 | .ctl_name = VM_VDSO_ENABLED, |
1217 | .procname = "vdso_enabled", | 1231 | .procname = "vdso_enabled", |
1218 | .data = &vdso_enabled, | 1232 | .data = &vdso_enabled, |
1219 | .maxlen = sizeof(vdso_enabled), | 1233 | .maxlen = sizeof(vdso_enabled), |
1220 | .mode = 0644, | 1234 | .mode = 0644, |
1221 | .proc_handler = &proc_dointvec, | 1235 | .proc_handler = &proc_dointvec, |
1222 | .strategy = &sysctl_intvec, | 1236 | .strategy = &sysctl_intvec, |
1223 | .extra1 = &zero, | 1237 | .extra1 = &zero, |
1224 | }, | 1238 | }, |
1225 | #endif | 1239 | #endif |
1226 | #ifdef CONFIG_HIGHMEM | 1240 | #ifdef CONFIG_HIGHMEM |
1227 | { | 1241 | { |
1228 | .ctl_name = CTL_UNNUMBERED, | 1242 | .ctl_name = CTL_UNNUMBERED, |
1229 | .procname = "highmem_is_dirtyable", | 1243 | .procname = "highmem_is_dirtyable", |
1230 | .data = &vm_highmem_is_dirtyable, | 1244 | .data = &vm_highmem_is_dirtyable, |
1231 | .maxlen = sizeof(vm_highmem_is_dirtyable), | 1245 | .maxlen = sizeof(vm_highmem_is_dirtyable), |
1232 | .mode = 0644, | 1246 | .mode = 0644, |
1233 | .proc_handler = &proc_dointvec_minmax, | 1247 | .proc_handler = &proc_dointvec_minmax, |
1234 | .strategy = &sysctl_intvec, | 1248 | .strategy = &sysctl_intvec, |
1235 | .extra1 = &zero, | 1249 | .extra1 = &zero, |
1236 | .extra2 = &one, | 1250 | .extra2 = &one, |
1237 | }, | 1251 | }, |
1238 | #endif | 1252 | #endif |
1239 | /* | 1253 | /* |
1240 | * NOTE: do not add new entries to this table unless you have read | 1254 | * NOTE: do not add new entries to this table unless you have read |
1241 | * Documentation/sysctl/ctl_unnumbered.txt | 1255 | * Documentation/sysctl/ctl_unnumbered.txt |
1242 | */ | 1256 | */ |
1243 | { .ctl_name = 0 } | 1257 | { .ctl_name = 0 } |
1244 | }; | 1258 | }; |
1245 | 1259 | ||
1246 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) | 1260 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) |
1247 | static struct ctl_table binfmt_misc_table[] = { | 1261 | static struct ctl_table binfmt_misc_table[] = { |
1248 | { .ctl_name = 0 } | 1262 | { .ctl_name = 0 } |
1249 | }; | 1263 | }; |
1250 | #endif | 1264 | #endif |
1251 | 1265 | ||
1252 | static struct ctl_table fs_table[] = { | 1266 | static struct ctl_table fs_table[] = { |
1253 | { | 1267 | { |
1254 | .ctl_name = FS_NRINODE, | 1268 | .ctl_name = FS_NRINODE, |
1255 | .procname = "inode-nr", | 1269 | .procname = "inode-nr", |
1256 | .data = &inodes_stat, | 1270 | .data = &inodes_stat, |
1257 | .maxlen = 2*sizeof(int), | 1271 | .maxlen = 2*sizeof(int), |
1258 | .mode = 0444, | 1272 | .mode = 0444, |
1259 | .proc_handler = &proc_dointvec, | 1273 | .proc_handler = &proc_dointvec, |
1260 | }, | 1274 | }, |
1261 | { | 1275 | { |
1262 | .ctl_name = FS_STATINODE, | 1276 | .ctl_name = FS_STATINODE, |
1263 | .procname = "inode-state", | 1277 | .procname = "inode-state", |
1264 | .data = &inodes_stat, | 1278 | .data = &inodes_stat, |
1265 | .maxlen = 7*sizeof(int), | 1279 | .maxlen = 7*sizeof(int), |
1266 | .mode = 0444, | 1280 | .mode = 0444, |
1267 | .proc_handler = &proc_dointvec, | 1281 | .proc_handler = &proc_dointvec, |
1268 | }, | 1282 | }, |
1269 | { | 1283 | { |
1270 | .procname = "file-nr", | 1284 | .procname = "file-nr", |
1271 | .data = &files_stat, | 1285 | .data = &files_stat, |
1272 | .maxlen = 3*sizeof(int), | 1286 | .maxlen = 3*sizeof(int), |
1273 | .mode = 0444, | 1287 | .mode = 0444, |
1274 | .proc_handler = &proc_nr_files, | 1288 | .proc_handler = &proc_nr_files, |
1275 | }, | 1289 | }, |
1276 | { | 1290 | { |
1277 | .ctl_name = FS_MAXFILE, | 1291 | .ctl_name = FS_MAXFILE, |
1278 | .procname = "file-max", | 1292 | .procname = "file-max", |
1279 | .data = &files_stat.max_files, | 1293 | .data = &files_stat.max_files, |
1280 | .maxlen = sizeof(int), | 1294 | .maxlen = sizeof(int), |
1281 | .mode = 0644, | 1295 | .mode = 0644, |
1282 | .proc_handler = &proc_dointvec, | 1296 | .proc_handler = &proc_dointvec, |
1283 | }, | 1297 | }, |
1284 | { | 1298 | { |
1285 | .ctl_name = CTL_UNNUMBERED, | 1299 | .ctl_name = CTL_UNNUMBERED, |
1286 | .procname = "nr_open", | 1300 | .procname = "nr_open", |
1287 | .data = &sysctl_nr_open, | 1301 | .data = &sysctl_nr_open, |
1288 | .maxlen = sizeof(int), | 1302 | .maxlen = sizeof(int), |
1289 | .mode = 0644, | 1303 | .mode = 0644, |
1290 | .proc_handler = &proc_dointvec_minmax, | 1304 | .proc_handler = &proc_dointvec_minmax, |
1291 | .extra1 = &sysctl_nr_open_min, | 1305 | .extra1 = &sysctl_nr_open_min, |
1292 | .extra2 = &sysctl_nr_open_max, | 1306 | .extra2 = &sysctl_nr_open_max, |
1293 | }, | 1307 | }, |
1294 | { | 1308 | { |
1295 | .ctl_name = FS_DENTRY, | 1309 | .ctl_name = FS_DENTRY, |
1296 | .procname = "dentry-state", | 1310 | .procname = "dentry-state", |
1297 | .data = &dentry_stat, | 1311 | .data = &dentry_stat, |
1298 | .maxlen = 6*sizeof(int), | 1312 | .maxlen = 6*sizeof(int), |
1299 | .mode = 0444, | 1313 | .mode = 0444, |
1300 | .proc_handler = &proc_dointvec, | 1314 | .proc_handler = &proc_dointvec, |
1301 | }, | 1315 | }, |
1302 | { | 1316 | { |
1303 | .ctl_name = FS_OVERFLOWUID, | 1317 | .ctl_name = FS_OVERFLOWUID, |
1304 | .procname = "overflowuid", | 1318 | .procname = "overflowuid", |
1305 | .data = &fs_overflowuid, | 1319 | .data = &fs_overflowuid, |
1306 | .maxlen = sizeof(int), | 1320 | .maxlen = sizeof(int), |
1307 | .mode = 0644, | 1321 | .mode = 0644, |
1308 | .proc_handler = &proc_dointvec_minmax, | 1322 | .proc_handler = &proc_dointvec_minmax, |
1309 | .strategy = &sysctl_intvec, | 1323 | .strategy = &sysctl_intvec, |
1310 | .extra1 = &minolduid, | 1324 | .extra1 = &minolduid, |
1311 | .extra2 = &maxolduid, | 1325 | .extra2 = &maxolduid, |
1312 | }, | 1326 | }, |
1313 | { | 1327 | { |
1314 | .ctl_name = FS_OVERFLOWGID, | 1328 | .ctl_name = FS_OVERFLOWGID, |
1315 | .procname = "overflowgid", | 1329 | .procname = "overflowgid", |
1316 | .data = &fs_overflowgid, | 1330 | .data = &fs_overflowgid, |
1317 | .maxlen = sizeof(int), | 1331 | .maxlen = sizeof(int), |
1318 | .mode = 0644, | 1332 | .mode = 0644, |
1319 | .proc_handler = &proc_dointvec_minmax, | 1333 | .proc_handler = &proc_dointvec_minmax, |
1320 | .strategy = &sysctl_intvec, | 1334 | .strategy = &sysctl_intvec, |
1321 | .extra1 = &minolduid, | 1335 | .extra1 = &minolduid, |
1322 | .extra2 = &maxolduid, | 1336 | .extra2 = &maxolduid, |
1323 | }, | 1337 | }, |
1324 | #ifdef CONFIG_FILE_LOCKING | 1338 | #ifdef CONFIG_FILE_LOCKING |
1325 | { | 1339 | { |
1326 | .ctl_name = FS_LEASES, | 1340 | .ctl_name = FS_LEASES, |
1327 | .procname = "leases-enable", | 1341 | .procname = "leases-enable", |
1328 | .data = &leases_enable, | 1342 | .data = &leases_enable, |
1329 | .maxlen = sizeof(int), | 1343 | .maxlen = sizeof(int), |
1330 | .mode = 0644, | 1344 | .mode = 0644, |
1331 | .proc_handler = &proc_dointvec, | 1345 | .proc_handler = &proc_dointvec, |
1332 | }, | 1346 | }, |
1333 | #endif | 1347 | #endif |
1334 | #ifdef CONFIG_DNOTIFY | 1348 | #ifdef CONFIG_DNOTIFY |
1335 | { | 1349 | { |
1336 | .ctl_name = FS_DIR_NOTIFY, | 1350 | .ctl_name = FS_DIR_NOTIFY, |
1337 | .procname = "dir-notify-enable", | 1351 | .procname = "dir-notify-enable", |
1338 | .data = &dir_notify_enable, | 1352 | .data = &dir_notify_enable, |
1339 | .maxlen = sizeof(int), | 1353 | .maxlen = sizeof(int), |
1340 | .mode = 0644, | 1354 | .mode = 0644, |
1341 | .proc_handler = &proc_dointvec, | 1355 | .proc_handler = &proc_dointvec, |
1342 | }, | 1356 | }, |
1343 | #endif | 1357 | #endif |
1344 | #ifdef CONFIG_MMU | 1358 | #ifdef CONFIG_MMU |
1345 | #ifdef CONFIG_FILE_LOCKING | 1359 | #ifdef CONFIG_FILE_LOCKING |
1346 | { | 1360 | { |
1347 | .ctl_name = FS_LEASE_TIME, | 1361 | .ctl_name = FS_LEASE_TIME, |
1348 | .procname = "lease-break-time", | 1362 | .procname = "lease-break-time", |
1349 | .data = &lease_break_time, | 1363 | .data = &lease_break_time, |
1350 | .maxlen = sizeof(int), | 1364 | .maxlen = sizeof(int), |
1351 | .mode = 0644, | 1365 | .mode = 0644, |
1352 | .proc_handler = &proc_dointvec_minmax, | 1366 | .proc_handler = &proc_dointvec_minmax, |
1353 | .strategy = &sysctl_intvec, | 1367 | .strategy = &sysctl_intvec, |
1354 | .extra1 = &zero, | 1368 | .extra1 = &zero, |
1355 | .extra2 = &two, | 1369 | .extra2 = &two, |
1356 | }, | 1370 | }, |
1357 | #endif | 1371 | #endif |
1358 | #ifdef CONFIG_AIO | 1372 | #ifdef CONFIG_AIO |
1359 | { | 1373 | { |
1360 | .procname = "aio-nr", | 1374 | .procname = "aio-nr", |
1361 | .data = &aio_nr, | 1375 | .data = &aio_nr, |
1362 | .maxlen = sizeof(aio_nr), | 1376 | .maxlen = sizeof(aio_nr), |
1363 | .mode = 0444, | 1377 | .mode = 0444, |
1364 | .proc_handler = &proc_doulongvec_minmax, | 1378 | .proc_handler = &proc_doulongvec_minmax, |
1365 | }, | 1379 | }, |
1366 | { | 1380 | { |
1367 | .procname = "aio-max-nr", | 1381 | .procname = "aio-max-nr", |
1368 | .data = &aio_max_nr, | 1382 | .data = &aio_max_nr, |
1369 | .maxlen = sizeof(aio_max_nr), | 1383 | .maxlen = sizeof(aio_max_nr), |
1370 | .mode = 0644, | 1384 | .mode = 0644, |
1371 | .proc_handler = &proc_doulongvec_minmax, | 1385 | .proc_handler = &proc_doulongvec_minmax, |
1372 | }, | 1386 | }, |
1373 | #endif /* CONFIG_AIO */ | 1387 | #endif /* CONFIG_AIO */ |
1374 | #ifdef CONFIG_INOTIFY_USER | 1388 | #ifdef CONFIG_INOTIFY_USER |
1375 | { | 1389 | { |
1376 | .ctl_name = FS_INOTIFY, | 1390 | .ctl_name = FS_INOTIFY, |
1377 | .procname = "inotify", | 1391 | .procname = "inotify", |
1378 | .mode = 0555, | 1392 | .mode = 0555, |
1379 | .child = inotify_table, | 1393 | .child = inotify_table, |
1380 | }, | 1394 | }, |
1381 | #endif | 1395 | #endif |
1382 | #ifdef CONFIG_EPOLL | 1396 | #ifdef CONFIG_EPOLL |
1383 | { | 1397 | { |
1384 | .procname = "epoll", | 1398 | .procname = "epoll", |
1385 | .mode = 0555, | 1399 | .mode = 0555, |
1386 | .child = epoll_table, | 1400 | .child = epoll_table, |
1387 | }, | 1401 | }, |
1388 | #endif | 1402 | #endif |
1389 | #endif | 1403 | #endif |
1390 | { | 1404 | { |
1391 | .ctl_name = KERN_SETUID_DUMPABLE, | 1405 | .ctl_name = KERN_SETUID_DUMPABLE, |
1392 | .procname = "suid_dumpable", | 1406 | .procname = "suid_dumpable", |
1393 | .data = &suid_dumpable, | 1407 | .data = &suid_dumpable, |
1394 | .maxlen = sizeof(int), | 1408 | .maxlen = sizeof(int), |
1395 | .mode = 0644, | 1409 | .mode = 0644, |
1396 | .proc_handler = &proc_dointvec, | 1410 | .proc_handler = &proc_dointvec, |
1397 | }, | 1411 | }, |
1398 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) | 1412 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) |
1399 | { | 1413 | { |
1400 | .ctl_name = CTL_UNNUMBERED, | 1414 | .ctl_name = CTL_UNNUMBERED, |
1401 | .procname = "binfmt_misc", | 1415 | .procname = "binfmt_misc", |
1402 | .mode = 0555, | 1416 | .mode = 0555, |
1403 | .child = binfmt_misc_table, | 1417 | .child = binfmt_misc_table, |
1404 | }, | 1418 | }, |
1405 | #endif | 1419 | #endif |
1406 | /* | 1420 | /* |
1407 | * NOTE: do not add new entries to this table unless you have read | 1421 | * NOTE: do not add new entries to this table unless you have read |
1408 | * Documentation/sysctl/ctl_unnumbered.txt | 1422 | * Documentation/sysctl/ctl_unnumbered.txt |
1409 | */ | 1423 | */ |
1410 | { .ctl_name = 0 } | 1424 | { .ctl_name = 0 } |
1411 | }; | 1425 | }; |
1412 | 1426 | ||
1413 | static struct ctl_table debug_table[] = { | 1427 | static struct ctl_table debug_table[] = { |
1414 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) | 1428 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) |
1415 | { | 1429 | { |
1416 | .ctl_name = CTL_UNNUMBERED, | 1430 | .ctl_name = CTL_UNNUMBERED, |
1417 | .procname = "exception-trace", | 1431 | .procname = "exception-trace", |
1418 | .data = &show_unhandled_signals, | 1432 | .data = &show_unhandled_signals, |
1419 | .maxlen = sizeof(int), | 1433 | .maxlen = sizeof(int), |
1420 | .mode = 0644, | 1434 | .mode = 0644, |
1421 | .proc_handler = proc_dointvec | 1435 | .proc_handler = proc_dointvec |
1422 | }, | 1436 | }, |
1423 | #endif | 1437 | #endif |
1424 | { .ctl_name = 0 } | 1438 | { .ctl_name = 0 } |
1425 | }; | 1439 | }; |
1426 | 1440 | ||
1427 | static struct ctl_table dev_table[] = { | 1441 | static struct ctl_table dev_table[] = { |
1428 | { .ctl_name = 0 } | 1442 | { .ctl_name = 0 } |
1429 | }; | 1443 | }; |
1430 | 1444 | ||
1431 | static DEFINE_SPINLOCK(sysctl_lock); | 1445 | static DEFINE_SPINLOCK(sysctl_lock); |
1432 | 1446 | ||
1433 | /* called under sysctl_lock */ | 1447 | /* called under sysctl_lock */ |
1434 | static int use_table(struct ctl_table_header *p) | 1448 | static int use_table(struct ctl_table_header *p) |
1435 | { | 1449 | { |
1436 | if (unlikely(p->unregistering)) | 1450 | if (unlikely(p->unregistering)) |
1437 | return 0; | 1451 | return 0; |
1438 | p->used++; | 1452 | p->used++; |
1439 | return 1; | 1453 | return 1; |
1440 | } | 1454 | } |
1441 | 1455 | ||
1442 | /* called under sysctl_lock */ | 1456 | /* called under sysctl_lock */ |
1443 | static void unuse_table(struct ctl_table_header *p) | 1457 | static void unuse_table(struct ctl_table_header *p) |
1444 | { | 1458 | { |
1445 | if (!--p->used) | 1459 | if (!--p->used) |
1446 | if (unlikely(p->unregistering)) | 1460 | if (unlikely(p->unregistering)) |
1447 | complete(p->unregistering); | 1461 | complete(p->unregistering); |
1448 | } | 1462 | } |
1449 | 1463 | ||
1450 | /* called under sysctl_lock, will reacquire if has to wait */ | 1464 | /* called under sysctl_lock, will reacquire if has to wait */ |
1451 | static void start_unregistering(struct ctl_table_header *p) | 1465 | static void start_unregistering(struct ctl_table_header *p) |
1452 | { | 1466 | { |
1453 | /* | 1467 | /* |
1454 | * if p->used is 0, nobody will ever touch that entry again; | 1468 | * if p->used is 0, nobody will ever touch that entry again; |
1455 | * we'll eliminate all paths to it before dropping sysctl_lock | 1469 | * we'll eliminate all paths to it before dropping sysctl_lock |
1456 | */ | 1470 | */ |
1457 | if (unlikely(p->used)) { | 1471 | if (unlikely(p->used)) { |
1458 | struct completion wait; | 1472 | struct completion wait; |
1459 | init_completion(&wait); | 1473 | init_completion(&wait); |
1460 | p->unregistering = &wait; | 1474 | p->unregistering = &wait; |
1461 | spin_unlock(&sysctl_lock); | 1475 | spin_unlock(&sysctl_lock); |
1462 | wait_for_completion(&wait); | 1476 | wait_for_completion(&wait); |
1463 | spin_lock(&sysctl_lock); | 1477 | spin_lock(&sysctl_lock); |
1464 | } else { | 1478 | } else { |
1465 | /* anything non-NULL; we'll never dereference it */ | 1479 | /* anything non-NULL; we'll never dereference it */ |
1466 | p->unregistering = ERR_PTR(-EINVAL); | 1480 | p->unregistering = ERR_PTR(-EINVAL); |
1467 | } | 1481 | } |
1468 | /* | 1482 | /* |
1469 | * do not remove from the list until nobody holds it; walking the | 1483 | * do not remove from the list until nobody holds it; walking the |
1470 | * list in do_sysctl() relies on that. | 1484 | * list in do_sysctl() relies on that. |
1471 | */ | 1485 | */ |
1472 | list_del_init(&p->ctl_entry); | 1486 | list_del_init(&p->ctl_entry); |
1473 | } | 1487 | } |
1474 | 1488 | ||
1475 | void sysctl_head_get(struct ctl_table_header *head) | 1489 | void sysctl_head_get(struct ctl_table_header *head) |
1476 | { | 1490 | { |
1477 | spin_lock(&sysctl_lock); | 1491 | spin_lock(&sysctl_lock); |
1478 | head->count++; | 1492 | head->count++; |
1479 | spin_unlock(&sysctl_lock); | 1493 | spin_unlock(&sysctl_lock); |
1480 | } | 1494 | } |
1481 | 1495 | ||
1482 | void sysctl_head_put(struct ctl_table_header *head) | 1496 | void sysctl_head_put(struct ctl_table_header *head) |
1483 | { | 1497 | { |
1484 | spin_lock(&sysctl_lock); | 1498 | spin_lock(&sysctl_lock); |
1485 | if (!--head->count) | 1499 | if (!--head->count) |
1486 | kfree(head); | 1500 | kfree(head); |
1487 | spin_unlock(&sysctl_lock); | 1501 | spin_unlock(&sysctl_lock); |
1488 | } | 1502 | } |
1489 | 1503 | ||
1490 | struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) | 1504 | struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) |
1491 | { | 1505 | { |
1492 | if (!head) | 1506 | if (!head) |
1493 | BUG(); | 1507 | BUG(); |
1494 | spin_lock(&sysctl_lock); | 1508 | spin_lock(&sysctl_lock); |
1495 | if (!use_table(head)) | 1509 | if (!use_table(head)) |
1496 | head = ERR_PTR(-ENOENT); | 1510 | head = ERR_PTR(-ENOENT); |
1497 | spin_unlock(&sysctl_lock); | 1511 | spin_unlock(&sysctl_lock); |
1498 | return head; | 1512 | return head; |
1499 | } | 1513 | } |
1500 | 1514 | ||
1501 | void sysctl_head_finish(struct ctl_table_header *head) | 1515 | void sysctl_head_finish(struct ctl_table_header *head) |
1502 | { | 1516 | { |
1503 | if (!head) | 1517 | if (!head) |
1504 | return; | 1518 | return; |
1505 | spin_lock(&sysctl_lock); | 1519 | spin_lock(&sysctl_lock); |
1506 | unuse_table(head); | 1520 | unuse_table(head); |
1507 | spin_unlock(&sysctl_lock); | 1521 | spin_unlock(&sysctl_lock); |
1508 | } | 1522 | } |
1509 | 1523 | ||
1510 | static struct ctl_table_set * | 1524 | static struct ctl_table_set * |
1511 | lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) | 1525 | lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) |
1512 | { | 1526 | { |
1513 | struct ctl_table_set *set = &root->default_set; | 1527 | struct ctl_table_set *set = &root->default_set; |
1514 | if (root->lookup) | 1528 | if (root->lookup) |
1515 | set = root->lookup(root, namespaces); | 1529 | set = root->lookup(root, namespaces); |
1516 | return set; | 1530 | return set; |
1517 | } | 1531 | } |
1518 | 1532 | ||
1519 | static struct list_head * | 1533 | static struct list_head * |
1520 | lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) | 1534 | lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) |
1521 | { | 1535 | { |
1522 | struct ctl_table_set *set = lookup_header_set(root, namespaces); | 1536 | struct ctl_table_set *set = lookup_header_set(root, namespaces); |
1523 | return &set->list; | 1537 | return &set->list; |
1524 | } | 1538 | } |
1525 | 1539 | ||
1526 | struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, | 1540 | struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, |
1527 | struct ctl_table_header *prev) | 1541 | struct ctl_table_header *prev) |
1528 | { | 1542 | { |
1529 | struct ctl_table_root *root; | 1543 | struct ctl_table_root *root; |
1530 | struct list_head *header_list; | 1544 | struct list_head *header_list; |
1531 | struct ctl_table_header *head; | 1545 | struct ctl_table_header *head; |
1532 | struct list_head *tmp; | 1546 | struct list_head *tmp; |
1533 | 1547 | ||
1534 | spin_lock(&sysctl_lock); | 1548 | spin_lock(&sysctl_lock); |
1535 | if (prev) { | 1549 | if (prev) { |
1536 | head = prev; | 1550 | head = prev; |
1537 | tmp = &prev->ctl_entry; | 1551 | tmp = &prev->ctl_entry; |
1538 | unuse_table(prev); | 1552 | unuse_table(prev); |
1539 | goto next; | 1553 | goto next; |
1540 | } | 1554 | } |
1541 | tmp = &root_table_header.ctl_entry; | 1555 | tmp = &root_table_header.ctl_entry; |
1542 | for (;;) { | 1556 | for (;;) { |
1543 | head = list_entry(tmp, struct ctl_table_header, ctl_entry); | 1557 | head = list_entry(tmp, struct ctl_table_header, ctl_entry); |
1544 | 1558 | ||
1545 | if (!use_table(head)) | 1559 | if (!use_table(head)) |
1546 | goto next; | 1560 | goto next; |
1547 | spin_unlock(&sysctl_lock); | 1561 | spin_unlock(&sysctl_lock); |
1548 | return head; | 1562 | return head; |
1549 | next: | 1563 | next: |
1550 | root = head->root; | 1564 | root = head->root; |
1551 | tmp = tmp->next; | 1565 | tmp = tmp->next; |
1552 | header_list = lookup_header_list(root, namespaces); | 1566 | header_list = lookup_header_list(root, namespaces); |
1553 | if (tmp != header_list) | 1567 | if (tmp != header_list) |
1554 | continue; | 1568 | continue; |
1555 | 1569 | ||
1556 | do { | 1570 | do { |
1557 | root = list_entry(root->root_list.next, | 1571 | root = list_entry(root->root_list.next, |
1558 | struct ctl_table_root, root_list); | 1572 | struct ctl_table_root, root_list); |
1559 | if (root == &sysctl_table_root) | 1573 | if (root == &sysctl_table_root) |
1560 | goto out; | 1574 | goto out; |
1561 | header_list = lookup_header_list(root, namespaces); | 1575 | header_list = lookup_header_list(root, namespaces); |
1562 | } while (list_empty(header_list)); | 1576 | } while (list_empty(header_list)); |
1563 | tmp = header_list->next; | 1577 | tmp = header_list->next; |
1564 | } | 1578 | } |
1565 | out: | 1579 | out: |
1566 | spin_unlock(&sysctl_lock); | 1580 | spin_unlock(&sysctl_lock); |
1567 | return NULL; | 1581 | return NULL; |
1568 | } | 1582 | } |
1569 | 1583 | ||
1570 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | 1584 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) |
1571 | { | 1585 | { |
1572 | return __sysctl_head_next(current->nsproxy, prev); | 1586 | return __sysctl_head_next(current->nsproxy, prev); |
1573 | } | 1587 | } |
1574 | 1588 | ||
1575 | void register_sysctl_root(struct ctl_table_root *root) | 1589 | void register_sysctl_root(struct ctl_table_root *root) |
1576 | { | 1590 | { |
1577 | spin_lock(&sysctl_lock); | 1591 | spin_lock(&sysctl_lock); |
1578 | list_add_tail(&root->root_list, &sysctl_table_root.root_list); | 1592 | list_add_tail(&root->root_list, &sysctl_table_root.root_list); |
1579 | spin_unlock(&sysctl_lock); | 1593 | spin_unlock(&sysctl_lock); |
1580 | } | 1594 | } |
1581 | 1595 | ||
1582 | #ifdef CONFIG_SYSCTL_SYSCALL | 1596 | #ifdef CONFIG_SYSCTL_SYSCALL |
1583 | /* Perform the actual read/write of a sysctl table entry. */ | 1597 | /* Perform the actual read/write of a sysctl table entry. */ |
1584 | static int do_sysctl_strategy(struct ctl_table_root *root, | 1598 | static int do_sysctl_strategy(struct ctl_table_root *root, |
1585 | struct ctl_table *table, | 1599 | struct ctl_table *table, |
1586 | void __user *oldval, size_t __user *oldlenp, | 1600 | void __user *oldval, size_t __user *oldlenp, |
1587 | void __user *newval, size_t newlen) | 1601 | void __user *newval, size_t newlen) |
1588 | { | 1602 | { |
1589 | int op = 0, rc; | 1603 | int op = 0, rc; |
1590 | 1604 | ||
1591 | if (oldval) | 1605 | if (oldval) |
1592 | op |= MAY_READ; | 1606 | op |= MAY_READ; |
1593 | if (newval) | 1607 | if (newval) |
1594 | op |= MAY_WRITE; | 1608 | op |= MAY_WRITE; |
1595 | if (sysctl_perm(root, table, op)) | 1609 | if (sysctl_perm(root, table, op)) |
1596 | return -EPERM; | 1610 | return -EPERM; |
1597 | 1611 | ||
1598 | if (table->strategy) { | 1612 | if (table->strategy) { |
1599 | rc = table->strategy(table, oldval, oldlenp, newval, newlen); | 1613 | rc = table->strategy(table, oldval, oldlenp, newval, newlen); |
1600 | if (rc < 0) | 1614 | if (rc < 0) |
1601 | return rc; | 1615 | return rc; |
1602 | if (rc > 0) | 1616 | if (rc > 0) |
1603 | return 0; | 1617 | return 0; |
1604 | } | 1618 | } |
1605 | 1619 | ||
1606 | /* If there is no strategy routine, or if the strategy returns | 1620 | /* If there is no strategy routine, or if the strategy returns |
1607 | * zero, proceed with automatic r/w */ | 1621 | * zero, proceed with automatic r/w */ |
1608 | if (table->data && table->maxlen) { | 1622 | if (table->data && table->maxlen) { |
1609 | rc = sysctl_data(table, oldval, oldlenp, newval, newlen); | 1623 | rc = sysctl_data(table, oldval, oldlenp, newval, newlen); |
1610 | if (rc < 0) | 1624 | if (rc < 0) |
1611 | return rc; | 1625 | return rc; |
1612 | } | 1626 | } |
1613 | return 0; | 1627 | return 0; |
1614 | } | 1628 | } |
1615 | 1629 | ||
1616 | static int parse_table(int __user *name, int nlen, | 1630 | static int parse_table(int __user *name, int nlen, |
1617 | void __user *oldval, size_t __user *oldlenp, | 1631 | void __user *oldval, size_t __user *oldlenp, |
1618 | void __user *newval, size_t newlen, | 1632 | void __user *newval, size_t newlen, |
1619 | struct ctl_table_root *root, | 1633 | struct ctl_table_root *root, |
1620 | struct ctl_table *table) | 1634 | struct ctl_table *table) |
1621 | { | 1635 | { |
1622 | int n; | 1636 | int n; |
1623 | repeat: | 1637 | repeat: |
1624 | if (!nlen) | 1638 | if (!nlen) |
1625 | return -ENOTDIR; | 1639 | return -ENOTDIR; |
1626 | if (get_user(n, name)) | 1640 | if (get_user(n, name)) |
1627 | return -EFAULT; | 1641 | return -EFAULT; |
1628 | for ( ; table->ctl_name || table->procname; table++) { | 1642 | for ( ; table->ctl_name || table->procname; table++) { |
1629 | if (!table->ctl_name) | 1643 | if (!table->ctl_name) |
1630 | continue; | 1644 | continue; |
1631 | if (n == table->ctl_name) { | 1645 | if (n == table->ctl_name) { |
1632 | int error; | 1646 | int error; |
1633 | if (table->child) { | 1647 | if (table->child) { |
1634 | if (sysctl_perm(root, table, MAY_EXEC)) | 1648 | if (sysctl_perm(root, table, MAY_EXEC)) |
1635 | return -EPERM; | 1649 | return -EPERM; |
1636 | name++; | 1650 | name++; |
1637 | nlen--; | 1651 | nlen--; |
1638 | table = table->child; | 1652 | table = table->child; |
1639 | goto repeat; | 1653 | goto repeat; |
1640 | } | 1654 | } |
1641 | error = do_sysctl_strategy(root, table, | 1655 | error = do_sysctl_strategy(root, table, |
1642 | oldval, oldlenp, | 1656 | oldval, oldlenp, |
1643 | newval, newlen); | 1657 | newval, newlen); |
1644 | return error; | 1658 | return error; |
1645 | } | 1659 | } |
1646 | } | 1660 | } |
1647 | return -ENOTDIR; | 1661 | return -ENOTDIR; |
1648 | } | 1662 | } |
1649 | 1663 | ||
1650 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1664 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
1651 | void __user *newval, size_t newlen) | 1665 | void __user *newval, size_t newlen) |
1652 | { | 1666 | { |
1653 | struct ctl_table_header *head; | 1667 | struct ctl_table_header *head; |
1654 | int error = -ENOTDIR; | 1668 | int error = -ENOTDIR; |
1655 | 1669 | ||
1656 | if (nlen <= 0 || nlen >= CTL_MAXNAME) | 1670 | if (nlen <= 0 || nlen >= CTL_MAXNAME) |
1657 | return -ENOTDIR; | 1671 | return -ENOTDIR; |
1658 | if (oldval) { | 1672 | if (oldval) { |
1659 | int old_len; | 1673 | int old_len; |
1660 | if (!oldlenp || get_user(old_len, oldlenp)) | 1674 | if (!oldlenp || get_user(old_len, oldlenp)) |
1661 | return -EFAULT; | 1675 | return -EFAULT; |
1662 | } | 1676 | } |
1663 | 1677 | ||
1664 | for (head = sysctl_head_next(NULL); head; | 1678 | for (head = sysctl_head_next(NULL); head; |
1665 | head = sysctl_head_next(head)) { | 1679 | head = sysctl_head_next(head)) { |
1666 | error = parse_table(name, nlen, oldval, oldlenp, | 1680 | error = parse_table(name, nlen, oldval, oldlenp, |
1667 | newval, newlen, | 1681 | newval, newlen, |
1668 | head->root, head->ctl_table); | 1682 | head->root, head->ctl_table); |
1669 | if (error != -ENOTDIR) { | 1683 | if (error != -ENOTDIR) { |
1670 | sysctl_head_finish(head); | 1684 | sysctl_head_finish(head); |
1671 | break; | 1685 | break; |
1672 | } | 1686 | } |
1673 | } | 1687 | } |
1674 | return error; | 1688 | return error; |
1675 | } | 1689 | } |
1676 | 1690 | ||
1677 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | 1691 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) |
1678 | { | 1692 | { |
1679 | struct __sysctl_args tmp; | 1693 | struct __sysctl_args tmp; |
1680 | int error; | 1694 | int error; |
1681 | 1695 | ||
1682 | if (copy_from_user(&tmp, args, sizeof(tmp))) | 1696 | if (copy_from_user(&tmp, args, sizeof(tmp))) |
1683 | return -EFAULT; | 1697 | return -EFAULT; |
1684 | 1698 | ||
1685 | error = deprecated_sysctl_warning(&tmp); | 1699 | error = deprecated_sysctl_warning(&tmp); |
1686 | if (error) | 1700 | if (error) |
1687 | goto out; | 1701 | goto out; |
1688 | 1702 | ||
1689 | lock_kernel(); | 1703 | lock_kernel(); |
1690 | error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, | 1704 | error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, |
1691 | tmp.newval, tmp.newlen); | 1705 | tmp.newval, tmp.newlen); |
1692 | unlock_kernel(); | 1706 | unlock_kernel(); |
1693 | out: | 1707 | out: |
1694 | return error; | 1708 | return error; |
1695 | } | 1709 | } |
1696 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 1710 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
1697 | 1711 | ||
1698 | /* | 1712 | /* |
1699 | * sysctl_perm does NOT grant the superuser all rights automatically, because | 1713 | * sysctl_perm does NOT grant the superuser all rights automatically, because |
1700 | * some sysctl variables are readonly even to root. | 1714 | * some sysctl variables are readonly even to root. |
1701 | */ | 1715 | */ |
1702 | 1716 | ||
1703 | static int test_perm(int mode, int op) | 1717 | static int test_perm(int mode, int op) |
1704 | { | 1718 | { |
1705 | if (!current_euid()) | 1719 | if (!current_euid()) |
1706 | mode >>= 6; | 1720 | mode >>= 6; |
1707 | else if (in_egroup_p(0)) | 1721 | else if (in_egroup_p(0)) |
1708 | mode >>= 3; | 1722 | mode >>= 3; |
1709 | if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) | 1723 | if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) |
1710 | return 0; | 1724 | return 0; |
1711 | return -EACCES; | 1725 | return -EACCES; |
1712 | } | 1726 | } |
1713 | 1727 | ||
1714 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) | 1728 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) |
1715 | { | 1729 | { |
1716 | int error; | 1730 | int error; |
1717 | int mode; | 1731 | int mode; |
1718 | 1732 | ||
1719 | error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); | 1733 | error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); |
1720 | if (error) | 1734 | if (error) |
1721 | return error; | 1735 | return error; |
1722 | 1736 | ||
1723 | if (root->permissions) | 1737 | if (root->permissions) |
1724 | mode = root->permissions(root, current->nsproxy, table); | 1738 | mode = root->permissions(root, current->nsproxy, table); |
1725 | else | 1739 | else |
1726 | mode = table->mode; | 1740 | mode = table->mode; |
1727 | 1741 | ||
1728 | return test_perm(mode, op); | 1742 | return test_perm(mode, op); |
1729 | } | 1743 | } |
1730 | 1744 | ||
1731 | static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) | 1745 | static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) |
1732 | { | 1746 | { |
1733 | for (; table->ctl_name || table->procname; table++) { | 1747 | for (; table->ctl_name || table->procname; table++) { |
1734 | table->parent = parent; | 1748 | table->parent = parent; |
1735 | if (table->child) | 1749 | if (table->child) |
1736 | sysctl_set_parent(table, table->child); | 1750 | sysctl_set_parent(table, table->child); |
1737 | } | 1751 | } |
1738 | } | 1752 | } |
1739 | 1753 | ||
1740 | static __init int sysctl_init(void) | 1754 | static __init int sysctl_init(void) |
1741 | { | 1755 | { |
1742 | sysctl_set_parent(NULL, root_table); | 1756 | sysctl_set_parent(NULL, root_table); |
1743 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | 1757 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK |
1744 | { | 1758 | { |
1745 | int err; | 1759 | int err; |
1746 | err = sysctl_check_table(current->nsproxy, root_table); | 1760 | err = sysctl_check_table(current->nsproxy, root_table); |
1747 | } | 1761 | } |
1748 | #endif | 1762 | #endif |
1749 | return 0; | 1763 | return 0; |
1750 | } | 1764 | } |
1751 | 1765 | ||
1752 | core_initcall(sysctl_init); | 1766 | core_initcall(sysctl_init); |
1753 | 1767 | ||
1754 | static struct ctl_table *is_branch_in(struct ctl_table *branch, | 1768 | static struct ctl_table *is_branch_in(struct ctl_table *branch, |
1755 | struct ctl_table *table) | 1769 | struct ctl_table *table) |
1756 | { | 1770 | { |
1757 | struct ctl_table *p; | 1771 | struct ctl_table *p; |
1758 | const char *s = branch->procname; | 1772 | const char *s = branch->procname; |
1759 | 1773 | ||
1760 | /* branch should have named subdirectory as its first element */ | 1774 | /* branch should have named subdirectory as its first element */ |
1761 | if (!s || !branch->child) | 1775 | if (!s || !branch->child) |
1762 | return NULL; | 1776 | return NULL; |
1763 | 1777 | ||
1764 | /* ... and nothing else */ | 1778 | /* ... and nothing else */ |
1765 | if (branch[1].procname || branch[1].ctl_name) | 1779 | if (branch[1].procname || branch[1].ctl_name) |
1766 | return NULL; | 1780 | return NULL; |
1767 | 1781 | ||
1768 | /* table should contain subdirectory with the same name */ | 1782 | /* table should contain subdirectory with the same name */ |
1769 | for (p = table; p->procname || p->ctl_name; p++) { | 1783 | for (p = table; p->procname || p->ctl_name; p++) { |
1770 | if (!p->child) | 1784 | if (!p->child) |
1771 | continue; | 1785 | continue; |
1772 | if (p->procname && strcmp(p->procname, s) == 0) | 1786 | if (p->procname && strcmp(p->procname, s) == 0) |
1773 | return p; | 1787 | return p; |
1774 | } | 1788 | } |
1775 | return NULL; | 1789 | return NULL; |
1776 | } | 1790 | } |
1777 | 1791 | ||
1778 | /* see if attaching q to p would be an improvement */ | 1792 | /* see if attaching q to p would be an improvement */ |
1779 | static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) | 1793 | static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) |
1780 | { | 1794 | { |
1781 | struct ctl_table *to = p->ctl_table, *by = q->ctl_table; | 1795 | struct ctl_table *to = p->ctl_table, *by = q->ctl_table; |
1782 | struct ctl_table *next; | 1796 | struct ctl_table *next; |
1783 | int is_better = 0; | 1797 | int is_better = 0; |
1784 | int not_in_parent = !p->attached_by; | 1798 | int not_in_parent = !p->attached_by; |
1785 | 1799 | ||
1786 | while ((next = is_branch_in(by, to)) != NULL) { | 1800 | while ((next = is_branch_in(by, to)) != NULL) { |
1787 | if (by == q->attached_by) | 1801 | if (by == q->attached_by) |
1788 | is_better = 1; | 1802 | is_better = 1; |
1789 | if (to == p->attached_by) | 1803 | if (to == p->attached_by) |
1790 | not_in_parent = 1; | 1804 | not_in_parent = 1; |
1791 | by = by->child; | 1805 | by = by->child; |
1792 | to = next->child; | 1806 | to = next->child; |
1793 | } | 1807 | } |
1794 | 1808 | ||
1795 | if (is_better && not_in_parent) { | 1809 | if (is_better && not_in_parent) { |
1796 | q->attached_by = by; | 1810 | q->attached_by = by; |
1797 | q->attached_to = to; | 1811 | q->attached_to = to; |
1798 | q->parent = p; | 1812 | q->parent = p; |
1799 | } | 1813 | } |
1800 | } | 1814 | } |
1801 | 1815 | ||
1802 | /** | 1816 | /** |
1803 | * __register_sysctl_paths - register a sysctl hierarchy | 1817 | * __register_sysctl_paths - register a sysctl hierarchy |
1804 | * @root: List of sysctl headers to register on | 1818 | * @root: List of sysctl headers to register on |
1805 | * @namespaces: Data to compute which lists of sysctl entries are visible | 1819 | * @namespaces: Data to compute which lists of sysctl entries are visible |
1806 | * @path: The path to the directory the sysctl table is in. | 1820 | * @path: The path to the directory the sysctl table is in. |
1807 | * @table: the top-level table structure | 1821 | * @table: the top-level table structure |
1808 | * | 1822 | * |
1809 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1823 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1810 | * array. A completely 0 filled entry terminates the table. | 1824 | * array. A completely 0 filled entry terminates the table. |
1811 | * | 1825 | * |
1812 | * The members of the &struct ctl_table structure are used as follows: | 1826 | * The members of the &struct ctl_table structure are used as follows: |
1813 | * | 1827 | * |
1814 | * ctl_name - This is the numeric sysctl value used by sysctl(2). The number | 1828 | * ctl_name - This is the numeric sysctl value used by sysctl(2). The number |
1815 | * must be unique within that level of sysctl | 1829 | * must be unique within that level of sysctl |
1816 | * | 1830 | * |
1817 | * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not | 1831 | * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not |
1818 | * enter a sysctl file | 1832 | * enter a sysctl file |
1819 | * | 1833 | * |
1820 | * data - a pointer to data for use by proc_handler | 1834 | * data - a pointer to data for use by proc_handler |
1821 | * | 1835 | * |
1822 | * maxlen - the maximum size in bytes of the data | 1836 | * maxlen - the maximum size in bytes of the data |
1823 | * | 1837 | * |
1824 | * mode - the file permissions for the /proc/sys file, and for sysctl(2) | 1838 | * mode - the file permissions for the /proc/sys file, and for sysctl(2) |
1825 | * | 1839 | * |
1826 | * child - a pointer to the child sysctl table if this entry is a directory, or | 1840 | * child - a pointer to the child sysctl table if this entry is a directory, or |
1827 | * %NULL. | 1841 | * %NULL. |
1828 | * | 1842 | * |
1829 | * proc_handler - the text handler routine (described below) | 1843 | * proc_handler - the text handler routine (described below) |
1830 | * | 1844 | * |
1831 | * strategy - the strategy routine (described below) | 1845 | * strategy - the strategy routine (described below) |
1832 | * | 1846 | * |
1833 | * de - for internal use by the sysctl routines | 1847 | * de - for internal use by the sysctl routines |
1834 | * | 1848 | * |
1835 | * extra1, extra2 - extra pointers usable by the proc handler routines | 1849 | * extra1, extra2 - extra pointers usable by the proc handler routines |
1836 | * | 1850 | * |
1837 | * Leaf nodes in the sysctl tree will be represented by a single file | 1851 | * Leaf nodes in the sysctl tree will be represented by a single file |
1838 | * under /proc; non-leaf nodes will be represented by directories. | 1852 | * under /proc; non-leaf nodes will be represented by directories. |
1839 | * | 1853 | * |
1840 | * sysctl(2) can automatically manage read and write requests through | 1854 | * sysctl(2) can automatically manage read and write requests through |
1841 | * the sysctl table. The data and maxlen fields of the ctl_table | 1855 | * the sysctl table. The data and maxlen fields of the ctl_table |
1842 | * struct enable minimal validation of the values being written to be | 1856 | * struct enable minimal validation of the values being written to be |
1843 | * performed, and the mode field allows minimal authentication. | 1857 | * performed, and the mode field allows minimal authentication. |
1844 | * | 1858 | * |
1845 | * More sophisticated management can be enabled by the provision of a | 1859 | * More sophisticated management can be enabled by the provision of a |
1846 | * strategy routine with the table entry. This will be called before | 1860 | * strategy routine with the table entry. This will be called before |
1847 | * any automatic read or write of the data is performed. | 1861 | * any automatic read or write of the data is performed. |
1848 | * | 1862 | * |
1849 | * The strategy routine may return | 1863 | * The strategy routine may return |
1850 | * | 1864 | * |
1851 | * < 0 - Error occurred (error is passed to user process) | 1865 | * < 0 - Error occurred (error is passed to user process) |
1852 | * | 1866 | * |
1853 | * 0 - OK - proceed with automatic read or write. | 1867 | * 0 - OK - proceed with automatic read or write. |
1854 | * | 1868 | * |
1855 | * > 0 - OK - read or write has been done by the strategy routine, so | 1869 | * > 0 - OK - read or write has been done by the strategy routine, so |
1856 | * return immediately. | 1870 | * return immediately. |
1857 | * | 1871 | * |
1858 | * There must be a proc_handler routine for any terminal nodes | 1872 | * There must be a proc_handler routine for any terminal nodes |
1859 | * mirrored under /proc/sys (non-terminals are handled by a built-in | 1873 | * mirrored under /proc/sys (non-terminals are handled by a built-in |
1860 | * directory handler). Several default handlers are available to | 1874 | * directory handler). Several default handlers are available to |
1861 | * cover common cases - | 1875 | * cover common cases - |
1862 | * | 1876 | * |
1863 | * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), | 1877 | * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), |
1864 | * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), | 1878 | * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), |
1865 | * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() | 1879 | * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() |
1866 | * | 1880 | * |
1867 | * It is the handler's job to read the input buffer from user memory | 1881 | * It is the handler's job to read the input buffer from user memory |
1868 | * and process it. The handler should return 0 on success. | 1882 | * and process it. The handler should return 0 on success. |
1869 | * | 1883 | * |
1870 | * This routine returns %NULL on a failure to register, and a pointer | 1884 | * This routine returns %NULL on a failure to register, and a pointer |
1871 | * to the table header on success. | 1885 | * to the table header on success. |
1872 | */ | 1886 | */ |
1873 | struct ctl_table_header *__register_sysctl_paths( | 1887 | struct ctl_table_header *__register_sysctl_paths( |
1874 | struct ctl_table_root *root, | 1888 | struct ctl_table_root *root, |
1875 | struct nsproxy *namespaces, | 1889 | struct nsproxy *namespaces, |
1876 | const struct ctl_path *path, struct ctl_table *table) | 1890 | const struct ctl_path *path, struct ctl_table *table) |
1877 | { | 1891 | { |
1878 | struct ctl_table_header *header; | 1892 | struct ctl_table_header *header; |
1879 | struct ctl_table *new, **prevp; | 1893 | struct ctl_table *new, **prevp; |
1880 | unsigned int n, npath; | 1894 | unsigned int n, npath; |
1881 | struct ctl_table_set *set; | 1895 | struct ctl_table_set *set; |
1882 | 1896 | ||
1883 | /* Count the path components */ | 1897 | /* Count the path components */ |
1884 | for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) | 1898 | for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) |
1885 | ; | 1899 | ; |
1886 | 1900 | ||
1887 | /* | 1901 | /* |
1888 | * For each path component, allocate a 2-element ctl_table array. | 1902 | * For each path component, allocate a 2-element ctl_table array. |
1889 | * The first array element will be filled with the sysctl entry | 1903 | * The first array element will be filled with the sysctl entry |
1890 | * for this, the second will be the sentinel (ctl_name == 0). | 1904 | * for this, the second will be the sentinel (ctl_name == 0). |
1891 | * | 1905 | * |
1892 | * We allocate everything in one go so that we don't have to | 1906 | * We allocate everything in one go so that we don't have to |
1893 | * worry about freeing additional memory in unregister_sysctl_table. | 1907 | * worry about freeing additional memory in unregister_sysctl_table. |
1894 | */ | 1908 | */ |
1895 | header = kzalloc(sizeof(struct ctl_table_header) + | 1909 | header = kzalloc(sizeof(struct ctl_table_header) + |
1896 | (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); | 1910 | (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); |
1897 | if (!header) | 1911 | if (!header) |
1898 | return NULL; | 1912 | return NULL; |
1899 | 1913 | ||
1900 | new = (struct ctl_table *) (header + 1); | 1914 | new = (struct ctl_table *) (header + 1); |
1901 | 1915 | ||
1902 | /* Now connect the dots */ | 1916 | /* Now connect the dots */ |
1903 | prevp = &header->ctl_table; | 1917 | prevp = &header->ctl_table; |
1904 | for (n = 0; n < npath; ++n, ++path) { | 1918 | for (n = 0; n < npath; ++n, ++path) { |
1905 | /* Copy the procname */ | 1919 | /* Copy the procname */ |
1906 | new->procname = path->procname; | 1920 | new->procname = path->procname; |
1907 | new->ctl_name = path->ctl_name; | 1921 | new->ctl_name = path->ctl_name; |
1908 | new->mode = 0555; | 1922 | new->mode = 0555; |
1909 | 1923 | ||
1910 | *prevp = new; | 1924 | *prevp = new; |
1911 | prevp = &new->child; | 1925 | prevp = &new->child; |
1912 | 1926 | ||
1913 | new += 2; | 1927 | new += 2; |
1914 | } | 1928 | } |
1915 | *prevp = table; | 1929 | *prevp = table; |
1916 | header->ctl_table_arg = table; | 1930 | header->ctl_table_arg = table; |
1917 | 1931 | ||
1918 | INIT_LIST_HEAD(&header->ctl_entry); | 1932 | INIT_LIST_HEAD(&header->ctl_entry); |
1919 | header->used = 0; | 1933 | header->used = 0; |
1920 | header->unregistering = NULL; | 1934 | header->unregistering = NULL; |
1921 | header->root = root; | 1935 | header->root = root; |
1922 | sysctl_set_parent(NULL, header->ctl_table); | 1936 | sysctl_set_parent(NULL, header->ctl_table); |
1923 | header->count = 1; | 1937 | header->count = 1; |
1924 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | 1938 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK |
1925 | if (sysctl_check_table(namespaces, header->ctl_table)) { | 1939 | if (sysctl_check_table(namespaces, header->ctl_table)) { |
1926 | kfree(header); | 1940 | kfree(header); |
1927 | return NULL; | 1941 | return NULL; |
1928 | } | 1942 | } |
1929 | #endif | 1943 | #endif |
1930 | spin_lock(&sysctl_lock); | 1944 | spin_lock(&sysctl_lock); |
1931 | header->set = lookup_header_set(root, namespaces); | 1945 | header->set = lookup_header_set(root, namespaces); |
1932 | header->attached_by = header->ctl_table; | 1946 | header->attached_by = header->ctl_table; |
1933 | header->attached_to = root_table; | 1947 | header->attached_to = root_table; |
1934 | header->parent = &root_table_header; | 1948 | header->parent = &root_table_header; |
1935 | for (set = header->set; set; set = set->parent) { | 1949 | for (set = header->set; set; set = set->parent) { |
1936 | struct ctl_table_header *p; | 1950 | struct ctl_table_header *p; |
1937 | list_for_each_entry(p, &set->list, ctl_entry) { | 1951 | list_for_each_entry(p, &set->list, ctl_entry) { |
1938 | if (p->unregistering) | 1952 | if (p->unregistering) |
1939 | continue; | 1953 | continue; |
1940 | try_attach(p, header); | 1954 | try_attach(p, header); |
1941 | } | 1955 | } |
1942 | } | 1956 | } |
1943 | header->parent->count++; | 1957 | header->parent->count++; |
1944 | list_add_tail(&header->ctl_entry, &header->set->list); | 1958 | list_add_tail(&header->ctl_entry, &header->set->list); |
1945 | spin_unlock(&sysctl_lock); | 1959 | spin_unlock(&sysctl_lock); |
1946 | 1960 | ||
1947 | return header; | 1961 | return header; |
1948 | } | 1962 | } |
1949 | 1963 | ||
1950 | /** | 1964 | /** |
1951 | * register_sysctl_table_path - register a sysctl table hierarchy | 1965 | * register_sysctl_table_path - register a sysctl table hierarchy |
1952 | * @path: The path to the directory the sysctl table is in. | 1966 | * @path: The path to the directory the sysctl table is in. |
1953 | * @table: the top-level table structure | 1967 | * @table: the top-level table structure |
1954 | * | 1968 | * |
1955 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1969 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1956 | * array. A completely 0 filled entry terminates the table. | 1970 | * array. A completely 0 filled entry terminates the table. |
1957 | * | 1971 | * |
1958 | * See __register_sysctl_paths for more details. | 1972 | * See __register_sysctl_paths for more details. |
1959 | */ | 1973 | */ |
1960 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | 1974 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, |
1961 | struct ctl_table *table) | 1975 | struct ctl_table *table) |
1962 | { | 1976 | { |
1963 | return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, | 1977 | return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, |
1964 | path, table); | 1978 | path, table); |
1965 | } | 1979 | } |
1966 | 1980 | ||
1967 | /** | 1981 | /** |
1968 | * register_sysctl_table - register a sysctl table hierarchy | 1982 | * register_sysctl_table - register a sysctl table hierarchy |
1969 | * @table: the top-level table structure | 1983 | * @table: the top-level table structure |
1970 | * | 1984 | * |
1971 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1985 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1972 | * array. A completely 0 filled entry terminates the table. | 1986 | * array. A completely 0 filled entry terminates the table. |
1973 | * | 1987 | * |
1974 | * See register_sysctl_paths for more details. | 1988 | * See register_sysctl_paths for more details. |
1975 | */ | 1989 | */ |
1976 | struct ctl_table_header *register_sysctl_table(struct ctl_table *table) | 1990 | struct ctl_table_header *register_sysctl_table(struct ctl_table *table) |
1977 | { | 1991 | { |
1978 | static const struct ctl_path null_path[] = { {} }; | 1992 | static const struct ctl_path null_path[] = { {} }; |
1979 | 1993 | ||
1980 | return register_sysctl_paths(null_path, table); | 1994 | return register_sysctl_paths(null_path, table); |
1981 | } | 1995 | } |
1982 | 1996 | ||
1983 | /** | 1997 | /** |
1984 | * unregister_sysctl_table - unregister a sysctl table hierarchy | 1998 | * unregister_sysctl_table - unregister a sysctl table hierarchy |
1985 | * @header: the header returned from register_sysctl_table | 1999 | * @header: the header returned from register_sysctl_table |
1986 | * | 2000 | * |
1987 | * Unregisters the sysctl table and all children. proc entries may not | 2001 | * Unregisters the sysctl table and all children. proc entries may not |
1988 | * actually be removed until they are no longer used by anyone. | 2002 | * actually be removed until they are no longer used by anyone. |
1989 | */ | 2003 | */ |
1990 | void unregister_sysctl_table(struct ctl_table_header * header) | 2004 | void unregister_sysctl_table(struct ctl_table_header * header) |
1991 | { | 2005 | { |
1992 | might_sleep(); | 2006 | might_sleep(); |
1993 | 2007 | ||
1994 | if (header == NULL) | 2008 | if (header == NULL) |
1995 | return; | 2009 | return; |
1996 | 2010 | ||
1997 | spin_lock(&sysctl_lock); | 2011 | spin_lock(&sysctl_lock); |
1998 | start_unregistering(header); | 2012 | start_unregistering(header); |
1999 | if (!--header->parent->count) { | 2013 | if (!--header->parent->count) { |
2000 | WARN_ON(1); | 2014 | WARN_ON(1); |
2001 | kfree(header->parent); | 2015 | kfree(header->parent); |
2002 | } | 2016 | } |
2003 | if (!--header->count) | 2017 | if (!--header->count) |
2004 | kfree(header); | 2018 | kfree(header); |
2005 | spin_unlock(&sysctl_lock); | 2019 | spin_unlock(&sysctl_lock); |
2006 | } | 2020 | } |
2007 | 2021 | ||
2008 | int sysctl_is_seen(struct ctl_table_header *p) | 2022 | int sysctl_is_seen(struct ctl_table_header *p) |
2009 | { | 2023 | { |
2010 | struct ctl_table_set *set = p->set; | 2024 | struct ctl_table_set *set = p->set; |
2011 | int res; | 2025 | int res; |
2012 | spin_lock(&sysctl_lock); | 2026 | spin_lock(&sysctl_lock); |
2013 | if (p->unregistering) | 2027 | if (p->unregistering) |
2014 | res = 0; | 2028 | res = 0; |
2015 | else if (!set->is_seen) | 2029 | else if (!set->is_seen) |
2016 | res = 1; | 2030 | res = 1; |
2017 | else | 2031 | else |
2018 | res = set->is_seen(set); | 2032 | res = set->is_seen(set); |
2019 | spin_unlock(&sysctl_lock); | 2033 | spin_unlock(&sysctl_lock); |
2020 | return res; | 2034 | return res; |
2021 | } | 2035 | } |
2022 | 2036 | ||
2023 | void setup_sysctl_set(struct ctl_table_set *p, | 2037 | void setup_sysctl_set(struct ctl_table_set *p, |
2024 | struct ctl_table_set *parent, | 2038 | struct ctl_table_set *parent, |
2025 | int (*is_seen)(struct ctl_table_set *)) | 2039 | int (*is_seen)(struct ctl_table_set *)) |
2026 | { | 2040 | { |
2027 | INIT_LIST_HEAD(&p->list); | 2041 | INIT_LIST_HEAD(&p->list); |
2028 | p->parent = parent ? parent : &sysctl_table_root.default_set; | 2042 | p->parent = parent ? parent : &sysctl_table_root.default_set; |
2029 | p->is_seen = is_seen; | 2043 | p->is_seen = is_seen; |
2030 | } | 2044 | } |
2031 | 2045 | ||
2032 | #else /* !CONFIG_SYSCTL */ | 2046 | #else /* !CONFIG_SYSCTL */ |
2033 | struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | 2047 | struct ctl_table_header *register_sysctl_table(struct ctl_table * table) |
2034 | { | 2048 | { |
2035 | return NULL; | 2049 | return NULL; |
2036 | } | 2050 | } |
2037 | 2051 | ||
2038 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | 2052 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, |
2039 | struct ctl_table *table) | 2053 | struct ctl_table *table) |
2040 | { | 2054 | { |
2041 | return NULL; | 2055 | return NULL; |
2042 | } | 2056 | } |
2043 | 2057 | ||
2044 | void unregister_sysctl_table(struct ctl_table_header * table) | 2058 | void unregister_sysctl_table(struct ctl_table_header * table) |
2045 | { | 2059 | { |
2046 | } | 2060 | } |
2047 | 2061 | ||
2048 | void setup_sysctl_set(struct ctl_table_set *p, | 2062 | void setup_sysctl_set(struct ctl_table_set *p, |
2049 | struct ctl_table_set *parent, | 2063 | struct ctl_table_set *parent, |
2050 | int (*is_seen)(struct ctl_table_set *)) | 2064 | int (*is_seen)(struct ctl_table_set *)) |
2051 | { | 2065 | { |
2052 | } | 2066 | } |
2053 | 2067 | ||
2054 | void sysctl_head_put(struct ctl_table_header *head) | 2068 | void sysctl_head_put(struct ctl_table_header *head) |
2055 | { | 2069 | { |
2056 | } | 2070 | } |
2057 | 2071 | ||
2058 | #endif /* CONFIG_SYSCTL */ | 2072 | #endif /* CONFIG_SYSCTL */ |
2059 | 2073 | ||
2060 | /* | 2074 | /* |
2061 | * /proc/sys support | 2075 | * /proc/sys support |
2062 | */ | 2076 | */ |
2063 | 2077 | ||
2064 | #ifdef CONFIG_PROC_SYSCTL | 2078 | #ifdef CONFIG_PROC_SYSCTL |
2065 | 2079 | ||
2066 | static int _proc_do_string(void* data, int maxlen, int write, | 2080 | static int _proc_do_string(void* data, int maxlen, int write, |
2067 | struct file *filp, void __user *buffer, | 2081 | struct file *filp, void __user *buffer, |
2068 | size_t *lenp, loff_t *ppos) | 2082 | size_t *lenp, loff_t *ppos) |
2069 | { | 2083 | { |
2070 | size_t len; | 2084 | size_t len; |
2071 | char __user *p; | 2085 | char __user *p; |
2072 | char c; | 2086 | char c; |
2073 | 2087 | ||
2074 | if (!data || !maxlen || !*lenp) { | 2088 | if (!data || !maxlen || !*lenp) { |
2075 | *lenp = 0; | 2089 | *lenp = 0; |
2076 | return 0; | 2090 | return 0; |
2077 | } | 2091 | } |
2078 | 2092 | ||
2079 | if (write) { | 2093 | if (write) { |
2080 | len = 0; | 2094 | len = 0; |
2081 | p = buffer; | 2095 | p = buffer; |
2082 | while (len < *lenp) { | 2096 | while (len < *lenp) { |
2083 | if (get_user(c, p++)) | 2097 | if (get_user(c, p++)) |
2084 | return -EFAULT; | 2098 | return -EFAULT; |
2085 | if (c == 0 || c == '\n') | 2099 | if (c == 0 || c == '\n') |
2086 | break; | 2100 | break; |
2087 | len++; | 2101 | len++; |
2088 | } | 2102 | } |
2089 | if (len >= maxlen) | 2103 | if (len >= maxlen) |
2090 | len = maxlen-1; | 2104 | len = maxlen-1; |
2091 | if(copy_from_user(data, buffer, len)) | 2105 | if(copy_from_user(data, buffer, len)) |
2092 | return -EFAULT; | 2106 | return -EFAULT; |
2093 | ((char *) data)[len] = 0; | 2107 | ((char *) data)[len] = 0; |
2094 | *ppos += *lenp; | 2108 | *ppos += *lenp; |
2095 | } else { | 2109 | } else { |
2096 | len = strlen(data); | 2110 | len = strlen(data); |
2097 | if (len > maxlen) | 2111 | if (len > maxlen) |
2098 | len = maxlen; | 2112 | len = maxlen; |
2099 | 2113 | ||
2100 | if (*ppos > len) { | 2114 | if (*ppos > len) { |
2101 | *lenp = 0; | 2115 | *lenp = 0; |
2102 | return 0; | 2116 | return 0; |
2103 | } | 2117 | } |
2104 | 2118 | ||
2105 | data += *ppos; | 2119 | data += *ppos; |
2106 | len -= *ppos; | 2120 | len -= *ppos; |
2107 | 2121 | ||
2108 | if (len > *lenp) | 2122 | if (len > *lenp) |
2109 | len = *lenp; | 2123 | len = *lenp; |
2110 | if (len) | 2124 | if (len) |
2111 | if(copy_to_user(buffer, data, len)) | 2125 | if(copy_to_user(buffer, data, len)) |
2112 | return -EFAULT; | 2126 | return -EFAULT; |
2113 | if (len < *lenp) { | 2127 | if (len < *lenp) { |
2114 | if(put_user('\n', ((char __user *) buffer) + len)) | 2128 | if(put_user('\n', ((char __user *) buffer) + len)) |
2115 | return -EFAULT; | 2129 | return -EFAULT; |
2116 | len++; | 2130 | len++; |
2117 | } | 2131 | } |
2118 | *lenp = len; | 2132 | *lenp = len; |
2119 | *ppos += len; | 2133 | *ppos += len; |
2120 | } | 2134 | } |
2121 | return 0; | 2135 | return 0; |
2122 | } | 2136 | } |
2123 | 2137 | ||
2124 | /** | 2138 | /** |
2125 | * proc_dostring - read a string sysctl | 2139 | * proc_dostring - read a string sysctl |
2126 | * @table: the sysctl table | 2140 | * @table: the sysctl table |
2127 | * @write: %TRUE if this is a write to the sysctl file | 2141 | * @write: %TRUE if this is a write to the sysctl file |
2128 | * @filp: the file structure | 2142 | * @filp: the file structure |
2129 | * @buffer: the user buffer | 2143 | * @buffer: the user buffer |
2130 | * @lenp: the size of the user buffer | 2144 | * @lenp: the size of the user buffer |
2131 | * @ppos: file position | 2145 | * @ppos: file position |
2132 | * | 2146 | * |
2133 | * Reads/writes a string from/to the user buffer. If the kernel | 2147 | * Reads/writes a string from/to the user buffer. If the kernel |
2134 | * buffer provided is not large enough to hold the string, the | 2148 | * buffer provided is not large enough to hold the string, the |
2135 | * string is truncated. The copied string is %NULL-terminated. | 2149 | * string is truncated. The copied string is %NULL-terminated. |
2136 | * If the string is being read by the user process, it is copied | 2150 | * If the string is being read by the user process, it is copied |
2137 | * and a newline '\n' is added. It is truncated if the buffer is | 2151 | * and a newline '\n' is added. It is truncated if the buffer is |
2138 | * not large enough. | 2152 | * not large enough. |
2139 | * | 2153 | * |
2140 | * Returns 0 on success. | 2154 | * Returns 0 on success. |
2141 | */ | 2155 | */ |
2142 | int proc_dostring(struct ctl_table *table, int write, struct file *filp, | 2156 | int proc_dostring(struct ctl_table *table, int write, struct file *filp, |
2143 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2157 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2144 | { | 2158 | { |
2145 | return _proc_do_string(table->data, table->maxlen, write, filp, | 2159 | return _proc_do_string(table->data, table->maxlen, write, filp, |
2146 | buffer, lenp, ppos); | 2160 | buffer, lenp, ppos); |
2147 | } | 2161 | } |
2148 | 2162 | ||
2149 | 2163 | ||
2150 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, | 2164 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, |
2151 | int *valp, | 2165 | int *valp, |
2152 | int write, void *data) | 2166 | int write, void *data) |
2153 | { | 2167 | { |
2154 | if (write) { | 2168 | if (write) { |
2155 | *valp = *negp ? -*lvalp : *lvalp; | 2169 | *valp = *negp ? -*lvalp : *lvalp; |
2156 | } else { | 2170 | } else { |
2157 | int val = *valp; | 2171 | int val = *valp; |
2158 | if (val < 0) { | 2172 | if (val < 0) { |
2159 | *negp = -1; | 2173 | *negp = -1; |
2160 | *lvalp = (unsigned long)-val; | 2174 | *lvalp = (unsigned long)-val; |
2161 | } else { | 2175 | } else { |
2162 | *negp = 0; | 2176 | *negp = 0; |
2163 | *lvalp = (unsigned long)val; | 2177 | *lvalp = (unsigned long)val; |
2164 | } | 2178 | } |
2165 | } | 2179 | } |
2166 | return 0; | 2180 | return 0; |
2167 | } | 2181 | } |
2168 | 2182 | ||
2169 | static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, | 2183 | static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, |
2170 | int write, struct file *filp, void __user *buffer, | 2184 | int write, struct file *filp, void __user *buffer, |
2171 | size_t *lenp, loff_t *ppos, | 2185 | size_t *lenp, loff_t *ppos, |
2172 | int (*conv)(int *negp, unsigned long *lvalp, int *valp, | 2186 | int (*conv)(int *negp, unsigned long *lvalp, int *valp, |
2173 | int write, void *data), | 2187 | int write, void *data), |
2174 | void *data) | 2188 | void *data) |
2175 | { | 2189 | { |
2176 | #define TMPBUFLEN 21 | 2190 | #define TMPBUFLEN 21 |
2177 | int *i, vleft, first=1, neg, val; | 2191 | int *i, vleft, first=1, neg, val; |
2178 | unsigned long lval; | 2192 | unsigned long lval; |
2179 | size_t left, len; | 2193 | size_t left, len; |
2180 | 2194 | ||
2181 | char buf[TMPBUFLEN], *p; | 2195 | char buf[TMPBUFLEN], *p; |
2182 | char __user *s = buffer; | 2196 | char __user *s = buffer; |
2183 | 2197 | ||
2184 | if (!tbl_data || !table->maxlen || !*lenp || | 2198 | if (!tbl_data || !table->maxlen || !*lenp || |
2185 | (*ppos && !write)) { | 2199 | (*ppos && !write)) { |
2186 | *lenp = 0; | 2200 | *lenp = 0; |
2187 | return 0; | 2201 | return 0; |
2188 | } | 2202 | } |
2189 | 2203 | ||
2190 | i = (int *) tbl_data; | 2204 | i = (int *) tbl_data; |
2191 | vleft = table->maxlen / sizeof(*i); | 2205 | vleft = table->maxlen / sizeof(*i); |
2192 | left = *lenp; | 2206 | left = *lenp; |
2193 | 2207 | ||
2194 | if (!conv) | 2208 | if (!conv) |
2195 | conv = do_proc_dointvec_conv; | 2209 | conv = do_proc_dointvec_conv; |
2196 | 2210 | ||
2197 | for (; left && vleft--; i++, first=0) { | 2211 | for (; left && vleft--; i++, first=0) { |
2198 | if (write) { | 2212 | if (write) { |
2199 | while (left) { | 2213 | while (left) { |
2200 | char c; | 2214 | char c; |
2201 | if (get_user(c, s)) | 2215 | if (get_user(c, s)) |
2202 | return -EFAULT; | 2216 | return -EFAULT; |
2203 | if (!isspace(c)) | 2217 | if (!isspace(c)) |
2204 | break; | 2218 | break; |
2205 | left--; | 2219 | left--; |
2206 | s++; | 2220 | s++; |
2207 | } | 2221 | } |
2208 | if (!left) | 2222 | if (!left) |
2209 | break; | 2223 | break; |
2210 | neg = 0; | 2224 | neg = 0; |
2211 | len = left; | 2225 | len = left; |
2212 | if (len > sizeof(buf) - 1) | 2226 | if (len > sizeof(buf) - 1) |
2213 | len = sizeof(buf) - 1; | 2227 | len = sizeof(buf) - 1; |
2214 | if (copy_from_user(buf, s, len)) | 2228 | if (copy_from_user(buf, s, len)) |
2215 | return -EFAULT; | 2229 | return -EFAULT; |
2216 | buf[len] = 0; | 2230 | buf[len] = 0; |
2217 | p = buf; | 2231 | p = buf; |
2218 | if (*p == '-' && left > 1) { | 2232 | if (*p == '-' && left > 1) { |
2219 | neg = 1; | 2233 | neg = 1; |
2220 | p++; | 2234 | p++; |
2221 | } | 2235 | } |
2222 | if (*p < '0' || *p > '9') | 2236 | if (*p < '0' || *p > '9') |
2223 | break; | 2237 | break; |
2224 | 2238 | ||
2225 | lval = simple_strtoul(p, &p, 0); | 2239 | lval = simple_strtoul(p, &p, 0); |
2226 | 2240 | ||
2227 | len = p-buf; | 2241 | len = p-buf; |
2228 | if ((len < left) && *p && !isspace(*p)) | 2242 | if ((len < left) && *p && !isspace(*p)) |
2229 | break; | 2243 | break; |
2230 | if (neg) | 2244 | if (neg) |
2231 | val = -val; | 2245 | val = -val; |
2232 | s += len; | 2246 | s += len; |
2233 | left -= len; | 2247 | left -= len; |
2234 | 2248 | ||
2235 | if (conv(&neg, &lval, i, 1, data)) | 2249 | if (conv(&neg, &lval, i, 1, data)) |
2236 | break; | 2250 | break; |
2237 | } else { | 2251 | } else { |
2238 | p = buf; | 2252 | p = buf; |
2239 | if (!first) | 2253 | if (!first) |
2240 | *p++ = '\t'; | 2254 | *p++ = '\t'; |
2241 | 2255 | ||
2242 | if (conv(&neg, &lval, i, 0, data)) | 2256 | if (conv(&neg, &lval, i, 0, data)) |
2243 | break; | 2257 | break; |
2244 | 2258 | ||
2245 | sprintf(p, "%s%lu", neg ? "-" : "", lval); | 2259 | sprintf(p, "%s%lu", neg ? "-" : "", lval); |
2246 | len = strlen(buf); | 2260 | len = strlen(buf); |
2247 | if (len > left) | 2261 | if (len > left) |
2248 | len = left; | 2262 | len = left; |
2249 | if(copy_to_user(s, buf, len)) | 2263 | if(copy_to_user(s, buf, len)) |
2250 | return -EFAULT; | 2264 | return -EFAULT; |
2251 | left -= len; | 2265 | left -= len; |
2252 | s += len; | 2266 | s += len; |
2253 | } | 2267 | } |
2254 | } | 2268 | } |
2255 | 2269 | ||
2256 | if (!write && !first && left) { | 2270 | if (!write && !first && left) { |
2257 | if(put_user('\n', s)) | 2271 | if(put_user('\n', s)) |
2258 | return -EFAULT; | 2272 | return -EFAULT; |
2259 | left--, s++; | 2273 | left--, s++; |
2260 | } | 2274 | } |
2261 | if (write) { | 2275 | if (write) { |
2262 | while (left) { | 2276 | while (left) { |
2263 | char c; | 2277 | char c; |
2264 | if (get_user(c, s++)) | 2278 | if (get_user(c, s++)) |
2265 | return -EFAULT; | 2279 | return -EFAULT; |
2266 | if (!isspace(c)) | 2280 | if (!isspace(c)) |
2267 | break; | 2281 | break; |
2268 | left--; | 2282 | left--; |
2269 | } | 2283 | } |
2270 | } | 2284 | } |
2271 | if (write && first) | 2285 | if (write && first) |
2272 | return -EINVAL; | 2286 | return -EINVAL; |
2273 | *lenp -= left; | 2287 | *lenp -= left; |
2274 | *ppos += *lenp; | 2288 | *ppos += *lenp; |
2275 | return 0; | 2289 | return 0; |
2276 | #undef TMPBUFLEN | 2290 | #undef TMPBUFLEN |
2277 | } | 2291 | } |
2278 | 2292 | ||
2279 | static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, | 2293 | static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, |
2280 | void __user *buffer, size_t *lenp, loff_t *ppos, | 2294 | void __user *buffer, size_t *lenp, loff_t *ppos, |
2281 | int (*conv)(int *negp, unsigned long *lvalp, int *valp, | 2295 | int (*conv)(int *negp, unsigned long *lvalp, int *valp, |
2282 | int write, void *data), | 2296 | int write, void *data), |
2283 | void *data) | 2297 | void *data) |
2284 | { | 2298 | { |
2285 | return __do_proc_dointvec(table->data, table, write, filp, | 2299 | return __do_proc_dointvec(table->data, table, write, filp, |
2286 | buffer, lenp, ppos, conv, data); | 2300 | buffer, lenp, ppos, conv, data); |
2287 | } | 2301 | } |
2288 | 2302 | ||
2289 | /** | 2303 | /** |
2290 | * proc_dointvec - read a vector of integers | 2304 | * proc_dointvec - read a vector of integers |
2291 | * @table: the sysctl table | 2305 | * @table: the sysctl table |
2292 | * @write: %TRUE if this is a write to the sysctl file | 2306 | * @write: %TRUE if this is a write to the sysctl file |
2293 | * @filp: the file structure | 2307 | * @filp: the file structure |
2294 | * @buffer: the user buffer | 2308 | * @buffer: the user buffer |
2295 | * @lenp: the size of the user buffer | 2309 | * @lenp: the size of the user buffer |
2296 | * @ppos: file position | 2310 | * @ppos: file position |
2297 | * | 2311 | * |
2298 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | 2312 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer |
2299 | * values from/to the user buffer, treated as an ASCII string. | 2313 | * values from/to the user buffer, treated as an ASCII string. |
2300 | * | 2314 | * |
2301 | * Returns 0 on success. | 2315 | * Returns 0 on success. |
2302 | */ | 2316 | */ |
2303 | int proc_dointvec(struct ctl_table *table, int write, struct file *filp, | 2317 | int proc_dointvec(struct ctl_table *table, int write, struct file *filp, |
2304 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2318 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2305 | { | 2319 | { |
2306 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | 2320 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, |
2307 | NULL,NULL); | 2321 | NULL,NULL); |
2308 | } | 2322 | } |
2309 | 2323 | ||
2310 | /* | 2324 | /* |
2311 | * Taint values can only be increased | 2325 | * Taint values can only be increased |
2312 | * This means we can safely use a temporary. | 2326 | * This means we can safely use a temporary. |
2313 | */ | 2327 | */ |
2314 | static int proc_taint(struct ctl_table *table, int write, struct file *filp, | 2328 | static int proc_taint(struct ctl_table *table, int write, struct file *filp, |
2315 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2329 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2316 | { | 2330 | { |
2317 | struct ctl_table t; | 2331 | struct ctl_table t; |
2318 | unsigned long tmptaint = get_taint(); | 2332 | unsigned long tmptaint = get_taint(); |
2319 | int err; | 2333 | int err; |
2320 | 2334 | ||
2321 | if (write && !capable(CAP_SYS_ADMIN)) | 2335 | if (write && !capable(CAP_SYS_ADMIN)) |
2322 | return -EPERM; | 2336 | return -EPERM; |
2323 | 2337 | ||
2324 | t = *table; | 2338 | t = *table; |
2325 | t.data = &tmptaint; | 2339 | t.data = &tmptaint; |
2326 | err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); | 2340 | err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); |
2327 | if (err < 0) | 2341 | if (err < 0) |
2328 | return err; | 2342 | return err; |
2329 | 2343 | ||
2330 | if (write) { | 2344 | if (write) { |
2331 | /* | 2345 | /* |
2332 | * Poor man's atomic or. Not worth adding a primitive | 2346 | * Poor man's atomic or. Not worth adding a primitive |
2333 | * to everyone's atomic.h for this | 2347 | * to everyone's atomic.h for this |
2334 | */ | 2348 | */ |
2335 | int i; | 2349 | int i; |
2336 | for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { | 2350 | for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { |
2337 | if ((tmptaint >> i) & 1) | 2351 | if ((tmptaint >> i) & 1) |
2338 | add_taint(i); | 2352 | add_taint(i); |
2339 | } | 2353 | } |
2340 | } | 2354 | } |
2341 | 2355 | ||
2342 | return err; | 2356 | return err; |
2343 | } | 2357 | } |
2344 | 2358 | ||
2345 | struct do_proc_dointvec_minmax_conv_param { | 2359 | struct do_proc_dointvec_minmax_conv_param { |
2346 | int *min; | 2360 | int *min; |
2347 | int *max; | 2361 | int *max; |
2348 | }; | 2362 | }; |
2349 | 2363 | ||
2350 | static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, | 2364 | static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, |
2351 | int *valp, | 2365 | int *valp, |
2352 | int write, void *data) | 2366 | int write, void *data) |
2353 | { | 2367 | { |
2354 | struct do_proc_dointvec_minmax_conv_param *param = data; | 2368 | struct do_proc_dointvec_minmax_conv_param *param = data; |
2355 | if (write) { | 2369 | if (write) { |
2356 | int val = *negp ? -*lvalp : *lvalp; | 2370 | int val = *negp ? -*lvalp : *lvalp; |
2357 | if ((param->min && *param->min > val) || | 2371 | if ((param->min && *param->min > val) || |
2358 | (param->max && *param->max < val)) | 2372 | (param->max && *param->max < val)) |
2359 | return -EINVAL; | 2373 | return -EINVAL; |
2360 | *valp = val; | 2374 | *valp = val; |
2361 | } else { | 2375 | } else { |
2362 | int val = *valp; | 2376 | int val = *valp; |
2363 | if (val < 0) { | 2377 | if (val < 0) { |
2364 | *negp = -1; | 2378 | *negp = -1; |
2365 | *lvalp = (unsigned long)-val; | 2379 | *lvalp = (unsigned long)-val; |
2366 | } else { | 2380 | } else { |
2367 | *negp = 0; | 2381 | *negp = 0; |
2368 | *lvalp = (unsigned long)val; | 2382 | *lvalp = (unsigned long)val; |
2369 | } | 2383 | } |
2370 | } | 2384 | } |
2371 | return 0; | 2385 | return 0; |
2372 | } | 2386 | } |
2373 | 2387 | ||
2374 | /** | 2388 | /** |
2375 | * proc_dointvec_minmax - read a vector of integers with min/max values | 2389 | * proc_dointvec_minmax - read a vector of integers with min/max values |
2376 | * @table: the sysctl table | 2390 | * @table: the sysctl table |
2377 | * @write: %TRUE if this is a write to the sysctl file | 2391 | * @write: %TRUE if this is a write to the sysctl file |
2378 | * @filp: the file structure | 2392 | * @filp: the file structure |
2379 | * @buffer: the user buffer | 2393 | * @buffer: the user buffer |
2380 | * @lenp: the size of the user buffer | 2394 | * @lenp: the size of the user buffer |
2381 | * @ppos: file position | 2395 | * @ppos: file position |
2382 | * | 2396 | * |
2383 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | 2397 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer |
2384 | * values from/to the user buffer, treated as an ASCII string. | 2398 | * values from/to the user buffer, treated as an ASCII string. |
2385 | * | 2399 | * |
2386 | * This routine will ensure the values are within the range specified by | 2400 | * This routine will ensure the values are within the range specified by |
2387 | * table->extra1 (min) and table->extra2 (max). | 2401 | * table->extra1 (min) and table->extra2 (max). |
2388 | * | 2402 | * |
2389 | * Returns 0 on success. | 2403 | * Returns 0 on success. |
2390 | */ | 2404 | */ |
2391 | int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, | 2405 | int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, |
2392 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2406 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2393 | { | 2407 | { |
2394 | struct do_proc_dointvec_minmax_conv_param param = { | 2408 | struct do_proc_dointvec_minmax_conv_param param = { |
2395 | .min = (int *) table->extra1, | 2409 | .min = (int *) table->extra1, |
2396 | .max = (int *) table->extra2, | 2410 | .max = (int *) table->extra2, |
2397 | }; | 2411 | }; |
2398 | return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, | 2412 | return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, |
2399 | do_proc_dointvec_minmax_conv, ¶m); | 2413 | do_proc_dointvec_minmax_conv, ¶m); |
2400 | } | 2414 | } |
2401 | 2415 | ||
2402 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, | 2416 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, |
2403 | struct file *filp, | 2417 | struct file *filp, |
2404 | void __user *buffer, | 2418 | void __user *buffer, |
2405 | size_t *lenp, loff_t *ppos, | 2419 | size_t *lenp, loff_t *ppos, |
2406 | unsigned long convmul, | 2420 | unsigned long convmul, |
2407 | unsigned long convdiv) | 2421 | unsigned long convdiv) |
2408 | { | 2422 | { |
2409 | #define TMPBUFLEN 21 | 2423 | #define TMPBUFLEN 21 |
2410 | unsigned long *i, *min, *max, val; | 2424 | unsigned long *i, *min, *max, val; |
2411 | int vleft, first=1, neg; | 2425 | int vleft, first=1, neg; |
2412 | size_t len, left; | 2426 | size_t len, left; |
2413 | char buf[TMPBUFLEN], *p; | 2427 | char buf[TMPBUFLEN], *p; |
2414 | char __user *s = buffer; | 2428 | char __user *s = buffer; |
2415 | 2429 | ||
2416 | if (!data || !table->maxlen || !*lenp || | 2430 | if (!data || !table->maxlen || !*lenp || |
2417 | (*ppos && !write)) { | 2431 | (*ppos && !write)) { |
2418 | *lenp = 0; | 2432 | *lenp = 0; |
2419 | return 0; | 2433 | return 0; |
2420 | } | 2434 | } |
2421 | 2435 | ||
2422 | i = (unsigned long *) data; | 2436 | i = (unsigned long *) data; |
2423 | min = (unsigned long *) table->extra1; | 2437 | min = (unsigned long *) table->extra1; |
2424 | max = (unsigned long *) table->extra2; | 2438 | max = (unsigned long *) table->extra2; |
2425 | vleft = table->maxlen / sizeof(unsigned long); | 2439 | vleft = table->maxlen / sizeof(unsigned long); |
2426 | left = *lenp; | 2440 | left = *lenp; |
2427 | 2441 | ||
2428 | for (; left && vleft--; i++, min++, max++, first=0) { | 2442 | for (; left && vleft--; i++, min++, max++, first=0) { |
2429 | if (write) { | 2443 | if (write) { |
2430 | while (left) { | 2444 | while (left) { |
2431 | char c; | 2445 | char c; |
2432 | if (get_user(c, s)) | 2446 | if (get_user(c, s)) |
2433 | return -EFAULT; | 2447 | return -EFAULT; |
2434 | if (!isspace(c)) | 2448 | if (!isspace(c)) |
2435 | break; | 2449 | break; |
2436 | left--; | 2450 | left--; |
2437 | s++; | 2451 | s++; |
2438 | } | 2452 | } |
2439 | if (!left) | 2453 | if (!left) |
2440 | break; | 2454 | break; |
2441 | neg = 0; | 2455 | neg = 0; |
2442 | len = left; | 2456 | len = left; |
2443 | if (len > TMPBUFLEN-1) | 2457 | if (len > TMPBUFLEN-1) |
2444 | len = TMPBUFLEN-1; | 2458 | len = TMPBUFLEN-1; |
2445 | if (copy_from_user(buf, s, len)) | 2459 | if (copy_from_user(buf, s, len)) |
2446 | return -EFAULT; | 2460 | return -EFAULT; |
2447 | buf[len] = 0; | 2461 | buf[len] = 0; |
2448 | p = buf; | 2462 | p = buf; |
2449 | if (*p == '-' && left > 1) { | 2463 | if (*p == '-' && left > 1) { |
2450 | neg = 1; | 2464 | neg = 1; |
2451 | p++; | 2465 | p++; |
2452 | } | 2466 | } |
2453 | if (*p < '0' || *p > '9') | 2467 | if (*p < '0' || *p > '9') |
2454 | break; | 2468 | break; |
2455 | val = simple_strtoul(p, &p, 0) * convmul / convdiv ; | 2469 | val = simple_strtoul(p, &p, 0) * convmul / convdiv ; |
2456 | len = p-buf; | 2470 | len = p-buf; |
2457 | if ((len < left) && *p && !isspace(*p)) | 2471 | if ((len < left) && *p && !isspace(*p)) |
2458 | break; | 2472 | break; |
2459 | if (neg) | 2473 | if (neg) |
2460 | val = -val; | 2474 | val = -val; |
2461 | s += len; | 2475 | s += len; |
2462 | left -= len; | 2476 | left -= len; |
2463 | 2477 | ||
2464 | if(neg) | 2478 | if(neg) |
2465 | continue; | 2479 | continue; |
2466 | if ((min && val < *min) || (max && val > *max)) | 2480 | if ((min && val < *min) || (max && val > *max)) |
2467 | continue; | 2481 | continue; |
2468 | *i = val; | 2482 | *i = val; |
2469 | } else { | 2483 | } else { |
2470 | p = buf; | 2484 | p = buf; |
2471 | if (!first) | 2485 | if (!first) |
2472 | *p++ = '\t'; | 2486 | *p++ = '\t'; |
2473 | sprintf(p, "%lu", convdiv * (*i) / convmul); | 2487 | sprintf(p, "%lu", convdiv * (*i) / convmul); |
2474 | len = strlen(buf); | 2488 | len = strlen(buf); |
2475 | if (len > left) | 2489 | if (len > left) |
2476 | len = left; | 2490 | len = left; |
2477 | if(copy_to_user(s, buf, len)) | 2491 | if(copy_to_user(s, buf, len)) |
2478 | return -EFAULT; | 2492 | return -EFAULT; |
2479 | left -= len; | 2493 | left -= len; |
2480 | s += len; | 2494 | s += len; |
2481 | } | 2495 | } |
2482 | } | 2496 | } |
2483 | 2497 | ||
2484 | if (!write && !first && left) { | 2498 | if (!write && !first && left) { |
2485 | if(put_user('\n', s)) | 2499 | if(put_user('\n', s)) |
2486 | return -EFAULT; | 2500 | return -EFAULT; |
2487 | left--, s++; | 2501 | left--, s++; |
2488 | } | 2502 | } |
2489 | if (write) { | 2503 | if (write) { |
2490 | while (left) { | 2504 | while (left) { |
2491 | char c; | 2505 | char c; |
2492 | if (get_user(c, s++)) | 2506 | if (get_user(c, s++)) |
2493 | return -EFAULT; | 2507 | return -EFAULT; |
2494 | if (!isspace(c)) | 2508 | if (!isspace(c)) |
2495 | break; | 2509 | break; |
2496 | left--; | 2510 | left--; |
2497 | } | 2511 | } |
2498 | } | 2512 | } |
2499 | if (write && first) | 2513 | if (write && first) |
2500 | return -EINVAL; | 2514 | return -EINVAL; |
2501 | *lenp -= left; | 2515 | *lenp -= left; |
2502 | *ppos += *lenp; | 2516 | *ppos += *lenp; |
2503 | return 0; | 2517 | return 0; |
2504 | #undef TMPBUFLEN | 2518 | #undef TMPBUFLEN |
2505 | } | 2519 | } |
2506 | 2520 | ||
2507 | static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, | 2521 | static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, |
2508 | struct file *filp, | 2522 | struct file *filp, |
2509 | void __user *buffer, | 2523 | void __user *buffer, |
2510 | size_t *lenp, loff_t *ppos, | 2524 | size_t *lenp, loff_t *ppos, |
2511 | unsigned long convmul, | 2525 | unsigned long convmul, |
2512 | unsigned long convdiv) | 2526 | unsigned long convdiv) |
2513 | { | 2527 | { |
2514 | return __do_proc_doulongvec_minmax(table->data, table, write, | 2528 | return __do_proc_doulongvec_minmax(table->data, table, write, |
2515 | filp, buffer, lenp, ppos, convmul, convdiv); | 2529 | filp, buffer, lenp, ppos, convmul, convdiv); |
2516 | } | 2530 | } |
2517 | 2531 | ||
2518 | /** | 2532 | /** |
2519 | * proc_doulongvec_minmax - read a vector of long integers with min/max values | 2533 | * proc_doulongvec_minmax - read a vector of long integers with min/max values |
2520 | * @table: the sysctl table | 2534 | * @table: the sysctl table |
2521 | * @write: %TRUE if this is a write to the sysctl file | 2535 | * @write: %TRUE if this is a write to the sysctl file |
2522 | * @filp: the file structure | 2536 | * @filp: the file structure |
2523 | * @buffer: the user buffer | 2537 | * @buffer: the user buffer |
2524 | * @lenp: the size of the user buffer | 2538 | * @lenp: the size of the user buffer |
2525 | * @ppos: file position | 2539 | * @ppos: file position |
2526 | * | 2540 | * |
2527 | * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long | 2541 | * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long |
2528 | * values from/to the user buffer, treated as an ASCII string. | 2542 | * values from/to the user buffer, treated as an ASCII string. |
2529 | * | 2543 | * |
2530 | * This routine will ensure the values are within the range specified by | 2544 | * This routine will ensure the values are within the range specified by |
2531 | * table->extra1 (min) and table->extra2 (max). | 2545 | * table->extra1 (min) and table->extra2 (max). |
2532 | * | 2546 | * |
2533 | * Returns 0 on success. | 2547 | * Returns 0 on success. |
2534 | */ | 2548 | */ |
2535 | int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, | 2549 | int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, |
2536 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2550 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2537 | { | 2551 | { |
2538 | return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); | 2552 | return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); |
2539 | } | 2553 | } |
2540 | 2554 | ||
2541 | /** | 2555 | /** |
2542 | * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values | 2556 | * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values |
2543 | * @table: the sysctl table | 2557 | * @table: the sysctl table |
2544 | * @write: %TRUE if this is a write to the sysctl file | 2558 | * @write: %TRUE if this is a write to the sysctl file |
2545 | * @filp: the file structure | 2559 | * @filp: the file structure |
2546 | * @buffer: the user buffer | 2560 | * @buffer: the user buffer |
2547 | * @lenp: the size of the user buffer | 2561 | * @lenp: the size of the user buffer |
2548 | * @ppos: file position | 2562 | * @ppos: file position |
2549 | * | 2563 | * |
2550 | * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long | 2564 | * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long |
2551 | * values from/to the user buffer, treated as an ASCII string. The values | 2565 | * values from/to the user buffer, treated as an ASCII string. The values |
2552 | * are treated as milliseconds, and converted to jiffies when they are stored. | 2566 | * are treated as milliseconds, and converted to jiffies when they are stored. |
2553 | * | 2567 | * |
2554 | * This routine will ensure the values are within the range specified by | 2568 | * This routine will ensure the values are within the range specified by |
2555 | * table->extra1 (min) and table->extra2 (max). | 2569 | * table->extra1 (min) and table->extra2 (max). |
2556 | * | 2570 | * |
2557 | * Returns 0 on success. | 2571 | * Returns 0 on success. |
2558 | */ | 2572 | */ |
2559 | int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, | 2573 | int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, |
2560 | struct file *filp, | 2574 | struct file *filp, |
2561 | void __user *buffer, | 2575 | void __user *buffer, |
2562 | size_t *lenp, loff_t *ppos) | 2576 | size_t *lenp, loff_t *ppos) |
2563 | { | 2577 | { |
2564 | return do_proc_doulongvec_minmax(table, write, filp, buffer, | 2578 | return do_proc_doulongvec_minmax(table, write, filp, buffer, |
2565 | lenp, ppos, HZ, 1000l); | 2579 | lenp, ppos, HZ, 1000l); |
2566 | } | 2580 | } |
2567 | 2581 | ||
2568 | 2582 | ||
2569 | static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, | 2583 | static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, |
2570 | int *valp, | 2584 | int *valp, |
2571 | int write, void *data) | 2585 | int write, void *data) |
2572 | { | 2586 | { |
2573 | if (write) { | 2587 | if (write) { |
2574 | if (*lvalp > LONG_MAX / HZ) | 2588 | if (*lvalp > LONG_MAX / HZ) |
2575 | return 1; | 2589 | return 1; |
2576 | *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); | 2590 | *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); |
2577 | } else { | 2591 | } else { |
2578 | int val = *valp; | 2592 | int val = *valp; |
2579 | unsigned long lval; | 2593 | unsigned long lval; |
2580 | if (val < 0) { | 2594 | if (val < 0) { |
2581 | *negp = -1; | 2595 | *negp = -1; |
2582 | lval = (unsigned long)-val; | 2596 | lval = (unsigned long)-val; |
2583 | } else { | 2597 | } else { |
2584 | *negp = 0; | 2598 | *negp = 0; |
2585 | lval = (unsigned long)val; | 2599 | lval = (unsigned long)val; |
2586 | } | 2600 | } |
2587 | *lvalp = lval / HZ; | 2601 | *lvalp = lval / HZ; |
2588 | } | 2602 | } |
2589 | return 0; | 2603 | return 0; |
2590 | } | 2604 | } |
2591 | 2605 | ||
2592 | static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, | 2606 | static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, |
2593 | int *valp, | 2607 | int *valp, |
2594 | int write, void *data) | 2608 | int write, void *data) |
2595 | { | 2609 | { |
2596 | if (write) { | 2610 | if (write) { |
2597 | if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) | 2611 | if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) |
2598 | return 1; | 2612 | return 1; |
2599 | *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); | 2613 | *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); |
2600 | } else { | 2614 | } else { |
2601 | int val = *valp; | 2615 | int val = *valp; |
2602 | unsigned long lval; | 2616 | unsigned long lval; |
2603 | if (val < 0) { | 2617 | if (val < 0) { |
2604 | *negp = -1; | 2618 | *negp = -1; |
2605 | lval = (unsigned long)-val; | 2619 | lval = (unsigned long)-val; |
2606 | } else { | 2620 | } else { |
2607 | *negp = 0; | 2621 | *negp = 0; |
2608 | lval = (unsigned long)val; | 2622 | lval = (unsigned long)val; |
2609 | } | 2623 | } |
2610 | *lvalp = jiffies_to_clock_t(lval); | 2624 | *lvalp = jiffies_to_clock_t(lval); |
2611 | } | 2625 | } |
2612 | return 0; | 2626 | return 0; |
2613 | } | 2627 | } |
2614 | 2628 | ||
2615 | static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, | 2629 | static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, |
2616 | int *valp, | 2630 | int *valp, |
2617 | int write, void *data) | 2631 | int write, void *data) |
2618 | { | 2632 | { |
2619 | if (write) { | 2633 | if (write) { |
2620 | *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); | 2634 | *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); |
2621 | } else { | 2635 | } else { |
2622 | int val = *valp; | 2636 | int val = *valp; |
2623 | unsigned long lval; | 2637 | unsigned long lval; |
2624 | if (val < 0) { | 2638 | if (val < 0) { |
2625 | *negp = -1; | 2639 | *negp = -1; |
2626 | lval = (unsigned long)-val; | 2640 | lval = (unsigned long)-val; |
2627 | } else { | 2641 | } else { |
2628 | *negp = 0; | 2642 | *negp = 0; |
2629 | lval = (unsigned long)val; | 2643 | lval = (unsigned long)val; |
2630 | } | 2644 | } |
2631 | *lvalp = jiffies_to_msecs(lval); | 2645 | *lvalp = jiffies_to_msecs(lval); |
2632 | } | 2646 | } |
2633 | return 0; | 2647 | return 0; |
2634 | } | 2648 | } |
2635 | 2649 | ||
2636 | /** | 2650 | /** |
2637 | * proc_dointvec_jiffies - read a vector of integers as seconds | 2651 | * proc_dointvec_jiffies - read a vector of integers as seconds |
2638 | * @table: the sysctl table | 2652 | * @table: the sysctl table |
2639 | * @write: %TRUE if this is a write to the sysctl file | 2653 | * @write: %TRUE if this is a write to the sysctl file |
2640 | * @filp: the file structure | 2654 | * @filp: the file structure |
2641 | * @buffer: the user buffer | 2655 | * @buffer: the user buffer |
2642 | * @lenp: the size of the user buffer | 2656 | * @lenp: the size of the user buffer |
2643 | * @ppos: file position | 2657 | * @ppos: file position |
2644 | * | 2658 | * |
2645 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | 2659 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer |
2646 | * values from/to the user buffer, treated as an ASCII string. | 2660 | * values from/to the user buffer, treated as an ASCII string. |
2647 | * The values read are assumed to be in seconds, and are converted into | 2661 | * The values read are assumed to be in seconds, and are converted into |
2648 | * jiffies. | 2662 | * jiffies. |
2649 | * | 2663 | * |
2650 | * Returns 0 on success. | 2664 | * Returns 0 on success. |
2651 | */ | 2665 | */ |
2652 | int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, | 2666 | int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, |
2653 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2667 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2654 | { | 2668 | { |
2655 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | 2669 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, |
2656 | do_proc_dointvec_jiffies_conv,NULL); | 2670 | do_proc_dointvec_jiffies_conv,NULL); |
2657 | } | 2671 | } |
2658 | 2672 | ||
2659 | /** | 2673 | /** |
2660 | * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds | 2674 | * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds |
2661 | * @table: the sysctl table | 2675 | * @table: the sysctl table |
2662 | * @write: %TRUE if this is a write to the sysctl file | 2676 | * @write: %TRUE if this is a write to the sysctl file |
2663 | * @filp: the file structure | 2677 | * @filp: the file structure |
2664 | * @buffer: the user buffer | 2678 | * @buffer: the user buffer |
2665 | * @lenp: the size of the user buffer | 2679 | * @lenp: the size of the user buffer |
2666 | * @ppos: pointer to the file position | 2680 | * @ppos: pointer to the file position |
2667 | * | 2681 | * |
2668 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | 2682 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer |
2669 | * values from/to the user buffer, treated as an ASCII string. | 2683 | * values from/to the user buffer, treated as an ASCII string. |
2670 | * The values read are assumed to be in 1/USER_HZ seconds, and | 2684 | * The values read are assumed to be in 1/USER_HZ seconds, and |
2671 | * are converted into jiffies. | 2685 | * are converted into jiffies. |
2672 | * | 2686 | * |
2673 | * Returns 0 on success. | 2687 | * Returns 0 on success. |
2674 | */ | 2688 | */ |
2675 | int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, | 2689 | int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, |
2676 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2690 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2677 | { | 2691 | { |
2678 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | 2692 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, |
2679 | do_proc_dointvec_userhz_jiffies_conv,NULL); | 2693 | do_proc_dointvec_userhz_jiffies_conv,NULL); |
2680 | } | 2694 | } |
2681 | 2695 | ||
2682 | /** | 2696 | /** |
2683 | * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds | 2697 | * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds |
2684 | * @table: the sysctl table | 2698 | * @table: the sysctl table |
2685 | * @write: %TRUE if this is a write to the sysctl file | 2699 | * @write: %TRUE if this is a write to the sysctl file |
2686 | * @filp: the file structure | 2700 | * @filp: the file structure |
2687 | * @buffer: the user buffer | 2701 | * @buffer: the user buffer |
2688 | * @lenp: the size of the user buffer | 2702 | * @lenp: the size of the user buffer |
2689 | * @ppos: file position | 2703 | * @ppos: file position |
2690 | * @ppos: the current position in the file | 2704 | * @ppos: the current position in the file |
2691 | * | 2705 | * |
2692 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | 2706 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer |
2693 | * values from/to the user buffer, treated as an ASCII string. | 2707 | * values from/to the user buffer, treated as an ASCII string. |
2694 | * The values read are assumed to be in 1/1000 seconds, and | 2708 | * The values read are assumed to be in 1/1000 seconds, and |
2695 | * are converted into jiffies. | 2709 | * are converted into jiffies. |
2696 | * | 2710 | * |
2697 | * Returns 0 on success. | 2711 | * Returns 0 on success. |
2698 | */ | 2712 | */ |
2699 | int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, | 2713 | int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, |
2700 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2714 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2701 | { | 2715 | { |
2702 | return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, | 2716 | return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, |
2703 | do_proc_dointvec_ms_jiffies_conv, NULL); | 2717 | do_proc_dointvec_ms_jiffies_conv, NULL); |
2704 | } | 2718 | } |
2705 | 2719 | ||
2706 | static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, | 2720 | static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, |
2707 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2721 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2708 | { | 2722 | { |
2709 | struct pid *new_pid; | 2723 | struct pid *new_pid; |
2710 | pid_t tmp; | 2724 | pid_t tmp; |
2711 | int r; | 2725 | int r; |
2712 | 2726 | ||
2713 | tmp = pid_vnr(cad_pid); | 2727 | tmp = pid_vnr(cad_pid); |
2714 | 2728 | ||
2715 | r = __do_proc_dointvec(&tmp, table, write, filp, buffer, | 2729 | r = __do_proc_dointvec(&tmp, table, write, filp, buffer, |
2716 | lenp, ppos, NULL, NULL); | 2730 | lenp, ppos, NULL, NULL); |
2717 | if (r || !write) | 2731 | if (r || !write) |
2718 | return r; | 2732 | return r; |
2719 | 2733 | ||
2720 | new_pid = find_get_pid(tmp); | 2734 | new_pid = find_get_pid(tmp); |
2721 | if (!new_pid) | 2735 | if (!new_pid) |
2722 | return -ESRCH; | 2736 | return -ESRCH; |
2723 | 2737 | ||
2724 | put_pid(xchg(&cad_pid, new_pid)); | 2738 | put_pid(xchg(&cad_pid, new_pid)); |
2725 | return 0; | 2739 | return 0; |
2726 | } | 2740 | } |
2727 | 2741 | ||
2728 | #else /* CONFIG_PROC_FS */ | 2742 | #else /* CONFIG_PROC_FS */ |
2729 | 2743 | ||
2730 | int proc_dostring(struct ctl_table *table, int write, struct file *filp, | 2744 | int proc_dostring(struct ctl_table *table, int write, struct file *filp, |
2731 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2745 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2732 | { | 2746 | { |
2733 | return -ENOSYS; | 2747 | return -ENOSYS; |
2734 | } | 2748 | } |
2735 | 2749 | ||
2736 | int proc_dointvec(struct ctl_table *table, int write, struct file *filp, | 2750 | int proc_dointvec(struct ctl_table *table, int write, struct file *filp, |
2737 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2751 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2738 | { | 2752 | { |
2739 | return -ENOSYS; | 2753 | return -ENOSYS; |
2740 | } | 2754 | } |
2741 | 2755 | ||
2742 | int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, | 2756 | int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, |
2743 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2757 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2744 | { | 2758 | { |
2745 | return -ENOSYS; | 2759 | return -ENOSYS; |
2746 | } | 2760 | } |
2747 | 2761 | ||
2748 | int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, | 2762 | int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, |
2749 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2763 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2750 | { | 2764 | { |
2751 | return -ENOSYS; | 2765 | return -ENOSYS; |
2752 | } | 2766 | } |
2753 | 2767 | ||
2754 | int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, | 2768 | int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, |
2755 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2769 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2756 | { | 2770 | { |
2757 | return -ENOSYS; | 2771 | return -ENOSYS; |
2758 | } | 2772 | } |
2759 | 2773 | ||
2760 | int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, | 2774 | int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, |
2761 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2775 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2762 | { | 2776 | { |
2763 | return -ENOSYS; | 2777 | return -ENOSYS; |
2764 | } | 2778 | } |
2765 | 2779 | ||
2766 | int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, | 2780 | int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, |
2767 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2781 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2768 | { | 2782 | { |
2769 | return -ENOSYS; | 2783 | return -ENOSYS; |
2770 | } | 2784 | } |
2771 | 2785 | ||
2772 | int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, | 2786 | int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, |
2773 | struct file *filp, | 2787 | struct file *filp, |
2774 | void __user *buffer, | 2788 | void __user *buffer, |
2775 | size_t *lenp, loff_t *ppos) | 2789 | size_t *lenp, loff_t *ppos) |
2776 | { | 2790 | { |
2777 | return -ENOSYS; | 2791 | return -ENOSYS; |
2778 | } | 2792 | } |
2779 | 2793 | ||
2780 | 2794 | ||
2781 | #endif /* CONFIG_PROC_FS */ | 2795 | #endif /* CONFIG_PROC_FS */ |
2782 | 2796 | ||
2783 | 2797 | ||
2784 | #ifdef CONFIG_SYSCTL_SYSCALL | 2798 | #ifdef CONFIG_SYSCTL_SYSCALL |
2785 | /* | 2799 | /* |
2786 | * General sysctl support routines | 2800 | * General sysctl support routines |
2787 | */ | 2801 | */ |
2788 | 2802 | ||
2789 | /* The generic sysctl data routine (used if no strategy routine supplied) */ | 2803 | /* The generic sysctl data routine (used if no strategy routine supplied) */ |
2790 | int sysctl_data(struct ctl_table *table, | 2804 | int sysctl_data(struct ctl_table *table, |
2791 | void __user *oldval, size_t __user *oldlenp, | 2805 | void __user *oldval, size_t __user *oldlenp, |
2792 | void __user *newval, size_t newlen) | 2806 | void __user *newval, size_t newlen) |
2793 | { | 2807 | { |
2794 | size_t len; | 2808 | size_t len; |
2795 | 2809 | ||
2796 | /* Get out of I don't have a variable */ | 2810 | /* Get out of I don't have a variable */ |
2797 | if (!table->data || !table->maxlen) | 2811 | if (!table->data || !table->maxlen) |
2798 | return -ENOTDIR; | 2812 | return -ENOTDIR; |
2799 | 2813 | ||
2800 | if (oldval && oldlenp) { | 2814 | if (oldval && oldlenp) { |
2801 | if (get_user(len, oldlenp)) | 2815 | if (get_user(len, oldlenp)) |
2802 | return -EFAULT; | 2816 | return -EFAULT; |
2803 | if (len) { | 2817 | if (len) { |
2804 | if (len > table->maxlen) | 2818 | if (len > table->maxlen) |
2805 | len = table->maxlen; | 2819 | len = table->maxlen; |
2806 | if (copy_to_user(oldval, table->data, len)) | 2820 | if (copy_to_user(oldval, table->data, len)) |
2807 | return -EFAULT; | 2821 | return -EFAULT; |
2808 | if (put_user(len, oldlenp)) | 2822 | if (put_user(len, oldlenp)) |
2809 | return -EFAULT; | 2823 | return -EFAULT; |
2810 | } | 2824 | } |
2811 | } | 2825 | } |
2812 | 2826 | ||
2813 | if (newval && newlen) { | 2827 | if (newval && newlen) { |
2814 | if (newlen > table->maxlen) | 2828 | if (newlen > table->maxlen) |
2815 | newlen = table->maxlen; | 2829 | newlen = table->maxlen; |
2816 | 2830 | ||
2817 | if (copy_from_user(table->data, newval, newlen)) | 2831 | if (copy_from_user(table->data, newval, newlen)) |
2818 | return -EFAULT; | 2832 | return -EFAULT; |
2819 | } | 2833 | } |
2820 | return 1; | 2834 | return 1; |
2821 | } | 2835 | } |
2822 | 2836 | ||
2823 | /* The generic string strategy routine: */ | 2837 | /* The generic string strategy routine: */ |
2824 | int sysctl_string(struct ctl_table *table, | 2838 | int sysctl_string(struct ctl_table *table, |
2825 | void __user *oldval, size_t __user *oldlenp, | 2839 | void __user *oldval, size_t __user *oldlenp, |
2826 | void __user *newval, size_t newlen) | 2840 | void __user *newval, size_t newlen) |
2827 | { | 2841 | { |
2828 | if (!table->data || !table->maxlen) | 2842 | if (!table->data || !table->maxlen) |
2829 | return -ENOTDIR; | 2843 | return -ENOTDIR; |
2830 | 2844 | ||
2831 | if (oldval && oldlenp) { | 2845 | if (oldval && oldlenp) { |
2832 | size_t bufsize; | 2846 | size_t bufsize; |
2833 | if (get_user(bufsize, oldlenp)) | 2847 | if (get_user(bufsize, oldlenp)) |
2834 | return -EFAULT; | 2848 | return -EFAULT; |
2835 | if (bufsize) { | 2849 | if (bufsize) { |
2836 | size_t len = strlen(table->data), copied; | 2850 | size_t len = strlen(table->data), copied; |
2837 | 2851 | ||
2838 | /* This shouldn't trigger for a well-formed sysctl */ | 2852 | /* This shouldn't trigger for a well-formed sysctl */ |
2839 | if (len > table->maxlen) | 2853 | if (len > table->maxlen) |
2840 | len = table->maxlen; | 2854 | len = table->maxlen; |
2841 | 2855 | ||
2842 | /* Copy up to a max of bufsize-1 bytes of the string */ | 2856 | /* Copy up to a max of bufsize-1 bytes of the string */ |
2843 | copied = (len >= bufsize) ? bufsize - 1 : len; | 2857 | copied = (len >= bufsize) ? bufsize - 1 : len; |
2844 | 2858 | ||
2845 | if (copy_to_user(oldval, table->data, copied) || | 2859 | if (copy_to_user(oldval, table->data, copied) || |
2846 | put_user(0, (char __user *)(oldval + copied))) | 2860 | put_user(0, (char __user *)(oldval + copied))) |
2847 | return -EFAULT; | 2861 | return -EFAULT; |
2848 | if (put_user(len, oldlenp)) | 2862 | if (put_user(len, oldlenp)) |
2849 | return -EFAULT; | 2863 | return -EFAULT; |
2850 | } | 2864 | } |
2851 | } | 2865 | } |
2852 | if (newval && newlen) { | 2866 | if (newval && newlen) { |
2853 | size_t len = newlen; | 2867 | size_t len = newlen; |
2854 | if (len > table->maxlen) | 2868 | if (len > table->maxlen) |
2855 | len = table->maxlen; | 2869 | len = table->maxlen; |
2856 | if(copy_from_user(table->data, newval, len)) | 2870 | if(copy_from_user(table->data, newval, len)) |
2857 | return -EFAULT; | 2871 | return -EFAULT; |
2858 | if (len == table->maxlen) | 2872 | if (len == table->maxlen) |
2859 | len--; | 2873 | len--; |
2860 | ((char *) table->data)[len] = 0; | 2874 | ((char *) table->data)[len] = 0; |
2861 | } | 2875 | } |
2862 | return 1; | 2876 | return 1; |
2863 | } | 2877 | } |
2864 | 2878 | ||
2865 | /* | 2879 | /* |
2866 | * This function makes sure that all of the integers in the vector | 2880 | * This function makes sure that all of the integers in the vector |
2867 | * are between the minimum and maximum values given in the arrays | 2881 | * are between the minimum and maximum values given in the arrays |
2868 | * table->extra1 and table->extra2, respectively. | 2882 | * table->extra1 and table->extra2, respectively. |
2869 | */ | 2883 | */ |
2870 | int sysctl_intvec(struct ctl_table *table, | 2884 | int sysctl_intvec(struct ctl_table *table, |
2871 | void __user *oldval, size_t __user *oldlenp, | 2885 | void __user *oldval, size_t __user *oldlenp, |
2872 | void __user *newval, size_t newlen) | 2886 | void __user *newval, size_t newlen) |
2873 | { | 2887 | { |
2874 | 2888 | ||
2875 | if (newval && newlen) { | 2889 | if (newval && newlen) { |
2876 | int __user *vec = (int __user *) newval; | 2890 | int __user *vec = (int __user *) newval; |
2877 | int *min = (int *) table->extra1; | 2891 | int *min = (int *) table->extra1; |
2878 | int *max = (int *) table->extra2; | 2892 | int *max = (int *) table->extra2; |
2879 | size_t length; | 2893 | size_t length; |
2880 | int i; | 2894 | int i; |
2881 | 2895 | ||
2882 | if (newlen % sizeof(int) != 0) | 2896 | if (newlen % sizeof(int) != 0) |
2883 | return -EINVAL; | 2897 | return -EINVAL; |
2884 | 2898 | ||
2885 | if (!table->extra1 && !table->extra2) | 2899 | if (!table->extra1 && !table->extra2) |
2886 | return 0; | 2900 | return 0; |
2887 | 2901 | ||
2888 | if (newlen > table->maxlen) | 2902 | if (newlen > table->maxlen) |
2889 | newlen = table->maxlen; | 2903 | newlen = table->maxlen; |
2890 | length = newlen / sizeof(int); | 2904 | length = newlen / sizeof(int); |
2891 | 2905 | ||
2892 | for (i = 0; i < length; i++) { | 2906 | for (i = 0; i < length; i++) { |
2893 | int value; | 2907 | int value; |
2894 | if (get_user(value, vec + i)) | 2908 | if (get_user(value, vec + i)) |
2895 | return -EFAULT; | 2909 | return -EFAULT; |
2896 | if (min && value < min[i]) | 2910 | if (min && value < min[i]) |
2897 | return -EINVAL; | 2911 | return -EINVAL; |
2898 | if (max && value > max[i]) | 2912 | if (max && value > max[i]) |
2899 | return -EINVAL; | 2913 | return -EINVAL; |
2900 | } | 2914 | } |
2901 | } | 2915 | } |
2902 | return 0; | 2916 | return 0; |
2903 | } | 2917 | } |
2904 | 2918 | ||
2905 | /* Strategy function to convert jiffies to seconds */ | 2919 | /* Strategy function to convert jiffies to seconds */ |
2906 | int sysctl_jiffies(struct ctl_table *table, | 2920 | int sysctl_jiffies(struct ctl_table *table, |
2907 | void __user *oldval, size_t __user *oldlenp, | 2921 | void __user *oldval, size_t __user *oldlenp, |
2908 | void __user *newval, size_t newlen) | 2922 | void __user *newval, size_t newlen) |
2909 | { | 2923 | { |
2910 | if (oldval && oldlenp) { | 2924 | if (oldval && oldlenp) { |
2911 | size_t olen; | 2925 | size_t olen; |
2912 | 2926 | ||
2913 | if (get_user(olen, oldlenp)) | 2927 | if (get_user(olen, oldlenp)) |
2914 | return -EFAULT; | 2928 | return -EFAULT; |
2915 | if (olen) { | 2929 | if (olen) { |
2916 | int val; | 2930 | int val; |
2917 | 2931 | ||
2918 | if (olen < sizeof(int)) | 2932 | if (olen < sizeof(int)) |
2919 | return -EINVAL; | 2933 | return -EINVAL; |
2920 | 2934 | ||
2921 | val = *(int *)(table->data) / HZ; | 2935 | val = *(int *)(table->data) / HZ; |
2922 | if (put_user(val, (int __user *)oldval)) | 2936 | if (put_user(val, (int __user *)oldval)) |
2923 | return -EFAULT; | 2937 | return -EFAULT; |
2924 | if (put_user(sizeof(int), oldlenp)) | 2938 | if (put_user(sizeof(int), oldlenp)) |
2925 | return -EFAULT; | 2939 | return -EFAULT; |
2926 | } | 2940 | } |
2927 | } | 2941 | } |
2928 | if (newval && newlen) { | 2942 | if (newval && newlen) { |
2929 | int new; | 2943 | int new; |
2930 | if (newlen != sizeof(int)) | 2944 | if (newlen != sizeof(int)) |
2931 | return -EINVAL; | 2945 | return -EINVAL; |
2932 | if (get_user(new, (int __user *)newval)) | 2946 | if (get_user(new, (int __user *)newval)) |
2933 | return -EFAULT; | 2947 | return -EFAULT; |
2934 | *(int *)(table->data) = new*HZ; | 2948 | *(int *)(table->data) = new*HZ; |
2935 | } | 2949 | } |
2936 | return 1; | 2950 | return 1; |
2937 | } | 2951 | } |
2938 | 2952 | ||
2939 | /* Strategy function to convert jiffies to seconds */ | 2953 | /* Strategy function to convert jiffies to seconds */ |
2940 | int sysctl_ms_jiffies(struct ctl_table *table, | 2954 | int sysctl_ms_jiffies(struct ctl_table *table, |
2941 | void __user *oldval, size_t __user *oldlenp, | 2955 | void __user *oldval, size_t __user *oldlenp, |
2942 | void __user *newval, size_t newlen) | 2956 | void __user *newval, size_t newlen) |
2943 | { | 2957 | { |
2944 | if (oldval && oldlenp) { | 2958 | if (oldval && oldlenp) { |
2945 | size_t olen; | 2959 | size_t olen; |
2946 | 2960 | ||
2947 | if (get_user(olen, oldlenp)) | 2961 | if (get_user(olen, oldlenp)) |
2948 | return -EFAULT; | 2962 | return -EFAULT; |
2949 | if (olen) { | 2963 | if (olen) { |
2950 | int val; | 2964 | int val; |
2951 | 2965 | ||
2952 | if (olen < sizeof(int)) | 2966 | if (olen < sizeof(int)) |
2953 | return -EINVAL; | 2967 | return -EINVAL; |
2954 | 2968 | ||
2955 | val = jiffies_to_msecs(*(int *)(table->data)); | 2969 | val = jiffies_to_msecs(*(int *)(table->data)); |
2956 | if (put_user(val, (int __user *)oldval)) | 2970 | if (put_user(val, (int __user *)oldval)) |
2957 | return -EFAULT; | 2971 | return -EFAULT; |
2958 | if (put_user(sizeof(int), oldlenp)) | 2972 | if (put_user(sizeof(int), oldlenp)) |
2959 | return -EFAULT; | 2973 | return -EFAULT; |
2960 | } | 2974 | } |
2961 | } | 2975 | } |
2962 | if (newval && newlen) { | 2976 | if (newval && newlen) { |
2963 | int new; | 2977 | int new; |
2964 | if (newlen != sizeof(int)) | 2978 | if (newlen != sizeof(int)) |
2965 | return -EINVAL; | 2979 | return -EINVAL; |
2966 | if (get_user(new, (int __user *)newval)) | 2980 | if (get_user(new, (int __user *)newval)) |
2967 | return -EFAULT; | 2981 | return -EFAULT; |
2968 | *(int *)(table->data) = msecs_to_jiffies(new); | 2982 | *(int *)(table->data) = msecs_to_jiffies(new); |
2969 | } | 2983 | } |
2970 | return 1; | 2984 | return 1; |
2971 | } | 2985 | } |
2972 | 2986 | ||
2973 | 2987 | ||
2974 | 2988 | ||
2975 | #else /* CONFIG_SYSCTL_SYSCALL */ | 2989 | #else /* CONFIG_SYSCTL_SYSCALL */ |
2976 | 2990 | ||
2977 | 2991 | ||
2978 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | 2992 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) |
2979 | { | 2993 | { |
2980 | struct __sysctl_args tmp; | 2994 | struct __sysctl_args tmp; |
2981 | int error; | 2995 | int error; |
2982 | 2996 | ||
2983 | if (copy_from_user(&tmp, args, sizeof(tmp))) | 2997 | if (copy_from_user(&tmp, args, sizeof(tmp))) |
2984 | return -EFAULT; | 2998 | return -EFAULT; |
2985 | 2999 | ||
2986 | error = deprecated_sysctl_warning(&tmp); | 3000 | error = deprecated_sysctl_warning(&tmp); |
2987 | 3001 | ||
2988 | /* If no error reading the parameters then just -ENOSYS ... */ | 3002 | /* If no error reading the parameters then just -ENOSYS ... */ |
2989 | if (!error) | 3003 | if (!error) |
2990 | error = -ENOSYS; | 3004 | error = -ENOSYS; |
2991 | 3005 | ||
2992 | return error; | 3006 | return error; |
2993 | } | 3007 | } |
2994 | 3008 | ||
2995 | int sysctl_data(struct ctl_table *table, | 3009 | int sysctl_data(struct ctl_table *table, |
2996 | void __user *oldval, size_t __user *oldlenp, | 3010 | void __user *oldval, size_t __user *oldlenp, |
2997 | void __user *newval, size_t newlen) | 3011 | void __user *newval, size_t newlen) |
2998 | { | 3012 | { |
2999 | return -ENOSYS; | 3013 | return -ENOSYS; |
3000 | } | 3014 | } |
3001 | 3015 | ||
3002 | int sysctl_string(struct ctl_table *table, | 3016 | int sysctl_string(struct ctl_table *table, |
3003 | void __user *oldval, size_t __user *oldlenp, | 3017 | void __user *oldval, size_t __user *oldlenp, |
3004 | void __user *newval, size_t newlen) | 3018 | void __user *newval, size_t newlen) |
3005 | { | 3019 | { |
3006 | return -ENOSYS; | 3020 | return -ENOSYS; |
3007 | } | 3021 | } |
3008 | 3022 | ||
3009 | int sysctl_intvec(struct ctl_table *table, | 3023 | int sysctl_intvec(struct ctl_table *table, |
3010 | void __user *oldval, size_t __user *oldlenp, | 3024 | void __user *oldval, size_t __user *oldlenp, |
3011 | void __user *newval, size_t newlen) | 3025 | void __user *newval, size_t newlen) |
3012 | { | 3026 | { |
3013 | return -ENOSYS; | 3027 | return -ENOSYS; |
3014 | } | 3028 | } |
3015 | 3029 | ||
3016 | int sysctl_jiffies(struct ctl_table *table, | 3030 | int sysctl_jiffies(struct ctl_table *table, |
3017 | void __user *oldval, size_t __user *oldlenp, | 3031 | void __user *oldval, size_t __user *oldlenp, |
3018 | void __user *newval, size_t newlen) | 3032 | void __user *newval, size_t newlen) |
3019 | { | 3033 | { |
3020 | return -ENOSYS; | 3034 | return -ENOSYS; |
3021 | } | 3035 | } |
3022 | 3036 | ||
3023 | int sysctl_ms_jiffies(struct ctl_table *table, | 3037 | int sysctl_ms_jiffies(struct ctl_table *table, |
3024 | void __user *oldval, size_t __user *oldlenp, | 3038 | void __user *oldval, size_t __user *oldlenp, |
3025 | void __user *newval, size_t newlen) | 3039 | void __user *newval, size_t newlen) |
3026 | { | 3040 | { |
3027 | return -ENOSYS; | 3041 | return -ENOSYS; |
3028 | } | 3042 | } |
3029 | 3043 | ||
3030 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 3044 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
3031 | 3045 | ||
3032 | static int deprecated_sysctl_warning(struct __sysctl_args *args) | 3046 | static int deprecated_sysctl_warning(struct __sysctl_args *args) |
3033 | { | 3047 | { |
3034 | static int msg_count; | 3048 | static int msg_count; |
3035 | int name[CTL_MAXNAME]; | 3049 | int name[CTL_MAXNAME]; |
3036 | int i; | 3050 | int i; |
3037 | 3051 | ||
3038 | /* Check args->nlen. */ | 3052 | /* Check args->nlen. */ |
3039 | if (args->nlen < 0 || args->nlen > CTL_MAXNAME) | 3053 | if (args->nlen < 0 || args->nlen > CTL_MAXNAME) |
3040 | return -ENOTDIR; | 3054 | return -ENOTDIR; |
3041 | 3055 | ||
3042 | /* Read in the sysctl name for better debug message logging */ | 3056 | /* Read in the sysctl name for better debug message logging */ |
3043 | for (i = 0; i < args->nlen; i++) | 3057 | for (i = 0; i < args->nlen; i++) |
3044 | if (get_user(name[i], args->name + i)) | 3058 | if (get_user(name[i], args->name + i)) |
3045 | return -EFAULT; | 3059 | return -EFAULT; |
3046 | 3060 | ||
3047 | /* Ignore accesses to kernel.version */ | 3061 | /* Ignore accesses to kernel.version */ |
3048 | if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) | 3062 | if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) |
3049 | return 0; | 3063 | return 0; |
3050 | 3064 | ||
3051 | if (msg_count < 5) { | 3065 | if (msg_count < 5) { |
3052 | msg_count++; | 3066 | msg_count++; |
3053 | printk(KERN_INFO | 3067 | printk(KERN_INFO |
3054 | "warning: process `%s' used the deprecated sysctl " | 3068 | "warning: process `%s' used the deprecated sysctl " |
3055 | "system call with ", current->comm); | 3069 | "system call with ", current->comm); |
3056 | for (i = 0; i < args->nlen; i++) | 3070 | for (i = 0; i < args->nlen; i++) |
3057 | printk("%d.", name[i]); | 3071 | printk("%d.", name[i]); |
3058 | printk("\n"); | 3072 | printk("\n"); |
3059 | } | 3073 | } |
3060 | return 0; | 3074 | return 0; |
3061 | } | 3075 | } |
3062 | 3076 | ||
3063 | /* | 3077 | /* |
3064 | * No sense putting this after each symbol definition, twice, | 3078 | * No sense putting this after each symbol definition, twice, |
3065 | * exception granted :-) | 3079 | * exception granted :-) |
3066 | */ | 3080 | */ |
3067 | EXPORT_SYMBOL(proc_dointvec); | 3081 | EXPORT_SYMBOL(proc_dointvec); |
3068 | EXPORT_SYMBOL(proc_dointvec_jiffies); | 3082 | EXPORT_SYMBOL(proc_dointvec_jiffies); |
3069 | EXPORT_SYMBOL(proc_dointvec_minmax); | 3083 | EXPORT_SYMBOL(proc_dointvec_minmax); |
3070 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); | 3084 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); |
3071 | EXPORT_SYMBOL(proc_dointvec_ms_jiffies); | 3085 | EXPORT_SYMBOL(proc_dointvec_ms_jiffies); |
3072 | EXPORT_SYMBOL(proc_dostring); | 3086 | EXPORT_SYMBOL(proc_dostring); |
3073 | EXPORT_SYMBOL(proc_doulongvec_minmax); | 3087 | EXPORT_SYMBOL(proc_doulongvec_minmax); |
3074 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); | 3088 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); |
3075 | EXPORT_SYMBOL(register_sysctl_table); | 3089 | EXPORT_SYMBOL(register_sysctl_table); |
3076 | EXPORT_SYMBOL(register_sysctl_paths); | 3090 | EXPORT_SYMBOL(register_sysctl_paths); |
3077 | EXPORT_SYMBOL(sysctl_intvec); | 3091 | EXPORT_SYMBOL(sysctl_intvec); |
3078 | EXPORT_SYMBOL(sysctl_jiffies); | 3092 | EXPORT_SYMBOL(sysctl_jiffies); |
3079 | EXPORT_SYMBOL(sysctl_ms_jiffies); | 3093 | EXPORT_SYMBOL(sysctl_ms_jiffies); |
3080 | EXPORT_SYMBOL(sysctl_string); | 3094 | EXPORT_SYMBOL(sysctl_string); |
3081 | EXPORT_SYMBOL(sysctl_data); | 3095 | EXPORT_SYMBOL(sysctl_data); |
3082 | EXPORT_SYMBOL(unregister_sysctl_table); | 3096 | EXPORT_SYMBOL(unregister_sysctl_table); |
3083 | 3097 |
lib/Kconfig.debug
1 | 1 | ||
2 | config PRINTK_TIME | 2 | config PRINTK_TIME |
3 | bool "Show timing information on printks" | 3 | bool "Show timing information on printks" |
4 | depends on PRINTK | 4 | depends on PRINTK |
5 | help | 5 | help |
6 | Selecting this option causes timing information to be | 6 | Selecting this option causes timing information to be |
7 | included in printk output. This allows you to measure | 7 | included in printk output. This allows you to measure |
8 | the interval between kernel operations, including bootup | 8 | the interval between kernel operations, including bootup |
9 | operations. This is useful for identifying long delays | 9 | operations. This is useful for identifying long delays |
10 | in kernel startup. | 10 | in kernel startup. |
11 | 11 | ||
12 | config ENABLE_WARN_DEPRECATED | 12 | config ENABLE_WARN_DEPRECATED |
13 | bool "Enable __deprecated logic" | 13 | bool "Enable __deprecated logic" |
14 | default y | 14 | default y |
15 | help | 15 | help |
16 | Enable the __deprecated logic in the kernel build. | 16 | Enable the __deprecated logic in the kernel build. |
17 | Disable this to suppress the "warning: 'foo' is deprecated | 17 | Disable this to suppress the "warning: 'foo' is deprecated |
18 | (declared at kernel/power/somefile.c:1234)" messages. | 18 | (declared at kernel/power/somefile.c:1234)" messages. |
19 | 19 | ||
20 | config ENABLE_MUST_CHECK | 20 | config ENABLE_MUST_CHECK |
21 | bool "Enable __must_check logic" | 21 | bool "Enable __must_check logic" |
22 | default y | 22 | default y |
23 | help | 23 | help |
24 | Enable the __must_check logic in the kernel build. Disable this to | 24 | Enable the __must_check logic in the kernel build. Disable this to |
25 | suppress the "warning: ignoring return value of 'foo', declared with | 25 | suppress the "warning: ignoring return value of 'foo', declared with |
26 | attribute warn_unused_result" messages. | 26 | attribute warn_unused_result" messages. |
27 | 27 | ||
28 | config FRAME_WARN | 28 | config FRAME_WARN |
29 | int "Warn for stack frames larger than (needs gcc 4.4)" | 29 | int "Warn for stack frames larger than (needs gcc 4.4)" |
30 | range 0 8192 | 30 | range 0 8192 |
31 | default 1024 if !64BIT | 31 | default 1024 if !64BIT |
32 | default 2048 if 64BIT | 32 | default 2048 if 64BIT |
33 | help | 33 | help |
34 | Tell gcc to warn at build time for stack frames larger than this. | 34 | Tell gcc to warn at build time for stack frames larger than this. |
35 | Setting this too low will cause a lot of warnings. | 35 | Setting this too low will cause a lot of warnings. |
36 | Setting it to 0 disables the warning. | 36 | Setting it to 0 disables the warning. |
37 | Requires gcc 4.4 | 37 | Requires gcc 4.4 |
38 | 38 | ||
39 | config MAGIC_SYSRQ | 39 | config MAGIC_SYSRQ |
40 | bool "Magic SysRq key" | 40 | bool "Magic SysRq key" |
41 | depends on !UML | 41 | depends on !UML |
42 | help | 42 | help |
43 | If you say Y here, you will have some control over the system even | 43 | If you say Y here, you will have some control over the system even |
44 | if the system crashes for example during kernel debugging (e.g., you | 44 | if the system crashes for example during kernel debugging (e.g., you |
45 | will be able to flush the buffer cache to disk, reboot the system | 45 | will be able to flush the buffer cache to disk, reboot the system |
46 | immediately or dump some status information). This is accomplished | 46 | immediately or dump some status information). This is accomplished |
47 | by pressing various keys while holding SysRq (Alt+PrintScreen). It | 47 | by pressing various keys while holding SysRq (Alt+PrintScreen). It |
48 | also works on a serial console (on PC hardware at least), if you | 48 | also works on a serial console (on PC hardware at least), if you |
49 | send a BREAK and then within 5 seconds a command keypress. The | 49 | send a BREAK and then within 5 seconds a command keypress. The |
50 | keys are documented in <file:Documentation/sysrq.txt>. Don't say Y | 50 | keys are documented in <file:Documentation/sysrq.txt>. Don't say Y |
51 | unless you really know what this hack does. | 51 | unless you really know what this hack does. |
52 | 52 | ||
53 | config UNUSED_SYMBOLS | 53 | config UNUSED_SYMBOLS |
54 | bool "Enable unused/obsolete exported symbols" | 54 | bool "Enable unused/obsolete exported symbols" |
55 | default y if X86 | 55 | default y if X86 |
56 | help | 56 | help |
57 | Unused but exported symbols make the kernel needlessly bigger. For | 57 | Unused but exported symbols make the kernel needlessly bigger. For |
58 | that reason most of these unused exports will soon be removed. This | 58 | that reason most of these unused exports will soon be removed. This |
59 | option is provided temporarily to provide a transition period in case | 59 | option is provided temporarily to provide a transition period in case |
60 | some external kernel module needs one of these symbols anyway. If you | 60 | some external kernel module needs one of these symbols anyway. If you |
61 | encounter such a case in your module, consider if you are actually | 61 | encounter such a case in your module, consider if you are actually |
62 | using the right API. (rationale: since nobody in the kernel is using | 62 | using the right API. (rationale: since nobody in the kernel is using |
63 | this in a module, there is a pretty good chance it's actually the | 63 | this in a module, there is a pretty good chance it's actually the |
64 | wrong interface to use). If you really need the symbol, please send a | 64 | wrong interface to use). If you really need the symbol, please send a |
65 | mail to the linux kernel mailing list mentioning the symbol and why | 65 | mail to the linux kernel mailing list mentioning the symbol and why |
66 | you really need it, and what the merge plan to the mainline kernel for | 66 | you really need it, and what the merge plan to the mainline kernel for |
67 | your module is. | 67 | your module is. |
68 | 68 | ||
69 | config DEBUG_FS | 69 | config DEBUG_FS |
70 | bool "Debug Filesystem" | 70 | bool "Debug Filesystem" |
71 | depends on SYSFS | 71 | depends on SYSFS |
72 | help | 72 | help |
73 | debugfs is a virtual file system that kernel developers use to put | 73 | debugfs is a virtual file system that kernel developers use to put |
74 | debugging files into. Enable this option to be able to read and | 74 | debugging files into. Enable this option to be able to read and |
75 | write to these files. | 75 | write to these files. |
76 | 76 | ||
77 | For detailed documentation on the debugfs API, see | 77 | For detailed documentation on the debugfs API, see |
78 | Documentation/DocBook/filesystems. | 78 | Documentation/DocBook/filesystems. |
79 | 79 | ||
80 | If unsure, say N. | 80 | If unsure, say N. |
81 | 81 | ||
82 | config HEADERS_CHECK | 82 | config HEADERS_CHECK |
83 | bool "Run 'make headers_check' when building vmlinux" | 83 | bool "Run 'make headers_check' when building vmlinux" |
84 | depends on !UML | 84 | depends on !UML |
85 | help | 85 | help |
86 | This option will extract the user-visible kernel headers whenever | 86 | This option will extract the user-visible kernel headers whenever |
87 | building the kernel, and will run basic sanity checks on them to | 87 | building the kernel, and will run basic sanity checks on them to |
88 | ensure that exported files do not attempt to include files which | 88 | ensure that exported files do not attempt to include files which |
89 | were not exported, etc. | 89 | were not exported, etc. |
90 | 90 | ||
91 | If you're making modifications to header files which are | 91 | If you're making modifications to header files which are |
92 | relevant for userspace, say 'Y', and check the headers | 92 | relevant for userspace, say 'Y', and check the headers |
93 | exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in | 93 | exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in |
94 | your build tree), to make sure they're suitable. | 94 | your build tree), to make sure they're suitable. |
95 | 95 | ||
96 | config DEBUG_SECTION_MISMATCH | 96 | config DEBUG_SECTION_MISMATCH |
97 | bool "Enable full Section mismatch analysis" | 97 | bool "Enable full Section mismatch analysis" |
98 | depends on UNDEFINED | 98 | depends on UNDEFINED |
99 | # This option is on purpose disabled for now. | 99 | # This option is on purpose disabled for now. |
100 | # It will be enabled when we are down to a resonable number | 100 | # It will be enabled when we are down to a resonable number |
101 | # of section mismatch warnings (< 10 for an allyesconfig build) | 101 | # of section mismatch warnings (< 10 for an allyesconfig build) |
102 | help | 102 | help |
103 | The section mismatch analysis checks if there are illegal | 103 | The section mismatch analysis checks if there are illegal |
104 | references from one section to another section. | 104 | references from one section to another section. |
105 | Linux will during link or during runtime drop some sections | 105 | Linux will during link or during runtime drop some sections |
106 | and any use of code/data previously in these sections will | 106 | and any use of code/data previously in these sections will |
107 | most likely result in an oops. | 107 | most likely result in an oops. |
108 | In the code functions and variables are annotated with | 108 | In the code functions and variables are annotated with |
109 | __init, __devinit etc. (see full list in include/linux/init.h) | 109 | __init, __devinit etc. (see full list in include/linux/init.h) |
110 | which results in the code/data being placed in specific sections. | 110 | which results in the code/data being placed in specific sections. |
111 | The section mismatch analysis is always done after a full | 111 | The section mismatch analysis is always done after a full |
112 | kernel build but enabling this option will in addition | 112 | kernel build but enabling this option will in addition |
113 | do the following: | 113 | do the following: |
114 | - Add the option -fno-inline-functions-called-once to gcc | 114 | - Add the option -fno-inline-functions-called-once to gcc |
115 | When inlining a function annotated __init in a non-init | 115 | When inlining a function annotated __init in a non-init |
116 | function we would lose the section information and thus | 116 | function we would lose the section information and thus |
117 | the analysis would not catch the illegal reference. | 117 | the analysis would not catch the illegal reference. |
118 | This option tells gcc to inline less but will also | 118 | This option tells gcc to inline less but will also |
119 | result in a larger kernel. | 119 | result in a larger kernel. |
120 | - Run the section mismatch analysis for each module/built-in.o | 120 | - Run the section mismatch analysis for each module/built-in.o |
121 | When we run the section mismatch analysis on vmlinux.o we | 121 | When we run the section mismatch analysis on vmlinux.o we |
122 | lose valueble information about where the mismatch was | 122 | lose valueble information about where the mismatch was |
123 | introduced. | 123 | introduced. |
124 | Running the analysis for each module/built-in.o file | 124 | Running the analysis for each module/built-in.o file |
125 | will tell where the mismatch happens much closer to the | 125 | will tell where the mismatch happens much closer to the |
126 | source. The drawback is that we will report the same | 126 | source. The drawback is that we will report the same |
127 | mismatch at least twice. | 127 | mismatch at least twice. |
128 | - Enable verbose reporting from modpost to help solving | 128 | - Enable verbose reporting from modpost to help solving |
129 | the section mismatches reported. | 129 | the section mismatches reported. |
130 | 130 | ||
131 | config DEBUG_KERNEL | 131 | config DEBUG_KERNEL |
132 | bool "Kernel debugging" | 132 | bool "Kernel debugging" |
133 | help | 133 | help |
134 | Say Y here if you are developing drivers or trying to debug and | 134 | Say Y here if you are developing drivers or trying to debug and |
135 | identify kernel problems. | 135 | identify kernel problems. |
136 | 136 | ||
137 | config DEBUG_SHIRQ | 137 | config DEBUG_SHIRQ |
138 | bool "Debug shared IRQ handlers" | 138 | bool "Debug shared IRQ handlers" |
139 | depends on DEBUG_KERNEL && GENERIC_HARDIRQS | 139 | depends on DEBUG_KERNEL && GENERIC_HARDIRQS |
140 | help | 140 | help |
141 | Enable this to generate a spurious interrupt as soon as a shared | 141 | Enable this to generate a spurious interrupt as soon as a shared |
142 | interrupt handler is registered, and just before one is deregistered. | 142 | interrupt handler is registered, and just before one is deregistered. |
143 | Drivers ought to be able to handle interrupts coming in at those | 143 | Drivers ought to be able to handle interrupts coming in at those |
144 | points; some don't and need to be caught. | 144 | points; some don't and need to be caught. |
145 | 145 | ||
146 | config DETECT_SOFTLOCKUP | 146 | config DETECT_SOFTLOCKUP |
147 | bool "Detect Soft Lockups" | 147 | bool "Detect Soft Lockups" |
148 | depends on DEBUG_KERNEL && !S390 | 148 | depends on DEBUG_KERNEL && !S390 |
149 | default y | 149 | default y |
150 | help | 150 | help |
151 | Say Y here to enable the kernel to detect "soft lockups", | 151 | Say Y here to enable the kernel to detect "soft lockups", |
152 | which are bugs that cause the kernel to loop in kernel | 152 | which are bugs that cause the kernel to loop in kernel |
153 | mode for more than 60 seconds, without giving other tasks a | 153 | mode for more than 60 seconds, without giving other tasks a |
154 | chance to run. | 154 | chance to run. |
155 | 155 | ||
156 | When a soft-lockup is detected, the kernel will print the | 156 | When a soft-lockup is detected, the kernel will print the |
157 | current stack trace (which you should report), but the | 157 | current stack trace (which you should report), but the |
158 | system will stay locked up. This feature has negligible | 158 | system will stay locked up. This feature has negligible |
159 | overhead. | 159 | overhead. |
160 | 160 | ||
161 | (Note that "hard lockups" are separate type of bugs that | 161 | (Note that "hard lockups" are separate type of bugs that |
162 | can be detected via the NMI-watchdog, on platforms that | 162 | can be detected via the NMI-watchdog, on platforms that |
163 | support it.) | 163 | support it.) |
164 | 164 | ||
165 | config BOOTPARAM_SOFTLOCKUP_PANIC | 165 | config BOOTPARAM_SOFTLOCKUP_PANIC |
166 | bool "Panic (Reboot) On Soft Lockups" | 166 | bool "Panic (Reboot) On Soft Lockups" |
167 | depends on DETECT_SOFTLOCKUP | 167 | depends on DETECT_SOFTLOCKUP |
168 | help | 168 | help |
169 | Say Y here to enable the kernel to panic on "soft lockups", | 169 | Say Y here to enable the kernel to panic on "soft lockups", |
170 | which are bugs that cause the kernel to loop in kernel | 170 | which are bugs that cause the kernel to loop in kernel |
171 | mode for more than 60 seconds, without giving other tasks a | 171 | mode for more than 60 seconds, without giving other tasks a |
172 | chance to run. | 172 | chance to run. |
173 | 173 | ||
174 | The panic can be used in combination with panic_timeout, | 174 | The panic can be used in combination with panic_timeout, |
175 | to cause the system to reboot automatically after a | 175 | to cause the system to reboot automatically after a |
176 | lockup has been detected. This feature is useful for | 176 | lockup has been detected. This feature is useful for |
177 | high-availability systems that have uptime guarantees and | 177 | high-availability systems that have uptime guarantees and |
178 | where a lockup must be resolved ASAP. | 178 | where a lockup must be resolved ASAP. |
179 | 179 | ||
180 | Say N if unsure. | 180 | Say N if unsure. |
181 | 181 | ||
182 | config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE | 182 | config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE |
183 | int | 183 | int |
184 | depends on DETECT_SOFTLOCKUP | 184 | depends on DETECT_SOFTLOCKUP |
185 | range 0 1 | 185 | range 0 1 |
186 | default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC | 186 | default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC |
187 | default 1 if BOOTPARAM_SOFTLOCKUP_PANIC | 187 | default 1 if BOOTPARAM_SOFTLOCKUP_PANIC |
188 | 188 | ||
189 | config SCHED_DEBUG | 189 | config SCHED_DEBUG |
190 | bool "Collect scheduler debugging info" | 190 | bool "Collect scheduler debugging info" |
191 | depends on DEBUG_KERNEL && PROC_FS | 191 | depends on DEBUG_KERNEL && PROC_FS |
192 | default y | 192 | default y |
193 | help | 193 | help |
194 | If you say Y here, the /proc/sched_debug file will be provided | 194 | If you say Y here, the /proc/sched_debug file will be provided |
195 | that can help debug the scheduler. The runtime overhead of this | 195 | that can help debug the scheduler. The runtime overhead of this |
196 | option is minimal. | 196 | option is minimal. |
197 | 197 | ||
198 | config SCHEDSTATS | 198 | config SCHEDSTATS |
199 | bool "Collect scheduler statistics" | 199 | bool "Collect scheduler statistics" |
200 | depends on DEBUG_KERNEL && PROC_FS | 200 | depends on DEBUG_KERNEL && PROC_FS |
201 | help | 201 | help |
202 | If you say Y here, additional code will be inserted into the | 202 | If you say Y here, additional code will be inserted into the |
203 | scheduler and related routines to collect statistics about | 203 | scheduler and related routines to collect statistics about |
204 | scheduler behavior and provide them in /proc/schedstat. These | 204 | scheduler behavior and provide them in /proc/schedstat. These |
205 | stats may be useful for both tuning and debugging the scheduler | 205 | stats may be useful for both tuning and debugging the scheduler |
206 | If you aren't debugging the scheduler or trying to tune a specific | 206 | If you aren't debugging the scheduler or trying to tune a specific |
207 | application, you can say N to avoid the very slight overhead | 207 | application, you can say N to avoid the very slight overhead |
208 | this adds. | 208 | this adds. |
209 | 209 | ||
210 | config TIMER_STATS | 210 | config TIMER_STATS |
211 | bool "Collect kernel timers statistics" | 211 | bool "Collect kernel timers statistics" |
212 | depends on DEBUG_KERNEL && PROC_FS | 212 | depends on DEBUG_KERNEL && PROC_FS |
213 | help | 213 | help |
214 | If you say Y here, additional code will be inserted into the | 214 | If you say Y here, additional code will be inserted into the |
215 | timer routines to collect statistics about kernel timers being | 215 | timer routines to collect statistics about kernel timers being |
216 | reprogrammed. The statistics can be read from /proc/timer_stats. | 216 | reprogrammed. The statistics can be read from /proc/timer_stats. |
217 | The statistics collection is started by writing 1 to /proc/timer_stats, | 217 | The statistics collection is started by writing 1 to /proc/timer_stats, |
218 | writing 0 stops it. This feature is useful to collect information | 218 | writing 0 stops it. This feature is useful to collect information |
219 | about timer usage patterns in kernel and userspace. This feature | 219 | about timer usage patterns in kernel and userspace. This feature |
220 | is lightweight if enabled in the kernel config but not activated | 220 | is lightweight if enabled in the kernel config but not activated |
221 | (it defaults to deactivated on bootup and will only be activated | 221 | (it defaults to deactivated on bootup and will only be activated |
222 | if some application like powertop activates it explicitly). | 222 | if some application like powertop activates it explicitly). |
223 | 223 | ||
224 | config DEBUG_OBJECTS | 224 | config DEBUG_OBJECTS |
225 | bool "Debug object operations" | 225 | bool "Debug object operations" |
226 | depends on DEBUG_KERNEL | 226 | depends on DEBUG_KERNEL |
227 | help | 227 | help |
228 | If you say Y here, additional code will be inserted into the | 228 | If you say Y here, additional code will be inserted into the |
229 | kernel to track the life time of various objects and validate | 229 | kernel to track the life time of various objects and validate |
230 | the operations on those objects. | 230 | the operations on those objects. |
231 | 231 | ||
232 | config DEBUG_OBJECTS_SELFTEST | 232 | config DEBUG_OBJECTS_SELFTEST |
233 | bool "Debug objects selftest" | 233 | bool "Debug objects selftest" |
234 | depends on DEBUG_OBJECTS | 234 | depends on DEBUG_OBJECTS |
235 | help | 235 | help |
236 | This enables the selftest of the object debug code. | 236 | This enables the selftest of the object debug code. |
237 | 237 | ||
238 | config DEBUG_OBJECTS_FREE | 238 | config DEBUG_OBJECTS_FREE |
239 | bool "Debug objects in freed memory" | 239 | bool "Debug objects in freed memory" |
240 | depends on DEBUG_OBJECTS | 240 | depends on DEBUG_OBJECTS |
241 | help | 241 | help |
242 | This enables checks whether a k/v free operation frees an area | 242 | This enables checks whether a k/v free operation frees an area |
243 | which contains an object which has not been deactivated | 243 | which contains an object which has not been deactivated |
244 | properly. This can make kmalloc/kfree-intensive workloads | 244 | properly. This can make kmalloc/kfree-intensive workloads |
245 | much slower. | 245 | much slower. |
246 | 246 | ||
247 | config DEBUG_OBJECTS_TIMERS | 247 | config DEBUG_OBJECTS_TIMERS |
248 | bool "Debug timer objects" | 248 | bool "Debug timer objects" |
249 | depends on DEBUG_OBJECTS | 249 | depends on DEBUG_OBJECTS |
250 | help | 250 | help |
251 | If you say Y here, additional code will be inserted into the | 251 | If you say Y here, additional code will be inserted into the |
252 | timer routines to track the life time of timer objects and | 252 | timer routines to track the life time of timer objects and |
253 | validate the timer operations. | 253 | validate the timer operations. |
254 | 254 | ||
255 | config DEBUG_OBJECTS_ENABLE_DEFAULT | 255 | config DEBUG_OBJECTS_ENABLE_DEFAULT |
256 | int "debug_objects bootup default value (0-1)" | 256 | int "debug_objects bootup default value (0-1)" |
257 | range 0 1 | 257 | range 0 1 |
258 | default "1" | 258 | default "1" |
259 | depends on DEBUG_OBJECTS | 259 | depends on DEBUG_OBJECTS |
260 | help | 260 | help |
261 | Debug objects boot parameter default value | 261 | Debug objects boot parameter default value |
262 | 262 | ||
263 | config DEBUG_SLAB | 263 | config DEBUG_SLAB |
264 | bool "Debug slab memory allocations" | 264 | bool "Debug slab memory allocations" |
265 | depends on DEBUG_KERNEL && SLAB | 265 | depends on DEBUG_KERNEL && SLAB |
266 | help | 266 | help |
267 | Say Y here to have the kernel do limited verification on memory | 267 | Say Y here to have the kernel do limited verification on memory |
268 | allocation as well as poisoning memory on free to catch use of freed | 268 | allocation as well as poisoning memory on free to catch use of freed |
269 | memory. This can make kmalloc/kfree-intensive workloads much slower. | 269 | memory. This can make kmalloc/kfree-intensive workloads much slower. |
270 | 270 | ||
271 | config DEBUG_SLAB_LEAK | 271 | config DEBUG_SLAB_LEAK |
272 | bool "Memory leak debugging" | 272 | bool "Memory leak debugging" |
273 | depends on DEBUG_SLAB | 273 | depends on DEBUG_SLAB |
274 | 274 | ||
275 | config SLUB_DEBUG_ON | 275 | config SLUB_DEBUG_ON |
276 | bool "SLUB debugging on by default" | 276 | bool "SLUB debugging on by default" |
277 | depends on SLUB && SLUB_DEBUG | 277 | depends on SLUB && SLUB_DEBUG |
278 | default n | 278 | default n |
279 | help | 279 | help |
280 | Boot with debugging on by default. SLUB boots by default with | 280 | Boot with debugging on by default. SLUB boots by default with |
281 | the runtime debug capabilities switched off. Enabling this is | 281 | the runtime debug capabilities switched off. Enabling this is |
282 | equivalent to specifying the "slub_debug" parameter on boot. | 282 | equivalent to specifying the "slub_debug" parameter on boot. |
283 | There is no support for more fine grained debug control like | 283 | There is no support for more fine grained debug control like |
284 | possible with slub_debug=xxx. SLUB debugging may be switched | 284 | possible with slub_debug=xxx. SLUB debugging may be switched |
285 | off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying | 285 | off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying |
286 | "slub_debug=-". | 286 | "slub_debug=-". |
287 | 287 | ||
288 | config SLUB_STATS | 288 | config SLUB_STATS |
289 | default n | 289 | default n |
290 | bool "Enable SLUB performance statistics" | 290 | bool "Enable SLUB performance statistics" |
291 | depends on SLUB && SLUB_DEBUG && SYSFS | 291 | depends on SLUB && SLUB_DEBUG && SYSFS |
292 | help | 292 | help |
293 | SLUB statistics are useful to debug SLUBs allocation behavior in | 293 | SLUB statistics are useful to debug SLUBs allocation behavior in |
294 | order find ways to optimize the allocator. This should never be | 294 | order find ways to optimize the allocator. This should never be |
295 | enabled for production use since keeping statistics slows down | 295 | enabled for production use since keeping statistics slows down |
296 | the allocator by a few percentage points. The slabinfo command | 296 | the allocator by a few percentage points. The slabinfo command |
297 | supports the determination of the most active slabs to figure | 297 | supports the determination of the most active slabs to figure |
298 | out which slabs are relevant to a particular load. | 298 | out which slabs are relevant to a particular load. |
299 | Try running: slabinfo -DA | 299 | Try running: slabinfo -DA |
300 | 300 | ||
301 | config DEBUG_PREEMPT | 301 | config DEBUG_PREEMPT |
302 | bool "Debug preemptible kernel" | 302 | bool "Debug preemptible kernel" |
303 | depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64) | 303 | depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64) |
304 | default y | 304 | default y |
305 | help | 305 | help |
306 | If you say Y here then the kernel will use a debug variant of the | 306 | If you say Y here then the kernel will use a debug variant of the |
307 | commonly used smp_processor_id() function and will print warnings | 307 | commonly used smp_processor_id() function and will print warnings |
308 | if kernel code uses it in a preemption-unsafe way. Also, the kernel | 308 | if kernel code uses it in a preemption-unsafe way. Also, the kernel |
309 | will detect preemption count underflows. | 309 | will detect preemption count underflows. |
310 | 310 | ||
311 | config DEBUG_RT_MUTEXES | 311 | config DEBUG_RT_MUTEXES |
312 | bool "RT Mutex debugging, deadlock detection" | 312 | bool "RT Mutex debugging, deadlock detection" |
313 | depends on DEBUG_KERNEL && RT_MUTEXES | 313 | depends on DEBUG_KERNEL && RT_MUTEXES |
314 | help | 314 | help |
315 | This allows rt mutex semantics violations and rt mutex related | 315 | This allows rt mutex semantics violations and rt mutex related |
316 | deadlocks (lockups) to be detected and reported automatically. | 316 | deadlocks (lockups) to be detected and reported automatically. |
317 | 317 | ||
318 | config DEBUG_PI_LIST | 318 | config DEBUG_PI_LIST |
319 | bool | 319 | bool |
320 | default y | 320 | default y |
321 | depends on DEBUG_RT_MUTEXES | 321 | depends on DEBUG_RT_MUTEXES |
322 | 322 | ||
323 | config RT_MUTEX_TESTER | 323 | config RT_MUTEX_TESTER |
324 | bool "Built-in scriptable tester for rt-mutexes" | 324 | bool "Built-in scriptable tester for rt-mutexes" |
325 | depends on DEBUG_KERNEL && RT_MUTEXES | 325 | depends on DEBUG_KERNEL && RT_MUTEXES |
326 | help | 326 | help |
327 | This option enables a rt-mutex tester. | 327 | This option enables a rt-mutex tester. |
328 | 328 | ||
329 | config DEBUG_SPINLOCK | 329 | config DEBUG_SPINLOCK |
330 | bool "Spinlock and rw-lock debugging: basic checks" | 330 | bool "Spinlock and rw-lock debugging: basic checks" |
331 | depends on DEBUG_KERNEL | 331 | depends on DEBUG_KERNEL |
332 | help | 332 | help |
333 | Say Y here and build SMP to catch missing spinlock initialization | 333 | Say Y here and build SMP to catch missing spinlock initialization |
334 | and certain other kinds of spinlock errors commonly made. This is | 334 | and certain other kinds of spinlock errors commonly made. This is |
335 | best used in conjunction with the NMI watchdog so that spinlock | 335 | best used in conjunction with the NMI watchdog so that spinlock |
336 | deadlocks are also debuggable. | 336 | deadlocks are also debuggable. |
337 | 337 | ||
338 | config DEBUG_MUTEXES | 338 | config DEBUG_MUTEXES |
339 | bool "Mutex debugging: basic checks" | 339 | bool "Mutex debugging: basic checks" |
340 | depends on DEBUG_KERNEL | 340 | depends on DEBUG_KERNEL |
341 | help | 341 | help |
342 | This feature allows mutex semantics violations to be detected and | 342 | This feature allows mutex semantics violations to be detected and |
343 | reported. | 343 | reported. |
344 | 344 | ||
345 | config DEBUG_LOCK_ALLOC | 345 | config DEBUG_LOCK_ALLOC |
346 | bool "Lock debugging: detect incorrect freeing of live locks" | 346 | bool "Lock debugging: detect incorrect freeing of live locks" |
347 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT | 347 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT |
348 | select DEBUG_SPINLOCK | 348 | select DEBUG_SPINLOCK |
349 | select DEBUG_MUTEXES | 349 | select DEBUG_MUTEXES |
350 | select LOCKDEP | 350 | select LOCKDEP |
351 | help | 351 | help |
352 | This feature will check whether any held lock (spinlock, rwlock, | 352 | This feature will check whether any held lock (spinlock, rwlock, |
353 | mutex or rwsem) is incorrectly freed by the kernel, via any of the | 353 | mutex or rwsem) is incorrectly freed by the kernel, via any of the |
354 | memory-freeing routines (kfree(), kmem_cache_free(), free_pages(), | 354 | memory-freeing routines (kfree(), kmem_cache_free(), free_pages(), |
355 | vfree(), etc.), whether a live lock is incorrectly reinitialized via | 355 | vfree(), etc.), whether a live lock is incorrectly reinitialized via |
356 | spin_lock_init()/mutex_init()/etc., or whether there is any lock | 356 | spin_lock_init()/mutex_init()/etc., or whether there is any lock |
357 | held during task exit. | 357 | held during task exit. |
358 | 358 | ||
359 | config PROVE_LOCKING | 359 | config PROVE_LOCKING |
360 | bool "Lock debugging: prove locking correctness" | 360 | bool "Lock debugging: prove locking correctness" |
361 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT | 361 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT |
362 | select LOCKDEP | 362 | select LOCKDEP |
363 | select DEBUG_SPINLOCK | 363 | select DEBUG_SPINLOCK |
364 | select DEBUG_MUTEXES | 364 | select DEBUG_MUTEXES |
365 | select DEBUG_LOCK_ALLOC | 365 | select DEBUG_LOCK_ALLOC |
366 | default n | 366 | default n |
367 | help | 367 | help |
368 | This feature enables the kernel to prove that all locking | 368 | This feature enables the kernel to prove that all locking |
369 | that occurs in the kernel runtime is mathematically | 369 | that occurs in the kernel runtime is mathematically |
370 | correct: that under no circumstance could an arbitrary (and | 370 | correct: that under no circumstance could an arbitrary (and |
371 | not yet triggered) combination of observed locking | 371 | not yet triggered) combination of observed locking |
372 | sequences (on an arbitrary number of CPUs, running an | 372 | sequences (on an arbitrary number of CPUs, running an |
373 | arbitrary number of tasks and interrupt contexts) cause a | 373 | arbitrary number of tasks and interrupt contexts) cause a |
374 | deadlock. | 374 | deadlock. |
375 | 375 | ||
376 | In short, this feature enables the kernel to report locking | 376 | In short, this feature enables the kernel to report locking |
377 | related deadlocks before they actually occur. | 377 | related deadlocks before they actually occur. |
378 | 378 | ||
379 | The proof does not depend on how hard and complex a | 379 | The proof does not depend on how hard and complex a |
380 | deadlock scenario would be to trigger: how many | 380 | deadlock scenario would be to trigger: how many |
381 | participant CPUs, tasks and irq-contexts would be needed | 381 | participant CPUs, tasks and irq-contexts would be needed |
382 | for it to trigger. The proof also does not depend on | 382 | for it to trigger. The proof also does not depend on |
383 | timing: if a race and a resulting deadlock is possible | 383 | timing: if a race and a resulting deadlock is possible |
384 | theoretically (no matter how unlikely the race scenario | 384 | theoretically (no matter how unlikely the race scenario |
385 | is), it will be proven so and will immediately be | 385 | is), it will be proven so and will immediately be |
386 | reported by the kernel (once the event is observed that | 386 | reported by the kernel (once the event is observed that |
387 | makes the deadlock theoretically possible). | 387 | makes the deadlock theoretically possible). |
388 | 388 | ||
389 | If a deadlock is impossible (i.e. the locking rules, as | 389 | If a deadlock is impossible (i.e. the locking rules, as |
390 | observed by the kernel, are mathematically correct), the | 390 | observed by the kernel, are mathematically correct), the |
391 | kernel reports nothing. | 391 | kernel reports nothing. |
392 | 392 | ||
393 | NOTE: this feature can also be enabled for rwlocks, mutexes | 393 | NOTE: this feature can also be enabled for rwlocks, mutexes |
394 | and rwsems - in which case all dependencies between these | 394 | and rwsems - in which case all dependencies between these |
395 | different locking variants are observed and mapped too, and | 395 | different locking variants are observed and mapped too, and |
396 | the proof of observed correctness is also maintained for an | 396 | the proof of observed correctness is also maintained for an |
397 | arbitrary combination of these separate locking variants. | 397 | arbitrary combination of these separate locking variants. |
398 | 398 | ||
399 | For more details, see Documentation/lockdep-design.txt. | 399 | For more details, see Documentation/lockdep-design.txt. |
400 | 400 | ||
401 | config LOCKDEP | 401 | config LOCKDEP |
402 | bool | 402 | bool |
403 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT | 403 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT |
404 | select STACKTRACE | 404 | select STACKTRACE |
405 | select FRAME_POINTER if !X86 && !MIPS && !PPC | 405 | select FRAME_POINTER if !X86 && !MIPS && !PPC |
406 | select KALLSYMS | 406 | select KALLSYMS |
407 | select KALLSYMS_ALL | 407 | select KALLSYMS_ALL |
408 | 408 | ||
409 | config LOCK_STAT | 409 | config LOCK_STAT |
410 | bool "Lock usage statistics" | 410 | bool "Lock usage statistics" |
411 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT | 411 | depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT |
412 | select LOCKDEP | 412 | select LOCKDEP |
413 | select DEBUG_SPINLOCK | 413 | select DEBUG_SPINLOCK |
414 | select DEBUG_MUTEXES | 414 | select DEBUG_MUTEXES |
415 | select DEBUG_LOCK_ALLOC | 415 | select DEBUG_LOCK_ALLOC |
416 | default n | 416 | default n |
417 | help | 417 | help |
418 | This feature enables tracking lock contention points | 418 | This feature enables tracking lock contention points |
419 | 419 | ||
420 | For more details, see Documentation/lockstat.txt | 420 | For more details, see Documentation/lockstat.txt |
421 | 421 | ||
422 | config DEBUG_LOCKDEP | 422 | config DEBUG_LOCKDEP |
423 | bool "Lock dependency engine debugging" | 423 | bool "Lock dependency engine debugging" |
424 | depends on DEBUG_KERNEL && LOCKDEP | 424 | depends on DEBUG_KERNEL && LOCKDEP |
425 | help | 425 | help |
426 | If you say Y here, the lock dependency engine will do | 426 | If you say Y here, the lock dependency engine will do |
427 | additional runtime checks to debug itself, at the price | 427 | additional runtime checks to debug itself, at the price |
428 | of more runtime overhead. | 428 | of more runtime overhead. |
429 | 429 | ||
430 | config TRACE_IRQFLAGS | 430 | config TRACE_IRQFLAGS |
431 | depends on DEBUG_KERNEL | 431 | depends on DEBUG_KERNEL |
432 | bool | 432 | bool |
433 | default y | 433 | default y |
434 | depends on TRACE_IRQFLAGS_SUPPORT | 434 | depends on TRACE_IRQFLAGS_SUPPORT |
435 | depends on PROVE_LOCKING | 435 | depends on PROVE_LOCKING |
436 | 436 | ||
437 | config DEBUG_SPINLOCK_SLEEP | 437 | config DEBUG_SPINLOCK_SLEEP |
438 | bool "Spinlock debugging: sleep-inside-spinlock checking" | 438 | bool "Spinlock debugging: sleep-inside-spinlock checking" |
439 | depends on DEBUG_KERNEL | 439 | depends on DEBUG_KERNEL |
440 | help | 440 | help |
441 | If you say Y here, various routines which may sleep will become very | 441 | If you say Y here, various routines which may sleep will become very |
442 | noisy if they are called with a spinlock held. | 442 | noisy if they are called with a spinlock held. |
443 | 443 | ||
444 | config DEBUG_LOCKING_API_SELFTESTS | 444 | config DEBUG_LOCKING_API_SELFTESTS |
445 | bool "Locking API boot-time self-tests" | 445 | bool "Locking API boot-time self-tests" |
446 | depends on DEBUG_KERNEL | 446 | depends on DEBUG_KERNEL |
447 | help | 447 | help |
448 | Say Y here if you want the kernel to run a short self-test during | 448 | Say Y here if you want the kernel to run a short self-test during |
449 | bootup. The self-test checks whether common types of locking bugs | 449 | bootup. The self-test checks whether common types of locking bugs |
450 | are detected by debugging mechanisms or not. (if you disable | 450 | are detected by debugging mechanisms or not. (if you disable |
451 | lock debugging then those bugs wont be detected of course.) | 451 | lock debugging then those bugs wont be detected of course.) |
452 | The following locking APIs are covered: spinlocks, rwlocks, | 452 | The following locking APIs are covered: spinlocks, rwlocks, |
453 | mutexes and rwsems. | 453 | mutexes and rwsems. |
454 | 454 | ||
455 | config STACKTRACE | 455 | config STACKTRACE |
456 | bool | 456 | bool |
457 | depends on STACKTRACE_SUPPORT | 457 | depends on STACKTRACE_SUPPORT |
458 | 458 | ||
459 | config DEBUG_KOBJECT | 459 | config DEBUG_KOBJECT |
460 | bool "kobject debugging" | 460 | bool "kobject debugging" |
461 | depends on DEBUG_KERNEL | 461 | depends on DEBUG_KERNEL |
462 | help | 462 | help |
463 | If you say Y here, some extra kobject debugging messages will be sent | 463 | If you say Y here, some extra kobject debugging messages will be sent |
464 | to the syslog. | 464 | to the syslog. |
465 | 465 | ||
466 | config DEBUG_HIGHMEM | 466 | config DEBUG_HIGHMEM |
467 | bool "Highmem debugging" | 467 | bool "Highmem debugging" |
468 | depends on DEBUG_KERNEL && HIGHMEM | 468 | depends on DEBUG_KERNEL && HIGHMEM |
469 | help | 469 | help |
470 | This options enables addition error checking for high memory systems. | 470 | This options enables addition error checking for high memory systems. |
471 | Disable for production systems. | 471 | Disable for production systems. |
472 | 472 | ||
473 | config DEBUG_BUGVERBOSE | 473 | config DEBUG_BUGVERBOSE |
474 | bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EMBEDDED | 474 | bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EMBEDDED |
475 | depends on BUG | 475 | depends on BUG |
476 | depends on ARM || AVR32 || M32R || M68K || SPARC32 || SPARC64 || \ | 476 | depends on ARM || AVR32 || M32R || M68K || SPARC32 || SPARC64 || \ |
477 | FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300 | 477 | FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300 |
478 | default !EMBEDDED | 478 | default !EMBEDDED |
479 | help | 479 | help |
480 | Say Y here to make BUG() panics output the file name and line number | 480 | Say Y here to make BUG() panics output the file name and line number |
481 | of the BUG call as well as the EIP and oops trace. This aids | 481 | of the BUG call as well as the EIP and oops trace. This aids |
482 | debugging but costs about 70-100K of memory. | 482 | debugging but costs about 70-100K of memory. |
483 | 483 | ||
484 | config DEBUG_INFO | 484 | config DEBUG_INFO |
485 | bool "Compile the kernel with debug info" | 485 | bool "Compile the kernel with debug info" |
486 | depends on DEBUG_KERNEL | 486 | depends on DEBUG_KERNEL |
487 | help | 487 | help |
488 | If you say Y here the resulting kernel image will include | 488 | If you say Y here the resulting kernel image will include |
489 | debugging info resulting in a larger kernel image. | 489 | debugging info resulting in a larger kernel image. |
490 | This adds debug symbols to the kernel and modules (gcc -g), and | 490 | This adds debug symbols to the kernel and modules (gcc -g), and |
491 | is needed if you intend to use kernel crashdump or binary object | 491 | is needed if you intend to use kernel crashdump or binary object |
492 | tools like crash, kgdb, LKCD, gdb, etc on the kernel. | 492 | tools like crash, kgdb, LKCD, gdb, etc on the kernel. |
493 | Say Y here only if you plan to debug the kernel. | 493 | Say Y here only if you plan to debug the kernel. |
494 | 494 | ||
495 | If unsure, say N. | 495 | If unsure, say N. |
496 | 496 | ||
497 | config DEBUG_VM | 497 | config DEBUG_VM |
498 | bool "Debug VM" | 498 | bool "Debug VM" |
499 | depends on DEBUG_KERNEL | 499 | depends on DEBUG_KERNEL |
500 | help | 500 | help |
501 | Enable this to turn on extended checks in the virtual-memory system | 501 | Enable this to turn on extended checks in the virtual-memory system |
502 | that may impact performance. | 502 | that may impact performance. |
503 | 503 | ||
504 | If unsure, say N. | 504 | If unsure, say N. |
505 | 505 | ||
506 | config DEBUG_VIRTUAL | 506 | config DEBUG_VIRTUAL |
507 | bool "Debug VM translations" | 507 | bool "Debug VM translations" |
508 | depends on DEBUG_KERNEL && X86 | 508 | depends on DEBUG_KERNEL && X86 |
509 | help | 509 | help |
510 | Enable some costly sanity checks in virtual to page code. This can | 510 | Enable some costly sanity checks in virtual to page code. This can |
511 | catch mistakes with virt_to_page() and friends. | 511 | catch mistakes with virt_to_page() and friends. |
512 | 512 | ||
513 | If unsure, say N. | 513 | If unsure, say N. |
514 | 514 | ||
515 | config DEBUG_NOMMU_REGIONS | ||
516 | bool "Debug the global anon/private NOMMU mapping region tree" | ||
517 | depends on DEBUG_KERNEL && !MMU | ||
518 | help | ||
519 | This option causes the global tree of anonymous and private mapping | ||
520 | regions to be regularly checked for invalid topology. | ||
521 | |||
515 | config DEBUG_WRITECOUNT | 522 | config DEBUG_WRITECOUNT |
516 | bool "Debug filesystem writers count" | 523 | bool "Debug filesystem writers count" |
517 | depends on DEBUG_KERNEL | 524 | depends on DEBUG_KERNEL |
518 | help | 525 | help |
519 | Enable this to catch wrong use of the writers count in struct | 526 | Enable this to catch wrong use of the writers count in struct |
520 | vfsmount. This will increase the size of each file struct by | 527 | vfsmount. This will increase the size of each file struct by |
521 | 32 bits. | 528 | 32 bits. |
522 | 529 | ||
523 | If unsure, say N. | 530 | If unsure, say N. |
524 | 531 | ||
525 | config DEBUG_MEMORY_INIT | 532 | config DEBUG_MEMORY_INIT |
526 | bool "Debug memory initialisation" if EMBEDDED | 533 | bool "Debug memory initialisation" if EMBEDDED |
527 | default !EMBEDDED | 534 | default !EMBEDDED |
528 | help | 535 | help |
529 | Enable this for additional checks during memory initialisation. | 536 | Enable this for additional checks during memory initialisation. |
530 | The sanity checks verify aspects of the VM such as the memory model | 537 | The sanity checks verify aspects of the VM such as the memory model |
531 | and other information provided by the architecture. Verbose | 538 | and other information provided by the architecture. Verbose |
532 | information will be printed at KERN_DEBUG loglevel depending | 539 | information will be printed at KERN_DEBUG loglevel depending |
533 | on the mminit_loglevel= command-line option. | 540 | on the mminit_loglevel= command-line option. |
534 | 541 | ||
535 | If unsure, say Y | 542 | If unsure, say Y |
536 | 543 | ||
537 | config DEBUG_LIST | 544 | config DEBUG_LIST |
538 | bool "Debug linked list manipulation" | 545 | bool "Debug linked list manipulation" |
539 | depends on DEBUG_KERNEL | 546 | depends on DEBUG_KERNEL |
540 | help | 547 | help |
541 | Enable this to turn on extended checks in the linked-list | 548 | Enable this to turn on extended checks in the linked-list |
542 | walking routines. | 549 | walking routines. |
543 | 550 | ||
544 | If unsure, say N. | 551 | If unsure, say N. |
545 | 552 | ||
546 | config DEBUG_SG | 553 | config DEBUG_SG |
547 | bool "Debug SG table operations" | 554 | bool "Debug SG table operations" |
548 | depends on DEBUG_KERNEL | 555 | depends on DEBUG_KERNEL |
549 | help | 556 | help |
550 | Enable this to turn on checks on scatter-gather tables. This can | 557 | Enable this to turn on checks on scatter-gather tables. This can |
551 | help find problems with drivers that do not properly initialize | 558 | help find problems with drivers that do not properly initialize |
552 | their sg tables. | 559 | their sg tables. |
553 | 560 | ||
554 | If unsure, say N. | 561 | If unsure, say N. |
555 | 562 | ||
556 | config DEBUG_NOTIFIERS | 563 | config DEBUG_NOTIFIERS |
557 | bool "Debug notifier call chains" | 564 | bool "Debug notifier call chains" |
558 | depends on DEBUG_KERNEL | 565 | depends on DEBUG_KERNEL |
559 | help | 566 | help |
560 | Enable this to turn on sanity checking for notifier call chains. | 567 | Enable this to turn on sanity checking for notifier call chains. |
561 | This is most useful for kernel developers to make sure that | 568 | This is most useful for kernel developers to make sure that |
562 | modules properly unregister themselves from notifier chains. | 569 | modules properly unregister themselves from notifier chains. |
563 | This is a relatively cheap check but if you care about maximum | 570 | This is a relatively cheap check but if you care about maximum |
564 | performance, say N. | 571 | performance, say N. |
565 | 572 | ||
566 | config FRAME_POINTER | 573 | config FRAME_POINTER |
567 | bool "Compile the kernel with frame pointers" | 574 | bool "Compile the kernel with frame pointers" |
568 | depends on DEBUG_KERNEL && \ | 575 | depends on DEBUG_KERNEL && \ |
569 | (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || \ | 576 | (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || \ |
570 | AVR32 || SUPERH || BLACKFIN || MN10300) | 577 | AVR32 || SUPERH || BLACKFIN || MN10300) |
571 | default y if DEBUG_INFO && UML | 578 | default y if DEBUG_INFO && UML |
572 | help | 579 | help |
573 | If you say Y here the resulting kernel image will be slightly larger | 580 | If you say Y here the resulting kernel image will be slightly larger |
574 | and slower, but it might give very useful debugging information on | 581 | and slower, but it might give very useful debugging information on |
575 | some architectures or if you use external debuggers. | 582 | some architectures or if you use external debuggers. |
576 | If you don't debug the kernel, you can say N. | 583 | If you don't debug the kernel, you can say N. |
577 | 584 | ||
578 | config BOOT_PRINTK_DELAY | 585 | config BOOT_PRINTK_DELAY |
579 | bool "Delay each boot printk message by N milliseconds" | 586 | bool "Delay each boot printk message by N milliseconds" |
580 | depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY | 587 | depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY |
581 | help | 588 | help |
582 | This build option allows you to read kernel boot messages | 589 | This build option allows you to read kernel boot messages |
583 | by inserting a short delay after each one. The delay is | 590 | by inserting a short delay after each one. The delay is |
584 | specified in milliseconds on the kernel command line, | 591 | specified in milliseconds on the kernel command line, |
585 | using "boot_delay=N". | 592 | using "boot_delay=N". |
586 | 593 | ||
587 | It is likely that you would also need to use "lpj=M" to preset | 594 | It is likely that you would also need to use "lpj=M" to preset |
588 | the "loops per jiffie" value. | 595 | the "loops per jiffie" value. |
589 | See a previous boot log for the "lpj" value to use for your | 596 | See a previous boot log for the "lpj" value to use for your |
590 | system, and then set "lpj=M" before setting "boot_delay=N". | 597 | system, and then set "lpj=M" before setting "boot_delay=N". |
591 | NOTE: Using this option may adversely affect SMP systems. | 598 | NOTE: Using this option may adversely affect SMP systems. |
592 | I.e., processors other than the first one may not boot up. | 599 | I.e., processors other than the first one may not boot up. |
593 | BOOT_PRINTK_DELAY also may cause DETECT_SOFTLOCKUP to detect | 600 | BOOT_PRINTK_DELAY also may cause DETECT_SOFTLOCKUP to detect |
594 | what it believes to be lockup conditions. | 601 | what it believes to be lockup conditions. |
595 | 602 | ||
596 | config RCU_TORTURE_TEST | 603 | config RCU_TORTURE_TEST |
597 | tristate "torture tests for RCU" | 604 | tristate "torture tests for RCU" |
598 | depends on DEBUG_KERNEL | 605 | depends on DEBUG_KERNEL |
599 | default n | 606 | default n |
600 | help | 607 | help |
601 | This option provides a kernel module that runs torture tests | 608 | This option provides a kernel module that runs torture tests |
602 | on the RCU infrastructure. The kernel module may be built | 609 | on the RCU infrastructure. The kernel module may be built |
603 | after the fact on the running kernel to be tested, if desired. | 610 | after the fact on the running kernel to be tested, if desired. |
604 | 611 | ||
605 | Say Y here if you want RCU torture tests to be built into | 612 | Say Y here if you want RCU torture tests to be built into |
606 | the kernel. | 613 | the kernel. |
607 | Say M if you want the RCU torture tests to build as a module. | 614 | Say M if you want the RCU torture tests to build as a module. |
608 | Say N if you are unsure. | 615 | Say N if you are unsure. |
609 | 616 | ||
610 | config RCU_TORTURE_TEST_RUNNABLE | 617 | config RCU_TORTURE_TEST_RUNNABLE |
611 | bool "torture tests for RCU runnable by default" | 618 | bool "torture tests for RCU runnable by default" |
612 | depends on RCU_TORTURE_TEST = y | 619 | depends on RCU_TORTURE_TEST = y |
613 | default n | 620 | default n |
614 | help | 621 | help |
615 | This option provides a way to build the RCU torture tests | 622 | This option provides a way to build the RCU torture tests |
616 | directly into the kernel without them starting up at boot | 623 | directly into the kernel without them starting up at boot |
617 | time. You can use /proc/sys/kernel/rcutorture_runnable | 624 | time. You can use /proc/sys/kernel/rcutorture_runnable |
618 | to manually override this setting. This /proc file is | 625 | to manually override this setting. This /proc file is |
619 | available only when the RCU torture tests have been built | 626 | available only when the RCU torture tests have been built |
620 | into the kernel. | 627 | into the kernel. |
621 | 628 | ||
622 | Say Y here if you want the RCU torture tests to start during | 629 | Say Y here if you want the RCU torture tests to start during |
623 | boot (you probably don't). | 630 | boot (you probably don't). |
624 | Say N here if you want the RCU torture tests to start only | 631 | Say N here if you want the RCU torture tests to start only |
625 | after being manually enabled via /proc. | 632 | after being manually enabled via /proc. |
626 | 633 | ||
627 | config RCU_CPU_STALL_DETECTOR | 634 | config RCU_CPU_STALL_DETECTOR |
628 | bool "Check for stalled CPUs delaying RCU grace periods" | 635 | bool "Check for stalled CPUs delaying RCU grace periods" |
629 | depends on CLASSIC_RCU | 636 | depends on CLASSIC_RCU |
630 | default n | 637 | default n |
631 | help | 638 | help |
632 | This option causes RCU to printk information on which | 639 | This option causes RCU to printk information on which |
633 | CPUs are delaying the current grace period, but only when | 640 | CPUs are delaying the current grace period, but only when |
634 | the grace period extends for excessive time periods. | 641 | the grace period extends for excessive time periods. |
635 | 642 | ||
636 | Say Y if you want RCU to perform such checks. | 643 | Say Y if you want RCU to perform such checks. |
637 | 644 | ||
638 | Say N if you are unsure. | 645 | Say N if you are unsure. |
639 | 646 | ||
640 | config RCU_CPU_STALL_DETECTOR | 647 | config RCU_CPU_STALL_DETECTOR |
641 | bool "Check for stalled CPUs delaying RCU grace periods" | 648 | bool "Check for stalled CPUs delaying RCU grace periods" |
642 | depends on CLASSIC_RCU || TREE_RCU | 649 | depends on CLASSIC_RCU || TREE_RCU |
643 | default n | 650 | default n |
644 | help | 651 | help |
645 | This option causes RCU to printk information on which | 652 | This option causes RCU to printk information on which |
646 | CPUs are delaying the current grace period, but only when | 653 | CPUs are delaying the current grace period, but only when |
647 | the grace period extends for excessive time periods. | 654 | the grace period extends for excessive time periods. |
648 | 655 | ||
649 | Say Y if you want RCU to perform such checks. | 656 | Say Y if you want RCU to perform such checks. |
650 | 657 | ||
651 | Say N if you are unsure. | 658 | Say N if you are unsure. |
652 | 659 | ||
653 | config KPROBES_SANITY_TEST | 660 | config KPROBES_SANITY_TEST |
654 | bool "Kprobes sanity tests" | 661 | bool "Kprobes sanity tests" |
655 | depends on DEBUG_KERNEL | 662 | depends on DEBUG_KERNEL |
656 | depends on KPROBES | 663 | depends on KPROBES |
657 | default n | 664 | default n |
658 | help | 665 | help |
659 | This option provides for testing basic kprobes functionality on | 666 | This option provides for testing basic kprobes functionality on |
660 | boot. A sample kprobe, jprobe and kretprobe are inserted and | 667 | boot. A sample kprobe, jprobe and kretprobe are inserted and |
661 | verified for functionality. | 668 | verified for functionality. |
662 | 669 | ||
663 | Say N if you are unsure. | 670 | Say N if you are unsure. |
664 | 671 | ||
665 | config BACKTRACE_SELF_TEST | 672 | config BACKTRACE_SELF_TEST |
666 | tristate "Self test for the backtrace code" | 673 | tristate "Self test for the backtrace code" |
667 | depends on DEBUG_KERNEL | 674 | depends on DEBUG_KERNEL |
668 | default n | 675 | default n |
669 | help | 676 | help |
670 | This option provides a kernel module that can be used to test | 677 | This option provides a kernel module that can be used to test |
671 | the kernel stack backtrace code. This option is not useful | 678 | the kernel stack backtrace code. This option is not useful |
672 | for distributions or general kernels, but only for kernel | 679 | for distributions or general kernels, but only for kernel |
673 | developers working on architecture code. | 680 | developers working on architecture code. |
674 | 681 | ||
675 | Note that if you want to also test saved backtraces, you will | 682 | Note that if you want to also test saved backtraces, you will |
676 | have to enable STACKTRACE as well. | 683 | have to enable STACKTRACE as well. |
677 | 684 | ||
678 | Say N if you are unsure. | 685 | Say N if you are unsure. |
679 | 686 | ||
680 | config DEBUG_BLOCK_EXT_DEVT | 687 | config DEBUG_BLOCK_EXT_DEVT |
681 | bool "Force extended block device numbers and spread them" | 688 | bool "Force extended block device numbers and spread them" |
682 | depends on DEBUG_KERNEL | 689 | depends on DEBUG_KERNEL |
683 | depends on BLOCK | 690 | depends on BLOCK |
684 | default n | 691 | default n |
685 | help | 692 | help |
686 | BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON | 693 | BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON |
687 | SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT | 694 | SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT |
688 | YOU ARE DOING. Distros, please enable this and fix whatever | 695 | YOU ARE DOING. Distros, please enable this and fix whatever |
689 | is broken. | 696 | is broken. |
690 | 697 | ||
691 | Conventionally, block device numbers are allocated from | 698 | Conventionally, block device numbers are allocated from |
692 | predetermined contiguous area. However, extended block area | 699 | predetermined contiguous area. However, extended block area |
693 | may introduce non-contiguous block device numbers. This | 700 | may introduce non-contiguous block device numbers. This |
694 | option forces most block device numbers to be allocated from | 701 | option forces most block device numbers to be allocated from |
695 | the extended space and spreads them to discover kernel or | 702 | the extended space and spreads them to discover kernel or |
696 | userland code paths which assume predetermined contiguous | 703 | userland code paths which assume predetermined contiguous |
697 | device number allocation. | 704 | device number allocation. |
698 | 705 | ||
699 | Note that turning on this debug option shuffles all the | 706 | Note that turning on this debug option shuffles all the |
700 | device numbers for all IDE and SCSI devices including libata | 707 | device numbers for all IDE and SCSI devices including libata |
701 | ones, so root partition specified using device number | 708 | ones, so root partition specified using device number |
702 | directly (via rdev or root=MAJ:MIN) won't work anymore. | 709 | directly (via rdev or root=MAJ:MIN) won't work anymore. |
703 | Textual device names (root=/dev/sdXn) will continue to work. | 710 | Textual device names (root=/dev/sdXn) will continue to work. |
704 | 711 | ||
705 | Say N if you are unsure. | 712 | Say N if you are unsure. |
706 | 713 | ||
707 | config LKDTM | 714 | config LKDTM |
708 | tristate "Linux Kernel Dump Test Tool Module" | 715 | tristate "Linux Kernel Dump Test Tool Module" |
709 | depends on DEBUG_KERNEL | 716 | depends on DEBUG_KERNEL |
710 | depends on KPROBES | 717 | depends on KPROBES |
711 | depends on BLOCK | 718 | depends on BLOCK |
712 | default n | 719 | default n |
713 | help | 720 | help |
714 | This module enables testing of the different dumping mechanisms by | 721 | This module enables testing of the different dumping mechanisms by |
715 | inducing system failures at predefined crash points. | 722 | inducing system failures at predefined crash points. |
716 | If you don't need it: say N | 723 | If you don't need it: say N |
717 | Choose M here to compile this code as a module. The module will be | 724 | Choose M here to compile this code as a module. The module will be |
718 | called lkdtm. | 725 | called lkdtm. |
719 | 726 | ||
720 | Documentation on how to use the module can be found in | 727 | Documentation on how to use the module can be found in |
721 | drivers/misc/lkdtm.c | 728 | drivers/misc/lkdtm.c |
722 | 729 | ||
723 | config FAULT_INJECTION | 730 | config FAULT_INJECTION |
724 | bool "Fault-injection framework" | 731 | bool "Fault-injection framework" |
725 | depends on DEBUG_KERNEL | 732 | depends on DEBUG_KERNEL |
726 | help | 733 | help |
727 | Provide fault-injection framework. | 734 | Provide fault-injection framework. |
728 | For more details, see Documentation/fault-injection/. | 735 | For more details, see Documentation/fault-injection/. |
729 | 736 | ||
730 | config FAILSLAB | 737 | config FAILSLAB |
731 | bool "Fault-injection capability for kmalloc" | 738 | bool "Fault-injection capability for kmalloc" |
732 | depends on FAULT_INJECTION | 739 | depends on FAULT_INJECTION |
733 | depends on SLAB || SLUB | 740 | depends on SLAB || SLUB |
734 | help | 741 | help |
735 | Provide fault-injection capability for kmalloc. | 742 | Provide fault-injection capability for kmalloc. |
736 | 743 | ||
737 | config FAIL_PAGE_ALLOC | 744 | config FAIL_PAGE_ALLOC |
738 | bool "Fault-injection capabilitiy for alloc_pages()" | 745 | bool "Fault-injection capabilitiy for alloc_pages()" |
739 | depends on FAULT_INJECTION | 746 | depends on FAULT_INJECTION |
740 | help | 747 | help |
741 | Provide fault-injection capability for alloc_pages(). | 748 | Provide fault-injection capability for alloc_pages(). |
742 | 749 | ||
743 | config FAIL_MAKE_REQUEST | 750 | config FAIL_MAKE_REQUEST |
744 | bool "Fault-injection capability for disk IO" | 751 | bool "Fault-injection capability for disk IO" |
745 | depends on FAULT_INJECTION && BLOCK | 752 | depends on FAULT_INJECTION && BLOCK |
746 | help | 753 | help |
747 | Provide fault-injection capability for disk IO. | 754 | Provide fault-injection capability for disk IO. |
748 | 755 | ||
749 | config FAIL_IO_TIMEOUT | 756 | config FAIL_IO_TIMEOUT |
750 | bool "Faul-injection capability for faking disk interrupts" | 757 | bool "Faul-injection capability for faking disk interrupts" |
751 | depends on FAULT_INJECTION && BLOCK | 758 | depends on FAULT_INJECTION && BLOCK |
752 | help | 759 | help |
753 | Provide fault-injection capability on end IO handling. This | 760 | Provide fault-injection capability on end IO handling. This |
754 | will make the block layer "forget" an interrupt as configured, | 761 | will make the block layer "forget" an interrupt as configured, |
755 | thus exercising the error handling. | 762 | thus exercising the error handling. |
756 | 763 | ||
757 | Only works with drivers that use the generic timeout handling, | 764 | Only works with drivers that use the generic timeout handling, |
758 | for others it wont do anything. | 765 | for others it wont do anything. |
759 | 766 | ||
760 | config FAULT_INJECTION_DEBUG_FS | 767 | config FAULT_INJECTION_DEBUG_FS |
761 | bool "Debugfs entries for fault-injection capabilities" | 768 | bool "Debugfs entries for fault-injection capabilities" |
762 | depends on FAULT_INJECTION && SYSFS && DEBUG_FS | 769 | depends on FAULT_INJECTION && SYSFS && DEBUG_FS |
763 | help | 770 | help |
764 | Enable configuration of fault-injection capabilities via debugfs. | 771 | Enable configuration of fault-injection capabilities via debugfs. |
765 | 772 | ||
766 | config FAULT_INJECTION_STACKTRACE_FILTER | 773 | config FAULT_INJECTION_STACKTRACE_FILTER |
767 | bool "stacktrace filter for fault-injection capabilities" | 774 | bool "stacktrace filter for fault-injection capabilities" |
768 | depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT | 775 | depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT |
769 | depends on !X86_64 | 776 | depends on !X86_64 |
770 | select STACKTRACE | 777 | select STACKTRACE |
771 | select FRAME_POINTER if !PPC | 778 | select FRAME_POINTER if !PPC |
772 | help | 779 | help |
773 | Provide stacktrace filter for fault-injection capabilities | 780 | Provide stacktrace filter for fault-injection capabilities |
774 | 781 | ||
775 | config LATENCYTOP | 782 | config LATENCYTOP |
776 | bool "Latency measuring infrastructure" | 783 | bool "Latency measuring infrastructure" |
777 | select FRAME_POINTER if !MIPS && !PPC | 784 | select FRAME_POINTER if !MIPS && !PPC |
778 | select KALLSYMS | 785 | select KALLSYMS |
779 | select KALLSYMS_ALL | 786 | select KALLSYMS_ALL |
780 | select STACKTRACE | 787 | select STACKTRACE |
781 | select SCHEDSTATS | 788 | select SCHEDSTATS |
782 | select SCHED_DEBUG | 789 | select SCHED_DEBUG |
783 | depends on HAVE_LATENCYTOP_SUPPORT | 790 | depends on HAVE_LATENCYTOP_SUPPORT |
784 | help | 791 | help |
785 | Enable this option if you want to use the LatencyTOP tool | 792 | Enable this option if you want to use the LatencyTOP tool |
786 | to find out which userspace is blocking on what kernel operations. | 793 | to find out which userspace is blocking on what kernel operations. |
787 | 794 | ||
788 | config SYSCTL_SYSCALL_CHECK | 795 | config SYSCTL_SYSCALL_CHECK |
789 | bool "Sysctl checks" | 796 | bool "Sysctl checks" |
790 | depends on SYSCTL_SYSCALL | 797 | depends on SYSCTL_SYSCALL |
791 | ---help--- | 798 | ---help--- |
792 | sys_sysctl uses binary paths that have been found challenging | 799 | sys_sysctl uses binary paths that have been found challenging |
793 | to properly maintain and use. This enables checks that help | 800 | to properly maintain and use. This enables checks that help |
794 | you to keep things correct. | 801 | you to keep things correct. |
795 | 802 | ||
796 | source kernel/trace/Kconfig | 803 | source kernel/trace/Kconfig |
797 | 804 | ||
798 | config PROVIDE_OHCI1394_DMA_INIT | 805 | config PROVIDE_OHCI1394_DMA_INIT |
799 | bool "Remote debugging over FireWire early on boot" | 806 | bool "Remote debugging over FireWire early on boot" |
800 | depends on PCI && X86 | 807 | depends on PCI && X86 |
801 | help | 808 | help |
802 | If you want to debug problems which hang or crash the kernel early | 809 | If you want to debug problems which hang or crash the kernel early |
803 | on boot and the crashing machine has a FireWire port, you can use | 810 | on boot and the crashing machine has a FireWire port, you can use |
804 | this feature to remotely access the memory of the crashed machine | 811 | this feature to remotely access the memory of the crashed machine |
805 | over FireWire. This employs remote DMA as part of the OHCI1394 | 812 | over FireWire. This employs remote DMA as part of the OHCI1394 |
806 | specification which is now the standard for FireWire controllers. | 813 | specification which is now the standard for FireWire controllers. |
807 | 814 | ||
808 | With remote DMA, you can monitor the printk buffer remotely using | 815 | With remote DMA, you can monitor the printk buffer remotely using |
809 | firescope and access all memory below 4GB using fireproxy from gdb. | 816 | firescope and access all memory below 4GB using fireproxy from gdb. |
810 | Even controlling a kernel debugger is possible using remote DMA. | 817 | Even controlling a kernel debugger is possible using remote DMA. |
811 | 818 | ||
812 | Usage: | 819 | Usage: |
813 | 820 | ||
814 | If ohci1394_dma=early is used as boot parameter, it will initialize | 821 | If ohci1394_dma=early is used as boot parameter, it will initialize |
815 | all OHCI1394 controllers which are found in the PCI config space. | 822 | all OHCI1394 controllers which are found in the PCI config space. |
816 | 823 | ||
817 | As all changes to the FireWire bus such as enabling and disabling | 824 | As all changes to the FireWire bus such as enabling and disabling |
818 | devices cause a bus reset and thereby disable remote DMA for all | 825 | devices cause a bus reset and thereby disable remote DMA for all |
819 | devices, be sure to have the cable plugged and FireWire enabled on | 826 | devices, be sure to have the cable plugged and FireWire enabled on |
820 | the debugging host before booting the debug target for debugging. | 827 | the debugging host before booting the debug target for debugging. |
821 | 828 | ||
822 | This code (~1k) is freed after boot. By then, the firewire stack | 829 | This code (~1k) is freed after boot. By then, the firewire stack |
823 | in charge of the OHCI-1394 controllers should be used instead. | 830 | in charge of the OHCI-1394 controllers should be used instead. |
824 | 831 | ||
825 | See Documentation/debugging-via-ohci1394.txt for more information. | 832 | See Documentation/debugging-via-ohci1394.txt for more information. |
826 | 833 | ||
827 | config FIREWIRE_OHCI_REMOTE_DMA | 834 | config FIREWIRE_OHCI_REMOTE_DMA |
828 | bool "Remote debugging over FireWire with firewire-ohci" | 835 | bool "Remote debugging over FireWire with firewire-ohci" |
829 | depends on FIREWIRE_OHCI | 836 | depends on FIREWIRE_OHCI |
830 | help | 837 | help |
831 | This option lets you use the FireWire bus for remote debugging | 838 | This option lets you use the FireWire bus for remote debugging |
832 | with help of the firewire-ohci driver. It enables unfiltered | 839 | with help of the firewire-ohci driver. It enables unfiltered |
833 | remote DMA in firewire-ohci. | 840 | remote DMA in firewire-ohci. |
834 | See Documentation/debugging-via-ohci1394.txt for more information. | 841 | See Documentation/debugging-via-ohci1394.txt for more information. |
835 | 842 | ||
836 | If unsure, say N. | 843 | If unsure, say N. |
837 | 844 | ||
838 | menuconfig BUILD_DOCSRC | 845 | menuconfig BUILD_DOCSRC |
839 | bool "Build targets in Documentation/ tree" | 846 | bool "Build targets in Documentation/ tree" |
840 | depends on HEADERS_CHECK | 847 | depends on HEADERS_CHECK |
841 | help | 848 | help |
842 | This option attempts to build objects from the source files in the | 849 | This option attempts to build objects from the source files in the |
843 | kernel Documentation/ tree. | 850 | kernel Documentation/ tree. |
844 | 851 | ||
845 | Say N if you are unsure. | 852 | Say N if you are unsure. |
846 | 853 | ||
847 | config DYNAMIC_PRINTK_DEBUG | 854 | config DYNAMIC_PRINTK_DEBUG |
848 | bool "Enable dynamic printk() call support" | 855 | bool "Enable dynamic printk() call support" |
849 | default n | 856 | default n |
850 | depends on PRINTK | 857 | depends on PRINTK |
851 | select PRINTK_DEBUG | 858 | select PRINTK_DEBUG |
852 | help | 859 | help |
853 | 860 | ||
854 | Compiles debug level messages into the kernel, which would not | 861 | Compiles debug level messages into the kernel, which would not |
855 | otherwise be available at runtime. These messages can then be | 862 | otherwise be available at runtime. These messages can then be |
856 | enabled/disabled on a per module basis. This mechanism implicitly | 863 | enabled/disabled on a per module basis. This mechanism implicitly |
857 | enables all pr_debug() and dev_dbg() calls. The impact of this | 864 | enables all pr_debug() and dev_dbg() calls. The impact of this |
858 | compile option is a larger kernel text size of about 2%. | 865 | compile option is a larger kernel text size of about 2%. |
859 | 866 | ||
860 | Usage: | 867 | Usage: |
861 | 868 | ||
862 | Dynamic debugging is controlled by the debugfs file, | 869 | Dynamic debugging is controlled by the debugfs file, |
863 | dynamic_printk/modules. This file contains a list of the modules that | 870 | dynamic_printk/modules. This file contains a list of the modules that |
864 | can be enabled. The format of the file is the module name, followed | 871 | can be enabled. The format of the file is the module name, followed |
865 | by a set of flags that can be enabled. The first flag is always the | 872 | by a set of flags that can be enabled. The first flag is always the |
866 | 'enabled' flag. For example: | 873 | 'enabled' flag. For example: |
867 | 874 | ||
868 | <module_name> <enabled=0/1> | 875 | <module_name> <enabled=0/1> |
869 | . | 876 | . |
870 | . | 877 | . |
871 | . | 878 | . |
872 | 879 | ||
873 | <module_name> : Name of the module in which the debug call resides | 880 | <module_name> : Name of the module in which the debug call resides |
874 | <enabled=0/1> : whether the messages are enabled or not | 881 | <enabled=0/1> : whether the messages are enabled or not |
875 | 882 | ||
876 | From a live system: | 883 | From a live system: |
877 | 884 | ||
878 | snd_hda_intel enabled=0 | 885 | snd_hda_intel enabled=0 |
879 | fixup enabled=0 | 886 | fixup enabled=0 |
880 | driver enabled=0 | 887 | driver enabled=0 |
881 | 888 | ||
882 | Enable a module: | 889 | Enable a module: |
883 | 890 | ||
884 | $echo "set enabled=1 <module_name>" > dynamic_printk/modules | 891 | $echo "set enabled=1 <module_name>" > dynamic_printk/modules |
885 | 892 | ||
886 | Disable a module: | 893 | Disable a module: |
887 | 894 | ||
888 | $echo "set enabled=0 <module_name>" > dynamic_printk/modules | 895 | $echo "set enabled=0 <module_name>" > dynamic_printk/modules |
889 | 896 | ||
890 | Enable all modules: | 897 | Enable all modules: |
891 | 898 | ||
892 | $echo "set enabled=1 all" > dynamic_printk/modules | 899 | $echo "set enabled=1 all" > dynamic_printk/modules |
893 | 900 | ||
894 | Disable all modules: | 901 | Disable all modules: |
895 | 902 | ||
896 | $echo "set enabled=0 all" > dynamic_printk/modules | 903 | $echo "set enabled=0 all" > dynamic_printk/modules |
897 | 904 | ||
898 | Finally, passing "dynamic_printk" at the command line enables | 905 | Finally, passing "dynamic_printk" at the command line enables |
899 | debugging for all modules. This mode can be turned off via the above | 906 | debugging for all modules. This mode can be turned off via the above |
900 | disable command. | 907 | disable command. |
901 | 908 | ||
902 | source "samples/Kconfig" | 909 | source "samples/Kconfig" |
903 | 910 | ||
904 | source "lib/Kconfig.kgdb" | 911 | source "lib/Kconfig.kgdb" |
905 | 912 |
mm/mmap.c
1 | /* | 1 | /* |
2 | * mm/mmap.c | 2 | * mm/mmap.c |
3 | * | 3 | * |
4 | * Written by obz. | 4 | * Written by obz. |
5 | * | 5 | * |
6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> | 6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/backing-dev.h> | 10 | #include <linux/backing-dev.h> |
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/shm.h> | 12 | #include <linux/shm.h> |
13 | #include <linux/mman.h> | 13 | #include <linux/mman.h> |
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
16 | #include <linux/syscalls.h> | 16 | #include <linux/syscalls.h> |
17 | #include <linux/capability.h> | 17 | #include <linux/capability.h> |
18 | #include <linux/init.h> | 18 | #include <linux/init.h> |
19 | #include <linux/file.h> | 19 | #include <linux/file.h> |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/personality.h> | 21 | #include <linux/personality.h> |
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/hugetlb.h> | 23 | #include <linux/hugetlb.h> |
24 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
29 | #include <linux/mmu_notifier.h> | 29 | #include <linux/mmu_notifier.h> |
30 | 30 | ||
31 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
32 | #include <asm/cacheflush.h> | 32 | #include <asm/cacheflush.h> |
33 | #include <asm/tlb.h> | 33 | #include <asm/tlb.h> |
34 | #include <asm/mmu_context.h> | 34 | #include <asm/mmu_context.h> |
35 | 35 | ||
36 | #include "internal.h" | 36 | #include "internal.h" |
37 | 37 | ||
38 | #ifndef arch_mmap_check | 38 | #ifndef arch_mmap_check |
39 | #define arch_mmap_check(addr, len, flags) (0) | 39 | #define arch_mmap_check(addr, len, flags) (0) |
40 | #endif | 40 | #endif |
41 | 41 | ||
42 | #ifndef arch_rebalance_pgtables | 42 | #ifndef arch_rebalance_pgtables |
43 | #define arch_rebalance_pgtables(addr, len) (addr) | 43 | #define arch_rebalance_pgtables(addr, len) (addr) |
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | static void unmap_region(struct mm_struct *mm, | 46 | static void unmap_region(struct mm_struct *mm, |
47 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 47 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
48 | unsigned long start, unsigned long end); | 48 | unsigned long start, unsigned long end); |
49 | 49 | ||
50 | /* | 50 | /* |
51 | * WARNING: the debugging will use recursive algorithms so never enable this | 51 | * WARNING: the debugging will use recursive algorithms so never enable this |
52 | * unless you know what you are doing. | 52 | * unless you know what you are doing. |
53 | */ | 53 | */ |
54 | #undef DEBUG_MM_RB | 54 | #undef DEBUG_MM_RB |
55 | 55 | ||
56 | /* description of effects of mapping type and prot in current implementation. | 56 | /* description of effects of mapping type and prot in current implementation. |
57 | * this is due to the limited x86 page protection hardware. The expected | 57 | * this is due to the limited x86 page protection hardware. The expected |
58 | * behavior is in parens: | 58 | * behavior is in parens: |
59 | * | 59 | * |
60 | * map_type prot | 60 | * map_type prot |
61 | * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC | 61 | * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC |
62 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 62 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
63 | * w: (no) no w: (no) no w: (yes) yes w: (no) no | 63 | * w: (no) no w: (no) no w: (yes) yes w: (no) no |
64 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 64 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
65 | * | 65 | * |
66 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 66 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
67 | * w: (no) no w: (no) no w: (copy) copy w: (no) no | 67 | * w: (no) no w: (no) no w: (copy) copy w: (no) no |
68 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 68 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
69 | * | 69 | * |
70 | */ | 70 | */ |
71 | pgprot_t protection_map[16] = { | 71 | pgprot_t protection_map[16] = { |
72 | __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, | 72 | __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, |
73 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 | 73 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 |
74 | }; | 74 | }; |
75 | 75 | ||
76 | pgprot_t vm_get_page_prot(unsigned long vm_flags) | 76 | pgprot_t vm_get_page_prot(unsigned long vm_flags) |
77 | { | 77 | { |
78 | return __pgprot(pgprot_val(protection_map[vm_flags & | 78 | return __pgprot(pgprot_val(protection_map[vm_flags & |
79 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | | 79 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | |
80 | pgprot_val(arch_vm_get_page_prot(vm_flags))); | 80 | pgprot_val(arch_vm_get_page_prot(vm_flags))); |
81 | } | 81 | } |
82 | EXPORT_SYMBOL(vm_get_page_prot); | 82 | EXPORT_SYMBOL(vm_get_page_prot); |
83 | 83 | ||
84 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 84 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
85 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 85 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
86 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 86 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
87 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 87 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Check that a process has enough memory to allocate a new virtual | 90 | * Check that a process has enough memory to allocate a new virtual |
91 | * mapping. 0 means there is enough memory for the allocation to | 91 | * mapping. 0 means there is enough memory for the allocation to |
92 | * succeed and -ENOMEM implies there is not. | 92 | * succeed and -ENOMEM implies there is not. |
93 | * | 93 | * |
94 | * We currently support three overcommit policies, which are set via the | 94 | * We currently support three overcommit policies, which are set via the |
95 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting | 95 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting |
96 | * | 96 | * |
97 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. | 97 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. |
98 | * Additional code 2002 Jul 20 by Robert Love. | 98 | * Additional code 2002 Jul 20 by Robert Love. |
99 | * | 99 | * |
100 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. | 100 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. |
101 | * | 101 | * |
102 | * Note this is a helper function intended to be used by LSMs which | 102 | * Note this is a helper function intended to be used by LSMs which |
103 | * wish to use this logic. | 103 | * wish to use this logic. |
104 | */ | 104 | */ |
105 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 105 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
106 | { | 106 | { |
107 | unsigned long free, allowed; | 107 | unsigned long free, allowed; |
108 | 108 | ||
109 | vm_acct_memory(pages); | 109 | vm_acct_memory(pages); |
110 | 110 | ||
111 | /* | 111 | /* |
112 | * Sometimes we want to use more memory than we have | 112 | * Sometimes we want to use more memory than we have |
113 | */ | 113 | */ |
114 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) | 114 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) |
115 | return 0; | 115 | return 0; |
116 | 116 | ||
117 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 117 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
118 | unsigned long n; | 118 | unsigned long n; |
119 | 119 | ||
120 | free = global_page_state(NR_FILE_PAGES); | 120 | free = global_page_state(NR_FILE_PAGES); |
121 | free += nr_swap_pages; | 121 | free += nr_swap_pages; |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * Any slabs which are created with the | 124 | * Any slabs which are created with the |
125 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents | 125 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents |
126 | * which are reclaimable, under pressure. The dentry | 126 | * which are reclaimable, under pressure. The dentry |
127 | * cache and most inode caches should fall into this | 127 | * cache and most inode caches should fall into this |
128 | */ | 128 | */ |
129 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 129 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * Leave the last 3% for root | 132 | * Leave the last 3% for root |
133 | */ | 133 | */ |
134 | if (!cap_sys_admin) | 134 | if (!cap_sys_admin) |
135 | free -= free / 32; | 135 | free -= free / 32; |
136 | 136 | ||
137 | if (free > pages) | 137 | if (free > pages) |
138 | return 0; | 138 | return 0; |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * nr_free_pages() is very expensive on large systems, | 141 | * nr_free_pages() is very expensive on large systems, |
142 | * only call if we're about to fail. | 142 | * only call if we're about to fail. |
143 | */ | 143 | */ |
144 | n = nr_free_pages(); | 144 | n = nr_free_pages(); |
145 | 145 | ||
146 | /* | 146 | /* |
147 | * Leave reserved pages. The pages are not for anonymous pages. | 147 | * Leave reserved pages. The pages are not for anonymous pages. |
148 | */ | 148 | */ |
149 | if (n <= totalreserve_pages) | 149 | if (n <= totalreserve_pages) |
150 | goto error; | 150 | goto error; |
151 | else | 151 | else |
152 | n -= totalreserve_pages; | 152 | n -= totalreserve_pages; |
153 | 153 | ||
154 | /* | 154 | /* |
155 | * Leave the last 3% for root | 155 | * Leave the last 3% for root |
156 | */ | 156 | */ |
157 | if (!cap_sys_admin) | 157 | if (!cap_sys_admin) |
158 | n -= n / 32; | 158 | n -= n / 32; |
159 | free += n; | 159 | free += n; |
160 | 160 | ||
161 | if (free > pages) | 161 | if (free > pages) |
162 | return 0; | 162 | return 0; |
163 | 163 | ||
164 | goto error; | 164 | goto error; |
165 | } | 165 | } |
166 | 166 | ||
167 | allowed = (totalram_pages - hugetlb_total_pages()) | 167 | allowed = (totalram_pages - hugetlb_total_pages()) |
168 | * sysctl_overcommit_ratio / 100; | 168 | * sysctl_overcommit_ratio / 100; |
169 | /* | 169 | /* |
170 | * Leave the last 3% for root | 170 | * Leave the last 3% for root |
171 | */ | 171 | */ |
172 | if (!cap_sys_admin) | 172 | if (!cap_sys_admin) |
173 | allowed -= allowed / 32; | 173 | allowed -= allowed / 32; |
174 | allowed += total_swap_pages; | 174 | allowed += total_swap_pages; |
175 | 175 | ||
176 | /* Don't let a single process grow too big: | 176 | /* Don't let a single process grow too big: |
177 | leave 3% of the size of this process for other processes */ | 177 | leave 3% of the size of this process for other processes */ |
178 | if (mm) | 178 | if (mm) |
179 | allowed -= mm->total_vm / 32; | 179 | allowed -= mm->total_vm / 32; |
180 | 180 | ||
181 | /* | 181 | /* |
182 | * cast `allowed' as a signed long because vm_committed_space | 182 | * cast `allowed' as a signed long because vm_committed_space |
183 | * sometimes has a negative value | 183 | * sometimes has a negative value |
184 | */ | 184 | */ |
185 | if (atomic_long_read(&vm_committed_space) < (long)allowed) | 185 | if (atomic_long_read(&vm_committed_space) < (long)allowed) |
186 | return 0; | 186 | return 0; |
187 | error: | 187 | error: |
188 | vm_unacct_memory(pages); | 188 | vm_unacct_memory(pages); |
189 | 189 | ||
190 | return -ENOMEM; | 190 | return -ENOMEM; |
191 | } | 191 | } |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * Requires inode->i_mapping->i_mmap_lock | 194 | * Requires inode->i_mapping->i_mmap_lock |
195 | */ | 195 | */ |
196 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 196 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
197 | struct file *file, struct address_space *mapping) | 197 | struct file *file, struct address_space *mapping) |
198 | { | 198 | { |
199 | if (vma->vm_flags & VM_DENYWRITE) | 199 | if (vma->vm_flags & VM_DENYWRITE) |
200 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); | 200 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); |
201 | if (vma->vm_flags & VM_SHARED) | 201 | if (vma->vm_flags & VM_SHARED) |
202 | mapping->i_mmap_writable--; | 202 | mapping->i_mmap_writable--; |
203 | 203 | ||
204 | flush_dcache_mmap_lock(mapping); | 204 | flush_dcache_mmap_lock(mapping); |
205 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 205 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
206 | list_del_init(&vma->shared.vm_set.list); | 206 | list_del_init(&vma->shared.vm_set.list); |
207 | else | 207 | else |
208 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 208 | vma_prio_tree_remove(vma, &mapping->i_mmap); |
209 | flush_dcache_mmap_unlock(mapping); | 209 | flush_dcache_mmap_unlock(mapping); |
210 | } | 210 | } |
211 | 211 | ||
212 | /* | 212 | /* |
213 | * Unlink a file-based vm structure from its prio_tree, to hide | 213 | * Unlink a file-based vm structure from its prio_tree, to hide |
214 | * vma from rmap and vmtruncate before freeing its page tables. | 214 | * vma from rmap and vmtruncate before freeing its page tables. |
215 | */ | 215 | */ |
216 | void unlink_file_vma(struct vm_area_struct *vma) | 216 | void unlink_file_vma(struct vm_area_struct *vma) |
217 | { | 217 | { |
218 | struct file *file = vma->vm_file; | 218 | struct file *file = vma->vm_file; |
219 | 219 | ||
220 | if (file) { | 220 | if (file) { |
221 | struct address_space *mapping = file->f_mapping; | 221 | struct address_space *mapping = file->f_mapping; |
222 | spin_lock(&mapping->i_mmap_lock); | 222 | spin_lock(&mapping->i_mmap_lock); |
223 | __remove_shared_vm_struct(vma, file, mapping); | 223 | __remove_shared_vm_struct(vma, file, mapping); |
224 | spin_unlock(&mapping->i_mmap_lock); | 224 | spin_unlock(&mapping->i_mmap_lock); |
225 | } | 225 | } |
226 | } | 226 | } |
227 | 227 | ||
228 | /* | 228 | /* |
229 | * Close a vm structure and free it, returning the next. | 229 | * Close a vm structure and free it, returning the next. |
230 | */ | 230 | */ |
231 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | 231 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) |
232 | { | 232 | { |
233 | struct vm_area_struct *next = vma->vm_next; | 233 | struct vm_area_struct *next = vma->vm_next; |
234 | 234 | ||
235 | might_sleep(); | 235 | might_sleep(); |
236 | if (vma->vm_ops && vma->vm_ops->close) | 236 | if (vma->vm_ops && vma->vm_ops->close) |
237 | vma->vm_ops->close(vma); | 237 | vma->vm_ops->close(vma); |
238 | if (vma->vm_file) { | 238 | if (vma->vm_file) { |
239 | fput(vma->vm_file); | 239 | fput(vma->vm_file); |
240 | if (vma->vm_flags & VM_EXECUTABLE) | 240 | if (vma->vm_flags & VM_EXECUTABLE) |
241 | removed_exe_file_vma(vma->vm_mm); | 241 | removed_exe_file_vma(vma->vm_mm); |
242 | } | 242 | } |
243 | mpol_put(vma_policy(vma)); | 243 | mpol_put(vma_policy(vma)); |
244 | kmem_cache_free(vm_area_cachep, vma); | 244 | kmem_cache_free(vm_area_cachep, vma); |
245 | return next; | 245 | return next; |
246 | } | 246 | } |
247 | 247 | ||
248 | asmlinkage unsigned long sys_brk(unsigned long brk) | 248 | asmlinkage unsigned long sys_brk(unsigned long brk) |
249 | { | 249 | { |
250 | unsigned long rlim, retval; | 250 | unsigned long rlim, retval; |
251 | unsigned long newbrk, oldbrk; | 251 | unsigned long newbrk, oldbrk; |
252 | struct mm_struct *mm = current->mm; | 252 | struct mm_struct *mm = current->mm; |
253 | unsigned long min_brk; | 253 | unsigned long min_brk; |
254 | 254 | ||
255 | down_write(&mm->mmap_sem); | 255 | down_write(&mm->mmap_sem); |
256 | 256 | ||
257 | #ifdef CONFIG_COMPAT_BRK | 257 | #ifdef CONFIG_COMPAT_BRK |
258 | min_brk = mm->end_code; | 258 | min_brk = mm->end_code; |
259 | #else | 259 | #else |
260 | min_brk = mm->start_brk; | 260 | min_brk = mm->start_brk; |
261 | #endif | 261 | #endif |
262 | if (brk < min_brk) | 262 | if (brk < min_brk) |
263 | goto out; | 263 | goto out; |
264 | 264 | ||
265 | /* | 265 | /* |
266 | * Check against rlimit here. If this check is done later after the test | 266 | * Check against rlimit here. If this check is done later after the test |
267 | * of oldbrk with newbrk then it can escape the test and let the data | 267 | * of oldbrk with newbrk then it can escape the test and let the data |
268 | * segment grow beyond its set limit the in case where the limit is | 268 | * segment grow beyond its set limit the in case where the limit is |
269 | * not page aligned -Ram Gupta | 269 | * not page aligned -Ram Gupta |
270 | */ | 270 | */ |
271 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | 271 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; |
272 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 272 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + |
273 | (mm->end_data - mm->start_data) > rlim) | 273 | (mm->end_data - mm->start_data) > rlim) |
274 | goto out; | 274 | goto out; |
275 | 275 | ||
276 | newbrk = PAGE_ALIGN(brk); | 276 | newbrk = PAGE_ALIGN(brk); |
277 | oldbrk = PAGE_ALIGN(mm->brk); | 277 | oldbrk = PAGE_ALIGN(mm->brk); |
278 | if (oldbrk == newbrk) | 278 | if (oldbrk == newbrk) |
279 | goto set_brk; | 279 | goto set_brk; |
280 | 280 | ||
281 | /* Always allow shrinking brk. */ | 281 | /* Always allow shrinking brk. */ |
282 | if (brk <= mm->brk) { | 282 | if (brk <= mm->brk) { |
283 | if (!do_munmap(mm, newbrk, oldbrk-newbrk)) | 283 | if (!do_munmap(mm, newbrk, oldbrk-newbrk)) |
284 | goto set_brk; | 284 | goto set_brk; |
285 | goto out; | 285 | goto out; |
286 | } | 286 | } |
287 | 287 | ||
288 | /* Check against existing mmap mappings. */ | 288 | /* Check against existing mmap mappings. */ |
289 | if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) | 289 | if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) |
290 | goto out; | 290 | goto out; |
291 | 291 | ||
292 | /* Ok, looks good - let it rip. */ | 292 | /* Ok, looks good - let it rip. */ |
293 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) | 293 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) |
294 | goto out; | 294 | goto out; |
295 | set_brk: | 295 | set_brk: |
296 | mm->brk = brk; | 296 | mm->brk = brk; |
297 | out: | 297 | out: |
298 | retval = mm->brk; | 298 | retval = mm->brk; |
299 | up_write(&mm->mmap_sem); | 299 | up_write(&mm->mmap_sem); |
300 | return retval; | 300 | return retval; |
301 | } | 301 | } |
302 | 302 | ||
303 | #ifdef DEBUG_MM_RB | 303 | #ifdef DEBUG_MM_RB |
304 | static int browse_rb(struct rb_root *root) | 304 | static int browse_rb(struct rb_root *root) |
305 | { | 305 | { |
306 | int i = 0, j; | 306 | int i = 0, j; |
307 | struct rb_node *nd, *pn = NULL; | 307 | struct rb_node *nd, *pn = NULL; |
308 | unsigned long prev = 0, pend = 0; | 308 | unsigned long prev = 0, pend = 0; |
309 | 309 | ||
310 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 310 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
311 | struct vm_area_struct *vma; | 311 | struct vm_area_struct *vma; |
312 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 312 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
313 | if (vma->vm_start < prev) | 313 | if (vma->vm_start < prev) |
314 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; | 314 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; |
315 | if (vma->vm_start < pend) | 315 | if (vma->vm_start < pend) |
316 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); | 316 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); |
317 | if (vma->vm_start > vma->vm_end) | 317 | if (vma->vm_start > vma->vm_end) |
318 | printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); | 318 | printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); |
319 | i++; | 319 | i++; |
320 | pn = nd; | 320 | pn = nd; |
321 | prev = vma->vm_start; | 321 | prev = vma->vm_start; |
322 | pend = vma->vm_end; | 322 | pend = vma->vm_end; |
323 | } | 323 | } |
324 | j = 0; | 324 | j = 0; |
325 | for (nd = pn; nd; nd = rb_prev(nd)) { | 325 | for (nd = pn; nd; nd = rb_prev(nd)) { |
326 | j++; | 326 | j++; |
327 | } | 327 | } |
328 | if (i != j) | 328 | if (i != j) |
329 | printk("backwards %d, forwards %d\n", j, i), i = 0; | 329 | printk("backwards %d, forwards %d\n", j, i), i = 0; |
330 | return i; | 330 | return i; |
331 | } | 331 | } |
332 | 332 | ||
333 | void validate_mm(struct mm_struct *mm) | 333 | void validate_mm(struct mm_struct *mm) |
334 | { | 334 | { |
335 | int bug = 0; | 335 | int bug = 0; |
336 | int i = 0; | 336 | int i = 0; |
337 | struct vm_area_struct *tmp = mm->mmap; | 337 | struct vm_area_struct *tmp = mm->mmap; |
338 | while (tmp) { | 338 | while (tmp) { |
339 | tmp = tmp->vm_next; | 339 | tmp = tmp->vm_next; |
340 | i++; | 340 | i++; |
341 | } | 341 | } |
342 | if (i != mm->map_count) | 342 | if (i != mm->map_count) |
343 | printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; | 343 | printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; |
344 | i = browse_rb(&mm->mm_rb); | 344 | i = browse_rb(&mm->mm_rb); |
345 | if (i != mm->map_count) | 345 | if (i != mm->map_count) |
346 | printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; | 346 | printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; |
347 | BUG_ON(bug); | 347 | BUG_ON(bug); |
348 | } | 348 | } |
349 | #else | 349 | #else |
350 | #define validate_mm(mm) do { } while (0) | 350 | #define validate_mm(mm) do { } while (0) |
351 | #endif | 351 | #endif |
352 | 352 | ||
353 | static struct vm_area_struct * | 353 | static struct vm_area_struct * |
354 | find_vma_prepare(struct mm_struct *mm, unsigned long addr, | 354 | find_vma_prepare(struct mm_struct *mm, unsigned long addr, |
355 | struct vm_area_struct **pprev, struct rb_node ***rb_link, | 355 | struct vm_area_struct **pprev, struct rb_node ***rb_link, |
356 | struct rb_node ** rb_parent) | 356 | struct rb_node ** rb_parent) |
357 | { | 357 | { |
358 | struct vm_area_struct * vma; | 358 | struct vm_area_struct * vma; |
359 | struct rb_node ** __rb_link, * __rb_parent, * rb_prev; | 359 | struct rb_node ** __rb_link, * __rb_parent, * rb_prev; |
360 | 360 | ||
361 | __rb_link = &mm->mm_rb.rb_node; | 361 | __rb_link = &mm->mm_rb.rb_node; |
362 | rb_prev = __rb_parent = NULL; | 362 | rb_prev = __rb_parent = NULL; |
363 | vma = NULL; | 363 | vma = NULL; |
364 | 364 | ||
365 | while (*__rb_link) { | 365 | while (*__rb_link) { |
366 | struct vm_area_struct *vma_tmp; | 366 | struct vm_area_struct *vma_tmp; |
367 | 367 | ||
368 | __rb_parent = *__rb_link; | 368 | __rb_parent = *__rb_link; |
369 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); | 369 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); |
370 | 370 | ||
371 | if (vma_tmp->vm_end > addr) { | 371 | if (vma_tmp->vm_end > addr) { |
372 | vma = vma_tmp; | 372 | vma = vma_tmp; |
373 | if (vma_tmp->vm_start <= addr) | 373 | if (vma_tmp->vm_start <= addr) |
374 | break; | 374 | break; |
375 | __rb_link = &__rb_parent->rb_left; | 375 | __rb_link = &__rb_parent->rb_left; |
376 | } else { | 376 | } else { |
377 | rb_prev = __rb_parent; | 377 | rb_prev = __rb_parent; |
378 | __rb_link = &__rb_parent->rb_right; | 378 | __rb_link = &__rb_parent->rb_right; |
379 | } | 379 | } |
380 | } | 380 | } |
381 | 381 | ||
382 | *pprev = NULL; | 382 | *pprev = NULL; |
383 | if (rb_prev) | 383 | if (rb_prev) |
384 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); | 384 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); |
385 | *rb_link = __rb_link; | 385 | *rb_link = __rb_link; |
386 | *rb_parent = __rb_parent; | 386 | *rb_parent = __rb_parent; |
387 | return vma; | 387 | return vma; |
388 | } | 388 | } |
389 | 389 | ||
390 | static inline void | 390 | static inline void |
391 | __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | 391 | __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, |
392 | struct vm_area_struct *prev, struct rb_node *rb_parent) | 392 | struct vm_area_struct *prev, struct rb_node *rb_parent) |
393 | { | 393 | { |
394 | if (prev) { | 394 | if (prev) { |
395 | vma->vm_next = prev->vm_next; | 395 | vma->vm_next = prev->vm_next; |
396 | prev->vm_next = vma; | 396 | prev->vm_next = vma; |
397 | } else { | 397 | } else { |
398 | mm->mmap = vma; | 398 | mm->mmap = vma; |
399 | if (rb_parent) | 399 | if (rb_parent) |
400 | vma->vm_next = rb_entry(rb_parent, | 400 | vma->vm_next = rb_entry(rb_parent, |
401 | struct vm_area_struct, vm_rb); | 401 | struct vm_area_struct, vm_rb); |
402 | else | 402 | else |
403 | vma->vm_next = NULL; | 403 | vma->vm_next = NULL; |
404 | } | 404 | } |
405 | } | 405 | } |
406 | 406 | ||
407 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 407 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
408 | struct rb_node **rb_link, struct rb_node *rb_parent) | 408 | struct rb_node **rb_link, struct rb_node *rb_parent) |
409 | { | 409 | { |
410 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); | 410 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); |
411 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | 411 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); |
412 | } | 412 | } |
413 | 413 | ||
414 | static void __vma_link_file(struct vm_area_struct *vma) | 414 | static void __vma_link_file(struct vm_area_struct *vma) |
415 | { | 415 | { |
416 | struct file *file; | 416 | struct file *file; |
417 | 417 | ||
418 | file = vma->vm_file; | 418 | file = vma->vm_file; |
419 | if (file) { | 419 | if (file) { |
420 | struct address_space *mapping = file->f_mapping; | 420 | struct address_space *mapping = file->f_mapping; |
421 | 421 | ||
422 | if (vma->vm_flags & VM_DENYWRITE) | 422 | if (vma->vm_flags & VM_DENYWRITE) |
423 | atomic_dec(&file->f_path.dentry->d_inode->i_writecount); | 423 | atomic_dec(&file->f_path.dentry->d_inode->i_writecount); |
424 | if (vma->vm_flags & VM_SHARED) | 424 | if (vma->vm_flags & VM_SHARED) |
425 | mapping->i_mmap_writable++; | 425 | mapping->i_mmap_writable++; |
426 | 426 | ||
427 | flush_dcache_mmap_lock(mapping); | 427 | flush_dcache_mmap_lock(mapping); |
428 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 428 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
429 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 429 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
430 | else | 430 | else |
431 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 431 | vma_prio_tree_insert(vma, &mapping->i_mmap); |
432 | flush_dcache_mmap_unlock(mapping); | 432 | flush_dcache_mmap_unlock(mapping); |
433 | } | 433 | } |
434 | } | 434 | } |
435 | 435 | ||
436 | static void | 436 | static void |
437 | __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 437 | __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
438 | struct vm_area_struct *prev, struct rb_node **rb_link, | 438 | struct vm_area_struct *prev, struct rb_node **rb_link, |
439 | struct rb_node *rb_parent) | 439 | struct rb_node *rb_parent) |
440 | { | 440 | { |
441 | __vma_link_list(mm, vma, prev, rb_parent); | 441 | __vma_link_list(mm, vma, prev, rb_parent); |
442 | __vma_link_rb(mm, vma, rb_link, rb_parent); | 442 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
443 | __anon_vma_link(vma); | 443 | __anon_vma_link(vma); |
444 | } | 444 | } |
445 | 445 | ||
446 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 446 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
447 | struct vm_area_struct *prev, struct rb_node **rb_link, | 447 | struct vm_area_struct *prev, struct rb_node **rb_link, |
448 | struct rb_node *rb_parent) | 448 | struct rb_node *rb_parent) |
449 | { | 449 | { |
450 | struct address_space *mapping = NULL; | 450 | struct address_space *mapping = NULL; |
451 | 451 | ||
452 | if (vma->vm_file) | 452 | if (vma->vm_file) |
453 | mapping = vma->vm_file->f_mapping; | 453 | mapping = vma->vm_file->f_mapping; |
454 | 454 | ||
455 | if (mapping) { | 455 | if (mapping) { |
456 | spin_lock(&mapping->i_mmap_lock); | 456 | spin_lock(&mapping->i_mmap_lock); |
457 | vma->vm_truncate_count = mapping->truncate_count; | 457 | vma->vm_truncate_count = mapping->truncate_count; |
458 | } | 458 | } |
459 | anon_vma_lock(vma); | 459 | anon_vma_lock(vma); |
460 | 460 | ||
461 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 461 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
462 | __vma_link_file(vma); | 462 | __vma_link_file(vma); |
463 | 463 | ||
464 | anon_vma_unlock(vma); | 464 | anon_vma_unlock(vma); |
465 | if (mapping) | 465 | if (mapping) |
466 | spin_unlock(&mapping->i_mmap_lock); | 466 | spin_unlock(&mapping->i_mmap_lock); |
467 | 467 | ||
468 | mm->map_count++; | 468 | mm->map_count++; |
469 | validate_mm(mm); | 469 | validate_mm(mm); |
470 | } | 470 | } |
471 | 471 | ||
472 | /* | 472 | /* |
473 | * Helper for vma_adjust in the split_vma insert case: | 473 | * Helper for vma_adjust in the split_vma insert case: |
474 | * insert vm structure into list and rbtree and anon_vma, | 474 | * insert vm structure into list and rbtree and anon_vma, |
475 | * but it has already been inserted into prio_tree earlier. | 475 | * but it has already been inserted into prio_tree earlier. |
476 | */ | 476 | */ |
477 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 477 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
478 | { | 478 | { |
479 | struct vm_area_struct *__vma, *prev; | 479 | struct vm_area_struct *__vma, *prev; |
480 | struct rb_node **rb_link, *rb_parent; | 480 | struct rb_node **rb_link, *rb_parent; |
481 | 481 | ||
482 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | 482 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); |
483 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); | 483 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); |
484 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 484 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
485 | mm->map_count++; | 485 | mm->map_count++; |
486 | } | 486 | } |
487 | 487 | ||
488 | static inline void | 488 | static inline void |
489 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | 489 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, |
490 | struct vm_area_struct *prev) | 490 | struct vm_area_struct *prev) |
491 | { | 491 | { |
492 | prev->vm_next = vma->vm_next; | 492 | prev->vm_next = vma->vm_next; |
493 | rb_erase(&vma->vm_rb, &mm->mm_rb); | 493 | rb_erase(&vma->vm_rb, &mm->mm_rb); |
494 | if (mm->mmap_cache == vma) | 494 | if (mm->mmap_cache == vma) |
495 | mm->mmap_cache = prev; | 495 | mm->mmap_cache = prev; |
496 | } | 496 | } |
497 | 497 | ||
498 | /* | 498 | /* |
499 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that | 499 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that |
500 | * is already present in an i_mmap tree without adjusting the tree. | 500 | * is already present in an i_mmap tree without adjusting the tree. |
501 | * The following helper function should be used when such adjustments | 501 | * The following helper function should be used when such adjustments |
502 | * are necessary. The "insert" vma (if any) is to be inserted | 502 | * are necessary. The "insert" vma (if any) is to be inserted |
503 | * before we drop the necessary locks. | 503 | * before we drop the necessary locks. |
504 | */ | 504 | */ |
505 | void vma_adjust(struct vm_area_struct *vma, unsigned long start, | 505 | void vma_adjust(struct vm_area_struct *vma, unsigned long start, |
506 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 506 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
507 | { | 507 | { |
508 | struct mm_struct *mm = vma->vm_mm; | 508 | struct mm_struct *mm = vma->vm_mm; |
509 | struct vm_area_struct *next = vma->vm_next; | 509 | struct vm_area_struct *next = vma->vm_next; |
510 | struct vm_area_struct *importer = NULL; | 510 | struct vm_area_struct *importer = NULL; |
511 | struct address_space *mapping = NULL; | 511 | struct address_space *mapping = NULL; |
512 | struct prio_tree_root *root = NULL; | 512 | struct prio_tree_root *root = NULL; |
513 | struct file *file = vma->vm_file; | 513 | struct file *file = vma->vm_file; |
514 | struct anon_vma *anon_vma = NULL; | 514 | struct anon_vma *anon_vma = NULL; |
515 | long adjust_next = 0; | 515 | long adjust_next = 0; |
516 | int remove_next = 0; | 516 | int remove_next = 0; |
517 | 517 | ||
518 | if (next && !insert) { | 518 | if (next && !insert) { |
519 | if (end >= next->vm_end) { | 519 | if (end >= next->vm_end) { |
520 | /* | 520 | /* |
521 | * vma expands, overlapping all the next, and | 521 | * vma expands, overlapping all the next, and |
522 | * perhaps the one after too (mprotect case 6). | 522 | * perhaps the one after too (mprotect case 6). |
523 | */ | 523 | */ |
524 | again: remove_next = 1 + (end > next->vm_end); | 524 | again: remove_next = 1 + (end > next->vm_end); |
525 | end = next->vm_end; | 525 | end = next->vm_end; |
526 | anon_vma = next->anon_vma; | 526 | anon_vma = next->anon_vma; |
527 | importer = vma; | 527 | importer = vma; |
528 | } else if (end > next->vm_start) { | 528 | } else if (end > next->vm_start) { |
529 | /* | 529 | /* |
530 | * vma expands, overlapping part of the next: | 530 | * vma expands, overlapping part of the next: |
531 | * mprotect case 5 shifting the boundary up. | 531 | * mprotect case 5 shifting the boundary up. |
532 | */ | 532 | */ |
533 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; | 533 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; |
534 | anon_vma = next->anon_vma; | 534 | anon_vma = next->anon_vma; |
535 | importer = vma; | 535 | importer = vma; |
536 | } else if (end < vma->vm_end) { | 536 | } else if (end < vma->vm_end) { |
537 | /* | 537 | /* |
538 | * vma shrinks, and !insert tells it's not | 538 | * vma shrinks, and !insert tells it's not |
539 | * split_vma inserting another: so it must be | 539 | * split_vma inserting another: so it must be |
540 | * mprotect case 4 shifting the boundary down. | 540 | * mprotect case 4 shifting the boundary down. |
541 | */ | 541 | */ |
542 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); | 542 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); |
543 | anon_vma = next->anon_vma; | 543 | anon_vma = next->anon_vma; |
544 | importer = next; | 544 | importer = next; |
545 | } | 545 | } |
546 | } | 546 | } |
547 | 547 | ||
548 | if (file) { | 548 | if (file) { |
549 | mapping = file->f_mapping; | 549 | mapping = file->f_mapping; |
550 | if (!(vma->vm_flags & VM_NONLINEAR)) | 550 | if (!(vma->vm_flags & VM_NONLINEAR)) |
551 | root = &mapping->i_mmap; | 551 | root = &mapping->i_mmap; |
552 | spin_lock(&mapping->i_mmap_lock); | 552 | spin_lock(&mapping->i_mmap_lock); |
553 | if (importer && | 553 | if (importer && |
554 | vma->vm_truncate_count != next->vm_truncate_count) { | 554 | vma->vm_truncate_count != next->vm_truncate_count) { |
555 | /* | 555 | /* |
556 | * unmap_mapping_range might be in progress: | 556 | * unmap_mapping_range might be in progress: |
557 | * ensure that the expanding vma is rescanned. | 557 | * ensure that the expanding vma is rescanned. |
558 | */ | 558 | */ |
559 | importer->vm_truncate_count = 0; | 559 | importer->vm_truncate_count = 0; |
560 | } | 560 | } |
561 | if (insert) { | 561 | if (insert) { |
562 | insert->vm_truncate_count = vma->vm_truncate_count; | 562 | insert->vm_truncate_count = vma->vm_truncate_count; |
563 | /* | 563 | /* |
564 | * Put into prio_tree now, so instantiated pages | 564 | * Put into prio_tree now, so instantiated pages |
565 | * are visible to arm/parisc __flush_dcache_page | 565 | * are visible to arm/parisc __flush_dcache_page |
566 | * throughout; but we cannot insert into address | 566 | * throughout; but we cannot insert into address |
567 | * space until vma start or end is updated. | 567 | * space until vma start or end is updated. |
568 | */ | 568 | */ |
569 | __vma_link_file(insert); | 569 | __vma_link_file(insert); |
570 | } | 570 | } |
571 | } | 571 | } |
572 | 572 | ||
573 | /* | 573 | /* |
574 | * When changing only vma->vm_end, we don't really need | 574 | * When changing only vma->vm_end, we don't really need |
575 | * anon_vma lock: but is that case worth optimizing out? | 575 | * anon_vma lock: but is that case worth optimizing out? |
576 | */ | 576 | */ |
577 | if (vma->anon_vma) | 577 | if (vma->anon_vma) |
578 | anon_vma = vma->anon_vma; | 578 | anon_vma = vma->anon_vma; |
579 | if (anon_vma) { | 579 | if (anon_vma) { |
580 | spin_lock(&anon_vma->lock); | 580 | spin_lock(&anon_vma->lock); |
581 | /* | 581 | /* |
582 | * Easily overlooked: when mprotect shifts the boundary, | 582 | * Easily overlooked: when mprotect shifts the boundary, |
583 | * make sure the expanding vma has anon_vma set if the | 583 | * make sure the expanding vma has anon_vma set if the |
584 | * shrinking vma had, to cover any anon pages imported. | 584 | * shrinking vma had, to cover any anon pages imported. |
585 | */ | 585 | */ |
586 | if (importer && !importer->anon_vma) { | 586 | if (importer && !importer->anon_vma) { |
587 | importer->anon_vma = anon_vma; | 587 | importer->anon_vma = anon_vma; |
588 | __anon_vma_link(importer); | 588 | __anon_vma_link(importer); |
589 | } | 589 | } |
590 | } | 590 | } |
591 | 591 | ||
592 | if (root) { | 592 | if (root) { |
593 | flush_dcache_mmap_lock(mapping); | 593 | flush_dcache_mmap_lock(mapping); |
594 | vma_prio_tree_remove(vma, root); | 594 | vma_prio_tree_remove(vma, root); |
595 | if (adjust_next) | 595 | if (adjust_next) |
596 | vma_prio_tree_remove(next, root); | 596 | vma_prio_tree_remove(next, root); |
597 | } | 597 | } |
598 | 598 | ||
599 | vma->vm_start = start; | 599 | vma->vm_start = start; |
600 | vma->vm_end = end; | 600 | vma->vm_end = end; |
601 | vma->vm_pgoff = pgoff; | 601 | vma->vm_pgoff = pgoff; |
602 | if (adjust_next) { | 602 | if (adjust_next) { |
603 | next->vm_start += adjust_next << PAGE_SHIFT; | 603 | next->vm_start += adjust_next << PAGE_SHIFT; |
604 | next->vm_pgoff += adjust_next; | 604 | next->vm_pgoff += adjust_next; |
605 | } | 605 | } |
606 | 606 | ||
607 | if (root) { | 607 | if (root) { |
608 | if (adjust_next) | 608 | if (adjust_next) |
609 | vma_prio_tree_insert(next, root); | 609 | vma_prio_tree_insert(next, root); |
610 | vma_prio_tree_insert(vma, root); | 610 | vma_prio_tree_insert(vma, root); |
611 | flush_dcache_mmap_unlock(mapping); | 611 | flush_dcache_mmap_unlock(mapping); |
612 | } | 612 | } |
613 | 613 | ||
614 | if (remove_next) { | 614 | if (remove_next) { |
615 | /* | 615 | /* |
616 | * vma_merge has merged next into vma, and needs | 616 | * vma_merge has merged next into vma, and needs |
617 | * us to remove next before dropping the locks. | 617 | * us to remove next before dropping the locks. |
618 | */ | 618 | */ |
619 | __vma_unlink(mm, next, vma); | 619 | __vma_unlink(mm, next, vma); |
620 | if (file) | 620 | if (file) |
621 | __remove_shared_vm_struct(next, file, mapping); | 621 | __remove_shared_vm_struct(next, file, mapping); |
622 | if (next->anon_vma) | 622 | if (next->anon_vma) |
623 | __anon_vma_merge(vma, next); | 623 | __anon_vma_merge(vma, next); |
624 | } else if (insert) { | 624 | } else if (insert) { |
625 | /* | 625 | /* |
626 | * split_vma has split insert from vma, and needs | 626 | * split_vma has split insert from vma, and needs |
627 | * us to insert it before dropping the locks | 627 | * us to insert it before dropping the locks |
628 | * (it may either follow vma or precede it). | 628 | * (it may either follow vma or precede it). |
629 | */ | 629 | */ |
630 | __insert_vm_struct(mm, insert); | 630 | __insert_vm_struct(mm, insert); |
631 | } | 631 | } |
632 | 632 | ||
633 | if (anon_vma) | 633 | if (anon_vma) |
634 | spin_unlock(&anon_vma->lock); | 634 | spin_unlock(&anon_vma->lock); |
635 | if (mapping) | 635 | if (mapping) |
636 | spin_unlock(&mapping->i_mmap_lock); | 636 | spin_unlock(&mapping->i_mmap_lock); |
637 | 637 | ||
638 | if (remove_next) { | 638 | if (remove_next) { |
639 | if (file) { | 639 | if (file) { |
640 | fput(file); | 640 | fput(file); |
641 | if (next->vm_flags & VM_EXECUTABLE) | 641 | if (next->vm_flags & VM_EXECUTABLE) |
642 | removed_exe_file_vma(mm); | 642 | removed_exe_file_vma(mm); |
643 | } | 643 | } |
644 | mm->map_count--; | 644 | mm->map_count--; |
645 | mpol_put(vma_policy(next)); | 645 | mpol_put(vma_policy(next)); |
646 | kmem_cache_free(vm_area_cachep, next); | 646 | kmem_cache_free(vm_area_cachep, next); |
647 | /* | 647 | /* |
648 | * In mprotect's case 6 (see comments on vma_merge), | 648 | * In mprotect's case 6 (see comments on vma_merge), |
649 | * we must remove another next too. It would clutter | 649 | * we must remove another next too. It would clutter |
650 | * up the code too much to do both in one go. | 650 | * up the code too much to do both in one go. |
651 | */ | 651 | */ |
652 | if (remove_next == 2) { | 652 | if (remove_next == 2) { |
653 | next = vma->vm_next; | 653 | next = vma->vm_next; |
654 | goto again; | 654 | goto again; |
655 | } | 655 | } |
656 | } | 656 | } |
657 | 657 | ||
658 | validate_mm(mm); | 658 | validate_mm(mm); |
659 | } | 659 | } |
660 | 660 | ||
661 | /* | 661 | /* |
662 | * If the vma has a ->close operation then the driver probably needs to release | 662 | * If the vma has a ->close operation then the driver probably needs to release |
663 | * per-vma resources, so we don't attempt to merge those. | 663 | * per-vma resources, so we don't attempt to merge those. |
664 | */ | 664 | */ |
665 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 665 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
666 | struct file *file, unsigned long vm_flags) | 666 | struct file *file, unsigned long vm_flags) |
667 | { | 667 | { |
668 | if (vma->vm_flags != vm_flags) | 668 | if (vma->vm_flags != vm_flags) |
669 | return 0; | 669 | return 0; |
670 | if (vma->vm_file != file) | 670 | if (vma->vm_file != file) |
671 | return 0; | 671 | return 0; |
672 | if (vma->vm_ops && vma->vm_ops->close) | 672 | if (vma->vm_ops && vma->vm_ops->close) |
673 | return 0; | 673 | return 0; |
674 | return 1; | 674 | return 1; |
675 | } | 675 | } |
676 | 676 | ||
677 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, | 677 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, |
678 | struct anon_vma *anon_vma2) | 678 | struct anon_vma *anon_vma2) |
679 | { | 679 | { |
680 | return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); | 680 | return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); |
681 | } | 681 | } |
682 | 682 | ||
683 | /* | 683 | /* |
684 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | 684 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) |
685 | * in front of (at a lower virtual address and file offset than) the vma. | 685 | * in front of (at a lower virtual address and file offset than) the vma. |
686 | * | 686 | * |
687 | * We cannot merge two vmas if they have differently assigned (non-NULL) | 687 | * We cannot merge two vmas if they have differently assigned (non-NULL) |
688 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | 688 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. |
689 | * | 689 | * |
690 | * We don't check here for the merged mmap wrapping around the end of pagecache | 690 | * We don't check here for the merged mmap wrapping around the end of pagecache |
691 | * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which | 691 | * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which |
692 | * wrap, nor mmaps which cover the final page at index -1UL. | 692 | * wrap, nor mmaps which cover the final page at index -1UL. |
693 | */ | 693 | */ |
694 | static int | 694 | static int |
695 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, | 695 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, |
696 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 696 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
697 | { | 697 | { |
698 | if (is_mergeable_vma(vma, file, vm_flags) && | 698 | if (is_mergeable_vma(vma, file, vm_flags) && |
699 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { | 699 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { |
700 | if (vma->vm_pgoff == vm_pgoff) | 700 | if (vma->vm_pgoff == vm_pgoff) |
701 | return 1; | 701 | return 1; |
702 | } | 702 | } |
703 | return 0; | 703 | return 0; |
704 | } | 704 | } |
705 | 705 | ||
706 | /* | 706 | /* |
707 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | 707 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) |
708 | * beyond (at a higher virtual address and file offset than) the vma. | 708 | * beyond (at a higher virtual address and file offset than) the vma. |
709 | * | 709 | * |
710 | * We cannot merge two vmas if they have differently assigned (non-NULL) | 710 | * We cannot merge two vmas if they have differently assigned (non-NULL) |
711 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | 711 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. |
712 | */ | 712 | */ |
713 | static int | 713 | static int |
714 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | 714 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, |
715 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 715 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
716 | { | 716 | { |
717 | if (is_mergeable_vma(vma, file, vm_flags) && | 717 | if (is_mergeable_vma(vma, file, vm_flags) && |
718 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { | 718 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { |
719 | pgoff_t vm_pglen; | 719 | pgoff_t vm_pglen; |
720 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 720 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; |
721 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) | 721 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) |
722 | return 1; | 722 | return 1; |
723 | } | 723 | } |
724 | return 0; | 724 | return 0; |
725 | } | 725 | } |
726 | 726 | ||
727 | /* | 727 | /* |
728 | * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out | 728 | * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out |
729 | * whether that can be merged with its predecessor or its successor. | 729 | * whether that can be merged with its predecessor or its successor. |
730 | * Or both (it neatly fills a hole). | 730 | * Or both (it neatly fills a hole). |
731 | * | 731 | * |
732 | * In most cases - when called for mmap, brk or mremap - [addr,end) is | 732 | * In most cases - when called for mmap, brk or mremap - [addr,end) is |
733 | * certain not to be mapped by the time vma_merge is called; but when | 733 | * certain not to be mapped by the time vma_merge is called; but when |
734 | * called for mprotect, it is certain to be already mapped (either at | 734 | * called for mprotect, it is certain to be already mapped (either at |
735 | * an offset within prev, or at the start of next), and the flags of | 735 | * an offset within prev, or at the start of next), and the flags of |
736 | * this area are about to be changed to vm_flags - and the no-change | 736 | * this area are about to be changed to vm_flags - and the no-change |
737 | * case has already been eliminated. | 737 | * case has already been eliminated. |
738 | * | 738 | * |
739 | * The following mprotect cases have to be considered, where AAAA is | 739 | * The following mprotect cases have to be considered, where AAAA is |
740 | * the area passed down from mprotect_fixup, never extending beyond one | 740 | * the area passed down from mprotect_fixup, never extending beyond one |
741 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: | 741 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: |
742 | * | 742 | * |
743 | * AAAA AAAA AAAA AAAA | 743 | * AAAA AAAA AAAA AAAA |
744 | * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX | 744 | * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX |
745 | * cannot merge might become might become might become | 745 | * cannot merge might become might become might become |
746 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or | 746 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or |
747 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or | 747 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or |
748 | * mremap move: PPPPNNNNNNNN 8 | 748 | * mremap move: PPPPNNNNNNNN 8 |
749 | * AAAA | 749 | * AAAA |
750 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN | 750 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN |
751 | * might become case 1 below case 2 below case 3 below | 751 | * might become case 1 below case 2 below case 3 below |
752 | * | 752 | * |
753 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: | 753 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: |
754 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. | 754 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. |
755 | */ | 755 | */ |
756 | struct vm_area_struct *vma_merge(struct mm_struct *mm, | 756 | struct vm_area_struct *vma_merge(struct mm_struct *mm, |
757 | struct vm_area_struct *prev, unsigned long addr, | 757 | struct vm_area_struct *prev, unsigned long addr, |
758 | unsigned long end, unsigned long vm_flags, | 758 | unsigned long end, unsigned long vm_flags, |
759 | struct anon_vma *anon_vma, struct file *file, | 759 | struct anon_vma *anon_vma, struct file *file, |
760 | pgoff_t pgoff, struct mempolicy *policy) | 760 | pgoff_t pgoff, struct mempolicy *policy) |
761 | { | 761 | { |
762 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 762 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
763 | struct vm_area_struct *area, *next; | 763 | struct vm_area_struct *area, *next; |
764 | 764 | ||
765 | /* | 765 | /* |
766 | * We later require that vma->vm_flags == vm_flags, | 766 | * We later require that vma->vm_flags == vm_flags, |
767 | * so this tests vma->vm_flags & VM_SPECIAL, too. | 767 | * so this tests vma->vm_flags & VM_SPECIAL, too. |
768 | */ | 768 | */ |
769 | if (vm_flags & VM_SPECIAL) | 769 | if (vm_flags & VM_SPECIAL) |
770 | return NULL; | 770 | return NULL; |
771 | 771 | ||
772 | if (prev) | 772 | if (prev) |
773 | next = prev->vm_next; | 773 | next = prev->vm_next; |
774 | else | 774 | else |
775 | next = mm->mmap; | 775 | next = mm->mmap; |
776 | area = next; | 776 | area = next; |
777 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ | 777 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ |
778 | next = next->vm_next; | 778 | next = next->vm_next; |
779 | 779 | ||
780 | /* | 780 | /* |
781 | * Can it merge with the predecessor? | 781 | * Can it merge with the predecessor? |
782 | */ | 782 | */ |
783 | if (prev && prev->vm_end == addr && | 783 | if (prev && prev->vm_end == addr && |
784 | mpol_equal(vma_policy(prev), policy) && | 784 | mpol_equal(vma_policy(prev), policy) && |
785 | can_vma_merge_after(prev, vm_flags, | 785 | can_vma_merge_after(prev, vm_flags, |
786 | anon_vma, file, pgoff)) { | 786 | anon_vma, file, pgoff)) { |
787 | /* | 787 | /* |
788 | * OK, it can. Can we now merge in the successor as well? | 788 | * OK, it can. Can we now merge in the successor as well? |
789 | */ | 789 | */ |
790 | if (next && end == next->vm_start && | 790 | if (next && end == next->vm_start && |
791 | mpol_equal(policy, vma_policy(next)) && | 791 | mpol_equal(policy, vma_policy(next)) && |
792 | can_vma_merge_before(next, vm_flags, | 792 | can_vma_merge_before(next, vm_flags, |
793 | anon_vma, file, pgoff+pglen) && | 793 | anon_vma, file, pgoff+pglen) && |
794 | is_mergeable_anon_vma(prev->anon_vma, | 794 | is_mergeable_anon_vma(prev->anon_vma, |
795 | next->anon_vma)) { | 795 | next->anon_vma)) { |
796 | /* cases 1, 6 */ | 796 | /* cases 1, 6 */ |
797 | vma_adjust(prev, prev->vm_start, | 797 | vma_adjust(prev, prev->vm_start, |
798 | next->vm_end, prev->vm_pgoff, NULL); | 798 | next->vm_end, prev->vm_pgoff, NULL); |
799 | } else /* cases 2, 5, 7 */ | 799 | } else /* cases 2, 5, 7 */ |
800 | vma_adjust(prev, prev->vm_start, | 800 | vma_adjust(prev, prev->vm_start, |
801 | end, prev->vm_pgoff, NULL); | 801 | end, prev->vm_pgoff, NULL); |
802 | return prev; | 802 | return prev; |
803 | } | 803 | } |
804 | 804 | ||
805 | /* | 805 | /* |
806 | * Can this new request be merged in front of next? | 806 | * Can this new request be merged in front of next? |
807 | */ | 807 | */ |
808 | if (next && end == next->vm_start && | 808 | if (next && end == next->vm_start && |
809 | mpol_equal(policy, vma_policy(next)) && | 809 | mpol_equal(policy, vma_policy(next)) && |
810 | can_vma_merge_before(next, vm_flags, | 810 | can_vma_merge_before(next, vm_flags, |
811 | anon_vma, file, pgoff+pglen)) { | 811 | anon_vma, file, pgoff+pglen)) { |
812 | if (prev && addr < prev->vm_end) /* case 4 */ | 812 | if (prev && addr < prev->vm_end) /* case 4 */ |
813 | vma_adjust(prev, prev->vm_start, | 813 | vma_adjust(prev, prev->vm_start, |
814 | addr, prev->vm_pgoff, NULL); | 814 | addr, prev->vm_pgoff, NULL); |
815 | else /* cases 3, 8 */ | 815 | else /* cases 3, 8 */ |
816 | vma_adjust(area, addr, next->vm_end, | 816 | vma_adjust(area, addr, next->vm_end, |
817 | next->vm_pgoff - pglen, NULL); | 817 | next->vm_pgoff - pglen, NULL); |
818 | return area; | 818 | return area; |
819 | } | 819 | } |
820 | 820 | ||
821 | return NULL; | 821 | return NULL; |
822 | } | 822 | } |
823 | 823 | ||
824 | /* | 824 | /* |
825 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check | 825 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check |
826 | * neighbouring vmas for a suitable anon_vma, before it goes off | 826 | * neighbouring vmas for a suitable anon_vma, before it goes off |
827 | * to allocate a new anon_vma. It checks because a repetitive | 827 | * to allocate a new anon_vma. It checks because a repetitive |
828 | * sequence of mprotects and faults may otherwise lead to distinct | 828 | * sequence of mprotects and faults may otherwise lead to distinct |
829 | * anon_vmas being allocated, preventing vma merge in subsequent | 829 | * anon_vmas being allocated, preventing vma merge in subsequent |
830 | * mprotect. | 830 | * mprotect. |
831 | */ | 831 | */ |
832 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | 832 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) |
833 | { | 833 | { |
834 | struct vm_area_struct *near; | 834 | struct vm_area_struct *near; |
835 | unsigned long vm_flags; | 835 | unsigned long vm_flags; |
836 | 836 | ||
837 | near = vma->vm_next; | 837 | near = vma->vm_next; |
838 | if (!near) | 838 | if (!near) |
839 | goto try_prev; | 839 | goto try_prev; |
840 | 840 | ||
841 | /* | 841 | /* |
842 | * Since only mprotect tries to remerge vmas, match flags | 842 | * Since only mprotect tries to remerge vmas, match flags |
843 | * which might be mprotected into each other later on. | 843 | * which might be mprotected into each other later on. |
844 | * Neither mlock nor madvise tries to remerge at present, | 844 | * Neither mlock nor madvise tries to remerge at present, |
845 | * so leave their flags as obstructing a merge. | 845 | * so leave their flags as obstructing a merge. |
846 | */ | 846 | */ |
847 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | 847 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); |
848 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | 848 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); |
849 | 849 | ||
850 | if (near->anon_vma && vma->vm_end == near->vm_start && | 850 | if (near->anon_vma && vma->vm_end == near->vm_start && |
851 | mpol_equal(vma_policy(vma), vma_policy(near)) && | 851 | mpol_equal(vma_policy(vma), vma_policy(near)) && |
852 | can_vma_merge_before(near, vm_flags, | 852 | can_vma_merge_before(near, vm_flags, |
853 | NULL, vma->vm_file, vma->vm_pgoff + | 853 | NULL, vma->vm_file, vma->vm_pgoff + |
854 | ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) | 854 | ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) |
855 | return near->anon_vma; | 855 | return near->anon_vma; |
856 | try_prev: | 856 | try_prev: |
857 | /* | 857 | /* |
858 | * It is potentially slow to have to call find_vma_prev here. | 858 | * It is potentially slow to have to call find_vma_prev here. |
859 | * But it's only on the first write fault on the vma, not | 859 | * But it's only on the first write fault on the vma, not |
860 | * every time, and we could devise a way to avoid it later | 860 | * every time, and we could devise a way to avoid it later |
861 | * (e.g. stash info in next's anon_vma_node when assigning | 861 | * (e.g. stash info in next's anon_vma_node when assigning |
862 | * an anon_vma, or when trying vma_merge). Another time. | 862 | * an anon_vma, or when trying vma_merge). Another time. |
863 | */ | 863 | */ |
864 | BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); | 864 | BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); |
865 | if (!near) | 865 | if (!near) |
866 | goto none; | 866 | goto none; |
867 | 867 | ||
868 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | 868 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); |
869 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | 869 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); |
870 | 870 | ||
871 | if (near->anon_vma && near->vm_end == vma->vm_start && | 871 | if (near->anon_vma && near->vm_end == vma->vm_start && |
872 | mpol_equal(vma_policy(near), vma_policy(vma)) && | 872 | mpol_equal(vma_policy(near), vma_policy(vma)) && |
873 | can_vma_merge_after(near, vm_flags, | 873 | can_vma_merge_after(near, vm_flags, |
874 | NULL, vma->vm_file, vma->vm_pgoff)) | 874 | NULL, vma->vm_file, vma->vm_pgoff)) |
875 | return near->anon_vma; | 875 | return near->anon_vma; |
876 | none: | 876 | none: |
877 | /* | 877 | /* |
878 | * There's no absolute need to look only at touching neighbours: | 878 | * There's no absolute need to look only at touching neighbours: |
879 | * we could search further afield for "compatible" anon_vmas. | 879 | * we could search further afield for "compatible" anon_vmas. |
880 | * But it would probably just be a waste of time searching, | 880 | * But it would probably just be a waste of time searching, |
881 | * or lead to too many vmas hanging off the same anon_vma. | 881 | * or lead to too many vmas hanging off the same anon_vma. |
882 | * We're trying to allow mprotect remerging later on, | 882 | * We're trying to allow mprotect remerging later on, |
883 | * not trying to minimize memory used for anon_vmas. | 883 | * not trying to minimize memory used for anon_vmas. |
884 | */ | 884 | */ |
885 | return NULL; | 885 | return NULL; |
886 | } | 886 | } |
887 | 887 | ||
888 | #ifdef CONFIG_PROC_FS | 888 | #ifdef CONFIG_PROC_FS |
889 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, | 889 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, |
890 | struct file *file, long pages) | 890 | struct file *file, long pages) |
891 | { | 891 | { |
892 | const unsigned long stack_flags | 892 | const unsigned long stack_flags |
893 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 893 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
894 | 894 | ||
895 | if (file) { | 895 | if (file) { |
896 | mm->shared_vm += pages; | 896 | mm->shared_vm += pages; |
897 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 897 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
898 | mm->exec_vm += pages; | 898 | mm->exec_vm += pages; |
899 | } else if (flags & stack_flags) | 899 | } else if (flags & stack_flags) |
900 | mm->stack_vm += pages; | 900 | mm->stack_vm += pages; |
901 | if (flags & (VM_RESERVED|VM_IO)) | 901 | if (flags & (VM_RESERVED|VM_IO)) |
902 | mm->reserved_vm += pages; | 902 | mm->reserved_vm += pages; |
903 | } | 903 | } |
904 | #endif /* CONFIG_PROC_FS */ | 904 | #endif /* CONFIG_PROC_FS */ |
905 | 905 | ||
906 | /* | 906 | /* |
907 | * The caller must hold down_write(current->mm->mmap_sem). | 907 | * The caller must hold down_write(current->mm->mmap_sem). |
908 | */ | 908 | */ |
909 | 909 | ||
910 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 910 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
911 | unsigned long len, unsigned long prot, | 911 | unsigned long len, unsigned long prot, |
912 | unsigned long flags, unsigned long pgoff) | 912 | unsigned long flags, unsigned long pgoff) |
913 | { | 913 | { |
914 | struct mm_struct * mm = current->mm; | 914 | struct mm_struct * mm = current->mm; |
915 | struct inode *inode; | 915 | struct inode *inode; |
916 | unsigned int vm_flags; | 916 | unsigned int vm_flags; |
917 | int error; | 917 | int error; |
918 | int accountable = 1; | 918 | int accountable = 1; |
919 | unsigned long reqprot = prot; | 919 | unsigned long reqprot = prot; |
920 | 920 | ||
921 | /* | 921 | /* |
922 | * Does the application expect PROT_READ to imply PROT_EXEC? | 922 | * Does the application expect PROT_READ to imply PROT_EXEC? |
923 | * | 923 | * |
924 | * (the exception is when the underlying filesystem is noexec | 924 | * (the exception is when the underlying filesystem is noexec |
925 | * mounted, in which case we dont add PROT_EXEC.) | 925 | * mounted, in which case we dont add PROT_EXEC.) |
926 | */ | 926 | */ |
927 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | 927 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
928 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) | 928 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) |
929 | prot |= PROT_EXEC; | 929 | prot |= PROT_EXEC; |
930 | 930 | ||
931 | if (!len) | 931 | if (!len) |
932 | return -EINVAL; | 932 | return -EINVAL; |
933 | 933 | ||
934 | if (!(flags & MAP_FIXED)) | 934 | if (!(flags & MAP_FIXED)) |
935 | addr = round_hint_to_min(addr); | 935 | addr = round_hint_to_min(addr); |
936 | 936 | ||
937 | error = arch_mmap_check(addr, len, flags); | 937 | error = arch_mmap_check(addr, len, flags); |
938 | if (error) | 938 | if (error) |
939 | return error; | 939 | return error; |
940 | 940 | ||
941 | /* Careful about overflows.. */ | 941 | /* Careful about overflows.. */ |
942 | len = PAGE_ALIGN(len); | 942 | len = PAGE_ALIGN(len); |
943 | if (!len || len > TASK_SIZE) | 943 | if (!len || len > TASK_SIZE) |
944 | return -ENOMEM; | 944 | return -ENOMEM; |
945 | 945 | ||
946 | /* offset overflow? */ | 946 | /* offset overflow? */ |
947 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 947 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) |
948 | return -EOVERFLOW; | 948 | return -EOVERFLOW; |
949 | 949 | ||
950 | /* Too many mappings? */ | 950 | /* Too many mappings? */ |
951 | if (mm->map_count > sysctl_max_map_count) | 951 | if (mm->map_count > sysctl_max_map_count) |
952 | return -ENOMEM; | 952 | return -ENOMEM; |
953 | 953 | ||
954 | /* Obtain the address to map to. we verify (or select) it and ensure | 954 | /* Obtain the address to map to. we verify (or select) it and ensure |
955 | * that it represents a valid section of the address space. | 955 | * that it represents a valid section of the address space. |
956 | */ | 956 | */ |
957 | addr = get_unmapped_area(file, addr, len, pgoff, flags); | 957 | addr = get_unmapped_area(file, addr, len, pgoff, flags); |
958 | if (addr & ~PAGE_MASK) | 958 | if (addr & ~PAGE_MASK) |
959 | return addr; | 959 | return addr; |
960 | 960 | ||
961 | /* Do simple checking here so the lower-level routines won't have | 961 | /* Do simple checking here so the lower-level routines won't have |
962 | * to. we assume access permissions have been handled by the open | 962 | * to. we assume access permissions have been handled by the open |
963 | * of the memory object, so we don't do any here. | 963 | * of the memory object, so we don't do any here. |
964 | */ | 964 | */ |
965 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | 965 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | |
966 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 966 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
967 | 967 | ||
968 | if (flags & MAP_LOCKED) { | 968 | if (flags & MAP_LOCKED) { |
969 | if (!can_do_mlock()) | 969 | if (!can_do_mlock()) |
970 | return -EPERM; | 970 | return -EPERM; |
971 | vm_flags |= VM_LOCKED; | 971 | vm_flags |= VM_LOCKED; |
972 | } | 972 | } |
973 | 973 | ||
974 | /* mlock MCL_FUTURE? */ | 974 | /* mlock MCL_FUTURE? */ |
975 | if (vm_flags & VM_LOCKED) { | 975 | if (vm_flags & VM_LOCKED) { |
976 | unsigned long locked, lock_limit; | 976 | unsigned long locked, lock_limit; |
977 | locked = len >> PAGE_SHIFT; | 977 | locked = len >> PAGE_SHIFT; |
978 | locked += mm->locked_vm; | 978 | locked += mm->locked_vm; |
979 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 979 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
980 | lock_limit >>= PAGE_SHIFT; | 980 | lock_limit >>= PAGE_SHIFT; |
981 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 981 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
982 | return -EAGAIN; | 982 | return -EAGAIN; |
983 | } | 983 | } |
984 | 984 | ||
985 | inode = file ? file->f_path.dentry->d_inode : NULL; | 985 | inode = file ? file->f_path.dentry->d_inode : NULL; |
986 | 986 | ||
987 | if (file) { | 987 | if (file) { |
988 | switch (flags & MAP_TYPE) { | 988 | switch (flags & MAP_TYPE) { |
989 | case MAP_SHARED: | 989 | case MAP_SHARED: |
990 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) | 990 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) |
991 | return -EACCES; | 991 | return -EACCES; |
992 | 992 | ||
993 | /* | 993 | /* |
994 | * Make sure we don't allow writing to an append-only | 994 | * Make sure we don't allow writing to an append-only |
995 | * file.. | 995 | * file.. |
996 | */ | 996 | */ |
997 | if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) | 997 | if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) |
998 | return -EACCES; | 998 | return -EACCES; |
999 | 999 | ||
1000 | /* | 1000 | /* |
1001 | * Make sure there are no mandatory locks on the file. | 1001 | * Make sure there are no mandatory locks on the file. |
1002 | */ | 1002 | */ |
1003 | if (locks_verify_locked(inode)) | 1003 | if (locks_verify_locked(inode)) |
1004 | return -EAGAIN; | 1004 | return -EAGAIN; |
1005 | 1005 | ||
1006 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1006 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1007 | if (!(file->f_mode & FMODE_WRITE)) | 1007 | if (!(file->f_mode & FMODE_WRITE)) |
1008 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); | 1008 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); |
1009 | 1009 | ||
1010 | /* fall through */ | 1010 | /* fall through */ |
1011 | case MAP_PRIVATE: | 1011 | case MAP_PRIVATE: |
1012 | if (!(file->f_mode & FMODE_READ)) | 1012 | if (!(file->f_mode & FMODE_READ)) |
1013 | return -EACCES; | 1013 | return -EACCES; |
1014 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { | 1014 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
1015 | if (vm_flags & VM_EXEC) | 1015 | if (vm_flags & VM_EXEC) |
1016 | return -EPERM; | 1016 | return -EPERM; |
1017 | vm_flags &= ~VM_MAYEXEC; | 1017 | vm_flags &= ~VM_MAYEXEC; |
1018 | } | 1018 | } |
1019 | if (is_file_hugepages(file)) | 1019 | if (is_file_hugepages(file)) |
1020 | accountable = 0; | 1020 | accountable = 0; |
1021 | 1021 | ||
1022 | if (!file->f_op || !file->f_op->mmap) | 1022 | if (!file->f_op || !file->f_op->mmap) |
1023 | return -ENODEV; | 1023 | return -ENODEV; |
1024 | break; | 1024 | break; |
1025 | 1025 | ||
1026 | default: | 1026 | default: |
1027 | return -EINVAL; | 1027 | return -EINVAL; |
1028 | } | 1028 | } |
1029 | } else { | 1029 | } else { |
1030 | switch (flags & MAP_TYPE) { | 1030 | switch (flags & MAP_TYPE) { |
1031 | case MAP_SHARED: | 1031 | case MAP_SHARED: |
1032 | /* | 1032 | /* |
1033 | * Ignore pgoff. | 1033 | * Ignore pgoff. |
1034 | */ | 1034 | */ |
1035 | pgoff = 0; | 1035 | pgoff = 0; |
1036 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1036 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1037 | break; | 1037 | break; |
1038 | case MAP_PRIVATE: | 1038 | case MAP_PRIVATE: |
1039 | /* | 1039 | /* |
1040 | * Set pgoff according to addr for anon_vma. | 1040 | * Set pgoff according to addr for anon_vma. |
1041 | */ | 1041 | */ |
1042 | pgoff = addr >> PAGE_SHIFT; | 1042 | pgoff = addr >> PAGE_SHIFT; |
1043 | break; | 1043 | break; |
1044 | default: | 1044 | default: |
1045 | return -EINVAL; | 1045 | return -EINVAL; |
1046 | } | 1046 | } |
1047 | } | 1047 | } |
1048 | 1048 | ||
1049 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 1049 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
1050 | if (error) | 1050 | if (error) |
1051 | return error; | 1051 | return error; |
1052 | 1052 | ||
1053 | return mmap_region(file, addr, len, flags, vm_flags, pgoff, | 1053 | return mmap_region(file, addr, len, flags, vm_flags, pgoff, |
1054 | accountable); | 1054 | accountable); |
1055 | } | 1055 | } |
1056 | EXPORT_SYMBOL(do_mmap_pgoff); | 1056 | EXPORT_SYMBOL(do_mmap_pgoff); |
1057 | 1057 | ||
1058 | /* | 1058 | /* |
1059 | * Some shared mappigns will want the pages marked read-only | 1059 | * Some shared mappigns will want the pages marked read-only |
1060 | * to track write events. If so, we'll downgrade vm_page_prot | 1060 | * to track write events. If so, we'll downgrade vm_page_prot |
1061 | * to the private version (using protection_map[] without the | 1061 | * to the private version (using protection_map[] without the |
1062 | * VM_SHARED bit). | 1062 | * VM_SHARED bit). |
1063 | */ | 1063 | */ |
1064 | int vma_wants_writenotify(struct vm_area_struct *vma) | 1064 | int vma_wants_writenotify(struct vm_area_struct *vma) |
1065 | { | 1065 | { |
1066 | unsigned int vm_flags = vma->vm_flags; | 1066 | unsigned int vm_flags = vma->vm_flags; |
1067 | 1067 | ||
1068 | /* If it was private or non-writable, the write bit is already clear */ | 1068 | /* If it was private or non-writable, the write bit is already clear */ |
1069 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) | 1069 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) |
1070 | return 0; | 1070 | return 0; |
1071 | 1071 | ||
1072 | /* The backer wishes to know when pages are first written to? */ | 1072 | /* The backer wishes to know when pages are first written to? */ |
1073 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | 1073 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) |
1074 | return 1; | 1074 | return 1; |
1075 | 1075 | ||
1076 | /* The open routine did something to the protections already? */ | 1076 | /* The open routine did something to the protections already? */ |
1077 | if (pgprot_val(vma->vm_page_prot) != | 1077 | if (pgprot_val(vma->vm_page_prot) != |
1078 | pgprot_val(vm_get_page_prot(vm_flags))) | 1078 | pgprot_val(vm_get_page_prot(vm_flags))) |
1079 | return 0; | 1079 | return 0; |
1080 | 1080 | ||
1081 | /* Specialty mapping? */ | 1081 | /* Specialty mapping? */ |
1082 | if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) | 1082 | if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) |
1083 | return 0; | 1083 | return 0; |
1084 | 1084 | ||
1085 | /* Can the mapping track the dirty pages? */ | 1085 | /* Can the mapping track the dirty pages? */ |
1086 | return vma->vm_file && vma->vm_file->f_mapping && | 1086 | return vma->vm_file && vma->vm_file->f_mapping && |
1087 | mapping_cap_account_dirty(vma->vm_file->f_mapping); | 1087 | mapping_cap_account_dirty(vma->vm_file->f_mapping); |
1088 | } | 1088 | } |
1089 | 1089 | ||
1090 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1090 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1091 | unsigned long len, unsigned long flags, | 1091 | unsigned long len, unsigned long flags, |
1092 | unsigned int vm_flags, unsigned long pgoff, | 1092 | unsigned int vm_flags, unsigned long pgoff, |
1093 | int accountable) | 1093 | int accountable) |
1094 | { | 1094 | { |
1095 | struct mm_struct *mm = current->mm; | 1095 | struct mm_struct *mm = current->mm; |
1096 | struct vm_area_struct *vma, *prev; | 1096 | struct vm_area_struct *vma, *prev; |
1097 | int correct_wcount = 0; | 1097 | int correct_wcount = 0; |
1098 | int error; | 1098 | int error; |
1099 | struct rb_node **rb_link, *rb_parent; | 1099 | struct rb_node **rb_link, *rb_parent; |
1100 | unsigned long charged = 0; | 1100 | unsigned long charged = 0; |
1101 | struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; | 1101 | struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; |
1102 | 1102 | ||
1103 | /* Clear old maps */ | 1103 | /* Clear old maps */ |
1104 | error = -ENOMEM; | 1104 | error = -ENOMEM; |
1105 | munmap_back: | 1105 | munmap_back: |
1106 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 1106 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); |
1107 | if (vma && vma->vm_start < addr + len) { | 1107 | if (vma && vma->vm_start < addr + len) { |
1108 | if (do_munmap(mm, addr, len)) | 1108 | if (do_munmap(mm, addr, len)) |
1109 | return -ENOMEM; | 1109 | return -ENOMEM; |
1110 | goto munmap_back; | 1110 | goto munmap_back; |
1111 | } | 1111 | } |
1112 | 1112 | ||
1113 | /* Check against address space limit. */ | 1113 | /* Check against address space limit. */ |
1114 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | 1114 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
1115 | return -ENOMEM; | 1115 | return -ENOMEM; |
1116 | 1116 | ||
1117 | if (flags & MAP_NORESERVE) | 1117 | if (flags & MAP_NORESERVE) |
1118 | vm_flags |= VM_NORESERVE; | 1118 | vm_flags |= VM_NORESERVE; |
1119 | 1119 | ||
1120 | if (accountable && (!(flags & MAP_NORESERVE) || | 1120 | if (accountable && (!(flags & MAP_NORESERVE) || |
1121 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { | 1121 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { |
1122 | if (vm_flags & VM_SHARED) { | 1122 | if (vm_flags & VM_SHARED) { |
1123 | /* Check memory availability in shmem_file_setup? */ | 1123 | /* Check memory availability in shmem_file_setup? */ |
1124 | vm_flags |= VM_ACCOUNT; | 1124 | vm_flags |= VM_ACCOUNT; |
1125 | } else if (vm_flags & VM_WRITE) { | 1125 | } else if (vm_flags & VM_WRITE) { |
1126 | /* | 1126 | /* |
1127 | * Private writable mapping: check memory availability | 1127 | * Private writable mapping: check memory availability |
1128 | */ | 1128 | */ |
1129 | charged = len >> PAGE_SHIFT; | 1129 | charged = len >> PAGE_SHIFT; |
1130 | if (security_vm_enough_memory(charged)) | 1130 | if (security_vm_enough_memory(charged)) |
1131 | return -ENOMEM; | 1131 | return -ENOMEM; |
1132 | vm_flags |= VM_ACCOUNT; | 1132 | vm_flags |= VM_ACCOUNT; |
1133 | } | 1133 | } |
1134 | } | 1134 | } |
1135 | 1135 | ||
1136 | /* | 1136 | /* |
1137 | * Can we just expand an old private anonymous mapping? | 1137 | * Can we just expand an old private anonymous mapping? |
1138 | * The VM_SHARED test is necessary because shmem_zero_setup | 1138 | * The VM_SHARED test is necessary because shmem_zero_setup |
1139 | * will create the file object for a shared anonymous map below. | 1139 | * will create the file object for a shared anonymous map below. |
1140 | */ | 1140 | */ |
1141 | if (!file && !(vm_flags & VM_SHARED)) { | 1141 | if (!file && !(vm_flags & VM_SHARED)) { |
1142 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, | 1142 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, |
1143 | NULL, NULL, pgoff, NULL); | 1143 | NULL, NULL, pgoff, NULL); |
1144 | if (vma) | 1144 | if (vma) |
1145 | goto out; | 1145 | goto out; |
1146 | } | 1146 | } |
1147 | 1147 | ||
1148 | /* | 1148 | /* |
1149 | * Determine the object being mapped and call the appropriate | 1149 | * Determine the object being mapped and call the appropriate |
1150 | * specific mapper. the address has already been validated, but | 1150 | * specific mapper. the address has already been validated, but |
1151 | * not unmapped, but the maps are removed from the list. | 1151 | * not unmapped, but the maps are removed from the list. |
1152 | */ | 1152 | */ |
1153 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 1153 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1154 | if (!vma) { | 1154 | if (!vma) { |
1155 | error = -ENOMEM; | 1155 | error = -ENOMEM; |
1156 | goto unacct_error; | 1156 | goto unacct_error; |
1157 | } | 1157 | } |
1158 | 1158 | ||
1159 | vma->vm_mm = mm; | 1159 | vma->vm_mm = mm; |
1160 | vma->vm_start = addr; | 1160 | vma->vm_start = addr; |
1161 | vma->vm_end = addr + len; | 1161 | vma->vm_end = addr + len; |
1162 | vma->vm_flags = vm_flags; | 1162 | vma->vm_flags = vm_flags; |
1163 | vma->vm_page_prot = vm_get_page_prot(vm_flags); | 1163 | vma->vm_page_prot = vm_get_page_prot(vm_flags); |
1164 | vma->vm_pgoff = pgoff; | 1164 | vma->vm_pgoff = pgoff; |
1165 | 1165 | ||
1166 | if (file) { | 1166 | if (file) { |
1167 | error = -EINVAL; | 1167 | error = -EINVAL; |
1168 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | 1168 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) |
1169 | goto free_vma; | 1169 | goto free_vma; |
1170 | if (vm_flags & VM_DENYWRITE) { | 1170 | if (vm_flags & VM_DENYWRITE) { |
1171 | error = deny_write_access(file); | 1171 | error = deny_write_access(file); |
1172 | if (error) | 1172 | if (error) |
1173 | goto free_vma; | 1173 | goto free_vma; |
1174 | correct_wcount = 1; | 1174 | correct_wcount = 1; |
1175 | } | 1175 | } |
1176 | vma->vm_file = file; | 1176 | vma->vm_file = file; |
1177 | get_file(file); | 1177 | get_file(file); |
1178 | error = file->f_op->mmap(file, vma); | 1178 | error = file->f_op->mmap(file, vma); |
1179 | if (error) | 1179 | if (error) |
1180 | goto unmap_and_free_vma; | 1180 | goto unmap_and_free_vma; |
1181 | if (vm_flags & VM_EXECUTABLE) | 1181 | if (vm_flags & VM_EXECUTABLE) |
1182 | added_exe_file_vma(mm); | 1182 | added_exe_file_vma(mm); |
1183 | } else if (vm_flags & VM_SHARED) { | 1183 | } else if (vm_flags & VM_SHARED) { |
1184 | error = shmem_zero_setup(vma); | 1184 | error = shmem_zero_setup(vma); |
1185 | if (error) | 1185 | if (error) |
1186 | goto free_vma; | 1186 | goto free_vma; |
1187 | } | 1187 | } |
1188 | 1188 | ||
1189 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | 1189 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform |
1190 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | 1190 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) |
1191 | * that memory reservation must be checked; but that reservation | 1191 | * that memory reservation must be checked; but that reservation |
1192 | * belongs to shared memory object, not to vma: so now clear it. | 1192 | * belongs to shared memory object, not to vma: so now clear it. |
1193 | */ | 1193 | */ |
1194 | if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) | 1194 | if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) |
1195 | vma->vm_flags &= ~VM_ACCOUNT; | 1195 | vma->vm_flags &= ~VM_ACCOUNT; |
1196 | 1196 | ||
1197 | /* Can addr have changed?? | 1197 | /* Can addr have changed?? |
1198 | * | 1198 | * |
1199 | * Answer: Yes, several device drivers can do it in their | 1199 | * Answer: Yes, several device drivers can do it in their |
1200 | * f_op->mmap method. -DaveM | 1200 | * f_op->mmap method. -DaveM |
1201 | */ | 1201 | */ |
1202 | addr = vma->vm_start; | 1202 | addr = vma->vm_start; |
1203 | pgoff = vma->vm_pgoff; | 1203 | pgoff = vma->vm_pgoff; |
1204 | vm_flags = vma->vm_flags; | 1204 | vm_flags = vma->vm_flags; |
1205 | 1205 | ||
1206 | if (vma_wants_writenotify(vma)) | 1206 | if (vma_wants_writenotify(vma)) |
1207 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1207 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
1208 | 1208 | ||
1209 | if (file && vma_merge(mm, prev, addr, vma->vm_end, | 1209 | if (file && vma_merge(mm, prev, addr, vma->vm_end, |
1210 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | 1210 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { |
1211 | mpol_put(vma_policy(vma)); | 1211 | mpol_put(vma_policy(vma)); |
1212 | kmem_cache_free(vm_area_cachep, vma); | 1212 | kmem_cache_free(vm_area_cachep, vma); |
1213 | fput(file); | 1213 | fput(file); |
1214 | if (vm_flags & VM_EXECUTABLE) | 1214 | if (vm_flags & VM_EXECUTABLE) |
1215 | removed_exe_file_vma(mm); | 1215 | removed_exe_file_vma(mm); |
1216 | } else { | 1216 | } else { |
1217 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1217 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1218 | file = vma->vm_file; | 1218 | file = vma->vm_file; |
1219 | } | 1219 | } |
1220 | 1220 | ||
1221 | /* Once vma denies write, undo our temporary denial count */ | 1221 | /* Once vma denies write, undo our temporary denial count */ |
1222 | if (correct_wcount) | 1222 | if (correct_wcount) |
1223 | atomic_inc(&inode->i_writecount); | 1223 | atomic_inc(&inode->i_writecount); |
1224 | out: | 1224 | out: |
1225 | mm->total_vm += len >> PAGE_SHIFT; | 1225 | mm->total_vm += len >> PAGE_SHIFT; |
1226 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1226 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1227 | if (vm_flags & VM_LOCKED) { | 1227 | if (vm_flags & VM_LOCKED) { |
1228 | /* | 1228 | /* |
1229 | * makes pages present; downgrades, drops, reacquires mmap_sem | 1229 | * makes pages present; downgrades, drops, reacquires mmap_sem |
1230 | */ | 1230 | */ |
1231 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); | 1231 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); |
1232 | if (nr_pages < 0) | 1232 | if (nr_pages < 0) |
1233 | return nr_pages; /* vma gone! */ | 1233 | return nr_pages; /* vma gone! */ |
1234 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; | 1234 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; |
1235 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1235 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) |
1236 | make_pages_present(addr, addr + len); | 1236 | make_pages_present(addr, addr + len); |
1237 | return addr; | 1237 | return addr; |
1238 | 1238 | ||
1239 | unmap_and_free_vma: | 1239 | unmap_and_free_vma: |
1240 | if (correct_wcount) | 1240 | if (correct_wcount) |
1241 | atomic_inc(&inode->i_writecount); | 1241 | atomic_inc(&inode->i_writecount); |
1242 | vma->vm_file = NULL; | 1242 | vma->vm_file = NULL; |
1243 | fput(file); | 1243 | fput(file); |
1244 | 1244 | ||
1245 | /* Undo any partial mapping done by a device driver. */ | 1245 | /* Undo any partial mapping done by a device driver. */ |
1246 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); | 1246 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); |
1247 | charged = 0; | 1247 | charged = 0; |
1248 | free_vma: | 1248 | free_vma: |
1249 | kmem_cache_free(vm_area_cachep, vma); | 1249 | kmem_cache_free(vm_area_cachep, vma); |
1250 | unacct_error: | 1250 | unacct_error: |
1251 | if (charged) | 1251 | if (charged) |
1252 | vm_unacct_memory(charged); | 1252 | vm_unacct_memory(charged); |
1253 | return error; | 1253 | return error; |
1254 | } | 1254 | } |
1255 | 1255 | ||
1256 | /* Get an address range which is currently unmapped. | 1256 | /* Get an address range which is currently unmapped. |
1257 | * For shmat() with addr=0. | 1257 | * For shmat() with addr=0. |
1258 | * | 1258 | * |
1259 | * Ugly calling convention alert: | 1259 | * Ugly calling convention alert: |
1260 | * Return value with the low bits set means error value, | 1260 | * Return value with the low bits set means error value, |
1261 | * ie | 1261 | * ie |
1262 | * if (ret & ~PAGE_MASK) | 1262 | * if (ret & ~PAGE_MASK) |
1263 | * error = ret; | 1263 | * error = ret; |
1264 | * | 1264 | * |
1265 | * This function "knows" that -ENOMEM has the bits set. | 1265 | * This function "knows" that -ENOMEM has the bits set. |
1266 | */ | 1266 | */ |
1267 | #ifndef HAVE_ARCH_UNMAPPED_AREA | 1267 | #ifndef HAVE_ARCH_UNMAPPED_AREA |
1268 | unsigned long | 1268 | unsigned long |
1269 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | 1269 | arch_get_unmapped_area(struct file *filp, unsigned long addr, |
1270 | unsigned long len, unsigned long pgoff, unsigned long flags) | 1270 | unsigned long len, unsigned long pgoff, unsigned long flags) |
1271 | { | 1271 | { |
1272 | struct mm_struct *mm = current->mm; | 1272 | struct mm_struct *mm = current->mm; |
1273 | struct vm_area_struct *vma; | 1273 | struct vm_area_struct *vma; |
1274 | unsigned long start_addr; | 1274 | unsigned long start_addr; |
1275 | 1275 | ||
1276 | if (len > TASK_SIZE) | 1276 | if (len > TASK_SIZE) |
1277 | return -ENOMEM; | 1277 | return -ENOMEM; |
1278 | 1278 | ||
1279 | if (flags & MAP_FIXED) | 1279 | if (flags & MAP_FIXED) |
1280 | return addr; | 1280 | return addr; |
1281 | 1281 | ||
1282 | if (addr) { | 1282 | if (addr) { |
1283 | addr = PAGE_ALIGN(addr); | 1283 | addr = PAGE_ALIGN(addr); |
1284 | vma = find_vma(mm, addr); | 1284 | vma = find_vma(mm, addr); |
1285 | if (TASK_SIZE - len >= addr && | 1285 | if (TASK_SIZE - len >= addr && |
1286 | (!vma || addr + len <= vma->vm_start)) | 1286 | (!vma || addr + len <= vma->vm_start)) |
1287 | return addr; | 1287 | return addr; |
1288 | } | 1288 | } |
1289 | if (len > mm->cached_hole_size) { | 1289 | if (len > mm->cached_hole_size) { |
1290 | start_addr = addr = mm->free_area_cache; | 1290 | start_addr = addr = mm->free_area_cache; |
1291 | } else { | 1291 | } else { |
1292 | start_addr = addr = TASK_UNMAPPED_BASE; | 1292 | start_addr = addr = TASK_UNMAPPED_BASE; |
1293 | mm->cached_hole_size = 0; | 1293 | mm->cached_hole_size = 0; |
1294 | } | 1294 | } |
1295 | 1295 | ||
1296 | full_search: | 1296 | full_search: |
1297 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | 1297 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { |
1298 | /* At this point: (!vma || addr < vma->vm_end). */ | 1298 | /* At this point: (!vma || addr < vma->vm_end). */ |
1299 | if (TASK_SIZE - len < addr) { | 1299 | if (TASK_SIZE - len < addr) { |
1300 | /* | 1300 | /* |
1301 | * Start a new search - just in case we missed | 1301 | * Start a new search - just in case we missed |
1302 | * some holes. | 1302 | * some holes. |
1303 | */ | 1303 | */ |
1304 | if (start_addr != TASK_UNMAPPED_BASE) { | 1304 | if (start_addr != TASK_UNMAPPED_BASE) { |
1305 | addr = TASK_UNMAPPED_BASE; | 1305 | addr = TASK_UNMAPPED_BASE; |
1306 | start_addr = addr; | 1306 | start_addr = addr; |
1307 | mm->cached_hole_size = 0; | 1307 | mm->cached_hole_size = 0; |
1308 | goto full_search; | 1308 | goto full_search; |
1309 | } | 1309 | } |
1310 | return -ENOMEM; | 1310 | return -ENOMEM; |
1311 | } | 1311 | } |
1312 | if (!vma || addr + len <= vma->vm_start) { | 1312 | if (!vma || addr + len <= vma->vm_start) { |
1313 | /* | 1313 | /* |
1314 | * Remember the place where we stopped the search: | 1314 | * Remember the place where we stopped the search: |
1315 | */ | 1315 | */ |
1316 | mm->free_area_cache = addr + len; | 1316 | mm->free_area_cache = addr + len; |
1317 | return addr; | 1317 | return addr; |
1318 | } | 1318 | } |
1319 | if (addr + mm->cached_hole_size < vma->vm_start) | 1319 | if (addr + mm->cached_hole_size < vma->vm_start) |
1320 | mm->cached_hole_size = vma->vm_start - addr; | 1320 | mm->cached_hole_size = vma->vm_start - addr; |
1321 | addr = vma->vm_end; | 1321 | addr = vma->vm_end; |
1322 | } | 1322 | } |
1323 | } | 1323 | } |
1324 | #endif | 1324 | #endif |
1325 | 1325 | ||
1326 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | 1326 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) |
1327 | { | 1327 | { |
1328 | /* | 1328 | /* |
1329 | * Is this a new hole at the lowest possible address? | 1329 | * Is this a new hole at the lowest possible address? |
1330 | */ | 1330 | */ |
1331 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { | 1331 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { |
1332 | mm->free_area_cache = addr; | 1332 | mm->free_area_cache = addr; |
1333 | mm->cached_hole_size = ~0UL; | 1333 | mm->cached_hole_size = ~0UL; |
1334 | } | 1334 | } |
1335 | } | 1335 | } |
1336 | 1336 | ||
1337 | /* | 1337 | /* |
1338 | * This mmap-allocator allocates new areas top-down from below the | 1338 | * This mmap-allocator allocates new areas top-down from below the |
1339 | * stack's low limit (the base): | 1339 | * stack's low limit (the base): |
1340 | */ | 1340 | */ |
1341 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 1341 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
1342 | unsigned long | 1342 | unsigned long |
1343 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | 1343 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, |
1344 | const unsigned long len, const unsigned long pgoff, | 1344 | const unsigned long len, const unsigned long pgoff, |
1345 | const unsigned long flags) | 1345 | const unsigned long flags) |
1346 | { | 1346 | { |
1347 | struct vm_area_struct *vma; | 1347 | struct vm_area_struct *vma; |
1348 | struct mm_struct *mm = current->mm; | 1348 | struct mm_struct *mm = current->mm; |
1349 | unsigned long addr = addr0; | 1349 | unsigned long addr = addr0; |
1350 | 1350 | ||
1351 | /* requested length too big for entire address space */ | 1351 | /* requested length too big for entire address space */ |
1352 | if (len > TASK_SIZE) | 1352 | if (len > TASK_SIZE) |
1353 | return -ENOMEM; | 1353 | return -ENOMEM; |
1354 | 1354 | ||
1355 | if (flags & MAP_FIXED) | 1355 | if (flags & MAP_FIXED) |
1356 | return addr; | 1356 | return addr; |
1357 | 1357 | ||
1358 | /* requesting a specific address */ | 1358 | /* requesting a specific address */ |
1359 | if (addr) { | 1359 | if (addr) { |
1360 | addr = PAGE_ALIGN(addr); | 1360 | addr = PAGE_ALIGN(addr); |
1361 | vma = find_vma(mm, addr); | 1361 | vma = find_vma(mm, addr); |
1362 | if (TASK_SIZE - len >= addr && | 1362 | if (TASK_SIZE - len >= addr && |
1363 | (!vma || addr + len <= vma->vm_start)) | 1363 | (!vma || addr + len <= vma->vm_start)) |
1364 | return addr; | 1364 | return addr; |
1365 | } | 1365 | } |
1366 | 1366 | ||
1367 | /* check if free_area_cache is useful for us */ | 1367 | /* check if free_area_cache is useful for us */ |
1368 | if (len <= mm->cached_hole_size) { | 1368 | if (len <= mm->cached_hole_size) { |
1369 | mm->cached_hole_size = 0; | 1369 | mm->cached_hole_size = 0; |
1370 | mm->free_area_cache = mm->mmap_base; | 1370 | mm->free_area_cache = mm->mmap_base; |
1371 | } | 1371 | } |
1372 | 1372 | ||
1373 | /* either no address requested or can't fit in requested address hole */ | 1373 | /* either no address requested or can't fit in requested address hole */ |
1374 | addr = mm->free_area_cache; | 1374 | addr = mm->free_area_cache; |
1375 | 1375 | ||
1376 | /* make sure it can fit in the remaining address space */ | 1376 | /* make sure it can fit in the remaining address space */ |
1377 | if (addr > len) { | 1377 | if (addr > len) { |
1378 | vma = find_vma(mm, addr-len); | 1378 | vma = find_vma(mm, addr-len); |
1379 | if (!vma || addr <= vma->vm_start) | 1379 | if (!vma || addr <= vma->vm_start) |
1380 | /* remember the address as a hint for next time */ | 1380 | /* remember the address as a hint for next time */ |
1381 | return (mm->free_area_cache = addr-len); | 1381 | return (mm->free_area_cache = addr-len); |
1382 | } | 1382 | } |
1383 | 1383 | ||
1384 | if (mm->mmap_base < len) | 1384 | if (mm->mmap_base < len) |
1385 | goto bottomup; | 1385 | goto bottomup; |
1386 | 1386 | ||
1387 | addr = mm->mmap_base-len; | 1387 | addr = mm->mmap_base-len; |
1388 | 1388 | ||
1389 | do { | 1389 | do { |
1390 | /* | 1390 | /* |
1391 | * Lookup failure means no vma is above this address, | 1391 | * Lookup failure means no vma is above this address, |
1392 | * else if new region fits below vma->vm_start, | 1392 | * else if new region fits below vma->vm_start, |
1393 | * return with success: | 1393 | * return with success: |
1394 | */ | 1394 | */ |
1395 | vma = find_vma(mm, addr); | 1395 | vma = find_vma(mm, addr); |
1396 | if (!vma || addr+len <= vma->vm_start) | 1396 | if (!vma || addr+len <= vma->vm_start) |
1397 | /* remember the address as a hint for next time */ | 1397 | /* remember the address as a hint for next time */ |
1398 | return (mm->free_area_cache = addr); | 1398 | return (mm->free_area_cache = addr); |
1399 | 1399 | ||
1400 | /* remember the largest hole we saw so far */ | 1400 | /* remember the largest hole we saw so far */ |
1401 | if (addr + mm->cached_hole_size < vma->vm_start) | 1401 | if (addr + mm->cached_hole_size < vma->vm_start) |
1402 | mm->cached_hole_size = vma->vm_start - addr; | 1402 | mm->cached_hole_size = vma->vm_start - addr; |
1403 | 1403 | ||
1404 | /* try just below the current vma->vm_start */ | 1404 | /* try just below the current vma->vm_start */ |
1405 | addr = vma->vm_start-len; | 1405 | addr = vma->vm_start-len; |
1406 | } while (len < vma->vm_start); | 1406 | } while (len < vma->vm_start); |
1407 | 1407 | ||
1408 | bottomup: | 1408 | bottomup: |
1409 | /* | 1409 | /* |
1410 | * A failed mmap() very likely causes application failure, | 1410 | * A failed mmap() very likely causes application failure, |
1411 | * so fall back to the bottom-up function here. This scenario | 1411 | * so fall back to the bottom-up function here. This scenario |
1412 | * can happen with large stack limits and large mmap() | 1412 | * can happen with large stack limits and large mmap() |
1413 | * allocations. | 1413 | * allocations. |
1414 | */ | 1414 | */ |
1415 | mm->cached_hole_size = ~0UL; | 1415 | mm->cached_hole_size = ~0UL; |
1416 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 1416 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
1417 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 1417 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); |
1418 | /* | 1418 | /* |
1419 | * Restore the topdown base: | 1419 | * Restore the topdown base: |
1420 | */ | 1420 | */ |
1421 | mm->free_area_cache = mm->mmap_base; | 1421 | mm->free_area_cache = mm->mmap_base; |
1422 | mm->cached_hole_size = ~0UL; | 1422 | mm->cached_hole_size = ~0UL; |
1423 | 1423 | ||
1424 | return addr; | 1424 | return addr; |
1425 | } | 1425 | } |
1426 | #endif | 1426 | #endif |
1427 | 1427 | ||
1428 | void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) | 1428 | void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) |
1429 | { | 1429 | { |
1430 | /* | 1430 | /* |
1431 | * Is this a new hole at the highest possible address? | 1431 | * Is this a new hole at the highest possible address? |
1432 | */ | 1432 | */ |
1433 | if (addr > mm->free_area_cache) | 1433 | if (addr > mm->free_area_cache) |
1434 | mm->free_area_cache = addr; | 1434 | mm->free_area_cache = addr; |
1435 | 1435 | ||
1436 | /* dont allow allocations above current base */ | 1436 | /* dont allow allocations above current base */ |
1437 | if (mm->free_area_cache > mm->mmap_base) | 1437 | if (mm->free_area_cache > mm->mmap_base) |
1438 | mm->free_area_cache = mm->mmap_base; | 1438 | mm->free_area_cache = mm->mmap_base; |
1439 | } | 1439 | } |
1440 | 1440 | ||
1441 | unsigned long | 1441 | unsigned long |
1442 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | 1442 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, |
1443 | unsigned long pgoff, unsigned long flags) | 1443 | unsigned long pgoff, unsigned long flags) |
1444 | { | 1444 | { |
1445 | unsigned long (*get_area)(struct file *, unsigned long, | 1445 | unsigned long (*get_area)(struct file *, unsigned long, |
1446 | unsigned long, unsigned long, unsigned long); | 1446 | unsigned long, unsigned long, unsigned long); |
1447 | 1447 | ||
1448 | get_area = current->mm->get_unmapped_area; | 1448 | get_area = current->mm->get_unmapped_area; |
1449 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1449 | if (file && file->f_op && file->f_op->get_unmapped_area) |
1450 | get_area = file->f_op->get_unmapped_area; | 1450 | get_area = file->f_op->get_unmapped_area; |
1451 | addr = get_area(file, addr, len, pgoff, flags); | 1451 | addr = get_area(file, addr, len, pgoff, flags); |
1452 | if (IS_ERR_VALUE(addr)) | 1452 | if (IS_ERR_VALUE(addr)) |
1453 | return addr; | 1453 | return addr; |
1454 | 1454 | ||
1455 | if (addr > TASK_SIZE - len) | 1455 | if (addr > TASK_SIZE - len) |
1456 | return -ENOMEM; | 1456 | return -ENOMEM; |
1457 | if (addr & ~PAGE_MASK) | 1457 | if (addr & ~PAGE_MASK) |
1458 | return -EINVAL; | 1458 | return -EINVAL; |
1459 | 1459 | ||
1460 | return arch_rebalance_pgtables(addr, len); | 1460 | return arch_rebalance_pgtables(addr, len); |
1461 | } | 1461 | } |
1462 | 1462 | ||
1463 | EXPORT_SYMBOL(get_unmapped_area); | 1463 | EXPORT_SYMBOL(get_unmapped_area); |
1464 | 1464 | ||
1465 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1465 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1466 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 1466 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1467 | { | 1467 | { |
1468 | struct vm_area_struct *vma = NULL; | 1468 | struct vm_area_struct *vma = NULL; |
1469 | 1469 | ||
1470 | if (mm) { | 1470 | if (mm) { |
1471 | /* Check the cache first. */ | 1471 | /* Check the cache first. */ |
1472 | /* (Cache hit rate is typically around 35%.) */ | 1472 | /* (Cache hit rate is typically around 35%.) */ |
1473 | vma = mm->mmap_cache; | 1473 | vma = mm->mmap_cache; |
1474 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 1474 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { |
1475 | struct rb_node * rb_node; | 1475 | struct rb_node * rb_node; |
1476 | 1476 | ||
1477 | rb_node = mm->mm_rb.rb_node; | 1477 | rb_node = mm->mm_rb.rb_node; |
1478 | vma = NULL; | 1478 | vma = NULL; |
1479 | 1479 | ||
1480 | while (rb_node) { | 1480 | while (rb_node) { |
1481 | struct vm_area_struct * vma_tmp; | 1481 | struct vm_area_struct * vma_tmp; |
1482 | 1482 | ||
1483 | vma_tmp = rb_entry(rb_node, | 1483 | vma_tmp = rb_entry(rb_node, |
1484 | struct vm_area_struct, vm_rb); | 1484 | struct vm_area_struct, vm_rb); |
1485 | 1485 | ||
1486 | if (vma_tmp->vm_end > addr) { | 1486 | if (vma_tmp->vm_end > addr) { |
1487 | vma = vma_tmp; | 1487 | vma = vma_tmp; |
1488 | if (vma_tmp->vm_start <= addr) | 1488 | if (vma_tmp->vm_start <= addr) |
1489 | break; | 1489 | break; |
1490 | rb_node = rb_node->rb_left; | 1490 | rb_node = rb_node->rb_left; |
1491 | } else | 1491 | } else |
1492 | rb_node = rb_node->rb_right; | 1492 | rb_node = rb_node->rb_right; |
1493 | } | 1493 | } |
1494 | if (vma) | 1494 | if (vma) |
1495 | mm->mmap_cache = vma; | 1495 | mm->mmap_cache = vma; |
1496 | } | 1496 | } |
1497 | } | 1497 | } |
1498 | return vma; | 1498 | return vma; |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | EXPORT_SYMBOL(find_vma); | 1501 | EXPORT_SYMBOL(find_vma); |
1502 | 1502 | ||
1503 | /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ | 1503 | /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ |
1504 | struct vm_area_struct * | 1504 | struct vm_area_struct * |
1505 | find_vma_prev(struct mm_struct *mm, unsigned long addr, | 1505 | find_vma_prev(struct mm_struct *mm, unsigned long addr, |
1506 | struct vm_area_struct **pprev) | 1506 | struct vm_area_struct **pprev) |
1507 | { | 1507 | { |
1508 | struct vm_area_struct *vma = NULL, *prev = NULL; | 1508 | struct vm_area_struct *vma = NULL, *prev = NULL; |
1509 | struct rb_node *rb_node; | 1509 | struct rb_node *rb_node; |
1510 | if (!mm) | 1510 | if (!mm) |
1511 | goto out; | 1511 | goto out; |
1512 | 1512 | ||
1513 | /* Guard against addr being lower than the first VMA */ | 1513 | /* Guard against addr being lower than the first VMA */ |
1514 | vma = mm->mmap; | 1514 | vma = mm->mmap; |
1515 | 1515 | ||
1516 | /* Go through the RB tree quickly. */ | 1516 | /* Go through the RB tree quickly. */ |
1517 | rb_node = mm->mm_rb.rb_node; | 1517 | rb_node = mm->mm_rb.rb_node; |
1518 | 1518 | ||
1519 | while (rb_node) { | 1519 | while (rb_node) { |
1520 | struct vm_area_struct *vma_tmp; | 1520 | struct vm_area_struct *vma_tmp; |
1521 | vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); | 1521 | vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
1522 | 1522 | ||
1523 | if (addr < vma_tmp->vm_end) { | 1523 | if (addr < vma_tmp->vm_end) { |
1524 | rb_node = rb_node->rb_left; | 1524 | rb_node = rb_node->rb_left; |
1525 | } else { | 1525 | } else { |
1526 | prev = vma_tmp; | 1526 | prev = vma_tmp; |
1527 | if (!prev->vm_next || (addr < prev->vm_next->vm_end)) | 1527 | if (!prev->vm_next || (addr < prev->vm_next->vm_end)) |
1528 | break; | 1528 | break; |
1529 | rb_node = rb_node->rb_right; | 1529 | rb_node = rb_node->rb_right; |
1530 | } | 1530 | } |
1531 | } | 1531 | } |
1532 | 1532 | ||
1533 | out: | 1533 | out: |
1534 | *pprev = prev; | 1534 | *pprev = prev; |
1535 | return prev ? prev->vm_next : vma; | 1535 | return prev ? prev->vm_next : vma; |
1536 | } | 1536 | } |
1537 | 1537 | ||
1538 | /* | 1538 | /* |
1539 | * Verify that the stack growth is acceptable and | 1539 | * Verify that the stack growth is acceptable and |
1540 | * update accounting. This is shared with both the | 1540 | * update accounting. This is shared with both the |
1541 | * grow-up and grow-down cases. | 1541 | * grow-up and grow-down cases. |
1542 | */ | 1542 | */ |
1543 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) | 1543 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) |
1544 | { | 1544 | { |
1545 | struct mm_struct *mm = vma->vm_mm; | 1545 | struct mm_struct *mm = vma->vm_mm; |
1546 | struct rlimit *rlim = current->signal->rlim; | 1546 | struct rlimit *rlim = current->signal->rlim; |
1547 | unsigned long new_start; | 1547 | unsigned long new_start; |
1548 | 1548 | ||
1549 | /* address space limit tests */ | 1549 | /* address space limit tests */ |
1550 | if (!may_expand_vm(mm, grow)) | 1550 | if (!may_expand_vm(mm, grow)) |
1551 | return -ENOMEM; | 1551 | return -ENOMEM; |
1552 | 1552 | ||
1553 | /* Stack limit test */ | 1553 | /* Stack limit test */ |
1554 | if (size > rlim[RLIMIT_STACK].rlim_cur) | 1554 | if (size > rlim[RLIMIT_STACK].rlim_cur) |
1555 | return -ENOMEM; | 1555 | return -ENOMEM; |
1556 | 1556 | ||
1557 | /* mlock limit tests */ | 1557 | /* mlock limit tests */ |
1558 | if (vma->vm_flags & VM_LOCKED) { | 1558 | if (vma->vm_flags & VM_LOCKED) { |
1559 | unsigned long locked; | 1559 | unsigned long locked; |
1560 | unsigned long limit; | 1560 | unsigned long limit; |
1561 | locked = mm->locked_vm + grow; | 1561 | locked = mm->locked_vm + grow; |
1562 | limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 1562 | limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; |
1563 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 1563 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
1564 | return -ENOMEM; | 1564 | return -ENOMEM; |
1565 | } | 1565 | } |
1566 | 1566 | ||
1567 | /* Check to ensure the stack will not grow into a hugetlb-only region */ | 1567 | /* Check to ensure the stack will not grow into a hugetlb-only region */ |
1568 | new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : | 1568 | new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : |
1569 | vma->vm_end - size; | 1569 | vma->vm_end - size; |
1570 | if (is_hugepage_only_range(vma->vm_mm, new_start, size)) | 1570 | if (is_hugepage_only_range(vma->vm_mm, new_start, size)) |
1571 | return -EFAULT; | 1571 | return -EFAULT; |
1572 | 1572 | ||
1573 | /* | 1573 | /* |
1574 | * Overcommit.. This must be the final test, as it will | 1574 | * Overcommit.. This must be the final test, as it will |
1575 | * update security statistics. | 1575 | * update security statistics. |
1576 | */ | 1576 | */ |
1577 | if (security_vm_enough_memory(grow)) | 1577 | if (security_vm_enough_memory(grow)) |
1578 | return -ENOMEM; | 1578 | return -ENOMEM; |
1579 | 1579 | ||
1580 | /* Ok, everything looks good - let it rip */ | 1580 | /* Ok, everything looks good - let it rip */ |
1581 | mm->total_vm += grow; | 1581 | mm->total_vm += grow; |
1582 | if (vma->vm_flags & VM_LOCKED) | 1582 | if (vma->vm_flags & VM_LOCKED) |
1583 | mm->locked_vm += grow; | 1583 | mm->locked_vm += grow; |
1584 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 1584 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
1585 | return 0; | 1585 | return 0; |
1586 | } | 1586 | } |
1587 | 1587 | ||
1588 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) | 1588 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) |
1589 | /* | 1589 | /* |
1590 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. | 1590 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. |
1591 | * vma is the last one with address > vma->vm_end. Have to extend vma. | 1591 | * vma is the last one with address > vma->vm_end. Have to extend vma. |
1592 | */ | 1592 | */ |
1593 | #ifndef CONFIG_IA64 | 1593 | #ifndef CONFIG_IA64 |
1594 | static | 1594 | static |
1595 | #endif | 1595 | #endif |
1596 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | 1596 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) |
1597 | { | 1597 | { |
1598 | int error; | 1598 | int error; |
1599 | 1599 | ||
1600 | if (!(vma->vm_flags & VM_GROWSUP)) | 1600 | if (!(vma->vm_flags & VM_GROWSUP)) |
1601 | return -EFAULT; | 1601 | return -EFAULT; |
1602 | 1602 | ||
1603 | /* | 1603 | /* |
1604 | * We must make sure the anon_vma is allocated | 1604 | * We must make sure the anon_vma is allocated |
1605 | * so that the anon_vma locking is not a noop. | 1605 | * so that the anon_vma locking is not a noop. |
1606 | */ | 1606 | */ |
1607 | if (unlikely(anon_vma_prepare(vma))) | 1607 | if (unlikely(anon_vma_prepare(vma))) |
1608 | return -ENOMEM; | 1608 | return -ENOMEM; |
1609 | anon_vma_lock(vma); | 1609 | anon_vma_lock(vma); |
1610 | 1610 | ||
1611 | /* | 1611 | /* |
1612 | * vma->vm_start/vm_end cannot change under us because the caller | 1612 | * vma->vm_start/vm_end cannot change under us because the caller |
1613 | * is required to hold the mmap_sem in read mode. We need the | 1613 | * is required to hold the mmap_sem in read mode. We need the |
1614 | * anon_vma lock to serialize against concurrent expand_stacks. | 1614 | * anon_vma lock to serialize against concurrent expand_stacks. |
1615 | * Also guard against wrapping around to address 0. | 1615 | * Also guard against wrapping around to address 0. |
1616 | */ | 1616 | */ |
1617 | if (address < PAGE_ALIGN(address+4)) | 1617 | if (address < PAGE_ALIGN(address+4)) |
1618 | address = PAGE_ALIGN(address+4); | 1618 | address = PAGE_ALIGN(address+4); |
1619 | else { | 1619 | else { |
1620 | anon_vma_unlock(vma); | 1620 | anon_vma_unlock(vma); |
1621 | return -ENOMEM; | 1621 | return -ENOMEM; |
1622 | } | 1622 | } |
1623 | error = 0; | 1623 | error = 0; |
1624 | 1624 | ||
1625 | /* Somebody else might have raced and expanded it already */ | 1625 | /* Somebody else might have raced and expanded it already */ |
1626 | if (address > vma->vm_end) { | 1626 | if (address > vma->vm_end) { |
1627 | unsigned long size, grow; | 1627 | unsigned long size, grow; |
1628 | 1628 | ||
1629 | size = address - vma->vm_start; | 1629 | size = address - vma->vm_start; |
1630 | grow = (address - vma->vm_end) >> PAGE_SHIFT; | 1630 | grow = (address - vma->vm_end) >> PAGE_SHIFT; |
1631 | 1631 | ||
1632 | error = acct_stack_growth(vma, size, grow); | 1632 | error = acct_stack_growth(vma, size, grow); |
1633 | if (!error) | 1633 | if (!error) |
1634 | vma->vm_end = address; | 1634 | vma->vm_end = address; |
1635 | } | 1635 | } |
1636 | anon_vma_unlock(vma); | 1636 | anon_vma_unlock(vma); |
1637 | return error; | 1637 | return error; |
1638 | } | 1638 | } |
1639 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 1639 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
1640 | 1640 | ||
1641 | /* | 1641 | /* |
1642 | * vma is the first one with address < vma->vm_start. Have to extend vma. | 1642 | * vma is the first one with address < vma->vm_start. Have to extend vma. |
1643 | */ | 1643 | */ |
1644 | static int expand_downwards(struct vm_area_struct *vma, | 1644 | static int expand_downwards(struct vm_area_struct *vma, |
1645 | unsigned long address) | 1645 | unsigned long address) |
1646 | { | 1646 | { |
1647 | int error; | 1647 | int error; |
1648 | 1648 | ||
1649 | /* | 1649 | /* |
1650 | * We must make sure the anon_vma is allocated | 1650 | * We must make sure the anon_vma is allocated |
1651 | * so that the anon_vma locking is not a noop. | 1651 | * so that the anon_vma locking is not a noop. |
1652 | */ | 1652 | */ |
1653 | if (unlikely(anon_vma_prepare(vma))) | 1653 | if (unlikely(anon_vma_prepare(vma))) |
1654 | return -ENOMEM; | 1654 | return -ENOMEM; |
1655 | 1655 | ||
1656 | address &= PAGE_MASK; | 1656 | address &= PAGE_MASK; |
1657 | error = security_file_mmap(NULL, 0, 0, 0, address, 1); | 1657 | error = security_file_mmap(NULL, 0, 0, 0, address, 1); |
1658 | if (error) | 1658 | if (error) |
1659 | return error; | 1659 | return error; |
1660 | 1660 | ||
1661 | anon_vma_lock(vma); | 1661 | anon_vma_lock(vma); |
1662 | 1662 | ||
1663 | /* | 1663 | /* |
1664 | * vma->vm_start/vm_end cannot change under us because the caller | 1664 | * vma->vm_start/vm_end cannot change under us because the caller |
1665 | * is required to hold the mmap_sem in read mode. We need the | 1665 | * is required to hold the mmap_sem in read mode. We need the |
1666 | * anon_vma lock to serialize against concurrent expand_stacks. | 1666 | * anon_vma lock to serialize against concurrent expand_stacks. |
1667 | */ | 1667 | */ |
1668 | 1668 | ||
1669 | /* Somebody else might have raced and expanded it already */ | 1669 | /* Somebody else might have raced and expanded it already */ |
1670 | if (address < vma->vm_start) { | 1670 | if (address < vma->vm_start) { |
1671 | unsigned long size, grow; | 1671 | unsigned long size, grow; |
1672 | 1672 | ||
1673 | size = vma->vm_end - address; | 1673 | size = vma->vm_end - address; |
1674 | grow = (vma->vm_start - address) >> PAGE_SHIFT; | 1674 | grow = (vma->vm_start - address) >> PAGE_SHIFT; |
1675 | 1675 | ||
1676 | error = acct_stack_growth(vma, size, grow); | 1676 | error = acct_stack_growth(vma, size, grow); |
1677 | if (!error) { | 1677 | if (!error) { |
1678 | vma->vm_start = address; | 1678 | vma->vm_start = address; |
1679 | vma->vm_pgoff -= grow; | 1679 | vma->vm_pgoff -= grow; |
1680 | } | 1680 | } |
1681 | } | 1681 | } |
1682 | anon_vma_unlock(vma); | 1682 | anon_vma_unlock(vma); |
1683 | return error; | 1683 | return error; |
1684 | } | 1684 | } |
1685 | 1685 | ||
1686 | int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) | 1686 | int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) |
1687 | { | 1687 | { |
1688 | return expand_downwards(vma, address); | 1688 | return expand_downwards(vma, address); |
1689 | } | 1689 | } |
1690 | 1690 | ||
1691 | #ifdef CONFIG_STACK_GROWSUP | 1691 | #ifdef CONFIG_STACK_GROWSUP |
1692 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 1692 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
1693 | { | 1693 | { |
1694 | return expand_upwards(vma, address); | 1694 | return expand_upwards(vma, address); |
1695 | } | 1695 | } |
1696 | 1696 | ||
1697 | struct vm_area_struct * | 1697 | struct vm_area_struct * |
1698 | find_extend_vma(struct mm_struct *mm, unsigned long addr) | 1698 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
1699 | { | 1699 | { |
1700 | struct vm_area_struct *vma, *prev; | 1700 | struct vm_area_struct *vma, *prev; |
1701 | 1701 | ||
1702 | addr &= PAGE_MASK; | 1702 | addr &= PAGE_MASK; |
1703 | vma = find_vma_prev(mm, addr, &prev); | 1703 | vma = find_vma_prev(mm, addr, &prev); |
1704 | if (vma && (vma->vm_start <= addr)) | 1704 | if (vma && (vma->vm_start <= addr)) |
1705 | return vma; | 1705 | return vma; |
1706 | if (!prev || expand_stack(prev, addr)) | 1706 | if (!prev || expand_stack(prev, addr)) |
1707 | return NULL; | 1707 | return NULL; |
1708 | if (prev->vm_flags & VM_LOCKED) { | 1708 | if (prev->vm_flags & VM_LOCKED) { |
1709 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) | 1709 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) |
1710 | return NULL; /* vma gone! */ | 1710 | return NULL; /* vma gone! */ |
1711 | } | 1711 | } |
1712 | return prev; | 1712 | return prev; |
1713 | } | 1713 | } |
1714 | #else | 1714 | #else |
1715 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 1715 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
1716 | { | 1716 | { |
1717 | return expand_downwards(vma, address); | 1717 | return expand_downwards(vma, address); |
1718 | } | 1718 | } |
1719 | 1719 | ||
1720 | struct vm_area_struct * | 1720 | struct vm_area_struct * |
1721 | find_extend_vma(struct mm_struct * mm, unsigned long addr) | 1721 | find_extend_vma(struct mm_struct * mm, unsigned long addr) |
1722 | { | 1722 | { |
1723 | struct vm_area_struct * vma; | 1723 | struct vm_area_struct * vma; |
1724 | unsigned long start; | 1724 | unsigned long start; |
1725 | 1725 | ||
1726 | addr &= PAGE_MASK; | 1726 | addr &= PAGE_MASK; |
1727 | vma = find_vma(mm,addr); | 1727 | vma = find_vma(mm,addr); |
1728 | if (!vma) | 1728 | if (!vma) |
1729 | return NULL; | 1729 | return NULL; |
1730 | if (vma->vm_start <= addr) | 1730 | if (vma->vm_start <= addr) |
1731 | return vma; | 1731 | return vma; |
1732 | if (!(vma->vm_flags & VM_GROWSDOWN)) | 1732 | if (!(vma->vm_flags & VM_GROWSDOWN)) |
1733 | return NULL; | 1733 | return NULL; |
1734 | start = vma->vm_start; | 1734 | start = vma->vm_start; |
1735 | if (expand_stack(vma, addr)) | 1735 | if (expand_stack(vma, addr)) |
1736 | return NULL; | 1736 | return NULL; |
1737 | if (vma->vm_flags & VM_LOCKED) { | 1737 | if (vma->vm_flags & VM_LOCKED) { |
1738 | if (mlock_vma_pages_range(vma, addr, start) < 0) | 1738 | if (mlock_vma_pages_range(vma, addr, start) < 0) |
1739 | return NULL; /* vma gone! */ | 1739 | return NULL; /* vma gone! */ |
1740 | } | 1740 | } |
1741 | return vma; | 1741 | return vma; |
1742 | } | 1742 | } |
1743 | #endif | 1743 | #endif |
1744 | 1744 | ||
1745 | /* | 1745 | /* |
1746 | * Ok - we have the memory areas we should free on the vma list, | 1746 | * Ok - we have the memory areas we should free on the vma list, |
1747 | * so release them, and do the vma updates. | 1747 | * so release them, and do the vma updates. |
1748 | * | 1748 | * |
1749 | * Called with the mm semaphore held. | 1749 | * Called with the mm semaphore held. |
1750 | */ | 1750 | */ |
1751 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 1751 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
1752 | { | 1752 | { |
1753 | /* Update high watermark before we lower total_vm */ | 1753 | /* Update high watermark before we lower total_vm */ |
1754 | update_hiwater_vm(mm); | 1754 | update_hiwater_vm(mm); |
1755 | do { | 1755 | do { |
1756 | long nrpages = vma_pages(vma); | 1756 | long nrpages = vma_pages(vma); |
1757 | 1757 | ||
1758 | mm->total_vm -= nrpages; | 1758 | mm->total_vm -= nrpages; |
1759 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1759 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1760 | vma = remove_vma(vma); | 1760 | vma = remove_vma(vma); |
1761 | } while (vma); | 1761 | } while (vma); |
1762 | validate_mm(mm); | 1762 | validate_mm(mm); |
1763 | } | 1763 | } |
1764 | 1764 | ||
1765 | /* | 1765 | /* |
1766 | * Get rid of page table information in the indicated region. | 1766 | * Get rid of page table information in the indicated region. |
1767 | * | 1767 | * |
1768 | * Called with the mm semaphore held. | 1768 | * Called with the mm semaphore held. |
1769 | */ | 1769 | */ |
1770 | static void unmap_region(struct mm_struct *mm, | 1770 | static void unmap_region(struct mm_struct *mm, |
1771 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 1771 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
1772 | unsigned long start, unsigned long end) | 1772 | unsigned long start, unsigned long end) |
1773 | { | 1773 | { |
1774 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | 1774 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; |
1775 | struct mmu_gather *tlb; | 1775 | struct mmu_gather *tlb; |
1776 | unsigned long nr_accounted = 0; | 1776 | unsigned long nr_accounted = 0; |
1777 | 1777 | ||
1778 | lru_add_drain(); | 1778 | lru_add_drain(); |
1779 | tlb = tlb_gather_mmu(mm, 0); | 1779 | tlb = tlb_gather_mmu(mm, 0); |
1780 | update_hiwater_rss(mm); | 1780 | update_hiwater_rss(mm); |
1781 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | 1781 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); |
1782 | vm_unacct_memory(nr_accounted); | 1782 | vm_unacct_memory(nr_accounted); |
1783 | free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, | 1783 | free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
1784 | next? next->vm_start: 0); | 1784 | next? next->vm_start: 0); |
1785 | tlb_finish_mmu(tlb, start, end); | 1785 | tlb_finish_mmu(tlb, start, end); |
1786 | } | 1786 | } |
1787 | 1787 | ||
1788 | /* | 1788 | /* |
1789 | * Create a list of vma's touched by the unmap, removing them from the mm's | 1789 | * Create a list of vma's touched by the unmap, removing them from the mm's |
1790 | * vma list as we go.. | 1790 | * vma list as we go.. |
1791 | */ | 1791 | */ |
1792 | static void | 1792 | static void |
1793 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | 1793 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, |
1794 | struct vm_area_struct *prev, unsigned long end) | 1794 | struct vm_area_struct *prev, unsigned long end) |
1795 | { | 1795 | { |
1796 | struct vm_area_struct **insertion_point; | 1796 | struct vm_area_struct **insertion_point; |
1797 | struct vm_area_struct *tail_vma = NULL; | 1797 | struct vm_area_struct *tail_vma = NULL; |
1798 | unsigned long addr; | 1798 | unsigned long addr; |
1799 | 1799 | ||
1800 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 1800 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
1801 | do { | 1801 | do { |
1802 | rb_erase(&vma->vm_rb, &mm->mm_rb); | 1802 | rb_erase(&vma->vm_rb, &mm->mm_rb); |
1803 | mm->map_count--; | 1803 | mm->map_count--; |
1804 | tail_vma = vma; | 1804 | tail_vma = vma; |
1805 | vma = vma->vm_next; | 1805 | vma = vma->vm_next; |
1806 | } while (vma && vma->vm_start < end); | 1806 | } while (vma && vma->vm_start < end); |
1807 | *insertion_point = vma; | 1807 | *insertion_point = vma; |
1808 | tail_vma->vm_next = NULL; | 1808 | tail_vma->vm_next = NULL; |
1809 | if (mm->unmap_area == arch_unmap_area) | 1809 | if (mm->unmap_area == arch_unmap_area) |
1810 | addr = prev ? prev->vm_end : mm->mmap_base; | 1810 | addr = prev ? prev->vm_end : mm->mmap_base; |
1811 | else | 1811 | else |
1812 | addr = vma ? vma->vm_start : mm->mmap_base; | 1812 | addr = vma ? vma->vm_start : mm->mmap_base; |
1813 | mm->unmap_area(mm, addr); | 1813 | mm->unmap_area(mm, addr); |
1814 | mm->mmap_cache = NULL; /* Kill the cache. */ | 1814 | mm->mmap_cache = NULL; /* Kill the cache. */ |
1815 | } | 1815 | } |
1816 | 1816 | ||
1817 | /* | 1817 | /* |
1818 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 1818 | * Split a vma into two pieces at address 'addr', a new vma is allocated |
1819 | * either for the first part or the tail. | 1819 | * either for the first part or the tail. |
1820 | */ | 1820 | */ |
1821 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 1821 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
1822 | unsigned long addr, int new_below) | 1822 | unsigned long addr, int new_below) |
1823 | { | 1823 | { |
1824 | struct mempolicy *pol; | 1824 | struct mempolicy *pol; |
1825 | struct vm_area_struct *new; | 1825 | struct vm_area_struct *new; |
1826 | 1826 | ||
1827 | if (is_vm_hugetlb_page(vma) && (addr & | 1827 | if (is_vm_hugetlb_page(vma) && (addr & |
1828 | ~(huge_page_mask(hstate_vma(vma))))) | 1828 | ~(huge_page_mask(hstate_vma(vma))))) |
1829 | return -EINVAL; | 1829 | return -EINVAL; |
1830 | 1830 | ||
1831 | if (mm->map_count >= sysctl_max_map_count) | 1831 | if (mm->map_count >= sysctl_max_map_count) |
1832 | return -ENOMEM; | 1832 | return -ENOMEM; |
1833 | 1833 | ||
1834 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 1834 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1835 | if (!new) | 1835 | if (!new) |
1836 | return -ENOMEM; | 1836 | return -ENOMEM; |
1837 | 1837 | ||
1838 | /* most fields are the same, copy all, and then fixup */ | 1838 | /* most fields are the same, copy all, and then fixup */ |
1839 | *new = *vma; | 1839 | *new = *vma; |
1840 | 1840 | ||
1841 | if (new_below) | 1841 | if (new_below) |
1842 | new->vm_end = addr; | 1842 | new->vm_end = addr; |
1843 | else { | 1843 | else { |
1844 | new->vm_start = addr; | 1844 | new->vm_start = addr; |
1845 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); | 1845 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); |
1846 | } | 1846 | } |
1847 | 1847 | ||
1848 | pol = mpol_dup(vma_policy(vma)); | 1848 | pol = mpol_dup(vma_policy(vma)); |
1849 | if (IS_ERR(pol)) { | 1849 | if (IS_ERR(pol)) { |
1850 | kmem_cache_free(vm_area_cachep, new); | 1850 | kmem_cache_free(vm_area_cachep, new); |
1851 | return PTR_ERR(pol); | 1851 | return PTR_ERR(pol); |
1852 | } | 1852 | } |
1853 | vma_set_policy(new, pol); | 1853 | vma_set_policy(new, pol); |
1854 | 1854 | ||
1855 | if (new->vm_file) { | 1855 | if (new->vm_file) { |
1856 | get_file(new->vm_file); | 1856 | get_file(new->vm_file); |
1857 | if (vma->vm_flags & VM_EXECUTABLE) | 1857 | if (vma->vm_flags & VM_EXECUTABLE) |
1858 | added_exe_file_vma(mm); | 1858 | added_exe_file_vma(mm); |
1859 | } | 1859 | } |
1860 | 1860 | ||
1861 | if (new->vm_ops && new->vm_ops->open) | 1861 | if (new->vm_ops && new->vm_ops->open) |
1862 | new->vm_ops->open(new); | 1862 | new->vm_ops->open(new); |
1863 | 1863 | ||
1864 | if (new_below) | 1864 | if (new_below) |
1865 | vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | 1865 | vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + |
1866 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | 1866 | ((addr - new->vm_start) >> PAGE_SHIFT), new); |
1867 | else | 1867 | else |
1868 | vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | 1868 | vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); |
1869 | 1869 | ||
1870 | return 0; | 1870 | return 0; |
1871 | } | 1871 | } |
1872 | 1872 | ||
1873 | /* Munmap is split into 2 main parts -- this part which finds | 1873 | /* Munmap is split into 2 main parts -- this part which finds |
1874 | * what needs doing, and the areas themselves, which do the | 1874 | * what needs doing, and the areas themselves, which do the |
1875 | * work. This now handles partial unmappings. | 1875 | * work. This now handles partial unmappings. |
1876 | * Jeremy Fitzhardinge <jeremy@goop.org> | 1876 | * Jeremy Fitzhardinge <jeremy@goop.org> |
1877 | */ | 1877 | */ |
1878 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | 1878 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) |
1879 | { | 1879 | { |
1880 | unsigned long end; | 1880 | unsigned long end; |
1881 | struct vm_area_struct *vma, *prev, *last; | 1881 | struct vm_area_struct *vma, *prev, *last; |
1882 | 1882 | ||
1883 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) | 1883 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) |
1884 | return -EINVAL; | 1884 | return -EINVAL; |
1885 | 1885 | ||
1886 | if ((len = PAGE_ALIGN(len)) == 0) | 1886 | if ((len = PAGE_ALIGN(len)) == 0) |
1887 | return -EINVAL; | 1887 | return -EINVAL; |
1888 | 1888 | ||
1889 | /* Find the first overlapping VMA */ | 1889 | /* Find the first overlapping VMA */ |
1890 | vma = find_vma_prev(mm, start, &prev); | 1890 | vma = find_vma_prev(mm, start, &prev); |
1891 | if (!vma) | 1891 | if (!vma) |
1892 | return 0; | 1892 | return 0; |
1893 | /* we have start < vma->vm_end */ | 1893 | /* we have start < vma->vm_end */ |
1894 | 1894 | ||
1895 | /* if it doesn't overlap, we have nothing.. */ | 1895 | /* if it doesn't overlap, we have nothing.. */ |
1896 | end = start + len; | 1896 | end = start + len; |
1897 | if (vma->vm_start >= end) | 1897 | if (vma->vm_start >= end) |
1898 | return 0; | 1898 | return 0; |
1899 | 1899 | ||
1900 | /* | 1900 | /* |
1901 | * If we need to split any vma, do it now to save pain later. | 1901 | * If we need to split any vma, do it now to save pain later. |
1902 | * | 1902 | * |
1903 | * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially | 1903 | * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially |
1904 | * unmapped vm_area_struct will remain in use: so lower split_vma | 1904 | * unmapped vm_area_struct will remain in use: so lower split_vma |
1905 | * places tmp vma above, and higher split_vma places tmp vma below. | 1905 | * places tmp vma above, and higher split_vma places tmp vma below. |
1906 | */ | 1906 | */ |
1907 | if (start > vma->vm_start) { | 1907 | if (start > vma->vm_start) { |
1908 | int error = split_vma(mm, vma, start, 0); | 1908 | int error = split_vma(mm, vma, start, 0); |
1909 | if (error) | 1909 | if (error) |
1910 | return error; | 1910 | return error; |
1911 | prev = vma; | 1911 | prev = vma; |
1912 | } | 1912 | } |
1913 | 1913 | ||
1914 | /* Does it split the last one? */ | 1914 | /* Does it split the last one? */ |
1915 | last = find_vma(mm, end); | 1915 | last = find_vma(mm, end); |
1916 | if (last && end > last->vm_start) { | 1916 | if (last && end > last->vm_start) { |
1917 | int error = split_vma(mm, last, end, 1); | 1917 | int error = split_vma(mm, last, end, 1); |
1918 | if (error) | 1918 | if (error) |
1919 | return error; | 1919 | return error; |
1920 | } | 1920 | } |
1921 | vma = prev? prev->vm_next: mm->mmap; | 1921 | vma = prev? prev->vm_next: mm->mmap; |
1922 | 1922 | ||
1923 | /* | 1923 | /* |
1924 | * unlock any mlock()ed ranges before detaching vmas | 1924 | * unlock any mlock()ed ranges before detaching vmas |
1925 | */ | 1925 | */ |
1926 | if (mm->locked_vm) { | 1926 | if (mm->locked_vm) { |
1927 | struct vm_area_struct *tmp = vma; | 1927 | struct vm_area_struct *tmp = vma; |
1928 | while (tmp && tmp->vm_start < end) { | 1928 | while (tmp && tmp->vm_start < end) { |
1929 | if (tmp->vm_flags & VM_LOCKED) { | 1929 | if (tmp->vm_flags & VM_LOCKED) { |
1930 | mm->locked_vm -= vma_pages(tmp); | 1930 | mm->locked_vm -= vma_pages(tmp); |
1931 | munlock_vma_pages_all(tmp); | 1931 | munlock_vma_pages_all(tmp); |
1932 | } | 1932 | } |
1933 | tmp = tmp->vm_next; | 1933 | tmp = tmp->vm_next; |
1934 | } | 1934 | } |
1935 | } | 1935 | } |
1936 | 1936 | ||
1937 | /* | 1937 | /* |
1938 | * Remove the vma's, and unmap the actual pages | 1938 | * Remove the vma's, and unmap the actual pages |
1939 | */ | 1939 | */ |
1940 | detach_vmas_to_be_unmapped(mm, vma, prev, end); | 1940 | detach_vmas_to_be_unmapped(mm, vma, prev, end); |
1941 | unmap_region(mm, vma, prev, start, end); | 1941 | unmap_region(mm, vma, prev, start, end); |
1942 | 1942 | ||
1943 | /* Fix up all other VM information */ | 1943 | /* Fix up all other VM information */ |
1944 | remove_vma_list(mm, vma); | 1944 | remove_vma_list(mm, vma); |
1945 | 1945 | ||
1946 | return 0; | 1946 | return 0; |
1947 | } | 1947 | } |
1948 | 1948 | ||
1949 | EXPORT_SYMBOL(do_munmap); | 1949 | EXPORT_SYMBOL(do_munmap); |
1950 | 1950 | ||
1951 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | 1951 | asmlinkage long sys_munmap(unsigned long addr, size_t len) |
1952 | { | 1952 | { |
1953 | int ret; | 1953 | int ret; |
1954 | struct mm_struct *mm = current->mm; | 1954 | struct mm_struct *mm = current->mm; |
1955 | 1955 | ||
1956 | profile_munmap(addr); | 1956 | profile_munmap(addr); |
1957 | 1957 | ||
1958 | down_write(&mm->mmap_sem); | 1958 | down_write(&mm->mmap_sem); |
1959 | ret = do_munmap(mm, addr, len); | 1959 | ret = do_munmap(mm, addr, len); |
1960 | up_write(&mm->mmap_sem); | 1960 | up_write(&mm->mmap_sem); |
1961 | return ret; | 1961 | return ret; |
1962 | } | 1962 | } |
1963 | 1963 | ||
1964 | static inline void verify_mm_writelocked(struct mm_struct *mm) | 1964 | static inline void verify_mm_writelocked(struct mm_struct *mm) |
1965 | { | 1965 | { |
1966 | #ifdef CONFIG_DEBUG_VM | 1966 | #ifdef CONFIG_DEBUG_VM |
1967 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { | 1967 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { |
1968 | WARN_ON(1); | 1968 | WARN_ON(1); |
1969 | up_read(&mm->mmap_sem); | 1969 | up_read(&mm->mmap_sem); |
1970 | } | 1970 | } |
1971 | #endif | 1971 | #endif |
1972 | } | 1972 | } |
1973 | 1973 | ||
1974 | /* | 1974 | /* |
1975 | * this is really a simplified "do_mmap". it only handles | 1975 | * this is really a simplified "do_mmap". it only handles |
1976 | * anonymous maps. eventually we may be able to do some | 1976 | * anonymous maps. eventually we may be able to do some |
1977 | * brk-specific accounting here. | 1977 | * brk-specific accounting here. |
1978 | */ | 1978 | */ |
1979 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1979 | unsigned long do_brk(unsigned long addr, unsigned long len) |
1980 | { | 1980 | { |
1981 | struct mm_struct * mm = current->mm; | 1981 | struct mm_struct * mm = current->mm; |
1982 | struct vm_area_struct * vma, * prev; | 1982 | struct vm_area_struct * vma, * prev; |
1983 | unsigned long flags; | 1983 | unsigned long flags; |
1984 | struct rb_node ** rb_link, * rb_parent; | 1984 | struct rb_node ** rb_link, * rb_parent; |
1985 | pgoff_t pgoff = addr >> PAGE_SHIFT; | 1985 | pgoff_t pgoff = addr >> PAGE_SHIFT; |
1986 | int error; | 1986 | int error; |
1987 | 1987 | ||
1988 | len = PAGE_ALIGN(len); | 1988 | len = PAGE_ALIGN(len); |
1989 | if (!len) | 1989 | if (!len) |
1990 | return addr; | 1990 | return addr; |
1991 | 1991 | ||
1992 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | 1992 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) |
1993 | return -EINVAL; | 1993 | return -EINVAL; |
1994 | 1994 | ||
1995 | if (is_hugepage_only_range(mm, addr, len)) | 1995 | if (is_hugepage_only_range(mm, addr, len)) |
1996 | return -EINVAL; | 1996 | return -EINVAL; |
1997 | 1997 | ||
1998 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); | 1998 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); |
1999 | if (error) | 1999 | if (error) |
2000 | return error; | 2000 | return error; |
2001 | 2001 | ||
2002 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2002 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2003 | 2003 | ||
2004 | error = arch_mmap_check(addr, len, flags); | 2004 | error = arch_mmap_check(addr, len, flags); |
2005 | if (error) | 2005 | if (error) |
2006 | return error; | 2006 | return error; |
2007 | 2007 | ||
2008 | /* | 2008 | /* |
2009 | * mlock MCL_FUTURE? | 2009 | * mlock MCL_FUTURE? |
2010 | */ | 2010 | */ |
2011 | if (mm->def_flags & VM_LOCKED) { | 2011 | if (mm->def_flags & VM_LOCKED) { |
2012 | unsigned long locked, lock_limit; | 2012 | unsigned long locked, lock_limit; |
2013 | locked = len >> PAGE_SHIFT; | 2013 | locked = len >> PAGE_SHIFT; |
2014 | locked += mm->locked_vm; | 2014 | locked += mm->locked_vm; |
2015 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 2015 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
2016 | lock_limit >>= PAGE_SHIFT; | 2016 | lock_limit >>= PAGE_SHIFT; |
2017 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 2017 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
2018 | return -EAGAIN; | 2018 | return -EAGAIN; |
2019 | } | 2019 | } |
2020 | 2020 | ||
2021 | /* | 2021 | /* |
2022 | * mm->mmap_sem is required to protect against another thread | 2022 | * mm->mmap_sem is required to protect against another thread |
2023 | * changing the mappings in case we sleep. | 2023 | * changing the mappings in case we sleep. |
2024 | */ | 2024 | */ |
2025 | verify_mm_writelocked(mm); | 2025 | verify_mm_writelocked(mm); |
2026 | 2026 | ||
2027 | /* | 2027 | /* |
2028 | * Clear old maps. this also does some error checking for us | 2028 | * Clear old maps. this also does some error checking for us |
2029 | */ | 2029 | */ |
2030 | munmap_back: | 2030 | munmap_back: |
2031 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2031 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); |
2032 | if (vma && vma->vm_start < addr + len) { | 2032 | if (vma && vma->vm_start < addr + len) { |
2033 | if (do_munmap(mm, addr, len)) | 2033 | if (do_munmap(mm, addr, len)) |
2034 | return -ENOMEM; | 2034 | return -ENOMEM; |
2035 | goto munmap_back; | 2035 | goto munmap_back; |
2036 | } | 2036 | } |
2037 | 2037 | ||
2038 | /* Check against address space limits *after* clearing old maps... */ | 2038 | /* Check against address space limits *after* clearing old maps... */ |
2039 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | 2039 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
2040 | return -ENOMEM; | 2040 | return -ENOMEM; |
2041 | 2041 | ||
2042 | if (mm->map_count > sysctl_max_map_count) | 2042 | if (mm->map_count > sysctl_max_map_count) |
2043 | return -ENOMEM; | 2043 | return -ENOMEM; |
2044 | 2044 | ||
2045 | if (security_vm_enough_memory(len >> PAGE_SHIFT)) | 2045 | if (security_vm_enough_memory(len >> PAGE_SHIFT)) |
2046 | return -ENOMEM; | 2046 | return -ENOMEM; |
2047 | 2047 | ||
2048 | /* Can we just expand an old private anonymous mapping? */ | 2048 | /* Can we just expand an old private anonymous mapping? */ |
2049 | vma = vma_merge(mm, prev, addr, addr + len, flags, | 2049 | vma = vma_merge(mm, prev, addr, addr + len, flags, |
2050 | NULL, NULL, pgoff, NULL); | 2050 | NULL, NULL, pgoff, NULL); |
2051 | if (vma) | 2051 | if (vma) |
2052 | goto out; | 2052 | goto out; |
2053 | 2053 | ||
2054 | /* | 2054 | /* |
2055 | * create a vma struct for an anonymous mapping | 2055 | * create a vma struct for an anonymous mapping |
2056 | */ | 2056 | */ |
2057 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2057 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
2058 | if (!vma) { | 2058 | if (!vma) { |
2059 | vm_unacct_memory(len >> PAGE_SHIFT); | 2059 | vm_unacct_memory(len >> PAGE_SHIFT); |
2060 | return -ENOMEM; | 2060 | return -ENOMEM; |
2061 | } | 2061 | } |
2062 | 2062 | ||
2063 | vma->vm_mm = mm; | 2063 | vma->vm_mm = mm; |
2064 | vma->vm_start = addr; | 2064 | vma->vm_start = addr; |
2065 | vma->vm_end = addr + len; | 2065 | vma->vm_end = addr + len; |
2066 | vma->vm_pgoff = pgoff; | 2066 | vma->vm_pgoff = pgoff; |
2067 | vma->vm_flags = flags; | 2067 | vma->vm_flags = flags; |
2068 | vma->vm_page_prot = vm_get_page_prot(flags); | 2068 | vma->vm_page_prot = vm_get_page_prot(flags); |
2069 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2069 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2070 | out: | 2070 | out: |
2071 | mm->total_vm += len >> PAGE_SHIFT; | 2071 | mm->total_vm += len >> PAGE_SHIFT; |
2072 | if (flags & VM_LOCKED) { | 2072 | if (flags & VM_LOCKED) { |
2073 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 2073 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
2074 | mm->locked_vm += (len >> PAGE_SHIFT); | 2074 | mm->locked_vm += (len >> PAGE_SHIFT); |
2075 | } | 2075 | } |
2076 | return addr; | 2076 | return addr; |
2077 | } | 2077 | } |
2078 | 2078 | ||
2079 | EXPORT_SYMBOL(do_brk); | 2079 | EXPORT_SYMBOL(do_brk); |
2080 | 2080 | ||
2081 | /* Release all mmaps. */ | 2081 | /* Release all mmaps. */ |
2082 | void exit_mmap(struct mm_struct *mm) | 2082 | void exit_mmap(struct mm_struct *mm) |
2083 | { | 2083 | { |
2084 | struct mmu_gather *tlb; | 2084 | struct mmu_gather *tlb; |
2085 | struct vm_area_struct *vma; | 2085 | struct vm_area_struct *vma; |
2086 | unsigned long nr_accounted = 0; | 2086 | unsigned long nr_accounted = 0; |
2087 | unsigned long end; | 2087 | unsigned long end; |
2088 | 2088 | ||
2089 | /* mm's last user has gone, and its about to be pulled down */ | 2089 | /* mm's last user has gone, and its about to be pulled down */ |
2090 | arch_exit_mmap(mm); | 2090 | arch_exit_mmap(mm); |
2091 | mmu_notifier_release(mm); | 2091 | mmu_notifier_release(mm); |
2092 | 2092 | ||
2093 | if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ | 2093 | if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ |
2094 | return; | 2094 | return; |
2095 | 2095 | ||
2096 | if (mm->locked_vm) { | 2096 | if (mm->locked_vm) { |
2097 | vma = mm->mmap; | 2097 | vma = mm->mmap; |
2098 | while (vma) { | 2098 | while (vma) { |
2099 | if (vma->vm_flags & VM_LOCKED) | 2099 | if (vma->vm_flags & VM_LOCKED) |
2100 | munlock_vma_pages_all(vma); | 2100 | munlock_vma_pages_all(vma); |
2101 | vma = vma->vm_next; | 2101 | vma = vma->vm_next; |
2102 | } | 2102 | } |
2103 | } | 2103 | } |
2104 | vma = mm->mmap; | 2104 | vma = mm->mmap; |
2105 | lru_add_drain(); | 2105 | lru_add_drain(); |
2106 | flush_cache_mm(mm); | 2106 | flush_cache_mm(mm); |
2107 | tlb = tlb_gather_mmu(mm, 1); | 2107 | tlb = tlb_gather_mmu(mm, 1); |
2108 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 2108 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2109 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2109 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2110 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2110 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
2111 | vm_unacct_memory(nr_accounted); | 2111 | vm_unacct_memory(nr_accounted); |
2112 | free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); | 2112 | free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); |
2113 | tlb_finish_mmu(tlb, 0, end); | 2113 | tlb_finish_mmu(tlb, 0, end); |
2114 | 2114 | ||
2115 | /* | 2115 | /* |
2116 | * Walk the list again, actually closing and freeing it, | 2116 | * Walk the list again, actually closing and freeing it, |
2117 | * with preemption enabled, without holding any MM locks. | 2117 | * with preemption enabled, without holding any MM locks. |
2118 | */ | 2118 | */ |
2119 | while (vma) | 2119 | while (vma) |
2120 | vma = remove_vma(vma); | 2120 | vma = remove_vma(vma); |
2121 | 2121 | ||
2122 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 2122 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); |
2123 | } | 2123 | } |
2124 | 2124 | ||
2125 | /* Insert vm structure into process list sorted by address | 2125 | /* Insert vm structure into process list sorted by address |
2126 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2126 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2127 | * then i_mmap_lock is taken here. | 2127 | * then i_mmap_lock is taken here. |
2128 | */ | 2128 | */ |
2129 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | 2129 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) |
2130 | { | 2130 | { |
2131 | struct vm_area_struct * __vma, * prev; | 2131 | struct vm_area_struct * __vma, * prev; |
2132 | struct rb_node ** rb_link, * rb_parent; | 2132 | struct rb_node ** rb_link, * rb_parent; |
2133 | 2133 | ||
2134 | /* | 2134 | /* |
2135 | * The vm_pgoff of a purely anonymous vma should be irrelevant | 2135 | * The vm_pgoff of a purely anonymous vma should be irrelevant |
2136 | * until its first write fault, when page's anon_vma and index | 2136 | * until its first write fault, when page's anon_vma and index |
2137 | * are set. But now set the vm_pgoff it will almost certainly | 2137 | * are set. But now set the vm_pgoff it will almost certainly |
2138 | * end up with (unless mremap moves it elsewhere before that | 2138 | * end up with (unless mremap moves it elsewhere before that |
2139 | * first wfault), so /proc/pid/maps tells a consistent story. | 2139 | * first wfault), so /proc/pid/maps tells a consistent story. |
2140 | * | 2140 | * |
2141 | * By setting it to reflect the virtual start address of the | 2141 | * By setting it to reflect the virtual start address of the |
2142 | * vma, merges and splits can happen in a seamless way, just | 2142 | * vma, merges and splits can happen in a seamless way, just |
2143 | * using the existing file pgoff checks and manipulations. | 2143 | * using the existing file pgoff checks and manipulations. |
2144 | * Similarly in do_mmap_pgoff and in do_brk. | 2144 | * Similarly in do_mmap_pgoff and in do_brk. |
2145 | */ | 2145 | */ |
2146 | if (!vma->vm_file) { | 2146 | if (!vma->vm_file) { |
2147 | BUG_ON(vma->anon_vma); | 2147 | BUG_ON(vma->anon_vma); |
2148 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | 2148 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
2149 | } | 2149 | } |
2150 | __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); | 2150 | __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); |
2151 | if (__vma && __vma->vm_start < vma->vm_end) | 2151 | if (__vma && __vma->vm_start < vma->vm_end) |
2152 | return -ENOMEM; | 2152 | return -ENOMEM; |
2153 | if ((vma->vm_flags & VM_ACCOUNT) && | 2153 | if ((vma->vm_flags & VM_ACCOUNT) && |
2154 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2154 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
2155 | return -ENOMEM; | 2155 | return -ENOMEM; |
2156 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2156 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2157 | return 0; | 2157 | return 0; |
2158 | } | 2158 | } |
2159 | 2159 | ||
2160 | /* | 2160 | /* |
2161 | * Copy the vma structure to a new location in the same mm, | 2161 | * Copy the vma structure to a new location in the same mm, |
2162 | * prior to moving page table entries, to effect an mremap move. | 2162 | * prior to moving page table entries, to effect an mremap move. |
2163 | */ | 2163 | */ |
2164 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | 2164 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2165 | unsigned long addr, unsigned long len, pgoff_t pgoff) | 2165 | unsigned long addr, unsigned long len, pgoff_t pgoff) |
2166 | { | 2166 | { |
2167 | struct vm_area_struct *vma = *vmap; | 2167 | struct vm_area_struct *vma = *vmap; |
2168 | unsigned long vma_start = vma->vm_start; | 2168 | unsigned long vma_start = vma->vm_start; |
2169 | struct mm_struct *mm = vma->vm_mm; | 2169 | struct mm_struct *mm = vma->vm_mm; |
2170 | struct vm_area_struct *new_vma, *prev; | 2170 | struct vm_area_struct *new_vma, *prev; |
2171 | struct rb_node **rb_link, *rb_parent; | 2171 | struct rb_node **rb_link, *rb_parent; |
2172 | struct mempolicy *pol; | 2172 | struct mempolicy *pol; |
2173 | 2173 | ||
2174 | /* | 2174 | /* |
2175 | * If anonymous vma has not yet been faulted, update new pgoff | 2175 | * If anonymous vma has not yet been faulted, update new pgoff |
2176 | * to match new location, to increase its chance of merging. | 2176 | * to match new location, to increase its chance of merging. |
2177 | */ | 2177 | */ |
2178 | if (!vma->vm_file && !vma->anon_vma) | 2178 | if (!vma->vm_file && !vma->anon_vma) |
2179 | pgoff = addr >> PAGE_SHIFT; | 2179 | pgoff = addr >> PAGE_SHIFT; |
2180 | 2180 | ||
2181 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2181 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); |
2182 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2182 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2183 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 2183 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); |
2184 | if (new_vma) { | 2184 | if (new_vma) { |
2185 | /* | 2185 | /* |
2186 | * Source vma may have been merged into new_vma | 2186 | * Source vma may have been merged into new_vma |
2187 | */ | 2187 | */ |
2188 | if (vma_start >= new_vma->vm_start && | 2188 | if (vma_start >= new_vma->vm_start && |
2189 | vma_start < new_vma->vm_end) | 2189 | vma_start < new_vma->vm_end) |
2190 | *vmap = new_vma; | 2190 | *vmap = new_vma; |
2191 | } else { | 2191 | } else { |
2192 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2192 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2193 | if (new_vma) { | 2193 | if (new_vma) { |
2194 | *new_vma = *vma; | 2194 | *new_vma = *vma; |
2195 | pol = mpol_dup(vma_policy(vma)); | 2195 | pol = mpol_dup(vma_policy(vma)); |
2196 | if (IS_ERR(pol)) { | 2196 | if (IS_ERR(pol)) { |
2197 | kmem_cache_free(vm_area_cachep, new_vma); | 2197 | kmem_cache_free(vm_area_cachep, new_vma); |
2198 | return NULL; | 2198 | return NULL; |
2199 | } | 2199 | } |
2200 | vma_set_policy(new_vma, pol); | 2200 | vma_set_policy(new_vma, pol); |
2201 | new_vma->vm_start = addr; | 2201 | new_vma->vm_start = addr; |
2202 | new_vma->vm_end = addr + len; | 2202 | new_vma->vm_end = addr + len; |
2203 | new_vma->vm_pgoff = pgoff; | 2203 | new_vma->vm_pgoff = pgoff; |
2204 | if (new_vma->vm_file) { | 2204 | if (new_vma->vm_file) { |
2205 | get_file(new_vma->vm_file); | 2205 | get_file(new_vma->vm_file); |
2206 | if (vma->vm_flags & VM_EXECUTABLE) | 2206 | if (vma->vm_flags & VM_EXECUTABLE) |
2207 | added_exe_file_vma(mm); | 2207 | added_exe_file_vma(mm); |
2208 | } | 2208 | } |
2209 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2209 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2210 | new_vma->vm_ops->open(new_vma); | 2210 | new_vma->vm_ops->open(new_vma); |
2211 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2211 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2212 | } | 2212 | } |
2213 | } | 2213 | } |
2214 | return new_vma; | 2214 | return new_vma; |
2215 | } | 2215 | } |
2216 | 2216 | ||
2217 | /* | 2217 | /* |
2218 | * Return true if the calling process may expand its vm space by the passed | 2218 | * Return true if the calling process may expand its vm space by the passed |
2219 | * number of pages | 2219 | * number of pages |
2220 | */ | 2220 | */ |
2221 | int may_expand_vm(struct mm_struct *mm, unsigned long npages) | 2221 | int may_expand_vm(struct mm_struct *mm, unsigned long npages) |
2222 | { | 2222 | { |
2223 | unsigned long cur = mm->total_vm; /* pages */ | 2223 | unsigned long cur = mm->total_vm; /* pages */ |
2224 | unsigned long lim; | 2224 | unsigned long lim; |
2225 | 2225 | ||
2226 | lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 2226 | lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; |
2227 | 2227 | ||
2228 | if (cur + npages > lim) | 2228 | if (cur + npages > lim) |
2229 | return 0; | 2229 | return 0; |
2230 | return 1; | 2230 | return 1; |
2231 | } | 2231 | } |
2232 | 2232 | ||
2233 | 2233 | ||
2234 | static int special_mapping_fault(struct vm_area_struct *vma, | 2234 | static int special_mapping_fault(struct vm_area_struct *vma, |
2235 | struct vm_fault *vmf) | 2235 | struct vm_fault *vmf) |
2236 | { | 2236 | { |
2237 | pgoff_t pgoff; | 2237 | pgoff_t pgoff; |
2238 | struct page **pages; | 2238 | struct page **pages; |
2239 | 2239 | ||
2240 | /* | 2240 | /* |
2241 | * special mappings have no vm_file, and in that case, the mm | 2241 | * special mappings have no vm_file, and in that case, the mm |
2242 | * uses vm_pgoff internally. So we have to subtract it from here. | 2242 | * uses vm_pgoff internally. So we have to subtract it from here. |
2243 | * We are allowed to do this because we are the mm; do not copy | 2243 | * We are allowed to do this because we are the mm; do not copy |
2244 | * this code into drivers! | 2244 | * this code into drivers! |
2245 | */ | 2245 | */ |
2246 | pgoff = vmf->pgoff - vma->vm_pgoff; | 2246 | pgoff = vmf->pgoff - vma->vm_pgoff; |
2247 | 2247 | ||
2248 | for (pages = vma->vm_private_data; pgoff && *pages; ++pages) | 2248 | for (pages = vma->vm_private_data; pgoff && *pages; ++pages) |
2249 | pgoff--; | 2249 | pgoff--; |
2250 | 2250 | ||
2251 | if (*pages) { | 2251 | if (*pages) { |
2252 | struct page *page = *pages; | 2252 | struct page *page = *pages; |
2253 | get_page(page); | 2253 | get_page(page); |
2254 | vmf->page = page; | 2254 | vmf->page = page; |
2255 | return 0; | 2255 | return 0; |
2256 | } | 2256 | } |
2257 | 2257 | ||
2258 | return VM_FAULT_SIGBUS; | 2258 | return VM_FAULT_SIGBUS; |
2259 | } | 2259 | } |
2260 | 2260 | ||
2261 | /* | 2261 | /* |
2262 | * Having a close hook prevents vma merging regardless of flags. | 2262 | * Having a close hook prevents vma merging regardless of flags. |
2263 | */ | 2263 | */ |
2264 | static void special_mapping_close(struct vm_area_struct *vma) | 2264 | static void special_mapping_close(struct vm_area_struct *vma) |
2265 | { | 2265 | { |
2266 | } | 2266 | } |
2267 | 2267 | ||
2268 | static struct vm_operations_struct special_mapping_vmops = { | 2268 | static struct vm_operations_struct special_mapping_vmops = { |
2269 | .close = special_mapping_close, | 2269 | .close = special_mapping_close, |
2270 | .fault = special_mapping_fault, | 2270 | .fault = special_mapping_fault, |
2271 | }; | 2271 | }; |
2272 | 2272 | ||
2273 | /* | 2273 | /* |
2274 | * Called with mm->mmap_sem held for writing. | 2274 | * Called with mm->mmap_sem held for writing. |
2275 | * Insert a new vma covering the given region, with the given flags. | 2275 | * Insert a new vma covering the given region, with the given flags. |
2276 | * Its pages are supplied by the given array of struct page *. | 2276 | * Its pages are supplied by the given array of struct page *. |
2277 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. | 2277 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. |
2278 | * The region past the last page supplied will always produce SIGBUS. | 2278 | * The region past the last page supplied will always produce SIGBUS. |
2279 | * The array pointer and the pages it points to are assumed to stay alive | 2279 | * The array pointer and the pages it points to are assumed to stay alive |
2280 | * for as long as this mapping might exist. | 2280 | * for as long as this mapping might exist. |
2281 | */ | 2281 | */ |
2282 | int install_special_mapping(struct mm_struct *mm, | 2282 | int install_special_mapping(struct mm_struct *mm, |
2283 | unsigned long addr, unsigned long len, | 2283 | unsigned long addr, unsigned long len, |
2284 | unsigned long vm_flags, struct page **pages) | 2284 | unsigned long vm_flags, struct page **pages) |
2285 | { | 2285 | { |
2286 | struct vm_area_struct *vma; | 2286 | struct vm_area_struct *vma; |
2287 | 2287 | ||
2288 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2288 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
2289 | if (unlikely(vma == NULL)) | 2289 | if (unlikely(vma == NULL)) |
2290 | return -ENOMEM; | 2290 | return -ENOMEM; |
2291 | 2291 | ||
2292 | vma->vm_mm = mm; | 2292 | vma->vm_mm = mm; |
2293 | vma->vm_start = addr; | 2293 | vma->vm_start = addr; |
2294 | vma->vm_end = addr + len; | 2294 | vma->vm_end = addr + len; |
2295 | 2295 | ||
2296 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; | 2296 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; |
2297 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 2297 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
2298 | 2298 | ||
2299 | vma->vm_ops = &special_mapping_vmops; | 2299 | vma->vm_ops = &special_mapping_vmops; |
2300 | vma->vm_private_data = pages; | 2300 | vma->vm_private_data = pages; |
2301 | 2301 | ||
2302 | if (unlikely(insert_vm_struct(mm, vma))) { | 2302 | if (unlikely(insert_vm_struct(mm, vma))) { |
2303 | kmem_cache_free(vm_area_cachep, vma); | 2303 | kmem_cache_free(vm_area_cachep, vma); |
2304 | return -ENOMEM; | 2304 | return -ENOMEM; |
2305 | } | 2305 | } |
2306 | 2306 | ||
2307 | mm->total_vm += len >> PAGE_SHIFT; | 2307 | mm->total_vm += len >> PAGE_SHIFT; |
2308 | 2308 | ||
2309 | return 0; | 2309 | return 0; |
2310 | } | 2310 | } |
2311 | 2311 | ||
2312 | static DEFINE_MUTEX(mm_all_locks_mutex); | 2312 | static DEFINE_MUTEX(mm_all_locks_mutex); |
2313 | 2313 | ||
2314 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | 2314 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) |
2315 | { | 2315 | { |
2316 | if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { | 2316 | if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { |
2317 | /* | 2317 | /* |
2318 | * The LSB of head.next can't change from under us | 2318 | * The LSB of head.next can't change from under us |
2319 | * because we hold the mm_all_locks_mutex. | 2319 | * because we hold the mm_all_locks_mutex. |
2320 | */ | 2320 | */ |
2321 | spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); | 2321 | spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); |
2322 | /* | 2322 | /* |
2323 | * We can safely modify head.next after taking the | 2323 | * We can safely modify head.next after taking the |
2324 | * anon_vma->lock. If some other vma in this mm shares | 2324 | * anon_vma->lock. If some other vma in this mm shares |
2325 | * the same anon_vma we won't take it again. | 2325 | * the same anon_vma we won't take it again. |
2326 | * | 2326 | * |
2327 | * No need of atomic instructions here, head.next | 2327 | * No need of atomic instructions here, head.next |
2328 | * can't change from under us thanks to the | 2328 | * can't change from under us thanks to the |
2329 | * anon_vma->lock. | 2329 | * anon_vma->lock. |
2330 | */ | 2330 | */ |
2331 | if (__test_and_set_bit(0, (unsigned long *) | 2331 | if (__test_and_set_bit(0, (unsigned long *) |
2332 | &anon_vma->head.next)) | 2332 | &anon_vma->head.next)) |
2333 | BUG(); | 2333 | BUG(); |
2334 | } | 2334 | } |
2335 | } | 2335 | } |
2336 | 2336 | ||
2337 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | 2337 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) |
2338 | { | 2338 | { |
2339 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | 2339 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { |
2340 | /* | 2340 | /* |
2341 | * AS_MM_ALL_LOCKS can't change from under us because | 2341 | * AS_MM_ALL_LOCKS can't change from under us because |
2342 | * we hold the mm_all_locks_mutex. | 2342 | * we hold the mm_all_locks_mutex. |
2343 | * | 2343 | * |
2344 | * Operations on ->flags have to be atomic because | 2344 | * Operations on ->flags have to be atomic because |
2345 | * even if AS_MM_ALL_LOCKS is stable thanks to the | 2345 | * even if AS_MM_ALL_LOCKS is stable thanks to the |
2346 | * mm_all_locks_mutex, there may be other cpus | 2346 | * mm_all_locks_mutex, there may be other cpus |
2347 | * changing other bitflags in parallel to us. | 2347 | * changing other bitflags in parallel to us. |
2348 | */ | 2348 | */ |
2349 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 2349 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
2350 | BUG(); | 2350 | BUG(); |
2351 | spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); | 2351 | spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); |
2352 | } | 2352 | } |
2353 | } | 2353 | } |
2354 | 2354 | ||
2355 | /* | 2355 | /* |
2356 | * This operation locks against the VM for all pte/vma/mm related | 2356 | * This operation locks against the VM for all pte/vma/mm related |
2357 | * operations that could ever happen on a certain mm. This includes | 2357 | * operations that could ever happen on a certain mm. This includes |
2358 | * vmtruncate, try_to_unmap, and all page faults. | 2358 | * vmtruncate, try_to_unmap, and all page faults. |
2359 | * | 2359 | * |
2360 | * The caller must take the mmap_sem in write mode before calling | 2360 | * The caller must take the mmap_sem in write mode before calling |
2361 | * mm_take_all_locks(). The caller isn't allowed to release the | 2361 | * mm_take_all_locks(). The caller isn't allowed to release the |
2362 | * mmap_sem until mm_drop_all_locks() returns. | 2362 | * mmap_sem until mm_drop_all_locks() returns. |
2363 | * | 2363 | * |
2364 | * mmap_sem in write mode is required in order to block all operations | 2364 | * mmap_sem in write mode is required in order to block all operations |
2365 | * that could modify pagetables and free pages without need of | 2365 | * that could modify pagetables and free pages without need of |
2366 | * altering the vma layout (for example populate_range() with | 2366 | * altering the vma layout (for example populate_range() with |
2367 | * nonlinear vmas). It's also needed in write mode to avoid new | 2367 | * nonlinear vmas). It's also needed in write mode to avoid new |
2368 | * anon_vmas to be associated with existing vmas. | 2368 | * anon_vmas to be associated with existing vmas. |
2369 | * | 2369 | * |
2370 | * A single task can't take more than one mm_take_all_locks() in a row | 2370 | * A single task can't take more than one mm_take_all_locks() in a row |
2371 | * or it would deadlock. | 2371 | * or it would deadlock. |
2372 | * | 2372 | * |
2373 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in | 2373 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in |
2374 | * mapping->flags avoid to take the same lock twice, if more than one | 2374 | * mapping->flags avoid to take the same lock twice, if more than one |
2375 | * vma in this mm is backed by the same anon_vma or address_space. | 2375 | * vma in this mm is backed by the same anon_vma or address_space. |
2376 | * | 2376 | * |
2377 | * We can take all the locks in random order because the VM code | 2377 | * We can take all the locks in random order because the VM code |
2378 | * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never | 2378 | * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never |
2379 | * takes more than one of them in a row. Secondly we're protected | 2379 | * takes more than one of them in a row. Secondly we're protected |
2380 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 2380 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
2381 | * | 2381 | * |
2382 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | 2382 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations |
2383 | * that may have to take thousand of locks. | 2383 | * that may have to take thousand of locks. |
2384 | * | 2384 | * |
2385 | * mm_take_all_locks() can fail if it's interrupted by signals. | 2385 | * mm_take_all_locks() can fail if it's interrupted by signals. |
2386 | */ | 2386 | */ |
2387 | int mm_take_all_locks(struct mm_struct *mm) | 2387 | int mm_take_all_locks(struct mm_struct *mm) |
2388 | { | 2388 | { |
2389 | struct vm_area_struct *vma; | 2389 | struct vm_area_struct *vma; |
2390 | int ret = -EINTR; | 2390 | int ret = -EINTR; |
2391 | 2391 | ||
2392 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2392 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
2393 | 2393 | ||
2394 | mutex_lock(&mm_all_locks_mutex); | 2394 | mutex_lock(&mm_all_locks_mutex); |
2395 | 2395 | ||
2396 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 2396 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
2397 | if (signal_pending(current)) | 2397 | if (signal_pending(current)) |
2398 | goto out_unlock; | 2398 | goto out_unlock; |
2399 | if (vma->vm_file && vma->vm_file->f_mapping) | 2399 | if (vma->vm_file && vma->vm_file->f_mapping) |
2400 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | 2400 | vm_lock_mapping(mm, vma->vm_file->f_mapping); |
2401 | } | 2401 | } |
2402 | 2402 | ||
2403 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 2403 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
2404 | if (signal_pending(current)) | 2404 | if (signal_pending(current)) |
2405 | goto out_unlock; | 2405 | goto out_unlock; |
2406 | if (vma->anon_vma) | 2406 | if (vma->anon_vma) |
2407 | vm_lock_anon_vma(mm, vma->anon_vma); | 2407 | vm_lock_anon_vma(mm, vma->anon_vma); |
2408 | } | 2408 | } |
2409 | 2409 | ||
2410 | ret = 0; | 2410 | ret = 0; |
2411 | 2411 | ||
2412 | out_unlock: | 2412 | out_unlock: |
2413 | if (ret) | 2413 | if (ret) |
2414 | mm_drop_all_locks(mm); | 2414 | mm_drop_all_locks(mm); |
2415 | 2415 | ||
2416 | return ret; | 2416 | return ret; |
2417 | } | 2417 | } |
2418 | 2418 | ||
2419 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 2419 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
2420 | { | 2420 | { |
2421 | if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { | 2421 | if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { |
2422 | /* | 2422 | /* |
2423 | * The LSB of head.next can't change to 0 from under | 2423 | * The LSB of head.next can't change to 0 from under |
2424 | * us because we hold the mm_all_locks_mutex. | 2424 | * us because we hold the mm_all_locks_mutex. |
2425 | * | 2425 | * |
2426 | * We must however clear the bitflag before unlocking | 2426 | * We must however clear the bitflag before unlocking |
2427 | * the vma so the users using the anon_vma->head will | 2427 | * the vma so the users using the anon_vma->head will |
2428 | * never see our bitflag. | 2428 | * never see our bitflag. |
2429 | * | 2429 | * |
2430 | * No need of atomic instructions here, head.next | 2430 | * No need of atomic instructions here, head.next |
2431 | * can't change from under us until we release the | 2431 | * can't change from under us until we release the |
2432 | * anon_vma->lock. | 2432 | * anon_vma->lock. |
2433 | */ | 2433 | */ |
2434 | if (!__test_and_clear_bit(0, (unsigned long *) | 2434 | if (!__test_and_clear_bit(0, (unsigned long *) |
2435 | &anon_vma->head.next)) | 2435 | &anon_vma->head.next)) |
2436 | BUG(); | 2436 | BUG(); |
2437 | spin_unlock(&anon_vma->lock); | 2437 | spin_unlock(&anon_vma->lock); |
2438 | } | 2438 | } |
2439 | } | 2439 | } |
2440 | 2440 | ||
2441 | static void vm_unlock_mapping(struct address_space *mapping) | 2441 | static void vm_unlock_mapping(struct address_space *mapping) |
2442 | { | 2442 | { |
2443 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | 2443 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { |
2444 | /* | 2444 | /* |
2445 | * AS_MM_ALL_LOCKS can't change to 0 from under us | 2445 | * AS_MM_ALL_LOCKS can't change to 0 from under us |
2446 | * because we hold the mm_all_locks_mutex. | 2446 | * because we hold the mm_all_locks_mutex. |
2447 | */ | 2447 | */ |
2448 | spin_unlock(&mapping->i_mmap_lock); | 2448 | spin_unlock(&mapping->i_mmap_lock); |
2449 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | 2449 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, |
2450 | &mapping->flags)) | 2450 | &mapping->flags)) |
2451 | BUG(); | 2451 | BUG(); |
2452 | } | 2452 | } |
2453 | } | 2453 | } |
2454 | 2454 | ||
2455 | /* | 2455 | /* |
2456 | * The mmap_sem cannot be released by the caller until | 2456 | * The mmap_sem cannot be released by the caller until |
2457 | * mm_drop_all_locks() returns. | 2457 | * mm_drop_all_locks() returns. |
2458 | */ | 2458 | */ |
2459 | void mm_drop_all_locks(struct mm_struct *mm) | 2459 | void mm_drop_all_locks(struct mm_struct *mm) |
2460 | { | 2460 | { |
2461 | struct vm_area_struct *vma; | 2461 | struct vm_area_struct *vma; |
2462 | 2462 | ||
2463 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2463 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
2464 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | 2464 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
2465 | 2465 | ||
2466 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 2466 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
2467 | if (vma->anon_vma) | 2467 | if (vma->anon_vma) |
2468 | vm_unlock_anon_vma(vma->anon_vma); | 2468 | vm_unlock_anon_vma(vma->anon_vma); |
2469 | if (vma->vm_file && vma->vm_file->f_mapping) | 2469 | if (vma->vm_file && vma->vm_file->f_mapping) |
2470 | vm_unlock_mapping(vma->vm_file->f_mapping); | 2470 | vm_unlock_mapping(vma->vm_file->f_mapping); |
2471 | } | 2471 | } |
2472 | 2472 | ||
2473 | mutex_unlock(&mm_all_locks_mutex); | 2473 | mutex_unlock(&mm_all_locks_mutex); |
2474 | } | 2474 | } |
2475 | |||
2476 | /* | ||
2477 | * initialise the VMA slab | ||
2478 | */ | ||
2479 | void __init mmap_init(void) | ||
2480 | { | ||
2481 | vm_area_cachep = kmem_cache_create("vm_area_struct", | ||
2482 | sizeof(struct vm_area_struct), 0, | ||
2483 | SLAB_PANIC, NULL); | ||
2484 | } | ||
2475 | 2485 |
mm/nommu.c
1 | /* | 1 | /* |
2 | * linux/mm/nommu.c | 2 | * linux/mm/nommu.c |
3 | * | 3 | * |
4 | * Replacement code for mm functions to support CPU's that don't | 4 | * Replacement code for mm functions to support CPU's that don't |
5 | * have any form of memory management unit (thus no virtual memory). | 5 | * have any form of memory management unit (thus no virtual memory). |
6 | * | 6 | * |
7 | * See Documentation/nommu-mmap.txt | 7 | * See Documentation/nommu-mmap.txt |
8 | * | 8 | * |
9 | * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> | 9 | * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> |
10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> |
11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> |
12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> |
13 | * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/mman.h> | 18 | #include <linux/mman.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/file.h> | 20 | #include <linux/file.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/tracehook.h> | 25 | #include <linux/tracehook.h> |
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
29 | #include <linux/personality.h> | 29 | #include <linux/personality.h> |
30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
32 | 32 | ||
33 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
34 | #include <asm/tlb.h> | 34 | #include <asm/tlb.h> |
35 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
36 | #include "internal.h" | ||
36 | 37 | ||
38 | static inline __attribute__((format(printf, 1, 2))) | ||
39 | void no_printk(const char *fmt, ...) | ||
40 | { | ||
41 | } | ||
42 | |||
43 | #if 0 | ||
44 | #define kenter(FMT, ...) \ | ||
45 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
46 | #define kleave(FMT, ...) \ | ||
47 | printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
48 | #define kdebug(FMT, ...) \ | ||
49 | printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) | ||
50 | #else | ||
51 | #define kenter(FMT, ...) \ | ||
52 | no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
53 | #define kleave(FMT, ...) \ | ||
54 | no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
55 | #define kdebug(FMT, ...) \ | ||
56 | no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) | ||
57 | #endif | ||
58 | |||
37 | #include "internal.h" | 59 | #include "internal.h" |
38 | 60 | ||
39 | void *high_memory; | 61 | void *high_memory; |
40 | struct page *mem_map; | 62 | struct page *mem_map; |
41 | unsigned long max_mapnr; | 63 | unsigned long max_mapnr; |
42 | unsigned long num_physpages; | 64 | unsigned long num_physpages; |
43 | unsigned long askedalloc, realalloc; | ||
44 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 65 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); |
45 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
46 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
47 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
69 | int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ | ||
48 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
49 | 71 | ||
72 | atomic_t mmap_pages_allocated; | ||
73 | |||
50 | EXPORT_SYMBOL(mem_map); | 74 | EXPORT_SYMBOL(mem_map); |
51 | EXPORT_SYMBOL(num_physpages); | 75 | EXPORT_SYMBOL(num_physpages); |
52 | 76 | ||
53 | /* list of shareable VMAs */ | 77 | /* list of mapped, potentially shareable regions */ |
54 | struct rb_root nommu_vma_tree = RB_ROOT; | 78 | static struct kmem_cache *vm_region_jar; |
55 | DECLARE_RWSEM(nommu_vma_sem); | 79 | struct rb_root nommu_region_tree = RB_ROOT; |
80 | DECLARE_RWSEM(nommu_region_sem); | ||
56 | 81 | ||
57 | struct vm_operations_struct generic_file_vm_ops = { | 82 | struct vm_operations_struct generic_file_vm_ops = { |
58 | }; | 83 | }; |
59 | 84 | ||
60 | /* | 85 | /* |
61 | * Handle all mappings that got truncated by a "truncate()" | 86 | * Handle all mappings that got truncated by a "truncate()" |
62 | * system call. | 87 | * system call. |
63 | * | 88 | * |
64 | * NOTE! We have to be ready to update the memory sharing | 89 | * NOTE! We have to be ready to update the memory sharing |
65 | * between the file and the memory map for a potential last | 90 | * between the file and the memory map for a potential last |
66 | * incomplete page. Ugly, but necessary. | 91 | * incomplete page. Ugly, but necessary. |
67 | */ | 92 | */ |
68 | int vmtruncate(struct inode *inode, loff_t offset) | 93 | int vmtruncate(struct inode *inode, loff_t offset) |
69 | { | 94 | { |
70 | struct address_space *mapping = inode->i_mapping; | 95 | struct address_space *mapping = inode->i_mapping; |
71 | unsigned long limit; | 96 | unsigned long limit; |
72 | 97 | ||
73 | if (inode->i_size < offset) | 98 | if (inode->i_size < offset) |
74 | goto do_expand; | 99 | goto do_expand; |
75 | i_size_write(inode, offset); | 100 | i_size_write(inode, offset); |
76 | 101 | ||
77 | truncate_inode_pages(mapping, offset); | 102 | truncate_inode_pages(mapping, offset); |
78 | goto out_truncate; | 103 | goto out_truncate; |
79 | 104 | ||
80 | do_expand: | 105 | do_expand: |
81 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 106 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; |
82 | if (limit != RLIM_INFINITY && offset > limit) | 107 | if (limit != RLIM_INFINITY && offset > limit) |
83 | goto out_sig; | 108 | goto out_sig; |
84 | if (offset > inode->i_sb->s_maxbytes) | 109 | if (offset > inode->i_sb->s_maxbytes) |
85 | goto out; | 110 | goto out; |
86 | i_size_write(inode, offset); | 111 | i_size_write(inode, offset); |
87 | 112 | ||
88 | out_truncate: | 113 | out_truncate: |
89 | if (inode->i_op->truncate) | 114 | if (inode->i_op->truncate) |
90 | inode->i_op->truncate(inode); | 115 | inode->i_op->truncate(inode); |
91 | return 0; | 116 | return 0; |
92 | out_sig: | 117 | out_sig: |
93 | send_sig(SIGXFSZ, current, 0); | 118 | send_sig(SIGXFSZ, current, 0); |
94 | out: | 119 | out: |
95 | return -EFBIG; | 120 | return -EFBIG; |
96 | } | 121 | } |
97 | 122 | ||
98 | EXPORT_SYMBOL(vmtruncate); | 123 | EXPORT_SYMBOL(vmtruncate); |
99 | 124 | ||
100 | /* | 125 | /* |
101 | * Return the total memory allocated for this pointer, not | 126 | * Return the total memory allocated for this pointer, not |
102 | * just what the caller asked for. | 127 | * just what the caller asked for. |
103 | * | 128 | * |
104 | * Doesn't have to be accurate, i.e. may have races. | 129 | * Doesn't have to be accurate, i.e. may have races. |
105 | */ | 130 | */ |
106 | unsigned int kobjsize(const void *objp) | 131 | unsigned int kobjsize(const void *objp) |
107 | { | 132 | { |
108 | struct page *page; | 133 | struct page *page; |
109 | 134 | ||
110 | /* | 135 | /* |
111 | * If the object we have should not have ksize performed on it, | 136 | * If the object we have should not have ksize performed on it, |
112 | * return size of 0 | 137 | * return size of 0 |
113 | */ | 138 | */ |
114 | if (!objp || !virt_addr_valid(objp)) | 139 | if (!objp || !virt_addr_valid(objp)) |
115 | return 0; | 140 | return 0; |
116 | 141 | ||
117 | page = virt_to_head_page(objp); | 142 | page = virt_to_head_page(objp); |
118 | 143 | ||
119 | /* | 144 | /* |
120 | * If the allocator sets PageSlab, we know the pointer came from | 145 | * If the allocator sets PageSlab, we know the pointer came from |
121 | * kmalloc(). | 146 | * kmalloc(). |
122 | */ | 147 | */ |
123 | if (PageSlab(page)) | 148 | if (PageSlab(page)) |
124 | return ksize(objp); | 149 | return ksize(objp); |
125 | 150 | ||
126 | /* | 151 | /* |
152 | * If it's not a compound page, see if we have a matching VMA | ||
153 | * region. This test is intentionally done in reverse order, | ||
154 | * so if there's no VMA, we still fall through and hand back | ||
155 | * PAGE_SIZE for 0-order pages. | ||
156 | */ | ||
157 | if (!PageCompound(page)) { | ||
158 | struct vm_area_struct *vma; | ||
159 | |||
160 | vma = find_vma(current->mm, (unsigned long)objp); | ||
161 | if (vma) | ||
162 | return vma->vm_end - vma->vm_start; | ||
163 | } | ||
164 | |||
165 | /* | ||
127 | * The ksize() function is only guaranteed to work for pointers | 166 | * The ksize() function is only guaranteed to work for pointers |
128 | * returned by kmalloc(). So handle arbitrary pointers here. | 167 | * returned by kmalloc(). So handle arbitrary pointers here. |
129 | */ | 168 | */ |
130 | return PAGE_SIZE << compound_order(page); | 169 | return PAGE_SIZE << compound_order(page); |
131 | } | 170 | } |
132 | 171 | ||
133 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 172 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
134 | unsigned long start, int len, int flags, | 173 | unsigned long start, int len, int flags, |
135 | struct page **pages, struct vm_area_struct **vmas) | 174 | struct page **pages, struct vm_area_struct **vmas) |
136 | { | 175 | { |
137 | struct vm_area_struct *vma; | 176 | struct vm_area_struct *vma; |
138 | unsigned long vm_flags; | 177 | unsigned long vm_flags; |
139 | int i; | 178 | int i; |
140 | int write = !!(flags & GUP_FLAGS_WRITE); | 179 | int write = !!(flags & GUP_FLAGS_WRITE); |
141 | int force = !!(flags & GUP_FLAGS_FORCE); | 180 | int force = !!(flags & GUP_FLAGS_FORCE); |
142 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | 181 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); |
143 | 182 | ||
144 | /* calculate required read or write permissions. | 183 | /* calculate required read or write permissions. |
145 | * - if 'force' is set, we only require the "MAY" flags. | 184 | * - if 'force' is set, we only require the "MAY" flags. |
146 | */ | 185 | */ |
147 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 186 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
148 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 187 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
149 | 188 | ||
150 | for (i = 0; i < len; i++) { | 189 | for (i = 0; i < len; i++) { |
151 | vma = find_vma(mm, start); | 190 | vma = find_vma(mm, start); |
152 | if (!vma) | 191 | if (!vma) |
153 | goto finish_or_fault; | 192 | goto finish_or_fault; |
154 | 193 | ||
155 | /* protect what we can, including chardevs */ | 194 | /* protect what we can, including chardevs */ |
156 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || | 195 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || |
157 | (!ignore && !(vm_flags & vma->vm_flags))) | 196 | (!ignore && !(vm_flags & vma->vm_flags))) |
158 | goto finish_or_fault; | 197 | goto finish_or_fault; |
159 | 198 | ||
160 | if (pages) { | 199 | if (pages) { |
161 | pages[i] = virt_to_page(start); | 200 | pages[i] = virt_to_page(start); |
162 | if (pages[i]) | 201 | if (pages[i]) |
163 | page_cache_get(pages[i]); | 202 | page_cache_get(pages[i]); |
164 | } | 203 | } |
165 | if (vmas) | 204 | if (vmas) |
166 | vmas[i] = vma; | 205 | vmas[i] = vma; |
167 | start += PAGE_SIZE; | 206 | start += PAGE_SIZE; |
168 | } | 207 | } |
169 | 208 | ||
170 | return i; | 209 | return i; |
171 | 210 | ||
172 | finish_or_fault: | 211 | finish_or_fault: |
173 | return i ? : -EFAULT; | 212 | return i ? : -EFAULT; |
174 | } | 213 | } |
175 | 214 | ||
176 | 215 | ||
177 | /* | 216 | /* |
178 | * get a list of pages in an address range belonging to the specified process | 217 | * get a list of pages in an address range belonging to the specified process |
179 | * and indicate the VMA that covers each page | 218 | * and indicate the VMA that covers each page |
180 | * - this is potentially dodgy as we may end incrementing the page count of a | 219 | * - this is potentially dodgy as we may end incrementing the page count of a |
181 | * slab page or a secondary page from a compound page | 220 | * slab page or a secondary page from a compound page |
182 | * - don't permit access to VMAs that don't support it, such as I/O mappings | 221 | * - don't permit access to VMAs that don't support it, such as I/O mappings |
183 | */ | 222 | */ |
184 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 223 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
185 | unsigned long start, int len, int write, int force, | 224 | unsigned long start, int len, int write, int force, |
186 | struct page **pages, struct vm_area_struct **vmas) | 225 | struct page **pages, struct vm_area_struct **vmas) |
187 | { | 226 | { |
188 | int flags = 0; | 227 | int flags = 0; |
189 | 228 | ||
190 | if (write) | 229 | if (write) |
191 | flags |= GUP_FLAGS_WRITE; | 230 | flags |= GUP_FLAGS_WRITE; |
192 | if (force) | 231 | if (force) |
193 | flags |= GUP_FLAGS_FORCE; | 232 | flags |= GUP_FLAGS_FORCE; |
194 | 233 | ||
195 | return __get_user_pages(tsk, mm, | 234 | return __get_user_pages(tsk, mm, |
196 | start, len, flags, | 235 | start, len, flags, |
197 | pages, vmas); | 236 | pages, vmas); |
198 | } | 237 | } |
199 | EXPORT_SYMBOL(get_user_pages); | 238 | EXPORT_SYMBOL(get_user_pages); |
200 | 239 | ||
201 | DEFINE_RWLOCK(vmlist_lock); | 240 | DEFINE_RWLOCK(vmlist_lock); |
202 | struct vm_struct *vmlist; | 241 | struct vm_struct *vmlist; |
203 | 242 | ||
204 | void vfree(const void *addr) | 243 | void vfree(const void *addr) |
205 | { | 244 | { |
206 | kfree(addr); | 245 | kfree(addr); |
207 | } | 246 | } |
208 | EXPORT_SYMBOL(vfree); | 247 | EXPORT_SYMBOL(vfree); |
209 | 248 | ||
210 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 249 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
211 | { | 250 | { |
212 | /* | 251 | /* |
213 | * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() | 252 | * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() |
214 | * returns only a logical address. | 253 | * returns only a logical address. |
215 | */ | 254 | */ |
216 | return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); | 255 | return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); |
217 | } | 256 | } |
218 | EXPORT_SYMBOL(__vmalloc); | 257 | EXPORT_SYMBOL(__vmalloc); |
219 | 258 | ||
220 | void *vmalloc_user(unsigned long size) | 259 | void *vmalloc_user(unsigned long size) |
221 | { | 260 | { |
222 | void *ret; | 261 | void *ret; |
223 | 262 | ||
224 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 263 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
225 | PAGE_KERNEL); | 264 | PAGE_KERNEL); |
226 | if (ret) { | 265 | if (ret) { |
227 | struct vm_area_struct *vma; | 266 | struct vm_area_struct *vma; |
228 | 267 | ||
229 | down_write(¤t->mm->mmap_sem); | 268 | down_write(¤t->mm->mmap_sem); |
230 | vma = find_vma(current->mm, (unsigned long)ret); | 269 | vma = find_vma(current->mm, (unsigned long)ret); |
231 | if (vma) | 270 | if (vma) |
232 | vma->vm_flags |= VM_USERMAP; | 271 | vma->vm_flags |= VM_USERMAP; |
233 | up_write(¤t->mm->mmap_sem); | 272 | up_write(¤t->mm->mmap_sem); |
234 | } | 273 | } |
235 | 274 | ||
236 | return ret; | 275 | return ret; |
237 | } | 276 | } |
238 | EXPORT_SYMBOL(vmalloc_user); | 277 | EXPORT_SYMBOL(vmalloc_user); |
239 | 278 | ||
240 | struct page *vmalloc_to_page(const void *addr) | 279 | struct page *vmalloc_to_page(const void *addr) |
241 | { | 280 | { |
242 | return virt_to_page(addr); | 281 | return virt_to_page(addr); |
243 | } | 282 | } |
244 | EXPORT_SYMBOL(vmalloc_to_page); | 283 | EXPORT_SYMBOL(vmalloc_to_page); |
245 | 284 | ||
246 | unsigned long vmalloc_to_pfn(const void *addr) | 285 | unsigned long vmalloc_to_pfn(const void *addr) |
247 | { | 286 | { |
248 | return page_to_pfn(virt_to_page(addr)); | 287 | return page_to_pfn(virt_to_page(addr)); |
249 | } | 288 | } |
250 | EXPORT_SYMBOL(vmalloc_to_pfn); | 289 | EXPORT_SYMBOL(vmalloc_to_pfn); |
251 | 290 | ||
252 | long vread(char *buf, char *addr, unsigned long count) | 291 | long vread(char *buf, char *addr, unsigned long count) |
253 | { | 292 | { |
254 | memcpy(buf, addr, count); | 293 | memcpy(buf, addr, count); |
255 | return count; | 294 | return count; |
256 | } | 295 | } |
257 | 296 | ||
258 | long vwrite(char *buf, char *addr, unsigned long count) | 297 | long vwrite(char *buf, char *addr, unsigned long count) |
259 | { | 298 | { |
260 | /* Don't allow overflow */ | 299 | /* Don't allow overflow */ |
261 | if ((unsigned long) addr + count < count) | 300 | if ((unsigned long) addr + count < count) |
262 | count = -(unsigned long) addr; | 301 | count = -(unsigned long) addr; |
263 | 302 | ||
264 | memcpy(addr, buf, count); | 303 | memcpy(addr, buf, count); |
265 | return(count); | 304 | return(count); |
266 | } | 305 | } |
267 | 306 | ||
268 | /* | 307 | /* |
269 | * vmalloc - allocate virtually continguos memory | 308 | * vmalloc - allocate virtually continguos memory |
270 | * | 309 | * |
271 | * @size: allocation size | 310 | * @size: allocation size |
272 | * | 311 | * |
273 | * Allocate enough pages to cover @size from the page level | 312 | * Allocate enough pages to cover @size from the page level |
274 | * allocator and map them into continguos kernel virtual space. | 313 | * allocator and map them into continguos kernel virtual space. |
275 | * | 314 | * |
276 | * For tight control over page level allocator and protection flags | 315 | * For tight control over page level allocator and protection flags |
277 | * use __vmalloc() instead. | 316 | * use __vmalloc() instead. |
278 | */ | 317 | */ |
279 | void *vmalloc(unsigned long size) | 318 | void *vmalloc(unsigned long size) |
280 | { | 319 | { |
281 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | 320 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); |
282 | } | 321 | } |
283 | EXPORT_SYMBOL(vmalloc); | 322 | EXPORT_SYMBOL(vmalloc); |
284 | 323 | ||
285 | void *vmalloc_node(unsigned long size, int node) | 324 | void *vmalloc_node(unsigned long size, int node) |
286 | { | 325 | { |
287 | return vmalloc(size); | 326 | return vmalloc(size); |
288 | } | 327 | } |
289 | EXPORT_SYMBOL(vmalloc_node); | 328 | EXPORT_SYMBOL(vmalloc_node); |
290 | 329 | ||
291 | #ifndef PAGE_KERNEL_EXEC | 330 | #ifndef PAGE_KERNEL_EXEC |
292 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | 331 | # define PAGE_KERNEL_EXEC PAGE_KERNEL |
293 | #endif | 332 | #endif |
294 | 333 | ||
295 | /** | 334 | /** |
296 | * vmalloc_exec - allocate virtually contiguous, executable memory | 335 | * vmalloc_exec - allocate virtually contiguous, executable memory |
297 | * @size: allocation size | 336 | * @size: allocation size |
298 | * | 337 | * |
299 | * Kernel-internal function to allocate enough pages to cover @size | 338 | * Kernel-internal function to allocate enough pages to cover @size |
300 | * the page level allocator and map them into contiguous and | 339 | * the page level allocator and map them into contiguous and |
301 | * executable kernel virtual space. | 340 | * executable kernel virtual space. |
302 | * | 341 | * |
303 | * For tight control over page level allocator and protection flags | 342 | * For tight control over page level allocator and protection flags |
304 | * use __vmalloc() instead. | 343 | * use __vmalloc() instead. |
305 | */ | 344 | */ |
306 | 345 | ||
307 | void *vmalloc_exec(unsigned long size) | 346 | void *vmalloc_exec(unsigned long size) |
308 | { | 347 | { |
309 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | 348 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); |
310 | } | 349 | } |
311 | 350 | ||
312 | /** | 351 | /** |
313 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 352 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
314 | * @size: allocation size | 353 | * @size: allocation size |
315 | * | 354 | * |
316 | * Allocate enough 32bit PA addressable pages to cover @size from the | 355 | * Allocate enough 32bit PA addressable pages to cover @size from the |
317 | * page level allocator and map them into continguos kernel virtual space. | 356 | * page level allocator and map them into continguos kernel virtual space. |
318 | */ | 357 | */ |
319 | void *vmalloc_32(unsigned long size) | 358 | void *vmalloc_32(unsigned long size) |
320 | { | 359 | { |
321 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); | 360 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); |
322 | } | 361 | } |
323 | EXPORT_SYMBOL(vmalloc_32); | 362 | EXPORT_SYMBOL(vmalloc_32); |
324 | 363 | ||
325 | /** | 364 | /** |
326 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory | 365 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory |
327 | * @size: allocation size | 366 | * @size: allocation size |
328 | * | 367 | * |
329 | * The resulting memory area is 32bit addressable and zeroed so it can be | 368 | * The resulting memory area is 32bit addressable and zeroed so it can be |
330 | * mapped to userspace without leaking data. | 369 | * mapped to userspace without leaking data. |
331 | * | 370 | * |
332 | * VM_USERMAP is set on the corresponding VMA so that subsequent calls to | 371 | * VM_USERMAP is set on the corresponding VMA so that subsequent calls to |
333 | * remap_vmalloc_range() are permissible. | 372 | * remap_vmalloc_range() are permissible. |
334 | */ | 373 | */ |
335 | void *vmalloc_32_user(unsigned long size) | 374 | void *vmalloc_32_user(unsigned long size) |
336 | { | 375 | { |
337 | /* | 376 | /* |
338 | * We'll have to sort out the ZONE_DMA bits for 64-bit, | 377 | * We'll have to sort out the ZONE_DMA bits for 64-bit, |
339 | * but for now this can simply use vmalloc_user() directly. | 378 | * but for now this can simply use vmalloc_user() directly. |
340 | */ | 379 | */ |
341 | return vmalloc_user(size); | 380 | return vmalloc_user(size); |
342 | } | 381 | } |
343 | EXPORT_SYMBOL(vmalloc_32_user); | 382 | EXPORT_SYMBOL(vmalloc_32_user); |
344 | 383 | ||
345 | void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) | 384 | void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) |
346 | { | 385 | { |
347 | BUG(); | 386 | BUG(); |
348 | return NULL; | 387 | return NULL; |
349 | } | 388 | } |
350 | EXPORT_SYMBOL(vmap); | 389 | EXPORT_SYMBOL(vmap); |
351 | 390 | ||
352 | void vunmap(const void *addr) | 391 | void vunmap(const void *addr) |
353 | { | 392 | { |
354 | BUG(); | 393 | BUG(); |
355 | } | 394 | } |
356 | EXPORT_SYMBOL(vunmap); | 395 | EXPORT_SYMBOL(vunmap); |
357 | 396 | ||
358 | /* | 397 | /* |
359 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to | 398 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to |
360 | * have one. | 399 | * have one. |
361 | */ | 400 | */ |
362 | void __attribute__((weak)) vmalloc_sync_all(void) | 401 | void __attribute__((weak)) vmalloc_sync_all(void) |
363 | { | 402 | { |
364 | } | 403 | } |
365 | 404 | ||
366 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 405 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
367 | struct page *page) | 406 | struct page *page) |
368 | { | 407 | { |
369 | return -EINVAL; | 408 | return -EINVAL; |
370 | } | 409 | } |
371 | EXPORT_SYMBOL(vm_insert_page); | 410 | EXPORT_SYMBOL(vm_insert_page); |
372 | 411 | ||
373 | /* | 412 | /* |
374 | * sys_brk() for the most part doesn't need the global kernel | 413 | * sys_brk() for the most part doesn't need the global kernel |
375 | * lock, except when an application is doing something nasty | 414 | * lock, except when an application is doing something nasty |
376 | * like trying to un-brk an area that has already been mapped | 415 | * like trying to un-brk an area that has already been mapped |
377 | * to a regular file. in this case, the unmapping will need | 416 | * to a regular file. in this case, the unmapping will need |
378 | * to invoke file system routines that need the global lock. | 417 | * to invoke file system routines that need the global lock. |
379 | */ | 418 | */ |
380 | asmlinkage unsigned long sys_brk(unsigned long brk) | 419 | asmlinkage unsigned long sys_brk(unsigned long brk) |
381 | { | 420 | { |
382 | struct mm_struct *mm = current->mm; | 421 | struct mm_struct *mm = current->mm; |
383 | 422 | ||
384 | if (brk < mm->start_brk || brk > mm->context.end_brk) | 423 | if (brk < mm->start_brk || brk > mm->context.end_brk) |
385 | return mm->brk; | 424 | return mm->brk; |
386 | 425 | ||
387 | if (mm->brk == brk) | 426 | if (mm->brk == brk) |
388 | return mm->brk; | 427 | return mm->brk; |
389 | 428 | ||
390 | /* | 429 | /* |
391 | * Always allow shrinking brk | 430 | * Always allow shrinking brk |
392 | */ | 431 | */ |
393 | if (brk <= mm->brk) { | 432 | if (brk <= mm->brk) { |
394 | mm->brk = brk; | 433 | mm->brk = brk; |
395 | return brk; | 434 | return brk; |
396 | } | 435 | } |
397 | 436 | ||
398 | /* | 437 | /* |
399 | * Ok, looks good - let it rip. | 438 | * Ok, looks good - let it rip. |
400 | */ | 439 | */ |
401 | return mm->brk = brk; | 440 | return mm->brk = brk; |
402 | } | 441 | } |
403 | 442 | ||
404 | #ifdef DEBUG | 443 | /* |
405 | static void show_process_blocks(void) | 444 | * initialise the VMA and region record slabs |
445 | */ | ||
446 | void __init mmap_init(void) | ||
406 | { | 447 | { |
407 | struct vm_list_struct *vml; | 448 | vm_region_jar = kmem_cache_create("vm_region_jar", |
408 | 449 | sizeof(struct vm_region), 0, | |
409 | printk("Process blocks %d:", current->pid); | 450 | SLAB_PANIC, NULL); |
410 | 451 | vm_area_cachep = kmem_cache_create("vm_area_struct", | |
411 | for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { | 452 | sizeof(struct vm_area_struct), 0, |
412 | printk(" %p: %p", vml, vml->vma); | 453 | SLAB_PANIC, NULL); |
413 | if (vml->vma) | ||
414 | printk(" (%d @%lx #%d)", | ||
415 | kobjsize((void *) vml->vma->vm_start), | ||
416 | vml->vma->vm_start, | ||
417 | atomic_read(&vml->vma->vm_usage)); | ||
418 | printk(vml->next ? " ->" : ".\n"); | ||
419 | } | ||
420 | } | 454 | } |
421 | #endif /* DEBUG */ | ||
422 | 455 | ||
423 | /* | 456 | /* |
424 | * add a VMA into a process's mm_struct in the appropriate place in the list | 457 | * validate the region tree |
425 | * - should be called with mm->mmap_sem held writelocked | 458 | * - the caller must hold the region lock |
426 | */ | 459 | */ |
427 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) | 460 | #ifdef CONFIG_DEBUG_NOMMU_REGIONS |
461 | static noinline void validate_nommu_regions(void) | ||
428 | { | 462 | { |
429 | struct vm_list_struct **ppv; | 463 | struct vm_region *region, *last; |
464 | struct rb_node *p, *lastp; | ||
430 | 465 | ||
431 | for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) | 466 | lastp = rb_first(&nommu_region_tree); |
432 | if ((*ppv)->vma->vm_start > vml->vma->vm_start) | 467 | if (!lastp) |
433 | break; | 468 | return; |
434 | 469 | ||
435 | vml->next = *ppv; | 470 | last = rb_entry(lastp, struct vm_region, vm_rb); |
436 | *ppv = vml; | 471 | if (unlikely(last->vm_end <= last->vm_start)) |
472 | BUG(); | ||
473 | if (unlikely(last->vm_top < last->vm_end)) | ||
474 | BUG(); | ||
475 | |||
476 | while ((p = rb_next(lastp))) { | ||
477 | region = rb_entry(p, struct vm_region, vm_rb); | ||
478 | last = rb_entry(lastp, struct vm_region, vm_rb); | ||
479 | |||
480 | if (unlikely(region->vm_end <= region->vm_start)) | ||
481 | BUG(); | ||
482 | if (unlikely(region->vm_top < region->vm_end)) | ||
483 | BUG(); | ||
484 | if (unlikely(region->vm_start < last->vm_top)) | ||
485 | BUG(); | ||
486 | |||
487 | lastp = p; | ||
488 | } | ||
437 | } | 489 | } |
490 | #else | ||
491 | #define validate_nommu_regions() do {} while(0) | ||
492 | #endif | ||
438 | 493 | ||
439 | /* | 494 | /* |
440 | * look up the first VMA in which addr resides, NULL if none | 495 | * add a region into the global tree |
441 | * - should be called with mm->mmap_sem at least held readlocked | ||
442 | */ | 496 | */ |
443 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 497 | static void add_nommu_region(struct vm_region *region) |
444 | { | 498 | { |
445 | struct vm_list_struct *loop, *vml; | 499 | struct vm_region *pregion; |
500 | struct rb_node **p, *parent; | ||
446 | 501 | ||
447 | /* search the vm_start ordered list */ | 502 | validate_nommu_regions(); |
448 | vml = NULL; | 503 | |
449 | for (loop = mm->context.vmlist; loop; loop = loop->next) { | 504 | BUG_ON(region->vm_start & ~PAGE_MASK); |
450 | if (loop->vma->vm_start > addr) | 505 | |
451 | break; | 506 | parent = NULL; |
452 | vml = loop; | 507 | p = &nommu_region_tree.rb_node; |
508 | while (*p) { | ||
509 | parent = *p; | ||
510 | pregion = rb_entry(parent, struct vm_region, vm_rb); | ||
511 | if (region->vm_start < pregion->vm_start) | ||
512 | p = &(*p)->rb_left; | ||
513 | else if (region->vm_start > pregion->vm_start) | ||
514 | p = &(*p)->rb_right; | ||
515 | else if (pregion == region) | ||
516 | return; | ||
517 | else | ||
518 | BUG(); | ||
453 | } | 519 | } |
454 | 520 | ||
455 | if (vml && vml->vma->vm_end > addr) | 521 | rb_link_node(®ion->vm_rb, parent, p); |
456 | return vml->vma; | 522 | rb_insert_color(®ion->vm_rb, &nommu_region_tree); |
457 | 523 | ||
458 | return NULL; | 524 | validate_nommu_regions(); |
459 | } | 525 | } |
460 | EXPORT_SYMBOL(find_vma); | ||
461 | 526 | ||
462 | /* | 527 | /* |
463 | * find a VMA | 528 | * delete a region from the global tree |
464 | * - we don't extend stack VMAs under NOMMU conditions | ||
465 | */ | 529 | */ |
466 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | 530 | static void delete_nommu_region(struct vm_region *region) |
467 | { | 531 | { |
468 | return find_vma(mm, addr); | 532 | BUG_ON(!nommu_region_tree.rb_node); |
469 | } | ||
470 | 533 | ||
471 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 534 | validate_nommu_regions(); |
472 | { | 535 | rb_erase(®ion->vm_rb, &nommu_region_tree); |
473 | return -ENOMEM; | 536 | validate_nommu_regions(); |
474 | } | 537 | } |
475 | 538 | ||
476 | /* | 539 | /* |
477 | * look up the first VMA exactly that exactly matches addr | 540 | * free a contiguous series of pages |
478 | * - should be called with mm->mmap_sem at least held readlocked | ||
479 | */ | 541 | */ |
480 | static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | 542 | static void free_page_series(unsigned long from, unsigned long to) |
481 | unsigned long addr) | ||
482 | { | 543 | { |
483 | struct vm_list_struct *vml; | 544 | for (; from < to; from += PAGE_SIZE) { |
545 | struct page *page = virt_to_page(from); | ||
484 | 546 | ||
485 | /* search the vm_start ordered list */ | 547 | kdebug("- free %lx", from); |
486 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | 548 | atomic_dec(&mmap_pages_allocated); |
487 | if (vml->vma->vm_start == addr) | 549 | if (page_count(page) != 1) |
488 | return vml->vma; | 550 | kdebug("free page %p [%d]", page, page_count(page)); |
489 | if (vml->vma->vm_start > addr) | 551 | put_page(page); |
490 | break; | ||
491 | } | 552 | } |
492 | |||
493 | return NULL; | ||
494 | } | 553 | } |
495 | 554 | ||
496 | /* | 555 | /* |
497 | * find a VMA in the global tree | 556 | * release a reference to a region |
557 | * - the caller must hold the region semaphore, which this releases | ||
558 | * - the region may not have been added to the tree yet, in which case vm_top | ||
559 | * will equal vm_start | ||
498 | */ | 560 | */ |
499 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | 561 | static void __put_nommu_region(struct vm_region *region) |
562 | __releases(nommu_region_sem) | ||
500 | { | 563 | { |
501 | struct vm_area_struct *vma; | 564 | kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); |
502 | struct rb_node *n = nommu_vma_tree.rb_node; | ||
503 | 565 | ||
504 | while (n) { | 566 | BUG_ON(!nommu_region_tree.rb_node); |
505 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
506 | 567 | ||
507 | if (start < vma->vm_start) | 568 | if (atomic_dec_and_test(®ion->vm_usage)) { |
508 | n = n->rb_left; | 569 | if (region->vm_top > region->vm_start) |
509 | else if (start > vma->vm_start) | 570 | delete_nommu_region(region); |
510 | n = n->rb_right; | 571 | up_write(&nommu_region_sem); |
511 | else | 572 | |
512 | return vma; | 573 | if (region->vm_file) |
574 | fput(region->vm_file); | ||
575 | |||
576 | /* IO memory and memory shared directly out of the pagecache | ||
577 | * from ramfs/tmpfs mustn't be released here */ | ||
578 | if (region->vm_flags & VM_MAPPED_COPY) { | ||
579 | kdebug("free series"); | ||
580 | free_page_series(region->vm_start, region->vm_top); | ||
581 | } | ||
582 | kmem_cache_free(vm_region_jar, region); | ||
583 | } else { | ||
584 | up_write(&nommu_region_sem); | ||
513 | } | 585 | } |
586 | } | ||
514 | 587 | ||
515 | return NULL; | 588 | /* |
589 | * release a reference to a region | ||
590 | */ | ||
591 | static void put_nommu_region(struct vm_region *region) | ||
592 | { | ||
593 | down_write(&nommu_region_sem); | ||
594 | __put_nommu_region(region); | ||
516 | } | 595 | } |
517 | 596 | ||
518 | /* | 597 | /* |
519 | * add a VMA in the global tree | 598 | * add a VMA into a process's mm_struct in the appropriate place in the list |
599 | * and tree and add to the address space's page tree also if not an anonymous | ||
600 | * page | ||
601 | * - should be called with mm->mmap_sem held writelocked | ||
520 | */ | 602 | */ |
521 | static void add_nommu_vma(struct vm_area_struct *vma) | 603 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) |
522 | { | 604 | { |
523 | struct vm_area_struct *pvma; | 605 | struct vm_area_struct *pvma, **pp; |
524 | struct address_space *mapping; | 606 | struct address_space *mapping; |
525 | struct rb_node **p = &nommu_vma_tree.rb_node; | 607 | struct rb_node **p, *parent; |
526 | struct rb_node *parent = NULL; | ||
527 | 608 | ||
609 | kenter(",%p", vma); | ||
610 | |||
611 | BUG_ON(!vma->vm_region); | ||
612 | |||
613 | mm->map_count++; | ||
614 | vma->vm_mm = mm; | ||
615 | |||
528 | /* add the VMA to the mapping */ | 616 | /* add the VMA to the mapping */ |
529 | if (vma->vm_file) { | 617 | if (vma->vm_file) { |
530 | mapping = vma->vm_file->f_mapping; | 618 | mapping = vma->vm_file->f_mapping; |
531 | 619 | ||
532 | flush_dcache_mmap_lock(mapping); | 620 | flush_dcache_mmap_lock(mapping); |
533 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 621 | vma_prio_tree_insert(vma, &mapping->i_mmap); |
534 | flush_dcache_mmap_unlock(mapping); | 622 | flush_dcache_mmap_unlock(mapping); |
535 | } | 623 | } |
536 | 624 | ||
537 | /* add the VMA to the master list */ | 625 | /* add the VMA to the tree */ |
626 | parent = NULL; | ||
627 | p = &mm->mm_rb.rb_node; | ||
538 | while (*p) { | 628 | while (*p) { |
539 | parent = *p; | 629 | parent = *p; |
540 | pvma = rb_entry(parent, struct vm_area_struct, vm_rb); | 630 | pvma = rb_entry(parent, struct vm_area_struct, vm_rb); |
541 | 631 | ||
542 | if (vma->vm_start < pvma->vm_start) { | 632 | /* sort by: start addr, end addr, VMA struct addr in that order |
633 | * (the latter is necessary as we may get identical VMAs) */ | ||
634 | if (vma->vm_start < pvma->vm_start) | ||
543 | p = &(*p)->rb_left; | 635 | p = &(*p)->rb_left; |
544 | } | 636 | else if (vma->vm_start > pvma->vm_start) |
545 | else if (vma->vm_start > pvma->vm_start) { | ||
546 | p = &(*p)->rb_right; | 637 | p = &(*p)->rb_right; |
547 | } | 638 | else if (vma->vm_end < pvma->vm_end) |
548 | else { | 639 | p = &(*p)->rb_left; |
549 | /* mappings are at the same address - this can only | 640 | else if (vma->vm_end > pvma->vm_end) |
550 | * happen for shared-mem chardevs and shared file | 641 | p = &(*p)->rb_right; |
551 | * mappings backed by ramfs/tmpfs */ | 642 | else if (vma < pvma) |
552 | BUG_ON(!(pvma->vm_flags & VM_SHARED)); | 643 | p = &(*p)->rb_left; |
553 | 644 | else if (vma > pvma) | |
554 | if (vma < pvma) | 645 | p = &(*p)->rb_right; |
555 | p = &(*p)->rb_left; | 646 | else |
556 | else if (vma > pvma) | 647 | BUG(); |
557 | p = &(*p)->rb_right; | ||
558 | else | ||
559 | BUG(); | ||
560 | } | ||
561 | } | 648 | } |
562 | 649 | ||
563 | rb_link_node(&vma->vm_rb, parent, p); | 650 | rb_link_node(&vma->vm_rb, parent, p); |
564 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | 651 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); |
652 | |||
653 | /* add VMA to the VMA list also */ | ||
654 | for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { | ||
655 | if (pvma->vm_start > vma->vm_start) | ||
656 | break; | ||
657 | if (pvma->vm_start < vma->vm_start) | ||
658 | continue; | ||
659 | if (pvma->vm_end < vma->vm_end) | ||
660 | break; | ||
661 | } | ||
662 | |||
663 | vma->vm_next = *pp; | ||
664 | *pp = vma; | ||
565 | } | 665 | } |
566 | 666 | ||
567 | /* | 667 | /* |
568 | * delete a VMA from the global list | 668 | * delete a VMA from its owning mm_struct and address space |
569 | */ | 669 | */ |
570 | static void delete_nommu_vma(struct vm_area_struct *vma) | 670 | static void delete_vma_from_mm(struct vm_area_struct *vma) |
571 | { | 671 | { |
672 | struct vm_area_struct **pp; | ||
572 | struct address_space *mapping; | 673 | struct address_space *mapping; |
674 | struct mm_struct *mm = vma->vm_mm; | ||
573 | 675 | ||
676 | kenter("%p", vma); | ||
677 | |||
678 | mm->map_count--; | ||
679 | if (mm->mmap_cache == vma) | ||
680 | mm->mmap_cache = NULL; | ||
681 | |||
574 | /* remove the VMA from the mapping */ | 682 | /* remove the VMA from the mapping */ |
575 | if (vma->vm_file) { | 683 | if (vma->vm_file) { |
576 | mapping = vma->vm_file->f_mapping; | 684 | mapping = vma->vm_file->f_mapping; |
577 | 685 | ||
578 | flush_dcache_mmap_lock(mapping); | 686 | flush_dcache_mmap_lock(mapping); |
579 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 687 | vma_prio_tree_remove(vma, &mapping->i_mmap); |
580 | flush_dcache_mmap_unlock(mapping); | 688 | flush_dcache_mmap_unlock(mapping); |
581 | } | 689 | } |
582 | 690 | ||
583 | /* remove from the master list */ | 691 | /* remove from the MM's tree and list */ |
584 | rb_erase(&vma->vm_rb, &nommu_vma_tree); | 692 | rb_erase(&vma->vm_rb, &mm->mm_rb); |
693 | for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { | ||
694 | if (*pp == vma) { | ||
695 | *pp = vma->vm_next; | ||
696 | break; | ||
697 | } | ||
698 | } | ||
699 | |||
700 | vma->vm_mm = NULL; | ||
585 | } | 701 | } |
586 | 702 | ||
587 | /* | 703 | /* |
704 | * destroy a VMA record | ||
705 | */ | ||
706 | static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | ||
707 | { | ||
708 | kenter("%p", vma); | ||
709 | if (vma->vm_ops && vma->vm_ops->close) | ||
710 | vma->vm_ops->close(vma); | ||
711 | if (vma->vm_file) { | ||
712 | fput(vma->vm_file); | ||
713 | if (vma->vm_flags & VM_EXECUTABLE) | ||
714 | removed_exe_file_vma(mm); | ||
715 | } | ||
716 | put_nommu_region(vma->vm_region); | ||
717 | kmem_cache_free(vm_area_cachep, vma); | ||
718 | } | ||
719 | |||
720 | /* | ||
721 | * look up the first VMA in which addr resides, NULL if none | ||
722 | * - should be called with mm->mmap_sem at least held readlocked | ||
723 | */ | ||
724 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
725 | { | ||
726 | struct vm_area_struct *vma; | ||
727 | struct rb_node *n = mm->mm_rb.rb_node; | ||
728 | |||
729 | /* check the cache first */ | ||
730 | vma = mm->mmap_cache; | ||
731 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) | ||
732 | return vma; | ||
733 | |||
734 | /* trawl the tree (there may be multiple mappings in which addr | ||
735 | * resides) */ | ||
736 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | ||
737 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
738 | if (vma->vm_start > addr) | ||
739 | return NULL; | ||
740 | if (vma->vm_end > addr) { | ||
741 | mm->mmap_cache = vma; | ||
742 | return vma; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | return NULL; | ||
747 | } | ||
748 | EXPORT_SYMBOL(find_vma); | ||
749 | |||
750 | /* | ||
751 | * find a VMA | ||
752 | * - we don't extend stack VMAs under NOMMU conditions | ||
753 | */ | ||
754 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
755 | { | ||
756 | return find_vma(mm, addr); | ||
757 | } | ||
758 | |||
759 | /* | ||
760 | * expand a stack to a given address | ||
761 | * - not supported under NOMMU conditions | ||
762 | */ | ||
763 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
764 | { | ||
765 | return -ENOMEM; | ||
766 | } | ||
767 | |||
768 | /* | ||
769 | * look up the first VMA exactly that exactly matches addr | ||
770 | * - should be called with mm->mmap_sem at least held readlocked | ||
771 | */ | ||
772 | static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
773 | unsigned long addr, | ||
774 | unsigned long len) | ||
775 | { | ||
776 | struct vm_area_struct *vma; | ||
777 | struct rb_node *n = mm->mm_rb.rb_node; | ||
778 | unsigned long end = addr + len; | ||
779 | |||
780 | /* check the cache first */ | ||
781 | vma = mm->mmap_cache; | ||
782 | if (vma && vma->vm_start == addr && vma->vm_end == end) | ||
783 | return vma; | ||
784 | |||
785 | /* trawl the tree (there may be multiple mappings in which addr | ||
786 | * resides) */ | ||
787 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | ||
788 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
789 | if (vma->vm_start < addr) | ||
790 | continue; | ||
791 | if (vma->vm_start > addr) | ||
792 | return NULL; | ||
793 | if (vma->vm_end == end) { | ||
794 | mm->mmap_cache = vma; | ||
795 | return vma; | ||
796 | } | ||
797 | } | ||
798 | |||
799 | return NULL; | ||
800 | } | ||
801 | |||
802 | /* | ||
588 | * determine whether a mapping should be permitted and, if so, what sort of | 803 | * determine whether a mapping should be permitted and, if so, what sort of |
589 | * mapping we're capable of supporting | 804 | * mapping we're capable of supporting |
590 | */ | 805 | */ |
591 | static int validate_mmap_request(struct file *file, | 806 | static int validate_mmap_request(struct file *file, |
592 | unsigned long addr, | 807 | unsigned long addr, |
593 | unsigned long len, | 808 | unsigned long len, |
594 | unsigned long prot, | 809 | unsigned long prot, |
595 | unsigned long flags, | 810 | unsigned long flags, |
596 | unsigned long pgoff, | 811 | unsigned long pgoff, |
597 | unsigned long *_capabilities) | 812 | unsigned long *_capabilities) |
598 | { | 813 | { |
599 | unsigned long capabilities; | 814 | unsigned long capabilities, rlen; |
600 | unsigned long reqprot = prot; | 815 | unsigned long reqprot = prot; |
601 | int ret; | 816 | int ret; |
602 | 817 | ||
603 | /* do the simple checks first */ | 818 | /* do the simple checks first */ |
604 | if (flags & MAP_FIXED || addr) { | 819 | if (flags & MAP_FIXED || addr) { |
605 | printk(KERN_DEBUG | 820 | printk(KERN_DEBUG |
606 | "%d: Can't do fixed-address/overlay mmap of RAM\n", | 821 | "%d: Can't do fixed-address/overlay mmap of RAM\n", |
607 | current->pid); | 822 | current->pid); |
608 | return -EINVAL; | 823 | return -EINVAL; |
609 | } | 824 | } |
610 | 825 | ||
611 | if ((flags & MAP_TYPE) != MAP_PRIVATE && | 826 | if ((flags & MAP_TYPE) != MAP_PRIVATE && |
612 | (flags & MAP_TYPE) != MAP_SHARED) | 827 | (flags & MAP_TYPE) != MAP_SHARED) |
613 | return -EINVAL; | 828 | return -EINVAL; |
614 | 829 | ||
615 | if (!len) | 830 | if (!len) |
616 | return -EINVAL; | 831 | return -EINVAL; |
617 | 832 | ||
618 | /* Careful about overflows.. */ | 833 | /* Careful about overflows.. */ |
619 | len = PAGE_ALIGN(len); | 834 | rlen = PAGE_ALIGN(len); |
620 | if (!len || len > TASK_SIZE) | 835 | if (!rlen || rlen > TASK_SIZE) |
621 | return -ENOMEM; | 836 | return -ENOMEM; |
622 | 837 | ||
623 | /* offset overflow? */ | 838 | /* offset overflow? */ |
624 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 839 | if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) |
625 | return -EOVERFLOW; | 840 | return -EOVERFLOW; |
626 | 841 | ||
627 | if (file) { | 842 | if (file) { |
628 | /* validate file mapping requests */ | 843 | /* validate file mapping requests */ |
629 | struct address_space *mapping; | 844 | struct address_space *mapping; |
630 | 845 | ||
631 | /* files must support mmap */ | 846 | /* files must support mmap */ |
632 | if (!file->f_op || !file->f_op->mmap) | 847 | if (!file->f_op || !file->f_op->mmap) |
633 | return -ENODEV; | 848 | return -ENODEV; |
634 | 849 | ||
635 | /* work out if what we've got could possibly be shared | 850 | /* work out if what we've got could possibly be shared |
636 | * - we support chardevs that provide their own "memory" | 851 | * - we support chardevs that provide their own "memory" |
637 | * - we support files/blockdevs that are memory backed | 852 | * - we support files/blockdevs that are memory backed |
638 | */ | 853 | */ |
639 | mapping = file->f_mapping; | 854 | mapping = file->f_mapping; |
640 | if (!mapping) | 855 | if (!mapping) |
641 | mapping = file->f_path.dentry->d_inode->i_mapping; | 856 | mapping = file->f_path.dentry->d_inode->i_mapping; |
642 | 857 | ||
643 | capabilities = 0; | 858 | capabilities = 0; |
644 | if (mapping && mapping->backing_dev_info) | 859 | if (mapping && mapping->backing_dev_info) |
645 | capabilities = mapping->backing_dev_info->capabilities; | 860 | capabilities = mapping->backing_dev_info->capabilities; |
646 | 861 | ||
647 | if (!capabilities) { | 862 | if (!capabilities) { |
648 | /* no explicit capabilities set, so assume some | 863 | /* no explicit capabilities set, so assume some |
649 | * defaults */ | 864 | * defaults */ |
650 | switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { | 865 | switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { |
651 | case S_IFREG: | 866 | case S_IFREG: |
652 | case S_IFBLK: | 867 | case S_IFBLK: |
653 | capabilities = BDI_CAP_MAP_COPY; | 868 | capabilities = BDI_CAP_MAP_COPY; |
654 | break; | 869 | break; |
655 | 870 | ||
656 | case S_IFCHR: | 871 | case S_IFCHR: |
657 | capabilities = | 872 | capabilities = |
658 | BDI_CAP_MAP_DIRECT | | 873 | BDI_CAP_MAP_DIRECT | |
659 | BDI_CAP_READ_MAP | | 874 | BDI_CAP_READ_MAP | |
660 | BDI_CAP_WRITE_MAP; | 875 | BDI_CAP_WRITE_MAP; |
661 | break; | 876 | break; |
662 | 877 | ||
663 | default: | 878 | default: |
664 | return -EINVAL; | 879 | return -EINVAL; |
665 | } | 880 | } |
666 | } | 881 | } |
667 | 882 | ||
668 | /* eliminate any capabilities that we can't support on this | 883 | /* eliminate any capabilities that we can't support on this |
669 | * device */ | 884 | * device */ |
670 | if (!file->f_op->get_unmapped_area) | 885 | if (!file->f_op->get_unmapped_area) |
671 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 886 | capabilities &= ~BDI_CAP_MAP_DIRECT; |
672 | if (!file->f_op->read) | 887 | if (!file->f_op->read) |
673 | capabilities &= ~BDI_CAP_MAP_COPY; | 888 | capabilities &= ~BDI_CAP_MAP_COPY; |
674 | 889 | ||
675 | if (flags & MAP_SHARED) { | 890 | if (flags & MAP_SHARED) { |
676 | /* do checks for writing, appending and locking */ | 891 | /* do checks for writing, appending and locking */ |
677 | if ((prot & PROT_WRITE) && | 892 | if ((prot & PROT_WRITE) && |
678 | !(file->f_mode & FMODE_WRITE)) | 893 | !(file->f_mode & FMODE_WRITE)) |
679 | return -EACCES; | 894 | return -EACCES; |
680 | 895 | ||
681 | if (IS_APPEND(file->f_path.dentry->d_inode) && | 896 | if (IS_APPEND(file->f_path.dentry->d_inode) && |
682 | (file->f_mode & FMODE_WRITE)) | 897 | (file->f_mode & FMODE_WRITE)) |
683 | return -EACCES; | 898 | return -EACCES; |
684 | 899 | ||
685 | if (locks_verify_locked(file->f_path.dentry->d_inode)) | 900 | if (locks_verify_locked(file->f_path.dentry->d_inode)) |
686 | return -EAGAIN; | 901 | return -EAGAIN; |
687 | 902 | ||
688 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 903 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
689 | return -ENODEV; | 904 | return -ENODEV; |
690 | 905 | ||
691 | if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || | 906 | if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || |
692 | ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || | 907 | ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || |
693 | ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) | 908 | ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) |
694 | ) { | 909 | ) { |
695 | printk("MAP_SHARED not completely supported on !MMU\n"); | 910 | printk("MAP_SHARED not completely supported on !MMU\n"); |
696 | return -EINVAL; | 911 | return -EINVAL; |
697 | } | 912 | } |
698 | 913 | ||
699 | /* we mustn't privatise shared mappings */ | 914 | /* we mustn't privatise shared mappings */ |
700 | capabilities &= ~BDI_CAP_MAP_COPY; | 915 | capabilities &= ~BDI_CAP_MAP_COPY; |
701 | } | 916 | } |
702 | else { | 917 | else { |
703 | /* we're going to read the file into private memory we | 918 | /* we're going to read the file into private memory we |
704 | * allocate */ | 919 | * allocate */ |
705 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 920 | if (!(capabilities & BDI_CAP_MAP_COPY)) |
706 | return -ENODEV; | 921 | return -ENODEV; |
707 | 922 | ||
708 | /* we don't permit a private writable mapping to be | 923 | /* we don't permit a private writable mapping to be |
709 | * shared with the backing device */ | 924 | * shared with the backing device */ |
710 | if (prot & PROT_WRITE) | 925 | if (prot & PROT_WRITE) |
711 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 926 | capabilities &= ~BDI_CAP_MAP_DIRECT; |
712 | } | 927 | } |
713 | 928 | ||
714 | /* handle executable mappings and implied executable | 929 | /* handle executable mappings and implied executable |
715 | * mappings */ | 930 | * mappings */ |
716 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { | 931 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
717 | if (prot & PROT_EXEC) | 932 | if (prot & PROT_EXEC) |
718 | return -EPERM; | 933 | return -EPERM; |
719 | } | 934 | } |
720 | else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { | 935 | else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { |
721 | /* handle implication of PROT_EXEC by PROT_READ */ | 936 | /* handle implication of PROT_EXEC by PROT_READ */ |
722 | if (current->personality & READ_IMPLIES_EXEC) { | 937 | if (current->personality & READ_IMPLIES_EXEC) { |
723 | if (capabilities & BDI_CAP_EXEC_MAP) | 938 | if (capabilities & BDI_CAP_EXEC_MAP) |
724 | prot |= PROT_EXEC; | 939 | prot |= PROT_EXEC; |
725 | } | 940 | } |
726 | } | 941 | } |
727 | else if ((prot & PROT_READ) && | 942 | else if ((prot & PROT_READ) && |
728 | (prot & PROT_EXEC) && | 943 | (prot & PROT_EXEC) && |
729 | !(capabilities & BDI_CAP_EXEC_MAP) | 944 | !(capabilities & BDI_CAP_EXEC_MAP) |
730 | ) { | 945 | ) { |
731 | /* backing file is not executable, try to copy */ | 946 | /* backing file is not executable, try to copy */ |
732 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 947 | capabilities &= ~BDI_CAP_MAP_DIRECT; |
733 | } | 948 | } |
734 | } | 949 | } |
735 | else { | 950 | else { |
736 | /* anonymous mappings are always memory backed and can be | 951 | /* anonymous mappings are always memory backed and can be |
737 | * privately mapped | 952 | * privately mapped |
738 | */ | 953 | */ |
739 | capabilities = BDI_CAP_MAP_COPY; | 954 | capabilities = BDI_CAP_MAP_COPY; |
740 | 955 | ||
741 | /* handle PROT_EXEC implication by PROT_READ */ | 956 | /* handle PROT_EXEC implication by PROT_READ */ |
742 | if ((prot & PROT_READ) && | 957 | if ((prot & PROT_READ) && |
743 | (current->personality & READ_IMPLIES_EXEC)) | 958 | (current->personality & READ_IMPLIES_EXEC)) |
744 | prot |= PROT_EXEC; | 959 | prot |= PROT_EXEC; |
745 | } | 960 | } |
746 | 961 | ||
747 | /* allow the security API to have its say */ | 962 | /* allow the security API to have its say */ |
748 | ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 963 | ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
749 | if (ret < 0) | 964 | if (ret < 0) |
750 | return ret; | 965 | return ret; |
751 | 966 | ||
752 | /* looks okay */ | 967 | /* looks okay */ |
753 | *_capabilities = capabilities; | 968 | *_capabilities = capabilities; |
754 | return 0; | 969 | return 0; |
755 | } | 970 | } |
756 | 971 | ||
757 | /* | 972 | /* |
758 | * we've determined that we can make the mapping, now translate what we | 973 | * we've determined that we can make the mapping, now translate what we |
759 | * now know into VMA flags | 974 | * now know into VMA flags |
760 | */ | 975 | */ |
761 | static unsigned long determine_vm_flags(struct file *file, | 976 | static unsigned long determine_vm_flags(struct file *file, |
762 | unsigned long prot, | 977 | unsigned long prot, |
763 | unsigned long flags, | 978 | unsigned long flags, |
764 | unsigned long capabilities) | 979 | unsigned long capabilities) |
765 | { | 980 | { |
766 | unsigned long vm_flags; | 981 | unsigned long vm_flags; |
767 | 982 | ||
768 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); | 983 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); |
769 | vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 984 | vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
770 | /* vm_flags |= mm->def_flags; */ | 985 | /* vm_flags |= mm->def_flags; */ |
771 | 986 | ||
772 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) { | 987 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) { |
773 | /* attempt to share read-only copies of mapped file chunks */ | 988 | /* attempt to share read-only copies of mapped file chunks */ |
774 | if (file && !(prot & PROT_WRITE)) | 989 | if (file && !(prot & PROT_WRITE)) |
775 | vm_flags |= VM_MAYSHARE; | 990 | vm_flags |= VM_MAYSHARE; |
776 | } | 991 | } |
777 | else { | 992 | else { |
778 | /* overlay a shareable mapping on the backing device or inode | 993 | /* overlay a shareable mapping on the backing device or inode |
779 | * if possible - used for chardevs, ramfs/tmpfs/shmfs and | 994 | * if possible - used for chardevs, ramfs/tmpfs/shmfs and |
780 | * romfs/cramfs */ | 995 | * romfs/cramfs */ |
781 | if (flags & MAP_SHARED) | 996 | if (flags & MAP_SHARED) |
782 | vm_flags |= VM_MAYSHARE | VM_SHARED; | 997 | vm_flags |= VM_MAYSHARE | VM_SHARED; |
783 | else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) | 998 | else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) |
784 | vm_flags |= VM_MAYSHARE; | 999 | vm_flags |= VM_MAYSHARE; |
785 | } | 1000 | } |
786 | 1001 | ||
787 | /* refuse to let anyone share private mappings with this process if | 1002 | /* refuse to let anyone share private mappings with this process if |
788 | * it's being traced - otherwise breakpoints set in it may interfere | 1003 | * it's being traced - otherwise breakpoints set in it may interfere |
789 | * with another untraced process | 1004 | * with another untraced process |
790 | */ | 1005 | */ |
791 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) | 1006 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) |
792 | vm_flags &= ~VM_MAYSHARE; | 1007 | vm_flags &= ~VM_MAYSHARE; |
793 | 1008 | ||
794 | return vm_flags; | 1009 | return vm_flags; |
795 | } | 1010 | } |
796 | 1011 | ||
797 | /* | 1012 | /* |
798 | * set up a shared mapping on a file | 1013 | * set up a shared mapping on a file (the driver or filesystem provides and |
1014 | * pins the storage) | ||
799 | */ | 1015 | */ |
800 | static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) | 1016 | static int do_mmap_shared_file(struct vm_area_struct *vma) |
801 | { | 1017 | { |
802 | int ret; | 1018 | int ret; |
803 | 1019 | ||
804 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1020 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1021 | if (ret == 0) { | ||
1022 | vma->vm_region->vm_top = vma->vm_region->vm_end; | ||
1023 | return ret; | ||
1024 | } | ||
805 | if (ret != -ENOSYS) | 1025 | if (ret != -ENOSYS) |
806 | return ret; | 1026 | return ret; |
807 | 1027 | ||
808 | /* getting an ENOSYS error indicates that direct mmap isn't | 1028 | /* getting an ENOSYS error indicates that direct mmap isn't |
809 | * possible (as opposed to tried but failed) so we'll fall | 1029 | * possible (as opposed to tried but failed) so we'll fall |
810 | * through to making a private copy of the data and mapping | 1030 | * through to making a private copy of the data and mapping |
811 | * that if we can */ | 1031 | * that if we can */ |
812 | return -ENODEV; | 1032 | return -ENODEV; |
813 | } | 1033 | } |
814 | 1034 | ||
815 | /* | 1035 | /* |
816 | * set up a private mapping or an anonymous shared mapping | 1036 | * set up a private mapping or an anonymous shared mapping |
817 | */ | 1037 | */ |
818 | static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | 1038 | static int do_mmap_private(struct vm_area_struct *vma, |
1039 | struct vm_region *region, | ||
1040 | unsigned long len) | ||
819 | { | 1041 | { |
1042 | struct page *pages; | ||
1043 | unsigned long total, point, n, rlen; | ||
820 | void *base; | 1044 | void *base; |
821 | int ret; | 1045 | int ret, order; |
822 | 1046 | ||
823 | /* invoke the file's mapping function so that it can keep track of | 1047 | /* invoke the file's mapping function so that it can keep track of |
824 | * shared mappings on devices or memory | 1048 | * shared mappings on devices or memory |
825 | * - VM_MAYSHARE will be set if it may attempt to share | 1049 | * - VM_MAYSHARE will be set if it may attempt to share |
826 | */ | 1050 | */ |
827 | if (vma->vm_file) { | 1051 | if (vma->vm_file) { |
828 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1052 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
829 | if (ret != -ENOSYS) { | 1053 | if (ret == 0) { |
830 | /* shouldn't return success if we're not sharing */ | 1054 | /* shouldn't return success if we're not sharing */ |
831 | BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); | 1055 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); |
832 | return ret; /* success or a real error */ | 1056 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
1057 | return ret; | ||
833 | } | 1058 | } |
1059 | if (ret != -ENOSYS) | ||
1060 | return ret; | ||
834 | 1061 | ||
835 | /* getting an ENOSYS error indicates that direct mmap isn't | 1062 | /* getting an ENOSYS error indicates that direct mmap isn't |
836 | * possible (as opposed to tried but failed) so we'll try to | 1063 | * possible (as opposed to tried but failed) so we'll try to |
837 | * make a private copy of the data and map that instead */ | 1064 | * make a private copy of the data and map that instead */ |
838 | } | 1065 | } |
839 | 1066 | ||
1067 | rlen = PAGE_ALIGN(len); | ||
1068 | |||
840 | /* allocate some memory to hold the mapping | 1069 | /* allocate some memory to hold the mapping |
841 | * - note that this may not return a page-aligned address if the object | 1070 | * - note that this may not return a page-aligned address if the object |
842 | * we're allocating is smaller than a page | 1071 | * we're allocating is smaller than a page |
843 | */ | 1072 | */ |
844 | base = kmalloc(len, GFP_KERNEL|__GFP_COMP); | 1073 | order = get_order(rlen); |
845 | if (!base) | 1074 | kdebug("alloc order %d for %lx", order, len); |
1075 | |||
1076 | pages = alloc_pages(GFP_KERNEL, order); | ||
1077 | if (!pages) | ||
846 | goto enomem; | 1078 | goto enomem; |
847 | 1079 | ||
848 | vma->vm_start = (unsigned long) base; | 1080 | total = 1 << order; |
849 | vma->vm_end = vma->vm_start + len; | 1081 | atomic_add(total, &mmap_pages_allocated); |
850 | vma->vm_flags |= VM_MAPPED_COPY; | ||
851 | 1082 | ||
852 | #ifdef WARN_ON_SLACK | 1083 | point = rlen >> PAGE_SHIFT; |
853 | if (len + WARN_ON_SLACK <= kobjsize(result)) | ||
854 | printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", | ||
855 | len, current->pid, kobjsize(result) - len); | ||
856 | #endif | ||
857 | 1084 | ||
1085 | /* we allocated a power-of-2 sized page set, so we may want to trim off | ||
1086 | * the excess */ | ||
1087 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | ||
1088 | while (total > point) { | ||
1089 | order = ilog2(total - point); | ||
1090 | n = 1 << order; | ||
1091 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | ||
1092 | atomic_sub(n, &mmap_pages_allocated); | ||
1093 | total -= n; | ||
1094 | set_page_refcounted(pages + total); | ||
1095 | __free_pages(pages + total, order); | ||
1096 | } | ||
1097 | } | ||
1098 | |||
1099 | for (point = 1; point < total; point++) | ||
1100 | set_page_refcounted(&pages[point]); | ||
1101 | |||
1102 | base = page_address(pages); | ||
1103 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; | ||
1104 | region->vm_start = (unsigned long) base; | ||
1105 | region->vm_end = region->vm_start + rlen; | ||
1106 | region->vm_top = region->vm_start + (total << PAGE_SHIFT); | ||
1107 | |||
1108 | vma->vm_start = region->vm_start; | ||
1109 | vma->vm_end = region->vm_start + len; | ||
1110 | |||
858 | if (vma->vm_file) { | 1111 | if (vma->vm_file) { |
859 | /* read the contents of a file into the copy */ | 1112 | /* read the contents of a file into the copy */ |
860 | mm_segment_t old_fs; | 1113 | mm_segment_t old_fs; |
861 | loff_t fpos; | 1114 | loff_t fpos; |
862 | 1115 | ||
863 | fpos = vma->vm_pgoff; | 1116 | fpos = vma->vm_pgoff; |
864 | fpos <<= PAGE_SHIFT; | 1117 | fpos <<= PAGE_SHIFT; |
865 | 1118 | ||
866 | old_fs = get_fs(); | 1119 | old_fs = get_fs(); |
867 | set_fs(KERNEL_DS); | 1120 | set_fs(KERNEL_DS); |
868 | ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); | 1121 | ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); |
869 | set_fs(old_fs); | 1122 | set_fs(old_fs); |
870 | 1123 | ||
871 | if (ret < 0) | 1124 | if (ret < 0) |
872 | goto error_free; | 1125 | goto error_free; |
873 | 1126 | ||
874 | /* clear the last little bit */ | 1127 | /* clear the last little bit */ |
875 | if (ret < len) | 1128 | if (ret < rlen) |
876 | memset(base + ret, 0, len - ret); | 1129 | memset(base + ret, 0, rlen - ret); |
877 | 1130 | ||
878 | } else { | 1131 | } else { |
879 | /* if it's an anonymous mapping, then just clear it */ | 1132 | /* if it's an anonymous mapping, then just clear it */ |
880 | memset(base, 0, len); | 1133 | memset(base, 0, rlen); |
881 | } | 1134 | } |
882 | 1135 | ||
883 | return 0; | 1136 | return 0; |
884 | 1137 | ||
885 | error_free: | 1138 | error_free: |
886 | kfree(base); | 1139 | free_page_series(region->vm_start, region->vm_end); |
887 | vma->vm_start = 0; | 1140 | region->vm_start = vma->vm_start = 0; |
1141 | region->vm_end = vma->vm_end = 0; | ||
1142 | region->vm_top = 0; | ||
888 | return ret; | 1143 | return ret; |
889 | 1144 | ||
890 | enomem: | 1145 | enomem: |
891 | printk("Allocation of length %lu from process %d failed\n", | 1146 | printk("Allocation of length %lu from process %d failed\n", |
892 | len, current->pid); | 1147 | len, current->pid); |
893 | show_free_areas(); | 1148 | show_free_areas(); |
894 | return -ENOMEM; | 1149 | return -ENOMEM; |
895 | } | 1150 | } |
896 | 1151 | ||
897 | /* | 1152 | /* |
898 | * handle mapping creation for uClinux | 1153 | * handle mapping creation for uClinux |
899 | */ | 1154 | */ |
900 | unsigned long do_mmap_pgoff(struct file *file, | 1155 | unsigned long do_mmap_pgoff(struct file *file, |
901 | unsigned long addr, | 1156 | unsigned long addr, |
902 | unsigned long len, | 1157 | unsigned long len, |
903 | unsigned long prot, | 1158 | unsigned long prot, |
904 | unsigned long flags, | 1159 | unsigned long flags, |
905 | unsigned long pgoff) | 1160 | unsigned long pgoff) |
906 | { | 1161 | { |
907 | struct vm_list_struct *vml = NULL; | 1162 | struct vm_area_struct *vma; |
908 | struct vm_area_struct *vma = NULL; | 1163 | struct vm_region *region; |
909 | struct rb_node *rb; | 1164 | struct rb_node *rb; |
910 | unsigned long capabilities, vm_flags; | 1165 | unsigned long capabilities, vm_flags, result; |
911 | void *result; | ||
912 | int ret; | 1166 | int ret; |
913 | 1167 | ||
1168 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | ||
1169 | |||
914 | if (!(flags & MAP_FIXED)) | 1170 | if (!(flags & MAP_FIXED)) |
915 | addr = round_hint_to_min(addr); | 1171 | addr = round_hint_to_min(addr); |
916 | 1172 | ||
917 | /* decide whether we should attempt the mapping, and if so what sort of | 1173 | /* decide whether we should attempt the mapping, and if so what sort of |
918 | * mapping */ | 1174 | * mapping */ |
919 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1175 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
920 | &capabilities); | 1176 | &capabilities); |
921 | if (ret < 0) | 1177 | if (ret < 0) { |
1178 | kleave(" = %d [val]", ret); | ||
922 | return ret; | 1179 | return ret; |
1180 | } | ||
923 | 1181 | ||
924 | /* we've determined that we can make the mapping, now translate what we | 1182 | /* we've determined that we can make the mapping, now translate what we |
925 | * now know into VMA flags */ | 1183 | * now know into VMA flags */ |
926 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 1184 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
927 | 1185 | ||
928 | /* we're going to need to record the mapping if it works */ | 1186 | /* we're going to need to record the mapping */ |
929 | vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); | 1187 | region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); |
930 | if (!vml) | 1188 | if (!region) |
931 | goto error_getting_vml; | 1189 | goto error_getting_region; |
932 | 1190 | ||
933 | down_write(&nommu_vma_sem); | 1191 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1192 | if (!vma) | ||
1193 | goto error_getting_vma; | ||
934 | 1194 | ||
935 | /* if we want to share, we need to check for VMAs created by other | 1195 | atomic_set(®ion->vm_usage, 1); |
1196 | region->vm_flags = vm_flags; | ||
1197 | region->vm_pgoff = pgoff; | ||
1198 | |||
1199 | INIT_LIST_HEAD(&vma->anon_vma_node); | ||
1200 | vma->vm_flags = vm_flags; | ||
1201 | vma->vm_pgoff = pgoff; | ||
1202 | |||
1203 | if (file) { | ||
1204 | region->vm_file = file; | ||
1205 | get_file(file); | ||
1206 | vma->vm_file = file; | ||
1207 | get_file(file); | ||
1208 | if (vm_flags & VM_EXECUTABLE) { | ||
1209 | added_exe_file_vma(current->mm); | ||
1210 | vma->vm_mm = current->mm; | ||
1211 | } | ||
1212 | } | ||
1213 | |||
1214 | down_write(&nommu_region_sem); | ||
1215 | |||
1216 | /* if we want to share, we need to check for regions created by other | ||
936 | * mmap() calls that overlap with our proposed mapping | 1217 | * mmap() calls that overlap with our proposed mapping |
937 | * - we can only share with an exact match on most regular files | 1218 | * - we can only share with a superset match on most regular files |
938 | * - shared mappings on character devices and memory backed files are | 1219 | * - shared mappings on character devices and memory backed files are |
939 | * permitted to overlap inexactly as far as we are concerned for in | 1220 | * permitted to overlap inexactly as far as we are concerned for in |
940 | * these cases, sharing is handled in the driver or filesystem rather | 1221 | * these cases, sharing is handled in the driver or filesystem rather |
941 | * than here | 1222 | * than here |
942 | */ | 1223 | */ |
943 | if (vm_flags & VM_MAYSHARE) { | 1224 | if (vm_flags & VM_MAYSHARE) { |
944 | unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1225 | struct vm_region *pregion; |
945 | unsigned long vmpglen; | 1226 | unsigned long pglen, rpglen, pgend, rpgend, start; |
946 | 1227 | ||
947 | /* suppress VMA sharing for shared regions */ | 1228 | pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
948 | if (vm_flags & VM_SHARED && | 1229 | pgend = pgoff + pglen; |
949 | capabilities & BDI_CAP_MAP_DIRECT) | ||
950 | goto dont_share_VMAs; | ||
951 | 1230 | ||
952 | for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { | 1231 | for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { |
953 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | 1232 | pregion = rb_entry(rb, struct vm_region, vm_rb); |
954 | 1233 | ||
955 | if (!(vma->vm_flags & VM_MAYSHARE)) | 1234 | if (!(pregion->vm_flags & VM_MAYSHARE)) |
956 | continue; | 1235 | continue; |
957 | 1236 | ||
958 | /* search for overlapping mappings on the same file */ | 1237 | /* search for overlapping mappings on the same file */ |
959 | if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) | 1238 | if (pregion->vm_file->f_path.dentry->d_inode != |
1239 | file->f_path.dentry->d_inode) | ||
960 | continue; | 1240 | continue; |
961 | 1241 | ||
962 | if (vma->vm_pgoff >= pgoff + pglen) | 1242 | if (pregion->vm_pgoff >= pgend) |
963 | continue; | 1243 | continue; |
964 | 1244 | ||
965 | vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; | 1245 | rpglen = pregion->vm_end - pregion->vm_start; |
966 | vmpglen >>= PAGE_SHIFT; | 1246 | rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
967 | if (pgoff >= vma->vm_pgoff + vmpglen) | 1247 | rpgend = pregion->vm_pgoff + rpglen; |
1248 | if (pgoff >= rpgend) | ||
968 | continue; | 1249 | continue; |
969 | 1250 | ||
970 | /* handle inexactly overlapping matches between mappings */ | 1251 | /* handle inexactly overlapping matches between |
971 | if (vma->vm_pgoff != pgoff || vmpglen != pglen) { | 1252 | * mappings */ |
1253 | if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && | ||
1254 | !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { | ||
1255 | /* new mapping is not a subset of the region */ | ||
972 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 1256 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
973 | goto sharing_violation; | 1257 | goto sharing_violation; |
974 | continue; | 1258 | continue; |
975 | } | 1259 | } |
976 | 1260 | ||
977 | /* we've found a VMA we can share */ | 1261 | /* we've found a region we can share */ |
978 | atomic_inc(&vma->vm_usage); | 1262 | atomic_inc(&pregion->vm_usage); |
1263 | vma->vm_region = pregion; | ||
1264 | start = pregion->vm_start; | ||
1265 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; | ||
1266 | vma->vm_start = start; | ||
1267 | vma->vm_end = start + len; | ||
979 | 1268 | ||
980 | vml->vma = vma; | 1269 | if (pregion->vm_flags & VM_MAPPED_COPY) { |
981 | result = (void *) vma->vm_start; | 1270 | kdebug("share copy"); |
982 | goto shared; | 1271 | vma->vm_flags |= VM_MAPPED_COPY; |
1272 | } else { | ||
1273 | kdebug("share mmap"); | ||
1274 | ret = do_mmap_shared_file(vma); | ||
1275 | if (ret < 0) { | ||
1276 | vma->vm_region = NULL; | ||
1277 | vma->vm_start = 0; | ||
1278 | vma->vm_end = 0; | ||
1279 | atomic_dec(&pregion->vm_usage); | ||
1280 | pregion = NULL; | ||
1281 | goto error_just_free; | ||
1282 | } | ||
1283 | } | ||
1284 | fput(region->vm_file); | ||
1285 | kmem_cache_free(vm_region_jar, region); | ||
1286 | region = pregion; | ||
1287 | result = start; | ||
1288 | goto share; | ||
983 | } | 1289 | } |
984 | 1290 | ||
985 | dont_share_VMAs: | ||
986 | vma = NULL; | ||
987 | |||
988 | /* obtain the address at which to make a shared mapping | 1291 | /* obtain the address at which to make a shared mapping |
989 | * - this is the hook for quasi-memory character devices to | 1292 | * - this is the hook for quasi-memory character devices to |
990 | * tell us the location of a shared mapping | 1293 | * tell us the location of a shared mapping |
991 | */ | 1294 | */ |
992 | if (file && file->f_op->get_unmapped_area) { | 1295 | if (file && file->f_op->get_unmapped_area) { |
993 | addr = file->f_op->get_unmapped_area(file, addr, len, | 1296 | addr = file->f_op->get_unmapped_area(file, addr, len, |
994 | pgoff, flags); | 1297 | pgoff, flags); |
995 | if (IS_ERR((void *) addr)) { | 1298 | if (IS_ERR((void *) addr)) { |
996 | ret = addr; | 1299 | ret = addr; |
997 | if (ret != (unsigned long) -ENOSYS) | 1300 | if (ret != (unsigned long) -ENOSYS) |
998 | goto error; | 1301 | goto error_just_free; |
999 | 1302 | ||
1000 | /* the driver refused to tell us where to site | 1303 | /* the driver refused to tell us where to site |
1001 | * the mapping so we'll have to attempt to copy | 1304 | * the mapping so we'll have to attempt to copy |
1002 | * it */ | 1305 | * it */ |
1003 | ret = (unsigned long) -ENODEV; | 1306 | ret = (unsigned long) -ENODEV; |
1004 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1307 | if (!(capabilities & BDI_CAP_MAP_COPY)) |
1005 | goto error; | 1308 | goto error_just_free; |
1006 | 1309 | ||
1007 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1310 | capabilities &= ~BDI_CAP_MAP_DIRECT; |
1311 | } else { | ||
1312 | vma->vm_start = region->vm_start = addr; | ||
1313 | vma->vm_end = region->vm_end = addr + len; | ||
1008 | } | 1314 | } |
1009 | } | 1315 | } |
1010 | } | 1316 | } |
1011 | 1317 | ||
1012 | /* we're going to need a VMA struct as well */ | 1318 | vma->vm_region = region; |
1013 | vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); | ||
1014 | if (!vma) | ||
1015 | goto error_getting_vma; | ||
1016 | 1319 | ||
1017 | INIT_LIST_HEAD(&vma->anon_vma_node); | ||
1018 | atomic_set(&vma->vm_usage, 1); | ||
1019 | if (file) { | ||
1020 | get_file(file); | ||
1021 | if (vm_flags & VM_EXECUTABLE) { | ||
1022 | added_exe_file_vma(current->mm); | ||
1023 | vma->vm_mm = current->mm; | ||
1024 | } | ||
1025 | } | ||
1026 | vma->vm_file = file; | ||
1027 | vma->vm_flags = vm_flags; | ||
1028 | vma->vm_start = addr; | ||
1029 | vma->vm_end = addr + len; | ||
1030 | vma->vm_pgoff = pgoff; | ||
1031 | |||
1032 | vml->vma = vma; | ||
1033 | |||
1034 | /* set up the mapping */ | 1320 | /* set up the mapping */ |
1035 | if (file && vma->vm_flags & VM_SHARED) | 1321 | if (file && vma->vm_flags & VM_SHARED) |
1036 | ret = do_mmap_shared_file(vma, len); | 1322 | ret = do_mmap_shared_file(vma); |
1037 | else | 1323 | else |
1038 | ret = do_mmap_private(vma, len); | 1324 | ret = do_mmap_private(vma, region, len); |
1039 | if (ret < 0) | 1325 | if (ret < 0) |
1040 | goto error; | 1326 | goto error_put_region; |
1041 | 1327 | ||
1328 | add_nommu_region(region); | ||
1329 | |||
1042 | /* okay... we have a mapping; now we have to register it */ | 1330 | /* okay... we have a mapping; now we have to register it */ |
1043 | result = (void *) vma->vm_start; | 1331 | result = vma->vm_start; |
1044 | 1332 | ||
1045 | if (vma->vm_flags & VM_MAPPED_COPY) { | ||
1046 | realalloc += kobjsize(result); | ||
1047 | askedalloc += len; | ||
1048 | } | ||
1049 | |||
1050 | realalloc += kobjsize(vma); | ||
1051 | askedalloc += sizeof(*vma); | ||
1052 | |||
1053 | current->mm->total_vm += len >> PAGE_SHIFT; | 1333 | current->mm->total_vm += len >> PAGE_SHIFT; |
1054 | 1334 | ||
1055 | add_nommu_vma(vma); | 1335 | share: |
1336 | add_vma_to_mm(current->mm, vma); | ||
1056 | 1337 | ||
1057 | shared: | 1338 | up_write(&nommu_region_sem); |
1058 | realalloc += kobjsize(vml); | ||
1059 | askedalloc += sizeof(*vml); | ||
1060 | 1339 | ||
1061 | add_vma_to_mm(current->mm, vml); | ||
1062 | |||
1063 | up_write(&nommu_vma_sem); | ||
1064 | |||
1065 | if (prot & PROT_EXEC) | 1340 | if (prot & PROT_EXEC) |
1066 | flush_icache_range((unsigned long) result, | 1341 | flush_icache_range(result, result + len); |
1067 | (unsigned long) result + len); | ||
1068 | 1342 | ||
1069 | #ifdef DEBUG | 1343 | kleave(" = %lx", result); |
1070 | printk("do_mmap:\n"); | 1344 | return result; |
1071 | show_process_blocks(); | ||
1072 | #endif | ||
1073 | 1345 | ||
1074 | return (unsigned long) result; | 1346 | error_put_region: |
1075 | 1347 | __put_nommu_region(region); | |
1076 | error: | ||
1077 | up_write(&nommu_vma_sem); | ||
1078 | kfree(vml); | ||
1079 | if (vma) { | 1348 | if (vma) { |
1080 | if (vma->vm_file) { | 1349 | if (vma->vm_file) { |
1081 | fput(vma->vm_file); | 1350 | fput(vma->vm_file); |
1082 | if (vma->vm_flags & VM_EXECUTABLE) | 1351 | if (vma->vm_flags & VM_EXECUTABLE) |
1083 | removed_exe_file_vma(vma->vm_mm); | 1352 | removed_exe_file_vma(vma->vm_mm); |
1084 | } | 1353 | } |
1085 | kfree(vma); | 1354 | kmem_cache_free(vm_area_cachep, vma); |
1086 | } | 1355 | } |
1356 | kleave(" = %d [pr]", ret); | ||
1087 | return ret; | 1357 | return ret; |
1088 | 1358 | ||
1089 | sharing_violation: | 1359 | error_just_free: |
1090 | up_write(&nommu_vma_sem); | 1360 | up_write(&nommu_region_sem); |
1091 | printk("Attempt to share mismatched mappings\n"); | 1361 | error: |
1092 | kfree(vml); | 1362 | fput(region->vm_file); |
1093 | return -EINVAL; | 1363 | kmem_cache_free(vm_region_jar, region); |
1364 | fput(vma->vm_file); | ||
1365 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1366 | removed_exe_file_vma(vma->vm_mm); | ||
1367 | kmem_cache_free(vm_area_cachep, vma); | ||
1368 | kleave(" = %d", ret); | ||
1369 | return ret; | ||
1094 | 1370 | ||
1095 | error_getting_vma: | 1371 | sharing_violation: |
1096 | up_write(&nommu_vma_sem); | 1372 | up_write(&nommu_region_sem); |
1097 | kfree(vml); | 1373 | printk(KERN_WARNING "Attempt to share mismatched mappings\n"); |
1098 | printk("Allocation of vma for %lu byte allocation from process %d failed\n", | 1374 | ret = -EINVAL; |
1375 | goto error; | ||
1376 | |||
1377 | error_getting_vma: | ||
1378 | kmem_cache_free(vm_region_jar, region); | ||
1379 | printk(KERN_WARNING "Allocation of vma for %lu byte allocation" | ||
1380 | " from process %d failed\n", | ||
1099 | len, current->pid); | 1381 | len, current->pid); |
1100 | show_free_areas(); | 1382 | show_free_areas(); |
1101 | return -ENOMEM; | 1383 | return -ENOMEM; |
1102 | 1384 | ||
1103 | error_getting_vml: | 1385 | error_getting_region: |
1104 | printk("Allocation of vml for %lu byte allocation from process %d failed\n", | 1386 | printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" |
1387 | " from process %d failed\n", | ||
1105 | len, current->pid); | 1388 | len, current->pid); |
1106 | show_free_areas(); | 1389 | show_free_areas(); |
1107 | return -ENOMEM; | 1390 | return -ENOMEM; |
1108 | } | 1391 | } |
1109 | EXPORT_SYMBOL(do_mmap_pgoff); | 1392 | EXPORT_SYMBOL(do_mmap_pgoff); |
1110 | 1393 | ||
1111 | /* | 1394 | /* |
1112 | * handle mapping disposal for uClinux | 1395 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
1396 | * for the first part or the tail. | ||
1113 | */ | 1397 | */ |
1114 | static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) | 1398 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
1399 | unsigned long addr, int new_below) | ||
1115 | { | 1400 | { |
1116 | if (vma) { | 1401 | struct vm_area_struct *new; |
1117 | down_write(&nommu_vma_sem); | 1402 | struct vm_region *region; |
1403 | unsigned long npages; | ||
1118 | 1404 | ||
1119 | if (atomic_dec_and_test(&vma->vm_usage)) { | 1405 | kenter(""); |
1120 | delete_nommu_vma(vma); | ||
1121 | 1406 | ||
1122 | if (vma->vm_ops && vma->vm_ops->close) | 1407 | /* we're only permitted to split anonymous regions that have a single |
1123 | vma->vm_ops->close(vma); | 1408 | * owner */ |
1409 | if (vma->vm_file || | ||
1410 | atomic_read(&vma->vm_region->vm_usage) != 1) | ||
1411 | return -ENOMEM; | ||
1124 | 1412 | ||
1125 | /* IO memory and memory shared directly out of the pagecache from | 1413 | if (mm->map_count >= sysctl_max_map_count) |
1126 | * ramfs/tmpfs mustn't be released here */ | 1414 | return -ENOMEM; |
1127 | if (vma->vm_flags & VM_MAPPED_COPY) { | ||
1128 | realalloc -= kobjsize((void *) vma->vm_start); | ||
1129 | askedalloc -= vma->vm_end - vma->vm_start; | ||
1130 | kfree((void *) vma->vm_start); | ||
1131 | } | ||
1132 | 1415 | ||
1133 | realalloc -= kobjsize(vma); | 1416 | region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); |
1134 | askedalloc -= sizeof(*vma); | 1417 | if (!region) |
1418 | return -ENOMEM; | ||
1135 | 1419 | ||
1136 | if (vma->vm_file) { | 1420 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1137 | fput(vma->vm_file); | 1421 | if (!new) { |
1138 | if (vma->vm_flags & VM_EXECUTABLE) | 1422 | kmem_cache_free(vm_region_jar, region); |
1139 | removed_exe_file_vma(mm); | 1423 | return -ENOMEM; |
1140 | } | 1424 | } |
1141 | kfree(vma); | ||
1142 | } | ||
1143 | 1425 | ||
1144 | up_write(&nommu_vma_sem); | 1426 | /* most fields are the same, copy all, and then fixup */ |
1427 | *new = *vma; | ||
1428 | *region = *vma->vm_region; | ||
1429 | new->vm_region = region; | ||
1430 | |||
1431 | npages = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
1432 | |||
1433 | if (new_below) { | ||
1434 | region->vm_top = region->vm_end = new->vm_end = addr; | ||
1435 | } else { | ||
1436 | region->vm_start = new->vm_start = addr; | ||
1437 | region->vm_pgoff = new->vm_pgoff += npages; | ||
1145 | } | 1438 | } |
1439 | |||
1440 | if (new->vm_ops && new->vm_ops->open) | ||
1441 | new->vm_ops->open(new); | ||
1442 | |||
1443 | delete_vma_from_mm(vma); | ||
1444 | down_write(&nommu_region_sem); | ||
1445 | delete_nommu_region(vma->vm_region); | ||
1446 | if (new_below) { | ||
1447 | vma->vm_region->vm_start = vma->vm_start = addr; | ||
1448 | vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; | ||
1449 | } else { | ||
1450 | vma->vm_region->vm_end = vma->vm_end = addr; | ||
1451 | vma->vm_region->vm_top = addr; | ||
1452 | } | ||
1453 | add_nommu_region(vma->vm_region); | ||
1454 | add_nommu_region(new->vm_region); | ||
1455 | up_write(&nommu_region_sem); | ||
1456 | add_vma_to_mm(mm, vma); | ||
1457 | add_vma_to_mm(mm, new); | ||
1458 | return 0; | ||
1146 | } | 1459 | } |
1147 | 1460 | ||
1148 | /* | 1461 | /* |
1149 | * release a mapping | 1462 | * shrink a VMA by removing the specified chunk from either the beginning or |
1150 | * - under NOMMU conditions the parameters must match exactly to the mapping to | 1463 | * the end |
1151 | * be removed | ||
1152 | */ | 1464 | */ |
1153 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | 1465 | static int shrink_vma(struct mm_struct *mm, |
1466 | struct vm_area_struct *vma, | ||
1467 | unsigned long from, unsigned long to) | ||
1154 | { | 1468 | { |
1155 | struct vm_list_struct *vml, **parent; | 1469 | struct vm_region *region; |
1156 | unsigned long end = addr + len; | ||
1157 | 1470 | ||
1158 | #ifdef DEBUG | 1471 | kenter(""); |
1159 | printk("do_munmap:\n"); | ||
1160 | #endif | ||
1161 | 1472 | ||
1162 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { | 1473 | /* adjust the VMA's pointers, which may reposition it in the MM's tree |
1163 | if ((*parent)->vma->vm_start > addr) | 1474 | * and list */ |
1164 | break; | 1475 | delete_vma_from_mm(vma); |
1165 | if ((*parent)->vma->vm_start == addr && | 1476 | if (from > vma->vm_start) |
1166 | ((len == 0) || ((*parent)->vma->vm_end == end))) | 1477 | vma->vm_end = from; |
1167 | goto found; | 1478 | else |
1479 | vma->vm_start = to; | ||
1480 | add_vma_to_mm(mm, vma); | ||
1481 | |||
1482 | /* cut the backing region down to size */ | ||
1483 | region = vma->vm_region; | ||
1484 | BUG_ON(atomic_read(®ion->vm_usage) != 1); | ||
1485 | |||
1486 | down_write(&nommu_region_sem); | ||
1487 | delete_nommu_region(region); | ||
1488 | if (from > region->vm_start) { | ||
1489 | to = region->vm_top; | ||
1490 | region->vm_top = region->vm_end = from; | ||
1491 | } else { | ||
1492 | region->vm_start = to; | ||
1168 | } | 1493 | } |
1494 | add_nommu_region(region); | ||
1495 | up_write(&nommu_region_sem); | ||
1169 | 1496 | ||
1170 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | 1497 | free_page_series(from, to); |
1171 | current->pid, current->comm, (void *) addr); | 1498 | return 0; |
1172 | return -EINVAL; | 1499 | } |
1173 | 1500 | ||
1174 | found: | 1501 | /* |
1175 | vml = *parent; | 1502 | * release a mapping |
1503 | * - under NOMMU conditions the chunk to be unmapped must be backed by a single | ||
1504 | * VMA, though it need not cover the whole VMA | ||
1505 | */ | ||
1506 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | ||
1507 | { | ||
1508 | struct vm_area_struct *vma; | ||
1509 | struct rb_node *rb; | ||
1510 | unsigned long end = start + len; | ||
1511 | int ret; | ||
1176 | 1512 | ||
1177 | put_vma(mm, vml->vma); | 1513 | kenter(",%lx,%zx", start, len); |
1178 | 1514 | ||
1179 | *parent = vml->next; | 1515 | if (len == 0) |
1180 | realalloc -= kobjsize(vml); | 1516 | return -EINVAL; |
1181 | askedalloc -= sizeof(*vml); | ||
1182 | kfree(vml); | ||
1183 | 1517 | ||
1184 | update_hiwater_vm(mm); | 1518 | /* find the first potentially overlapping VMA */ |
1185 | mm->total_vm -= len >> PAGE_SHIFT; | 1519 | vma = find_vma(mm, start); |
1520 | if (!vma) { | ||
1521 | printk(KERN_WARNING | ||
1522 | "munmap of memory not mmapped by process %d (%s):" | ||
1523 | " 0x%lx-0x%lx\n", | ||
1524 | current->pid, current->comm, start, start + len - 1); | ||
1525 | return -EINVAL; | ||
1526 | } | ||
1186 | 1527 | ||
1187 | #ifdef DEBUG | 1528 | /* we're allowed to split an anonymous VMA but not a file-backed one */ |
1188 | show_process_blocks(); | 1529 | if (vma->vm_file) { |
1189 | #endif | 1530 | do { |
1531 | if (start > vma->vm_start) { | ||
1532 | kleave(" = -EINVAL [miss]"); | ||
1533 | return -EINVAL; | ||
1534 | } | ||
1535 | if (end == vma->vm_end) | ||
1536 | goto erase_whole_vma; | ||
1537 | rb = rb_next(&vma->vm_rb); | ||
1538 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | ||
1539 | } while (rb); | ||
1540 | kleave(" = -EINVAL [split file]"); | ||
1541 | return -EINVAL; | ||
1542 | } else { | ||
1543 | /* the chunk must be a subset of the VMA found */ | ||
1544 | if (start == vma->vm_start && end == vma->vm_end) | ||
1545 | goto erase_whole_vma; | ||
1546 | if (start < vma->vm_start || end > vma->vm_end) { | ||
1547 | kleave(" = -EINVAL [superset]"); | ||
1548 | return -EINVAL; | ||
1549 | } | ||
1550 | if (start & ~PAGE_MASK) { | ||
1551 | kleave(" = -EINVAL [unaligned start]"); | ||
1552 | return -EINVAL; | ||
1553 | } | ||
1554 | if (end != vma->vm_end && end & ~PAGE_MASK) { | ||
1555 | kleave(" = -EINVAL [unaligned split]"); | ||
1556 | return -EINVAL; | ||
1557 | } | ||
1558 | if (start != vma->vm_start && end != vma->vm_end) { | ||
1559 | ret = split_vma(mm, vma, start, 1); | ||
1560 | if (ret < 0) { | ||
1561 | kleave(" = %d [split]", ret); | ||
1562 | return ret; | ||
1563 | } | ||
1564 | } | ||
1565 | return shrink_vma(mm, vma, start, end); | ||
1566 | } | ||
1190 | 1567 | ||
1568 | erase_whole_vma: | ||
1569 | delete_vma_from_mm(vma); | ||
1570 | delete_vma(mm, vma); | ||
1571 | kleave(" = 0"); | ||
1191 | return 0; | 1572 | return 0; |
1192 | } | 1573 | } |
1193 | EXPORT_SYMBOL(do_munmap); | 1574 | EXPORT_SYMBOL(do_munmap); |
1194 | 1575 | ||
1195 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | 1576 | asmlinkage long sys_munmap(unsigned long addr, size_t len) |
1196 | { | 1577 | { |
1197 | int ret; | 1578 | int ret; |
1198 | struct mm_struct *mm = current->mm; | 1579 | struct mm_struct *mm = current->mm; |
1199 | 1580 | ||
1200 | down_write(&mm->mmap_sem); | 1581 | down_write(&mm->mmap_sem); |
1201 | ret = do_munmap(mm, addr, len); | 1582 | ret = do_munmap(mm, addr, len); |
1202 | up_write(&mm->mmap_sem); | 1583 | up_write(&mm->mmap_sem); |
1203 | return ret; | 1584 | return ret; |
1204 | } | 1585 | } |
1205 | 1586 | ||
1206 | /* | 1587 | /* |
1207 | * Release all mappings | 1588 | * release all the mappings made in a process's VM space |
1208 | */ | 1589 | */ |
1209 | void exit_mmap(struct mm_struct * mm) | 1590 | void exit_mmap(struct mm_struct *mm) |
1210 | { | 1591 | { |
1211 | struct vm_list_struct *tmp; | 1592 | struct vm_area_struct *vma; |
1212 | 1593 | ||
1213 | if (mm) { | 1594 | if (!mm) |
1214 | #ifdef DEBUG | 1595 | return; |
1215 | printk("Exit_mmap:\n"); | ||
1216 | #endif | ||
1217 | 1596 | ||
1218 | mm->total_vm = 0; | 1597 | kenter(""); |
1219 | 1598 | ||
1220 | while ((tmp = mm->context.vmlist)) { | 1599 | mm->total_vm = 0; |
1221 | mm->context.vmlist = tmp->next; | ||
1222 | put_vma(mm, tmp->vma); | ||
1223 | 1600 | ||
1224 | realalloc -= kobjsize(tmp); | 1601 | while ((vma = mm->mmap)) { |
1225 | askedalloc -= sizeof(*tmp); | 1602 | mm->mmap = vma->vm_next; |
1226 | kfree(tmp); | 1603 | delete_vma_from_mm(vma); |
1227 | } | 1604 | delete_vma(mm, vma); |
1228 | |||
1229 | #ifdef DEBUG | ||
1230 | show_process_blocks(); | ||
1231 | #endif | ||
1232 | } | 1605 | } |
1606 | |||
1607 | kleave(""); | ||
1233 | } | 1608 | } |
1234 | 1609 | ||
1235 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1610 | unsigned long do_brk(unsigned long addr, unsigned long len) |
1236 | { | 1611 | { |
1237 | return -ENOMEM; | 1612 | return -ENOMEM; |
1238 | } | 1613 | } |
1239 | 1614 | ||
1240 | /* | 1615 | /* |
1241 | * expand (or shrink) an existing mapping, potentially moving it at the same | 1616 | * expand (or shrink) an existing mapping, potentially moving it at the same |
1242 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 1617 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
1243 | * | 1618 | * |
1244 | * under NOMMU conditions, we only permit changing a mapping's size, and only | 1619 | * under NOMMU conditions, we only permit changing a mapping's size, and only |
1245 | * as long as it stays within the hole allocated by the kmalloc() call in | 1620 | * as long as it stays within the region allocated by do_mmap_private() and the |
1246 | * do_mmap_pgoff() and the block is not shareable | 1621 | * block is not shareable |
1247 | * | 1622 | * |
1248 | * MREMAP_FIXED is not supported under NOMMU conditions | 1623 | * MREMAP_FIXED is not supported under NOMMU conditions |
1249 | */ | 1624 | */ |
1250 | unsigned long do_mremap(unsigned long addr, | 1625 | unsigned long do_mremap(unsigned long addr, |
1251 | unsigned long old_len, unsigned long new_len, | 1626 | unsigned long old_len, unsigned long new_len, |
1252 | unsigned long flags, unsigned long new_addr) | 1627 | unsigned long flags, unsigned long new_addr) |
1253 | { | 1628 | { |
1254 | struct vm_area_struct *vma; | 1629 | struct vm_area_struct *vma; |
1255 | 1630 | ||
1256 | /* insanity checks first */ | 1631 | /* insanity checks first */ |
1257 | if (new_len == 0) | 1632 | if (old_len == 0 || new_len == 0) |
1258 | return (unsigned long) -EINVAL; | 1633 | return (unsigned long) -EINVAL; |
1259 | 1634 | ||
1635 | if (addr & ~PAGE_MASK) | ||
1636 | return -EINVAL; | ||
1637 | |||
1260 | if (flags & MREMAP_FIXED && new_addr != addr) | 1638 | if (flags & MREMAP_FIXED && new_addr != addr) |
1261 | return (unsigned long) -EINVAL; | 1639 | return (unsigned long) -EINVAL; |
1262 | 1640 | ||
1263 | vma = find_vma_exact(current->mm, addr); | 1641 | vma = find_vma_exact(current->mm, addr, old_len); |
1264 | if (!vma) | 1642 | if (!vma) |
1265 | return (unsigned long) -EINVAL; | 1643 | return (unsigned long) -EINVAL; |
1266 | 1644 | ||
1267 | if (vma->vm_end != vma->vm_start + old_len) | 1645 | if (vma->vm_end != vma->vm_start + old_len) |
1268 | return (unsigned long) -EFAULT; | 1646 | return (unsigned long) -EFAULT; |
1269 | 1647 | ||
1270 | if (vma->vm_flags & VM_MAYSHARE) | 1648 | if (vma->vm_flags & VM_MAYSHARE) |
1271 | return (unsigned long) -EPERM; | 1649 | return (unsigned long) -EPERM; |
1272 | 1650 | ||
1273 | if (new_len > kobjsize((void *) addr)) | 1651 | if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) |
1274 | return (unsigned long) -ENOMEM; | 1652 | return (unsigned long) -ENOMEM; |
1275 | 1653 | ||
1276 | /* all checks complete - do it */ | 1654 | /* all checks complete - do it */ |
1277 | vma->vm_end = vma->vm_start + new_len; | 1655 | vma->vm_end = vma->vm_start + new_len; |
1278 | |||
1279 | askedalloc -= old_len; | ||
1280 | askedalloc += new_len; | ||
1281 | |||
1282 | return vma->vm_start; | 1656 | return vma->vm_start; |
1283 | } | 1657 | } |
1284 | EXPORT_SYMBOL(do_mremap); | 1658 | EXPORT_SYMBOL(do_mremap); |
1285 | 1659 | ||
1286 | asmlinkage unsigned long sys_mremap(unsigned long addr, | 1660 | asmlinkage |
1287 | unsigned long old_len, unsigned long new_len, | 1661 | unsigned long sys_mremap(unsigned long addr, |
1288 | unsigned long flags, unsigned long new_addr) | 1662 | unsigned long old_len, unsigned long new_len, |
1663 | unsigned long flags, unsigned long new_addr) | ||
1289 | { | 1664 | { |
1290 | unsigned long ret; | 1665 | unsigned long ret; |
1291 | 1666 | ||
1292 | down_write(¤t->mm->mmap_sem); | 1667 | down_write(¤t->mm->mmap_sem); |
1293 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); | 1668 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); |
1294 | up_write(¤t->mm->mmap_sem); | 1669 | up_write(¤t->mm->mmap_sem); |
1295 | return ret; | 1670 | return ret; |
1296 | } | 1671 | } |
1297 | 1672 | ||
1298 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1673 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
1299 | unsigned int foll_flags) | 1674 | unsigned int foll_flags) |
1300 | { | 1675 | { |
1301 | return NULL; | 1676 | return NULL; |
1302 | } | 1677 | } |
1303 | 1678 | ||
1304 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 1679 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, |
1305 | unsigned long to, unsigned long size, pgprot_t prot) | 1680 | unsigned long to, unsigned long size, pgprot_t prot) |
1306 | { | 1681 | { |
1307 | vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; | 1682 | vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; |
1308 | return 0; | 1683 | return 0; |
1309 | } | 1684 | } |
1310 | EXPORT_SYMBOL(remap_pfn_range); | 1685 | EXPORT_SYMBOL(remap_pfn_range); |
1311 | 1686 | ||
1312 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | 1687 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, |
1313 | unsigned long pgoff) | 1688 | unsigned long pgoff) |
1314 | { | 1689 | { |
1315 | unsigned int size = vma->vm_end - vma->vm_start; | 1690 | unsigned int size = vma->vm_end - vma->vm_start; |
1316 | 1691 | ||
1317 | if (!(vma->vm_flags & VM_USERMAP)) | 1692 | if (!(vma->vm_flags & VM_USERMAP)) |
1318 | return -EINVAL; | 1693 | return -EINVAL; |
1319 | 1694 | ||
1320 | vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); | 1695 | vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); |
1321 | vma->vm_end = vma->vm_start + size; | 1696 | vma->vm_end = vma->vm_start + size; |
1322 | 1697 | ||
1323 | return 0; | 1698 | return 0; |
1324 | } | 1699 | } |
1325 | EXPORT_SYMBOL(remap_vmalloc_range); | 1700 | EXPORT_SYMBOL(remap_vmalloc_range); |
1326 | 1701 | ||
1327 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 1702 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
1328 | { | 1703 | { |
1329 | } | 1704 | } |
1330 | 1705 | ||
1331 | unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, | 1706 | unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, |
1332 | unsigned long len, unsigned long pgoff, unsigned long flags) | 1707 | unsigned long len, unsigned long pgoff, unsigned long flags) |
1333 | { | 1708 | { |
1334 | return -ENOMEM; | 1709 | return -ENOMEM; |
1335 | } | 1710 | } |
1336 | 1711 | ||
1337 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | 1712 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) |
1338 | { | 1713 | { |
1339 | } | 1714 | } |
1340 | 1715 | ||
1341 | void unmap_mapping_range(struct address_space *mapping, | 1716 | void unmap_mapping_range(struct address_space *mapping, |
1342 | loff_t const holebegin, loff_t const holelen, | 1717 | loff_t const holebegin, loff_t const holelen, |
1343 | int even_cows) | 1718 | int even_cows) |
1344 | { | 1719 | { |
1345 | } | 1720 | } |
1346 | EXPORT_SYMBOL(unmap_mapping_range); | 1721 | EXPORT_SYMBOL(unmap_mapping_range); |
1347 | 1722 | ||
1348 | /* | 1723 | /* |
1349 | * ask for an unmapped area at which to create a mapping on a file | 1724 | * ask for an unmapped area at which to create a mapping on a file |
1350 | */ | 1725 | */ |
1351 | unsigned long get_unmapped_area(struct file *file, unsigned long addr, | 1726 | unsigned long get_unmapped_area(struct file *file, unsigned long addr, |
1352 | unsigned long len, unsigned long pgoff, | 1727 | unsigned long len, unsigned long pgoff, |
1353 | unsigned long flags) | 1728 | unsigned long flags) |
1354 | { | 1729 | { |
1355 | unsigned long (*get_area)(struct file *, unsigned long, unsigned long, | 1730 | unsigned long (*get_area)(struct file *, unsigned long, unsigned long, |
1356 | unsigned long, unsigned long); | 1731 | unsigned long, unsigned long); |
1357 | 1732 | ||
1358 | get_area = current->mm->get_unmapped_area; | 1733 | get_area = current->mm->get_unmapped_area; |
1359 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1734 | if (file && file->f_op && file->f_op->get_unmapped_area) |
1360 | get_area = file->f_op->get_unmapped_area; | 1735 | get_area = file->f_op->get_unmapped_area; |
1361 | 1736 | ||
1362 | if (!get_area) | 1737 | if (!get_area) |
1363 | return -ENOSYS; | 1738 | return -ENOSYS; |
1364 | 1739 | ||
1365 | return get_area(file, addr, len, pgoff, flags); | 1740 | return get_area(file, addr, len, pgoff, flags); |
1366 | } | 1741 | } |
1367 | EXPORT_SYMBOL(get_unmapped_area); | 1742 | EXPORT_SYMBOL(get_unmapped_area); |
1368 | 1743 | ||
1369 | /* | 1744 | /* |
1370 | * Check that a process has enough memory to allocate a new virtual | 1745 | * Check that a process has enough memory to allocate a new virtual |
1371 | * mapping. 0 means there is enough memory for the allocation to | 1746 | * mapping. 0 means there is enough memory for the allocation to |
1372 | * succeed and -ENOMEM implies there is not. | 1747 | * succeed and -ENOMEM implies there is not. |
1373 | * | 1748 | * |
1374 | * We currently support three overcommit policies, which are set via the | 1749 | * We currently support three overcommit policies, which are set via the |
1375 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting | 1750 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting |
1376 | * | 1751 | * |
1377 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. | 1752 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. |
1378 | * Additional code 2002 Jul 20 by Robert Love. | 1753 | * Additional code 2002 Jul 20 by Robert Love. |
1379 | * | 1754 | * |
1380 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. | 1755 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. |
1381 | * | 1756 | * |
1382 | * Note this is a helper function intended to be used by LSMs which | 1757 | * Note this is a helper function intended to be used by LSMs which |
1383 | * wish to use this logic. | 1758 | * wish to use this logic. |
1384 | */ | 1759 | */ |
1385 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 1760 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
1386 | { | 1761 | { |
1387 | unsigned long free, allowed; | 1762 | unsigned long free, allowed; |
1388 | 1763 | ||
1389 | vm_acct_memory(pages); | 1764 | vm_acct_memory(pages); |
1390 | 1765 | ||
1391 | /* | 1766 | /* |
1392 | * Sometimes we want to use more memory than we have | 1767 | * Sometimes we want to use more memory than we have |
1393 | */ | 1768 | */ |
1394 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) | 1769 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) |
1395 | return 0; | 1770 | return 0; |
1396 | 1771 | ||
1397 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 1772 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
1398 | unsigned long n; | 1773 | unsigned long n; |
1399 | 1774 | ||
1400 | free = global_page_state(NR_FILE_PAGES); | 1775 | free = global_page_state(NR_FILE_PAGES); |
1401 | free += nr_swap_pages; | 1776 | free += nr_swap_pages; |
1402 | 1777 | ||
1403 | /* | 1778 | /* |
1404 | * Any slabs which are created with the | 1779 | * Any slabs which are created with the |
1405 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents | 1780 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents |
1406 | * which are reclaimable, under pressure. The dentry | 1781 | * which are reclaimable, under pressure. The dentry |
1407 | * cache and most inode caches should fall into this | 1782 | * cache and most inode caches should fall into this |
1408 | */ | 1783 | */ |
1409 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 1784 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
1410 | 1785 | ||
1411 | /* | 1786 | /* |
1412 | * Leave the last 3% for root | 1787 | * Leave the last 3% for root |
1413 | */ | 1788 | */ |
1414 | if (!cap_sys_admin) | 1789 | if (!cap_sys_admin) |
1415 | free -= free / 32; | 1790 | free -= free / 32; |
1416 | 1791 | ||
1417 | if (free > pages) | 1792 | if (free > pages) |
1418 | return 0; | 1793 | return 0; |
1419 | 1794 | ||
1420 | /* | 1795 | /* |
1421 | * nr_free_pages() is very expensive on large systems, | 1796 | * nr_free_pages() is very expensive on large systems, |