Commit 97a41e26124330e41aa10ef88cd1711bc3d17460

Authored by Adrian Bunk
Committed by Linus Torvalds
1 parent b7b4d7a466

[PATCH] kernel/: small cleanups

This patch contains the following cleanups:
- make needlessly global functions static
- every file should include the headers containing the prototypes for
  it's global functions

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 5 additions and 2 deletions Inline Diff

1 /* audit.c -- Auditing support 1 /* audit.c -- Auditing support
2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. 2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
3 * System-call specific features have moved to auditsc.c 3 * System-call specific features have moved to auditsc.c
4 * 4 *
5 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 5 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
6 * All Rights Reserved. 6 * All Rights Reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or 10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version. 11 * (at your option) any later version.
12 * 12 *
13 * This program is distributed in the hope that it will be useful, 13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details. 16 * GNU General Public License for more details.
17 * 17 *
18 * You should have received a copy of the GNU General Public License 18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software 19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * 21 *
22 * Written by Rickard E. (Rik) Faith <faith@redhat.com> 22 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
23 * 23 *
24 * Goals: 1) Integrate fully with SELinux. 24 * Goals: 1) Integrate fully with SELinux.
25 * 2) Minimal run-time overhead: 25 * 2) Minimal run-time overhead:
26 * a) Minimal when syscall auditing is disabled (audit_enable=0). 26 * a) Minimal when syscall auditing is disabled (audit_enable=0).
27 * b) Small when syscall auditing is enabled and no audit record 27 * b) Small when syscall auditing is enabled and no audit record
28 * is generated (defer as much work as possible to record 28 * is generated (defer as much work as possible to record
29 * generation time): 29 * generation time):
30 * i) context is allocated, 30 * i) context is allocated,
31 * ii) names from getname are stored without a copy, and 31 * ii) names from getname are stored without a copy, and
32 * iii) inode information stored from path_lookup. 32 * iii) inode information stored from path_lookup.
33 * 3) Ability to disable syscall auditing at boot time (audit=0). 33 * 3) Ability to disable syscall auditing at boot time (audit=0).
34 * 4) Usable by other parts of the kernel (if audit_log* is called, 34 * 4) Usable by other parts of the kernel (if audit_log* is called,
35 * then a syscall record will be generated automatically for the 35 * then a syscall record will be generated automatically for the
36 * current syscall). 36 * current syscall).
37 * 5) Netlink interface to user-space. 37 * 5) Netlink interface to user-space.
38 * 6) Support low-overhead kernel-based filtering to minimize the 38 * 6) Support low-overhead kernel-based filtering to minimize the
39 * information that must be passed to user-space. 39 * information that must be passed to user-space.
40 * 40 *
41 * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ 41 * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
42 */ 42 */
43 43
44 #include <linux/init.h> 44 #include <linux/init.h>
45 #include <asm/atomic.h> 45 #include <asm/atomic.h>
46 #include <asm/types.h> 46 #include <asm/types.h>
47 #include <linux/mm.h> 47 #include <linux/mm.h>
48 #include <linux/module.h> 48 #include <linux/module.h>
49 #include <linux/err.h> 49 #include <linux/err.h>
50 #include <linux/kthread.h> 50 #include <linux/kthread.h>
51 51
52 #include <linux/audit.h> 52 #include <linux/audit.h>
53 53
54 #include <net/sock.h> 54 #include <net/sock.h>
55 #include <linux/skbuff.h> 55 #include <linux/skbuff.h>
56 #include <linux/netlink.h> 56 #include <linux/netlink.h>
57 57
58 /* No auditing will take place until audit_initialized != 0. 58 /* No auditing will take place until audit_initialized != 0.
59 * (Initialization happens after skb_init is called.) */ 59 * (Initialization happens after skb_init is called.) */
60 static int audit_initialized; 60 static int audit_initialized;
61 61
62 /* No syscall auditing will take place unless audit_enabled != 0. */ 62 /* No syscall auditing will take place unless audit_enabled != 0. */
63 int audit_enabled; 63 int audit_enabled;
64 64
65 /* Default state when kernel boots without any parameters. */ 65 /* Default state when kernel boots without any parameters. */
66 static int audit_default; 66 static int audit_default;
67 67
68 /* If auditing cannot proceed, audit_failure selects what happens. */ 68 /* If auditing cannot proceed, audit_failure selects what happens. */
69 static int audit_failure = AUDIT_FAIL_PRINTK; 69 static int audit_failure = AUDIT_FAIL_PRINTK;
70 70
71 /* If audit records are to be written to the netlink socket, audit_pid 71 /* If audit records are to be written to the netlink socket, audit_pid
72 * contains the (non-zero) pid. */ 72 * contains the (non-zero) pid. */
73 int audit_pid; 73 int audit_pid;
74 74
75 /* If audit_limit is non-zero, limit the rate of sending audit records 75 /* If audit_limit is non-zero, limit the rate of sending audit records
76 * to that number per second. This prevents DoS attacks, but results in 76 * to that number per second. This prevents DoS attacks, but results in
77 * audit records being dropped. */ 77 * audit records being dropped. */
78 static int audit_rate_limit; 78 static int audit_rate_limit;
79 79
80 /* Number of outstanding audit_buffers allowed. */ 80 /* Number of outstanding audit_buffers allowed. */
81 static int audit_backlog_limit = 64; 81 static int audit_backlog_limit = 64;
82 static int audit_backlog_wait_time = 60 * HZ; 82 static int audit_backlog_wait_time = 60 * HZ;
83 static int audit_backlog_wait_overflow = 0; 83 static int audit_backlog_wait_overflow = 0;
84 84
85 /* The identity of the user shutting down the audit system. */ 85 /* The identity of the user shutting down the audit system. */
86 uid_t audit_sig_uid = -1; 86 uid_t audit_sig_uid = -1;
87 pid_t audit_sig_pid = -1; 87 pid_t audit_sig_pid = -1;
88 88
89 /* Records can be lost in several ways: 89 /* Records can be lost in several ways:
90 0) [suppressed in audit_alloc] 90 0) [suppressed in audit_alloc]
91 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] 91 1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
92 2) out of memory in audit_log_move [alloc_skb] 92 2) out of memory in audit_log_move [alloc_skb]
93 3) suppressed due to audit_rate_limit 93 3) suppressed due to audit_rate_limit
94 4) suppressed due to audit_backlog_limit 94 4) suppressed due to audit_backlog_limit
95 */ 95 */
96 static atomic_t audit_lost = ATOMIC_INIT(0); 96 static atomic_t audit_lost = ATOMIC_INIT(0);
97 97
98 /* The netlink socket. */ 98 /* The netlink socket. */
99 static struct sock *audit_sock; 99 static struct sock *audit_sock;
100 100
101 /* The audit_freelist is a list of pre-allocated audit buffers (if more 101 /* The audit_freelist is a list of pre-allocated audit buffers (if more
102 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 102 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
103 * being placed on the freelist). */ 103 * being placed on the freelist). */
104 static DEFINE_SPINLOCK(audit_freelist_lock); 104 static DEFINE_SPINLOCK(audit_freelist_lock);
105 static int audit_freelist_count = 0; 105 static int audit_freelist_count = 0;
106 static LIST_HEAD(audit_freelist); 106 static LIST_HEAD(audit_freelist);
107 107
108 static struct sk_buff_head audit_skb_queue; 108 static struct sk_buff_head audit_skb_queue;
109 static struct task_struct *kauditd_task; 109 static struct task_struct *kauditd_task;
110 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 110 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
111 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 111 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
112 112
113 /* The netlink socket is only to be read by 1 CPU, which lets us assume 113 /* The netlink socket is only to be read by 1 CPU, which lets us assume
114 * that list additions and deletions never happen simultaneously in 114 * that list additions and deletions never happen simultaneously in
115 * auditsc.c */ 115 * auditsc.c */
116 DECLARE_MUTEX(audit_netlink_sem); 116 DECLARE_MUTEX(audit_netlink_sem);
117 117
118 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 118 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
119 * audit records. Since printk uses a 1024 byte buffer, this buffer 119 * audit records. Since printk uses a 1024 byte buffer, this buffer
120 * should be at least that large. */ 120 * should be at least that large. */
121 #define AUDIT_BUFSIZ 1024 121 #define AUDIT_BUFSIZ 1024
122 122
123 /* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the 123 /* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
124 * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ 124 * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
125 #define AUDIT_MAXFREE (2*NR_CPUS) 125 #define AUDIT_MAXFREE (2*NR_CPUS)
126 126
127 /* The audit_buffer is used when formatting an audit record. The caller 127 /* The audit_buffer is used when formatting an audit record. The caller
128 * locks briefly to get the record off the freelist or to allocate the 128 * locks briefly to get the record off the freelist or to allocate the
129 * buffer, and locks briefly to send the buffer to the netlink layer or 129 * buffer, and locks briefly to send the buffer to the netlink layer or
130 * to place it on a transmit queue. Multiple audit_buffers can be in 130 * to place it on a transmit queue. Multiple audit_buffers can be in
131 * use simultaneously. */ 131 * use simultaneously. */
132 struct audit_buffer { 132 struct audit_buffer {
133 struct list_head list; 133 struct list_head list;
134 struct sk_buff *skb; /* formatted skb ready to send */ 134 struct sk_buff *skb; /* formatted skb ready to send */
135 struct audit_context *ctx; /* NULL or associated context */ 135 struct audit_context *ctx; /* NULL or associated context */
136 gfp_t gfp_mask; 136 gfp_t gfp_mask;
137 }; 137 };
138 138
139 static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 139 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
140 { 140 {
141 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; 141 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
142 nlh->nlmsg_pid = pid; 142 nlh->nlmsg_pid = pid;
143 } 143 }
144 144
145 static void audit_panic(const char *message) 145 static void audit_panic(const char *message)
146 { 146 {
147 switch (audit_failure) 147 switch (audit_failure)
148 { 148 {
149 case AUDIT_FAIL_SILENT: 149 case AUDIT_FAIL_SILENT:
150 break; 150 break;
151 case AUDIT_FAIL_PRINTK: 151 case AUDIT_FAIL_PRINTK:
152 printk(KERN_ERR "audit: %s\n", message); 152 printk(KERN_ERR "audit: %s\n", message);
153 break; 153 break;
154 case AUDIT_FAIL_PANIC: 154 case AUDIT_FAIL_PANIC:
155 panic("audit: %s\n", message); 155 panic("audit: %s\n", message);
156 break; 156 break;
157 } 157 }
158 } 158 }
159 159
160 static inline int audit_rate_check(void) 160 static inline int audit_rate_check(void)
161 { 161 {
162 static unsigned long last_check = 0; 162 static unsigned long last_check = 0;
163 static int messages = 0; 163 static int messages = 0;
164 static DEFINE_SPINLOCK(lock); 164 static DEFINE_SPINLOCK(lock);
165 unsigned long flags; 165 unsigned long flags;
166 unsigned long now; 166 unsigned long now;
167 unsigned long elapsed; 167 unsigned long elapsed;
168 int retval = 0; 168 int retval = 0;
169 169
170 if (!audit_rate_limit) return 1; 170 if (!audit_rate_limit) return 1;
171 171
172 spin_lock_irqsave(&lock, flags); 172 spin_lock_irqsave(&lock, flags);
173 if (++messages < audit_rate_limit) { 173 if (++messages < audit_rate_limit) {
174 retval = 1; 174 retval = 1;
175 } else { 175 } else {
176 now = jiffies; 176 now = jiffies;
177 elapsed = now - last_check; 177 elapsed = now - last_check;
178 if (elapsed > HZ) { 178 if (elapsed > HZ) {
179 last_check = now; 179 last_check = now;
180 messages = 0; 180 messages = 0;
181 retval = 1; 181 retval = 1;
182 } 182 }
183 } 183 }
184 spin_unlock_irqrestore(&lock, flags); 184 spin_unlock_irqrestore(&lock, flags);
185 185
186 return retval; 186 return retval;
187 } 187 }
188 188
189 /* Emit at least 1 message per second, even if audit_rate_check is 189 /* Emit at least 1 message per second, even if audit_rate_check is
190 * throttling. */ 190 * throttling. */
191 void audit_log_lost(const char *message) 191 void audit_log_lost(const char *message)
192 { 192 {
193 static unsigned long last_msg = 0; 193 static unsigned long last_msg = 0;
194 static DEFINE_SPINLOCK(lock); 194 static DEFINE_SPINLOCK(lock);
195 unsigned long flags; 195 unsigned long flags;
196 unsigned long now; 196 unsigned long now;
197 int print; 197 int print;
198 198
199 atomic_inc(&audit_lost); 199 atomic_inc(&audit_lost);
200 200
201 print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); 201 print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
202 202
203 if (!print) { 203 if (!print) {
204 spin_lock_irqsave(&lock, flags); 204 spin_lock_irqsave(&lock, flags);
205 now = jiffies; 205 now = jiffies;
206 if (now - last_msg > HZ) { 206 if (now - last_msg > HZ) {
207 print = 1; 207 print = 1;
208 last_msg = now; 208 last_msg = now;
209 } 209 }
210 spin_unlock_irqrestore(&lock, flags); 210 spin_unlock_irqrestore(&lock, flags);
211 } 211 }
212 212
213 if (print) { 213 if (print) {
214 printk(KERN_WARNING 214 printk(KERN_WARNING
215 "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", 215 "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",
216 atomic_read(&audit_lost), 216 atomic_read(&audit_lost),
217 audit_rate_limit, 217 audit_rate_limit,
218 audit_backlog_limit); 218 audit_backlog_limit);
219 audit_panic(message); 219 audit_panic(message);
220 } 220 }
221 221
222 } 222 }
223 223
224 static int audit_set_rate_limit(int limit, uid_t loginuid) 224 static int audit_set_rate_limit(int limit, uid_t loginuid)
225 { 225 {
226 int old = audit_rate_limit; 226 int old = audit_rate_limit;
227 audit_rate_limit = limit; 227 audit_rate_limit = limit;
228 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 228 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
229 "audit_rate_limit=%d old=%d by auid=%u", 229 "audit_rate_limit=%d old=%d by auid=%u",
230 audit_rate_limit, old, loginuid); 230 audit_rate_limit, old, loginuid);
231 return old; 231 return old;
232 } 232 }
233 233
234 static int audit_set_backlog_limit(int limit, uid_t loginuid) 234 static int audit_set_backlog_limit(int limit, uid_t loginuid)
235 { 235 {
236 int old = audit_backlog_limit; 236 int old = audit_backlog_limit;
237 audit_backlog_limit = limit; 237 audit_backlog_limit = limit;
238 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 238 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
239 "audit_backlog_limit=%d old=%d by auid=%u", 239 "audit_backlog_limit=%d old=%d by auid=%u",
240 audit_backlog_limit, old, loginuid); 240 audit_backlog_limit, old, loginuid);
241 return old; 241 return old;
242 } 242 }
243 243
244 static int audit_set_enabled(int state, uid_t loginuid) 244 static int audit_set_enabled(int state, uid_t loginuid)
245 { 245 {
246 int old = audit_enabled; 246 int old = audit_enabled;
247 if (state != 0 && state != 1) 247 if (state != 0 && state != 1)
248 return -EINVAL; 248 return -EINVAL;
249 audit_enabled = state; 249 audit_enabled = state;
250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
251 "audit_enabled=%d old=%d by auid=%u", 251 "audit_enabled=%d old=%d by auid=%u",
252 audit_enabled, old, loginuid); 252 audit_enabled, old, loginuid);
253 return old; 253 return old;
254 } 254 }
255 255
256 static int audit_set_failure(int state, uid_t loginuid) 256 static int audit_set_failure(int state, uid_t loginuid)
257 { 257 {
258 int old = audit_failure; 258 int old = audit_failure;
259 if (state != AUDIT_FAIL_SILENT 259 if (state != AUDIT_FAIL_SILENT
260 && state != AUDIT_FAIL_PRINTK 260 && state != AUDIT_FAIL_PRINTK
261 && state != AUDIT_FAIL_PANIC) 261 && state != AUDIT_FAIL_PANIC)
262 return -EINVAL; 262 return -EINVAL;
263 audit_failure = state; 263 audit_failure = state;
264 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 264 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
265 "audit_failure=%d old=%d by auid=%u", 265 "audit_failure=%d old=%d by auid=%u",
266 audit_failure, old, loginuid); 266 audit_failure, old, loginuid);
267 return old; 267 return old;
268 } 268 }
269 269
270 int kauditd_thread(void *dummy) 270 static int kauditd_thread(void *dummy)
271 { 271 {
272 struct sk_buff *skb; 272 struct sk_buff *skb;
273 273
274 while (1) { 274 while (1) {
275 skb = skb_dequeue(&audit_skb_queue); 275 skb = skb_dequeue(&audit_skb_queue);
276 wake_up(&audit_backlog_wait); 276 wake_up(&audit_backlog_wait);
277 if (skb) { 277 if (skb) {
278 if (audit_pid) { 278 if (audit_pid) {
279 int err = netlink_unicast(audit_sock, skb, audit_pid, 0); 279 int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
280 if (err < 0) { 280 if (err < 0) {
281 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 281 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
282 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 282 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
283 audit_pid = 0; 283 audit_pid = 0;
284 } 284 }
285 } else { 285 } else {
286 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); 286 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
287 kfree_skb(skb); 287 kfree_skb(skb);
288 } 288 }
289 } else { 289 } else {
290 DECLARE_WAITQUEUE(wait, current); 290 DECLARE_WAITQUEUE(wait, current);
291 set_current_state(TASK_INTERRUPTIBLE); 291 set_current_state(TASK_INTERRUPTIBLE);
292 add_wait_queue(&kauditd_wait, &wait); 292 add_wait_queue(&kauditd_wait, &wait);
293 293
294 if (!skb_queue_len(&audit_skb_queue)) { 294 if (!skb_queue_len(&audit_skb_queue)) {
295 try_to_freeze(); 295 try_to_freeze();
296 schedule(); 296 schedule();
297 } 297 }
298 298
299 __set_current_state(TASK_RUNNING); 299 __set_current_state(TASK_RUNNING);
300 remove_wait_queue(&kauditd_wait, &wait); 300 remove_wait_queue(&kauditd_wait, &wait);
301 } 301 }
302 } 302 }
303 } 303 }
304 304
305 void audit_send_reply(int pid, int seq, int type, int done, int multi, 305 void audit_send_reply(int pid, int seq, int type, int done, int multi,
306 void *payload, int size) 306 void *payload, int size)
307 { 307 {
308 struct sk_buff *skb; 308 struct sk_buff *skb;
309 struct nlmsghdr *nlh; 309 struct nlmsghdr *nlh;
310 int len = NLMSG_SPACE(size); 310 int len = NLMSG_SPACE(size);
311 void *data; 311 void *data;
312 int flags = multi ? NLM_F_MULTI : 0; 312 int flags = multi ? NLM_F_MULTI : 0;
313 int t = done ? NLMSG_DONE : type; 313 int t = done ? NLMSG_DONE : type;
314 314
315 skb = alloc_skb(len, GFP_KERNEL); 315 skb = alloc_skb(len, GFP_KERNEL);
316 if (!skb) 316 if (!skb)
317 return; 317 return;
318 318
319 nlh = NLMSG_PUT(skb, pid, seq, t, size); 319 nlh = NLMSG_PUT(skb, pid, seq, t, size);
320 nlh->nlmsg_flags = flags; 320 nlh->nlmsg_flags = flags;
321 data = NLMSG_DATA(nlh); 321 data = NLMSG_DATA(nlh);
322 memcpy(data, payload, size); 322 memcpy(data, payload, size);
323 323
324 /* Ignore failure. It'll only happen if the sender goes away, 324 /* Ignore failure. It'll only happen if the sender goes away,
325 because our timeout is set to infinite. */ 325 because our timeout is set to infinite. */
326 netlink_unicast(audit_sock, skb, pid, 0); 326 netlink_unicast(audit_sock, skb, pid, 0);
327 return; 327 return;
328 328
329 nlmsg_failure: /* Used by NLMSG_PUT */ 329 nlmsg_failure: /* Used by NLMSG_PUT */
330 if (skb) 330 if (skb)
331 kfree_skb(skb); 331 kfree_skb(skb);
332 } 332 }
333 333
334 /* 334 /*
335 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit 335 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
336 * control messages. 336 * control messages.
337 */ 337 */
338 static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) 338 static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
339 { 339 {
340 int err = 0; 340 int err = 0;
341 341
342 switch (msg_type) { 342 switch (msg_type) {
343 case AUDIT_GET: 343 case AUDIT_GET:
344 case AUDIT_LIST: 344 case AUDIT_LIST:
345 case AUDIT_SET: 345 case AUDIT_SET:
346 case AUDIT_ADD: 346 case AUDIT_ADD:
347 case AUDIT_DEL: 347 case AUDIT_DEL:
348 case AUDIT_SIGNAL_INFO: 348 case AUDIT_SIGNAL_INFO:
349 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) 349 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
350 err = -EPERM; 350 err = -EPERM;
351 break; 351 break;
352 case AUDIT_USER: 352 case AUDIT_USER:
353 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 353 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
354 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) 354 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
355 err = -EPERM; 355 err = -EPERM;
356 break; 356 break;
357 default: /* bad msg */ 357 default: /* bad msg */
358 err = -EINVAL; 358 err = -EINVAL;
359 } 359 }
360 360
361 return err; 361 return err;
362 } 362 }
363 363
364 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 364 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
365 { 365 {
366 u32 uid, pid, seq; 366 u32 uid, pid, seq;
367 void *data; 367 void *data;
368 struct audit_status *status_get, status_set; 368 struct audit_status *status_get, status_set;
369 int err; 369 int err;
370 struct audit_buffer *ab; 370 struct audit_buffer *ab;
371 u16 msg_type = nlh->nlmsg_type; 371 u16 msg_type = nlh->nlmsg_type;
372 uid_t loginuid; /* loginuid of sender */ 372 uid_t loginuid; /* loginuid of sender */
373 struct audit_sig_info sig_data; 373 struct audit_sig_info sig_data;
374 374
375 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); 375 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
376 if (err) 376 if (err)
377 return err; 377 return err;
378 378
379 /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ 379 /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
380 if (!kauditd_task) 380 if (!kauditd_task)
381 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); 381 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
382 if (IS_ERR(kauditd_task)) { 382 if (IS_ERR(kauditd_task)) {
383 err = PTR_ERR(kauditd_task); 383 err = PTR_ERR(kauditd_task);
384 kauditd_task = NULL; 384 kauditd_task = NULL;
385 return err; 385 return err;
386 } 386 }
387 387
388 pid = NETLINK_CREDS(skb)->pid; 388 pid = NETLINK_CREDS(skb)->pid;
389 uid = NETLINK_CREDS(skb)->uid; 389 uid = NETLINK_CREDS(skb)->uid;
390 loginuid = NETLINK_CB(skb).loginuid; 390 loginuid = NETLINK_CB(skb).loginuid;
391 seq = nlh->nlmsg_seq; 391 seq = nlh->nlmsg_seq;
392 data = NLMSG_DATA(nlh); 392 data = NLMSG_DATA(nlh);
393 393
394 switch (msg_type) { 394 switch (msg_type) {
395 case AUDIT_GET: 395 case AUDIT_GET:
396 status_set.enabled = audit_enabled; 396 status_set.enabled = audit_enabled;
397 status_set.failure = audit_failure; 397 status_set.failure = audit_failure;
398 status_set.pid = audit_pid; 398 status_set.pid = audit_pid;
399 status_set.rate_limit = audit_rate_limit; 399 status_set.rate_limit = audit_rate_limit;
400 status_set.backlog_limit = audit_backlog_limit; 400 status_set.backlog_limit = audit_backlog_limit;
401 status_set.lost = atomic_read(&audit_lost); 401 status_set.lost = atomic_read(&audit_lost);
402 status_set.backlog = skb_queue_len(&audit_skb_queue); 402 status_set.backlog = skb_queue_len(&audit_skb_queue);
403 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, 403 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
404 &status_set, sizeof(status_set)); 404 &status_set, sizeof(status_set));
405 break; 405 break;
406 case AUDIT_SET: 406 case AUDIT_SET:
407 if (nlh->nlmsg_len < sizeof(struct audit_status)) 407 if (nlh->nlmsg_len < sizeof(struct audit_status))
408 return -EINVAL; 408 return -EINVAL;
409 status_get = (struct audit_status *)data; 409 status_get = (struct audit_status *)data;
410 if (status_get->mask & AUDIT_STATUS_ENABLED) { 410 if (status_get->mask & AUDIT_STATUS_ENABLED) {
411 err = audit_set_enabled(status_get->enabled, loginuid); 411 err = audit_set_enabled(status_get->enabled, loginuid);
412 if (err < 0) return err; 412 if (err < 0) return err;
413 } 413 }
414 if (status_get->mask & AUDIT_STATUS_FAILURE) { 414 if (status_get->mask & AUDIT_STATUS_FAILURE) {
415 err = audit_set_failure(status_get->failure, loginuid); 415 err = audit_set_failure(status_get->failure, loginuid);
416 if (err < 0) return err; 416 if (err < 0) return err;
417 } 417 }
418 if (status_get->mask & AUDIT_STATUS_PID) { 418 if (status_get->mask & AUDIT_STATUS_PID) {
419 int old = audit_pid; 419 int old = audit_pid;
420 audit_pid = status_get->pid; 420 audit_pid = status_get->pid;
421 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 421 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
422 "audit_pid=%d old=%d by auid=%u", 422 "audit_pid=%d old=%d by auid=%u",
423 audit_pid, old, loginuid); 423 audit_pid, old, loginuid);
424 } 424 }
425 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 425 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
426 audit_set_rate_limit(status_get->rate_limit, loginuid); 426 audit_set_rate_limit(status_get->rate_limit, loginuid);
427 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 427 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
428 audit_set_backlog_limit(status_get->backlog_limit, 428 audit_set_backlog_limit(status_get->backlog_limit,
429 loginuid); 429 loginuid);
430 break; 430 break;
431 case AUDIT_USER: 431 case AUDIT_USER:
432 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 432 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
433 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 433 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
434 return 0; 434 return 0;
435 435
436 err = audit_filter_user(&NETLINK_CB(skb), msg_type); 436 err = audit_filter_user(&NETLINK_CB(skb), msg_type);
437 if (err == 1) { 437 if (err == 1) {
438 err = 0; 438 err = 0;
439 ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 439 ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
440 if (ab) { 440 if (ab) {
441 audit_log_format(ab, 441 audit_log_format(ab,
442 "user pid=%d uid=%u auid=%u msg='%.1024s'", 442 "user pid=%d uid=%u auid=%u msg='%.1024s'",
443 pid, uid, loginuid, (char *)data); 443 pid, uid, loginuid, (char *)data);
444 audit_set_pid(ab, pid); 444 audit_set_pid(ab, pid);
445 audit_log_end(ab); 445 audit_log_end(ab);
446 } 446 }
447 } 447 }
448 break; 448 break;
449 case AUDIT_ADD: 449 case AUDIT_ADD:
450 case AUDIT_DEL: 450 case AUDIT_DEL:
451 if (nlh->nlmsg_len < sizeof(struct audit_rule)) 451 if (nlh->nlmsg_len < sizeof(struct audit_rule))
452 return -EINVAL; 452 return -EINVAL;
453 /* fallthrough */ 453 /* fallthrough */
454 case AUDIT_LIST: 454 case AUDIT_LIST:
455 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 455 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
456 uid, seq, data, loginuid); 456 uid, seq, data, loginuid);
457 break; 457 break;
458 case AUDIT_SIGNAL_INFO: 458 case AUDIT_SIGNAL_INFO:
459 sig_data.uid = audit_sig_uid; 459 sig_data.uid = audit_sig_uid;
460 sig_data.pid = audit_sig_pid; 460 sig_data.pid = audit_sig_pid;
461 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 461 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
462 0, 0, &sig_data, sizeof(sig_data)); 462 0, 0, &sig_data, sizeof(sig_data));
463 break; 463 break;
464 default: 464 default:
465 err = -EINVAL; 465 err = -EINVAL;
466 break; 466 break;
467 } 467 }
468 468
469 return err < 0 ? err : 0; 469 return err < 0 ? err : 0;
470 } 470 }
471 471
472 /* Get message from skb (based on rtnetlink_rcv_skb). Each message is 472 /* Get message from skb (based on rtnetlink_rcv_skb). Each message is
473 * processed by audit_receive_msg. Malformed skbs with wrong length are 473 * processed by audit_receive_msg. Malformed skbs with wrong length are
474 * discarded silently. */ 474 * discarded silently. */
475 static void audit_receive_skb(struct sk_buff *skb) 475 static void audit_receive_skb(struct sk_buff *skb)
476 { 476 {
477 int err; 477 int err;
478 struct nlmsghdr *nlh; 478 struct nlmsghdr *nlh;
479 u32 rlen; 479 u32 rlen;
480 480
481 while (skb->len >= NLMSG_SPACE(0)) { 481 while (skb->len >= NLMSG_SPACE(0)) {
482 nlh = (struct nlmsghdr *)skb->data; 482 nlh = (struct nlmsghdr *)skb->data;
483 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 483 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
484 return; 484 return;
485 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 485 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
486 if (rlen > skb->len) 486 if (rlen > skb->len)
487 rlen = skb->len; 487 rlen = skb->len;
488 if ((err = audit_receive_msg(skb, nlh))) { 488 if ((err = audit_receive_msg(skb, nlh))) {
489 netlink_ack(skb, nlh, err); 489 netlink_ack(skb, nlh, err);
490 } else if (nlh->nlmsg_flags & NLM_F_ACK) 490 } else if (nlh->nlmsg_flags & NLM_F_ACK)
491 netlink_ack(skb, nlh, 0); 491 netlink_ack(skb, nlh, 0);
492 skb_pull(skb, rlen); 492 skb_pull(skb, rlen);
493 } 493 }
494 } 494 }
495 495
496 /* Receive messages from netlink socket. */ 496 /* Receive messages from netlink socket. */
497 static void audit_receive(struct sock *sk, int length) 497 static void audit_receive(struct sock *sk, int length)
498 { 498 {
499 struct sk_buff *skb; 499 struct sk_buff *skb;
500 unsigned int qlen; 500 unsigned int qlen;
501 501
502 down(&audit_netlink_sem); 502 down(&audit_netlink_sem);
503 503
504 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { 504 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
505 skb = skb_dequeue(&sk->sk_receive_queue); 505 skb = skb_dequeue(&sk->sk_receive_queue);
506 audit_receive_skb(skb); 506 audit_receive_skb(skb);
507 kfree_skb(skb); 507 kfree_skb(skb);
508 } 508 }
509 up(&audit_netlink_sem); 509 up(&audit_netlink_sem);
510 } 510 }
511 511
512 512
513 /* Initialize audit support at boot time. */ 513 /* Initialize audit support at boot time. */
514 static int __init audit_init(void) 514 static int __init audit_init(void)
515 { 515 {
516 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 516 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
517 audit_default ? "enabled" : "disabled"); 517 audit_default ? "enabled" : "disabled");
518 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 518 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
519 THIS_MODULE); 519 THIS_MODULE);
520 if (!audit_sock) 520 if (!audit_sock)
521 audit_panic("cannot initialize netlink socket"); 521 audit_panic("cannot initialize netlink socket");
522 522
523 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 523 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
524 skb_queue_head_init(&audit_skb_queue); 524 skb_queue_head_init(&audit_skb_queue);
525 audit_initialized = 1; 525 audit_initialized = 1;
526 audit_enabled = audit_default; 526 audit_enabled = audit_default;
527 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 527 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
528 return 0; 528 return 0;
529 } 529 }
530 __initcall(audit_init); 530 __initcall(audit_init);
531 531
532 /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ 532 /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
533 static int __init audit_enable(char *str) 533 static int __init audit_enable(char *str)
534 { 534 {
535 audit_default = !!simple_strtol(str, NULL, 0); 535 audit_default = !!simple_strtol(str, NULL, 0);
536 printk(KERN_INFO "audit: %s%s\n", 536 printk(KERN_INFO "audit: %s%s\n",
537 audit_default ? "enabled" : "disabled", 537 audit_default ? "enabled" : "disabled",
538 audit_initialized ? "" : " (after initialization)"); 538 audit_initialized ? "" : " (after initialization)");
539 if (audit_initialized) 539 if (audit_initialized)
540 audit_enabled = audit_default; 540 audit_enabled = audit_default;
541 return 0; 541 return 0;
542 } 542 }
543 543
544 __setup("audit=", audit_enable); 544 __setup("audit=", audit_enable);
545 545
546 static void audit_buffer_free(struct audit_buffer *ab) 546 static void audit_buffer_free(struct audit_buffer *ab)
547 { 547 {
548 unsigned long flags; 548 unsigned long flags;
549 549
550 if (!ab) 550 if (!ab)
551 return; 551 return;
552 552
553 if (ab->skb) 553 if (ab->skb)
554 kfree_skb(ab->skb); 554 kfree_skb(ab->skb);
555 555
556 spin_lock_irqsave(&audit_freelist_lock, flags); 556 spin_lock_irqsave(&audit_freelist_lock, flags);
557 if (++audit_freelist_count > AUDIT_MAXFREE) 557 if (++audit_freelist_count > AUDIT_MAXFREE)
558 kfree(ab); 558 kfree(ab);
559 else 559 else
560 list_add(&ab->list, &audit_freelist); 560 list_add(&ab->list, &audit_freelist);
561 spin_unlock_irqrestore(&audit_freelist_lock, flags); 561 spin_unlock_irqrestore(&audit_freelist_lock, flags);
562 } 562 }
563 563
564 static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, 564 static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
565 gfp_t gfp_mask, int type) 565 gfp_t gfp_mask, int type)
566 { 566 {
567 unsigned long flags; 567 unsigned long flags;
568 struct audit_buffer *ab = NULL; 568 struct audit_buffer *ab = NULL;
569 struct nlmsghdr *nlh; 569 struct nlmsghdr *nlh;
570 570
571 spin_lock_irqsave(&audit_freelist_lock, flags); 571 spin_lock_irqsave(&audit_freelist_lock, flags);
572 if (!list_empty(&audit_freelist)) { 572 if (!list_empty(&audit_freelist)) {
573 ab = list_entry(audit_freelist.next, 573 ab = list_entry(audit_freelist.next,
574 struct audit_buffer, list); 574 struct audit_buffer, list);
575 list_del(&ab->list); 575 list_del(&ab->list);
576 --audit_freelist_count; 576 --audit_freelist_count;
577 } 577 }
578 spin_unlock_irqrestore(&audit_freelist_lock, flags); 578 spin_unlock_irqrestore(&audit_freelist_lock, flags);
579 579
580 if (!ab) { 580 if (!ab) {
581 ab = kmalloc(sizeof(*ab), gfp_mask); 581 ab = kmalloc(sizeof(*ab), gfp_mask);
582 if (!ab) 582 if (!ab)
583 goto err; 583 goto err;
584 } 584 }
585 585
586 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); 586 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
587 if (!ab->skb) 587 if (!ab->skb)
588 goto err; 588 goto err;
589 589
590 ab->ctx = ctx; 590 ab->ctx = ctx;
591 ab->gfp_mask = gfp_mask; 591 ab->gfp_mask = gfp_mask;
592 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 592 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
593 nlh->nlmsg_type = type; 593 nlh->nlmsg_type = type;
594 nlh->nlmsg_flags = 0; 594 nlh->nlmsg_flags = 0;
595 nlh->nlmsg_pid = 0; 595 nlh->nlmsg_pid = 0;
596 nlh->nlmsg_seq = 0; 596 nlh->nlmsg_seq = 0;
597 return ab; 597 return ab;
598 err: 598 err:
599 audit_buffer_free(ab); 599 audit_buffer_free(ab);
600 return NULL; 600 return NULL;
601 } 601 }
602 602
603 /* Compute a serial number for the audit record. Audit records are 603 /* Compute a serial number for the audit record. Audit records are
604 * written to user-space as soon as they are generated, so a complete 604 * written to user-space as soon as they are generated, so a complete
605 * audit record may be written in several pieces. The timestamp of the 605 * audit record may be written in several pieces. The timestamp of the
606 * record and this serial number are used by the user-space tools to 606 * record and this serial number are used by the user-space tools to
607 * determine which pieces belong to the same audit record. The 607 * determine which pieces belong to the same audit record. The
608 * (timestamp,serial) tuple is unique for each syscall and is live from 608 * (timestamp,serial) tuple is unique for each syscall and is live from
609 * syscall entry to syscall exit. 609 * syscall entry to syscall exit.
610 * 610 *
611 * NOTE: Another possibility is to store the formatted records off the 611 * NOTE: Another possibility is to store the formatted records off the
612 * audit context (for those records that have a context), and emit them 612 * audit context (for those records that have a context), and emit them
613 * all at syscall exit. However, this could delay the reporting of 613 * all at syscall exit. However, this could delay the reporting of
614 * significant errors until syscall exit (or never, if the system 614 * significant errors until syscall exit (or never, if the system
615 * halts). */ 615 * halts). */
616 616
617 unsigned int audit_serial(void) 617 unsigned int audit_serial(void)
618 { 618 {
619 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 619 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
620 static unsigned int serial = 0; 620 static unsigned int serial = 0;
621 621
622 unsigned long flags; 622 unsigned long flags;
623 unsigned int ret; 623 unsigned int ret;
624 624
625 spin_lock_irqsave(&serial_lock, flags); 625 spin_lock_irqsave(&serial_lock, flags);
626 do { 626 do {
627 ret = ++serial; 627 ret = ++serial;
628 } while (unlikely(!ret)); 628 } while (unlikely(!ret));
629 spin_unlock_irqrestore(&serial_lock, flags); 629 spin_unlock_irqrestore(&serial_lock, flags);
630 630
631 return ret; 631 return ret;
632 } 632 }
633 633
634 static inline void audit_get_stamp(struct audit_context *ctx, 634 static inline void audit_get_stamp(struct audit_context *ctx,
635 struct timespec *t, unsigned int *serial) 635 struct timespec *t, unsigned int *serial)
636 { 636 {
637 if (ctx) 637 if (ctx)
638 auditsc_get_stamp(ctx, t, serial); 638 auditsc_get_stamp(ctx, t, serial);
639 else { 639 else {
640 *t = CURRENT_TIME; 640 *t = CURRENT_TIME;
641 *serial = audit_serial(); 641 *serial = audit_serial();
642 } 642 }
643 } 643 }
644 644
645 /* Obtain an audit buffer. This routine does locking to obtain the 645 /* Obtain an audit buffer. This routine does locking to obtain the
646 * audit buffer, but then no locking is required for calls to 646 * audit buffer, but then no locking is required for calls to
647 * audit_log_*format. If the tsk is a task that is currently in a 647 * audit_log_*format. If the tsk is a task that is currently in a
648 * syscall, then the syscall is marked as auditable and an audit record 648 * syscall, then the syscall is marked as auditable and an audit record
649 * will be written at syscall exit. If there is no associated task, tsk 649 * will be written at syscall exit. If there is no associated task, tsk
650 * should be NULL. */ 650 * should be NULL. */
651 651
652 struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, 652 struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
653 int type) 653 int type)
654 { 654 {
655 struct audit_buffer *ab = NULL; 655 struct audit_buffer *ab = NULL;
656 struct timespec t; 656 struct timespec t;
657 unsigned int serial; 657 unsigned int serial;
658 int reserve; 658 int reserve;
659 unsigned long timeout_start = jiffies; 659 unsigned long timeout_start = jiffies;
660 660
661 if (!audit_initialized) 661 if (!audit_initialized)
662 return NULL; 662 return NULL;
663 663
664 if (gfp_mask & __GFP_WAIT) 664 if (gfp_mask & __GFP_WAIT)
665 reserve = 0; 665 reserve = 0;
666 else 666 else
667 reserve = 5; /* Allow atomic callers to go up to five 667 reserve = 5; /* Allow atomic callers to go up to five
668 entries over the normal backlog limit */ 668 entries over the normal backlog limit */
669 669
670 while (audit_backlog_limit 670 while (audit_backlog_limit
671 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { 671 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
672 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time 672 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
673 && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { 673 && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
674 674
675 /* Wait for auditd to drain the queue a little */ 675 /* Wait for auditd to drain the queue a little */
676 DECLARE_WAITQUEUE(wait, current); 676 DECLARE_WAITQUEUE(wait, current);
677 set_current_state(TASK_INTERRUPTIBLE); 677 set_current_state(TASK_INTERRUPTIBLE);
678 add_wait_queue(&audit_backlog_wait, &wait); 678 add_wait_queue(&audit_backlog_wait, &wait);
679 679
680 if (audit_backlog_limit && 680 if (audit_backlog_limit &&
681 skb_queue_len(&audit_skb_queue) > audit_backlog_limit) 681 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
682 schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); 682 schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
683 683
684 __set_current_state(TASK_RUNNING); 684 __set_current_state(TASK_RUNNING);
685 remove_wait_queue(&audit_backlog_wait, &wait); 685 remove_wait_queue(&audit_backlog_wait, &wait);
686 continue; 686 continue;
687 } 687 }
688 if (audit_rate_check()) 688 if (audit_rate_check())
689 printk(KERN_WARNING 689 printk(KERN_WARNING
690 "audit: audit_backlog=%d > " 690 "audit: audit_backlog=%d > "
691 "audit_backlog_limit=%d\n", 691 "audit_backlog_limit=%d\n",
692 skb_queue_len(&audit_skb_queue), 692 skb_queue_len(&audit_skb_queue),
693 audit_backlog_limit); 693 audit_backlog_limit);
694 audit_log_lost("backlog limit exceeded"); 694 audit_log_lost("backlog limit exceeded");
695 audit_backlog_wait_time = audit_backlog_wait_overflow; 695 audit_backlog_wait_time = audit_backlog_wait_overflow;
696 wake_up(&audit_backlog_wait); 696 wake_up(&audit_backlog_wait);
697 return NULL; 697 return NULL;
698 } 698 }
699 699
700 ab = audit_buffer_alloc(ctx, gfp_mask, type); 700 ab = audit_buffer_alloc(ctx, gfp_mask, type);
701 if (!ab) { 701 if (!ab) {
702 audit_log_lost("out of memory in audit_log_start"); 702 audit_log_lost("out of memory in audit_log_start");
703 return NULL; 703 return NULL;
704 } 704 }
705 705
706 audit_get_stamp(ab->ctx, &t, &serial); 706 audit_get_stamp(ab->ctx, &t, &serial);
707 707
708 audit_log_format(ab, "audit(%lu.%03lu:%u): ", 708 audit_log_format(ab, "audit(%lu.%03lu:%u): ",
709 t.tv_sec, t.tv_nsec/1000000, serial); 709 t.tv_sec, t.tv_nsec/1000000, serial);
710 return ab; 710 return ab;
711 } 711 }
712 712
713 /** 713 /**
714 * audit_expand - expand skb in the audit buffer 714 * audit_expand - expand skb in the audit buffer
715 * @ab: audit_buffer 715 * @ab: audit_buffer
716 * 716 *
717 * Returns 0 (no space) on failed expansion, or available space if 717 * Returns 0 (no space) on failed expansion, or available space if
718 * successful. 718 * successful.
719 */ 719 */
720 static inline int audit_expand(struct audit_buffer *ab, int extra) 720 static inline int audit_expand(struct audit_buffer *ab, int extra)
721 { 721 {
722 struct sk_buff *skb = ab->skb; 722 struct sk_buff *skb = ab->skb;
723 int ret = pskb_expand_head(skb, skb_headroom(skb), extra, 723 int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
724 ab->gfp_mask); 724 ab->gfp_mask);
725 if (ret < 0) { 725 if (ret < 0) {
726 audit_log_lost("out of memory in audit_expand"); 726 audit_log_lost("out of memory in audit_expand");
727 return 0; 727 return 0;
728 } 728 }
729 return skb_tailroom(skb); 729 return skb_tailroom(skb);
730 } 730 }
731 731
732 /* Format an audit message into the audit buffer. If there isn't enough 732 /* Format an audit message into the audit buffer. If there isn't enough
733 * room in the audit buffer, more room will be allocated and vsnprint 733 * room in the audit buffer, more room will be allocated and vsnprint
734 * will be called a second time. Currently, we assume that a printk 734 * will be called a second time. Currently, we assume that a printk
735 * can't format message larger than 1024 bytes, so we don't either. */ 735 * can't format message larger than 1024 bytes, so we don't either. */
736 static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, 736 static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
737 va_list args) 737 va_list args)
738 { 738 {
739 int len, avail; 739 int len, avail;
740 struct sk_buff *skb; 740 struct sk_buff *skb;
741 va_list args2; 741 va_list args2;
742 742
743 if (!ab) 743 if (!ab)
744 return; 744 return;
745 745
746 BUG_ON(!ab->skb); 746 BUG_ON(!ab->skb);
747 skb = ab->skb; 747 skb = ab->skb;
748 avail = skb_tailroom(skb); 748 avail = skb_tailroom(skb);
749 if (avail == 0) { 749 if (avail == 0) {
750 avail = audit_expand(ab, AUDIT_BUFSIZ); 750 avail = audit_expand(ab, AUDIT_BUFSIZ);
751 if (!avail) 751 if (!avail)
752 goto out; 752 goto out;
753 } 753 }
754 va_copy(args2, args); 754 va_copy(args2, args);
755 len = vsnprintf(skb->tail, avail, fmt, args); 755 len = vsnprintf(skb->tail, avail, fmt, args);
756 if (len >= avail) { 756 if (len >= avail) {
757 /* The printk buffer is 1024 bytes long, so if we get 757 /* The printk buffer is 1024 bytes long, so if we get
758 * here and AUDIT_BUFSIZ is at least 1024, then we can 758 * here and AUDIT_BUFSIZ is at least 1024, then we can
759 * log everything that printk could have logged. */ 759 * log everything that printk could have logged. */
760 avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 760 avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
761 if (!avail) 761 if (!avail)
762 goto out; 762 goto out;
763 len = vsnprintf(skb->tail, avail, fmt, args2); 763 len = vsnprintf(skb->tail, avail, fmt, args2);
764 } 764 }
765 if (len > 0) 765 if (len > 0)
766 skb_put(skb, len); 766 skb_put(skb, len);
767 out: 767 out:
768 return; 768 return;
769 } 769 }
770 770
771 /* Format a message into the audit buffer. All the work is done in 771 /* Format a message into the audit buffer. All the work is done in
772 * audit_log_vformat. */ 772 * audit_log_vformat. */
773 void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) 773 void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
774 { 774 {
775 va_list args; 775 va_list args;
776 776
777 if (!ab) 777 if (!ab)
778 return; 778 return;
779 va_start(args, fmt); 779 va_start(args, fmt);
780 audit_log_vformat(ab, fmt, args); 780 audit_log_vformat(ab, fmt, args);
781 va_end(args); 781 va_end(args);
782 } 782 }
783 783
784 /* This function will take the passed buf and convert it into a string of 784 /* This function will take the passed buf and convert it into a string of
785 * ascii hex digits. The new string is placed onto the skb. */ 785 * ascii hex digits. The new string is placed onto the skb. */
786 void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 786 void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
787 size_t len) 787 size_t len)
788 { 788 {
789 int i, avail, new_len; 789 int i, avail, new_len;
790 unsigned char *ptr; 790 unsigned char *ptr;
791 struct sk_buff *skb; 791 struct sk_buff *skb;
792 static const unsigned char *hex = "0123456789ABCDEF"; 792 static const unsigned char *hex = "0123456789ABCDEF";
793 793
794 BUG_ON(!ab->skb); 794 BUG_ON(!ab->skb);
795 skb = ab->skb; 795 skb = ab->skb;
796 avail = skb_tailroom(skb); 796 avail = skb_tailroom(skb);
797 new_len = len<<1; 797 new_len = len<<1;
798 if (new_len >= avail) { 798 if (new_len >= avail) {
799 /* Round the buffer request up to the next multiple */ 799 /* Round the buffer request up to the next multiple */
800 new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); 800 new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
801 avail = audit_expand(ab, new_len); 801 avail = audit_expand(ab, new_len);
802 if (!avail) 802 if (!avail)
803 return; 803 return;
804 } 804 }
805 805
806 ptr = skb->tail; 806 ptr = skb->tail;
807 for (i=0; i<len; i++) { 807 for (i=0; i<len; i++) {
808 *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ 808 *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
809 *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ 809 *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
810 } 810 }
811 *ptr = 0; 811 *ptr = 0;
812 skb_put(skb, len << 1); /* new string is twice the old string */ 812 skb_put(skb, len << 1); /* new string is twice the old string */
813 } 813 }
814 814
815 /* This code will escape a string that is passed to it if the string 815 /* This code will escape a string that is passed to it if the string
816 * contains a control character, unprintable character, double quote mark, 816 * contains a control character, unprintable character, double quote mark,
817 * or a space. Unescaped strings will start and end with a double quote mark. 817 * or a space. Unescaped strings will start and end with a double quote mark.
818 * Strings that are escaped are printed in hex (2 digits per char). */ 818 * Strings that are escaped are printed in hex (2 digits per char). */
819 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 819 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
820 { 820 {
821 const unsigned char *p = string; 821 const unsigned char *p = string;
822 822
823 while (*p) { 823 while (*p) {
824 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 824 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
825 audit_log_hex(ab, string, strlen(string)); 825 audit_log_hex(ab, string, strlen(string));
826 return; 826 return;
827 } 827 }
828 p++; 828 p++;
829 } 829 }
830 audit_log_format(ab, "\"%s\"", string); 830 audit_log_format(ab, "\"%s\"", string);
831 } 831 }
832 832
833 /* This is a helper-function to print the escaped d_path */ 833 /* This is a helper-function to print the escaped d_path */
834 void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 834 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
835 struct dentry *dentry, struct vfsmount *vfsmnt) 835 struct dentry *dentry, struct vfsmount *vfsmnt)
836 { 836 {
837 char *p, *path; 837 char *p, *path;
838 838
839 if (prefix) 839 if (prefix)
840 audit_log_format(ab, " %s", prefix); 840 audit_log_format(ab, " %s", prefix);
841 841
842 /* We will allow 11 spaces for ' (deleted)' to be appended */ 842 /* We will allow 11 spaces for ' (deleted)' to be appended */
843 path = kmalloc(PATH_MAX+11, ab->gfp_mask); 843 path = kmalloc(PATH_MAX+11, ab->gfp_mask);
844 if (!path) { 844 if (!path) {
845 audit_log_format(ab, "<no memory>"); 845 audit_log_format(ab, "<no memory>");
846 return; 846 return;
847 } 847 }
848 p = d_path(dentry, vfsmnt, path, PATH_MAX+11); 848 p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
849 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ 849 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
850 /* FIXME: can we save some information here? */ 850 /* FIXME: can we save some information here? */
851 audit_log_format(ab, "<too long>"); 851 audit_log_format(ab, "<too long>");
852 } else 852 } else
853 audit_log_untrustedstring(ab, p); 853 audit_log_untrustedstring(ab, p);
854 kfree(path); 854 kfree(path);
855 } 855 }
856 856
857 /* The netlink_* functions cannot be called inside an irq context, so 857 /* The netlink_* functions cannot be called inside an irq context, so
858 * the audit buffer is places on a queue and a tasklet is scheduled to 858 * the audit buffer is places on a queue and a tasklet is scheduled to
859 * remove them from the queue outside the irq context. May be called in 859 * remove them from the queue outside the irq context. May be called in
860 * any context. */ 860 * any context. */
861 void audit_log_end(struct audit_buffer *ab) 861 void audit_log_end(struct audit_buffer *ab)
862 { 862 {
863 if (!ab) 863 if (!ab)
864 return; 864 return;
865 if (!audit_rate_check()) { 865 if (!audit_rate_check()) {
866 audit_log_lost("rate limit exceeded"); 866 audit_log_lost("rate limit exceeded");
867 } else { 867 } else {
868 if (audit_pid) { 868 if (audit_pid) {
869 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; 869 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
870 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); 870 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
871 skb_queue_tail(&audit_skb_queue, ab->skb); 871 skb_queue_tail(&audit_skb_queue, ab->skb);
872 ab->skb = NULL; 872 ab->skb = NULL;
873 wake_up_interruptible(&kauditd_wait); 873 wake_up_interruptible(&kauditd_wait);
874 } else { 874 } else {
875 printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); 875 printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));
876 } 876 }
877 } 877 }
878 audit_buffer_free(ab); 878 audit_buffer_free(ab);
879 } 879 }
880 880
881 /* Log an audit record. This is a convenience function that calls 881 /* Log an audit record. This is a convenience function that calls
882 * audit_log_start, audit_log_vformat, and audit_log_end. It may be 882 * audit_log_start, audit_log_vformat, and audit_log_end. It may be
883 * called in any context. */ 883 * called in any context. */
884 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 884 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
885 const char *fmt, ...) 885 const char *fmt, ...)
886 { 886 {
887 struct audit_buffer *ab; 887 struct audit_buffer *ab;
888 va_list args; 888 va_list args;
889 889
890 ab = audit_log_start(ctx, gfp_mask, type); 890 ab = audit_log_start(ctx, gfp_mask, type);
891 if (ab) { 891 if (ab) {
892 va_start(args, fmt); 892 va_start(args, fmt);
893 audit_log_vformat(ab, fmt, args); 893 audit_log_vformat(ab, fmt, args);
894 va_end(args); 894 va_end(args);
895 audit_log_end(ab); 895 audit_log_end(ab);
896 } 896 }
897 } 897 }
898 898
1 /* 1 /*
2 * linux/kernel/irq/proc.c 2 * linux/kernel/irq/proc.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
5 * 5 *
6 * This file contains the /proc/irq/ handling code. 6 * This file contains the /proc/irq/ handling code.
7 */ 7 */
8 8
9 #include <linux/irq.h> 9 #include <linux/irq.h>
10 #include <linux/proc_fs.h> 10 #include <linux/proc_fs.h>
11 #include <linux/interrupt.h> 11 #include <linux/interrupt.h>
12 12
13 #include "internals.h"
14
13 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; 15 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
14 16
15 #ifdef CONFIG_SMP 17 #ifdef CONFIG_SMP
16 18
17 /* 19 /*
18 * The /proc/irq/<irq>/smp_affinity values: 20 * The /proc/irq/<irq>/smp_affinity values:
19 */ 21 */
20 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; 22 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
21 23
22 #ifdef CONFIG_GENERIC_PENDING_IRQ 24 #ifdef CONFIG_GENERIC_PENDING_IRQ
23 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 25 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24 { 26 {
25 /* 27 /*
26 * Save these away for later use. Re-progam when the 28 * Save these away for later use. Re-progam when the
27 * interrupt is pending 29 * interrupt is pending
28 */ 30 */
29 set_pending_irq(irq, mask_val); 31 set_pending_irq(irq, mask_val);
30 } 32 }
31 #else 33 #else
32 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 34 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
33 { 35 {
34 irq_affinity[irq] = mask_val; 36 irq_affinity[irq] = mask_val;
35 irq_desc[irq].handler->set_affinity(irq, mask_val); 37 irq_desc[irq].handler->set_affinity(irq, mask_val);
36 } 38 }
37 #endif 39 #endif
38 40
39 static int irq_affinity_read_proc(char *page, char **start, off_t off, 41 static int irq_affinity_read_proc(char *page, char **start, off_t off,
40 int count, int *eof, void *data) 42 int count, int *eof, void *data)
41 { 43 {
42 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); 44 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
43 45
44 if (count - len < 2) 46 if (count - len < 2)
45 return -EINVAL; 47 return -EINVAL;
46 len += sprintf(page + len, "\n"); 48 len += sprintf(page + len, "\n");
47 return len; 49 return len;
48 } 50 }
49 51
50 int no_irq_affinity; 52 int no_irq_affinity;
51 static int irq_affinity_write_proc(struct file *file, const char __user *buffer, 53 static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
52 unsigned long count, void *data) 54 unsigned long count, void *data)
53 { 55 {
54 unsigned int irq = (int)(long)data, full_count = count, err; 56 unsigned int irq = (int)(long)data, full_count = count, err;
55 cpumask_t new_value, tmp; 57 cpumask_t new_value, tmp;
56 58
57 if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) 59 if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
58 return -EIO; 60 return -EIO;
59 61
60 err = cpumask_parse(buffer, count, new_value); 62 err = cpumask_parse(buffer, count, new_value);
61 if (err) 63 if (err)
62 return err; 64 return err;
63 65
64 /* 66 /*
65 * Do not allow disabling IRQs completely - it's a too easy 67 * Do not allow disabling IRQs completely - it's a too easy
66 * way to make the system unusable accidentally :-) At least 68 * way to make the system unusable accidentally :-) At least
67 * one online CPU still has to be targeted. 69 * one online CPU still has to be targeted.
68 */ 70 */
69 cpus_and(tmp, new_value, cpu_online_map); 71 cpus_and(tmp, new_value, cpu_online_map);
70 if (cpus_empty(tmp)) 72 if (cpus_empty(tmp))
71 /* Special case for empty set - allow the architecture 73 /* Special case for empty set - allow the architecture
72 code to set default SMP affinity. */ 74 code to set default SMP affinity. */
73 return select_smp_affinity(irq) ? -EINVAL : full_count; 75 return select_smp_affinity(irq) ? -EINVAL : full_count;
74 76
75 proc_set_irq_affinity(irq, new_value); 77 proc_set_irq_affinity(irq, new_value);
76 78
77 return full_count; 79 return full_count;
78 } 80 }
79 81
80 #endif 82 #endif
81 83
82 #define MAX_NAMELEN 128 84 #define MAX_NAMELEN 128
83 85
84 static int name_unique(unsigned int irq, struct irqaction *new_action) 86 static int name_unique(unsigned int irq, struct irqaction *new_action)
85 { 87 {
86 struct irq_desc *desc = irq_desc + irq; 88 struct irq_desc *desc = irq_desc + irq;
87 struct irqaction *action; 89 struct irqaction *action;
88 90
89 for (action = desc->action ; action; action = action->next) 91 for (action = desc->action ; action; action = action->next)
90 if ((action != new_action) && action->name && 92 if ((action != new_action) && action->name &&
91 !strcmp(new_action->name, action->name)) 93 !strcmp(new_action->name, action->name))
92 return 0; 94 return 0;
93 return 1; 95 return 1;
94 } 96 }
95 97
96 void register_handler_proc(unsigned int irq, struct irqaction *action) 98 void register_handler_proc(unsigned int irq, struct irqaction *action)
97 { 99 {
98 char name [MAX_NAMELEN]; 100 char name [MAX_NAMELEN];
99 101
100 if (!irq_dir[irq] || action->dir || !action->name || 102 if (!irq_dir[irq] || action->dir || !action->name ||
101 !name_unique(irq, action)) 103 !name_unique(irq, action))
102 return; 104 return;
103 105
104 memset(name, 0, MAX_NAMELEN); 106 memset(name, 0, MAX_NAMELEN);
105 snprintf(name, MAX_NAMELEN, "%s", action->name); 107 snprintf(name, MAX_NAMELEN, "%s", action->name);
106 108
107 /* create /proc/irq/1234/handler/ */ 109 /* create /proc/irq/1234/handler/ */
108 action->dir = proc_mkdir(name, irq_dir[irq]); 110 action->dir = proc_mkdir(name, irq_dir[irq]);
109 } 111 }
110 112
111 #undef MAX_NAMELEN 113 #undef MAX_NAMELEN
112 114
113 #define MAX_NAMELEN 10 115 #define MAX_NAMELEN 10
114 116
115 void register_irq_proc(unsigned int irq) 117 void register_irq_proc(unsigned int irq)
116 { 118 {
117 char name [MAX_NAMELEN]; 119 char name [MAX_NAMELEN];
118 120
119 if (!root_irq_dir || 121 if (!root_irq_dir ||
120 (irq_desc[irq].handler == &no_irq_type) || 122 (irq_desc[irq].handler == &no_irq_type) ||
121 irq_dir[irq]) 123 irq_dir[irq])
122 return; 124 return;
123 125
124 memset(name, 0, MAX_NAMELEN); 126 memset(name, 0, MAX_NAMELEN);
125 sprintf(name, "%d", irq); 127 sprintf(name, "%d", irq);
126 128
127 /* create /proc/irq/1234 */ 129 /* create /proc/irq/1234 */
128 irq_dir[irq] = proc_mkdir(name, root_irq_dir); 130 irq_dir[irq] = proc_mkdir(name, root_irq_dir);
129 131
130 #ifdef CONFIG_SMP 132 #ifdef CONFIG_SMP
131 { 133 {
132 struct proc_dir_entry *entry; 134 struct proc_dir_entry *entry;
133 135
134 /* create /proc/irq/<irq>/smp_affinity */ 136 /* create /proc/irq/<irq>/smp_affinity */
135 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); 137 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
136 138
137 if (entry) { 139 if (entry) {
138 entry->nlink = 1; 140 entry->nlink = 1;
139 entry->data = (void *)(long)irq; 141 entry->data = (void *)(long)irq;
140 entry->read_proc = irq_affinity_read_proc; 142 entry->read_proc = irq_affinity_read_proc;
141 entry->write_proc = irq_affinity_write_proc; 143 entry->write_proc = irq_affinity_write_proc;
142 } 144 }
143 smp_affinity_entry[irq] = entry; 145 smp_affinity_entry[irq] = entry;
144 } 146 }
145 #endif 147 #endif
146 } 148 }
147 149
148 #undef MAX_NAMELEN 150 #undef MAX_NAMELEN
149 151
150 void unregister_handler_proc(unsigned int irq, struct irqaction *action) 152 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
151 { 153 {
152 if (action->dir) 154 if (action->dir)
153 remove_proc_entry(action->dir->name, irq_dir[irq]); 155 remove_proc_entry(action->dir->name, irq_dir[irq]);
154 } 156 }
155 157
156 void init_irq_proc(void) 158 void init_irq_proc(void)
157 { 159 {
158 int i; 160 int i;
159 161
160 /* create /proc/irq */ 162 /* create /proc/irq */
161 root_irq_dir = proc_mkdir("irq", NULL); 163 root_irq_dir = proc_mkdir("irq", NULL);
162 if (!root_irq_dir) 164 if (!root_irq_dir)
163 return; 165 return;
164 166
165 /* 167 /*
166 * Create entries for all existing IRQs. 168 * Create entries for all existing IRQs.
167 */ 169 */
168 for (i = 0; i < NR_IRQS; i++) 170 for (i = 0; i < NR_IRQS; i++)
169 register_irq_proc(i); 171 register_irq_proc(i);
170 } 172 }
171 173
172 174
1 /* 1 /*
2 * Read-Copy Update /proc-based torture test facility 2 * Read-Copy Update /proc-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or 6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version. 7 * (at your option) any later version.
8 * 8 *
9 * This program is distributed in the hope that it will be useful, 9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2005 18 * Copyright (C) IBM Corporation, 2005
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * 21 *
22 * See also: Documentation/RCU/torture.txt 22 * See also: Documentation/RCU/torture.txt
23 */ 23 */
24 #include <linux/types.h> 24 #include <linux/types.h>
25 #include <linux/kernel.h> 25 #include <linux/kernel.h>
26 #include <linux/init.h> 26 #include <linux/init.h>
27 #include <linux/module.h> 27 #include <linux/module.h>
28 #include <linux/kthread.h> 28 #include <linux/kthread.h>
29 #include <linux/err.h> 29 #include <linux/err.h>
30 #include <linux/spinlock.h> 30 #include <linux/spinlock.h>
31 #include <linux/smp.h> 31 #include <linux/smp.h>
32 #include <linux/rcupdate.h> 32 #include <linux/rcupdate.h>
33 #include <linux/interrupt.h> 33 #include <linux/interrupt.h>
34 #include <linux/sched.h> 34 #include <linux/sched.h>
35 #include <asm/atomic.h> 35 #include <asm/atomic.h>
36 #include <linux/bitops.h> 36 #include <linux/bitops.h>
37 #include <linux/module.h> 37 #include <linux/module.h>
38 #include <linux/completion.h> 38 #include <linux/completion.h>
39 #include <linux/moduleparam.h> 39 #include <linux/moduleparam.h>
40 #include <linux/percpu.h> 40 #include <linux/percpu.h>
41 #include <linux/notifier.h> 41 #include <linux/notifier.h>
42 #include <linux/rcuref.h> 42 #include <linux/rcuref.h>
43 #include <linux/cpu.h> 43 #include <linux/cpu.h>
44 #include <linux/random.h> 44 #include <linux/random.h>
45 #include <linux/delay.h> 45 #include <linux/delay.h>
46 #include <linux/byteorder/swabb.h> 46 #include <linux/byteorder/swabb.h>
47 #include <linux/stat.h> 47 #include <linux/stat.h>
48 48
49 MODULE_LICENSE("GPL"); 49 MODULE_LICENSE("GPL");
50 50
51 static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ 51 static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */
52 static int stat_interval = 0; /* Interval between stats, in seconds. */ 52 static int stat_interval = 0; /* Interval between stats, in seconds. */
53 /* Defaults to "only at end of test". */ 53 /* Defaults to "only at end of test". */
54 static int verbose = 0; /* Print more debug info. */ 54 static int verbose = 0; /* Print more debug info. */
55 55
56 MODULE_PARM(nreaders, "i"); 56 MODULE_PARM(nreaders, "i");
57 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 57 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
58 MODULE_PARM(stat_interval, "i"); 58 MODULE_PARM(stat_interval, "i");
59 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 59 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
60 MODULE_PARM(verbose, "i"); 60 MODULE_PARM(verbose, "i");
61 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 61 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
62 #define TORTURE_FLAG "rcutorture: " 62 #define TORTURE_FLAG "rcutorture: "
63 #define PRINTK_STRING(s) \ 63 #define PRINTK_STRING(s) \
64 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 64 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
65 #define VERBOSE_PRINTK_STRING(s) \ 65 #define VERBOSE_PRINTK_STRING(s) \
66 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 66 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
67 #define VERBOSE_PRINTK_ERRSTRING(s) \ 67 #define VERBOSE_PRINTK_ERRSTRING(s) \
68 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) 68 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
69 69
70 static char printk_buf[4096]; 70 static char printk_buf[4096];
71 71
72 static int nrealreaders; 72 static int nrealreaders;
73 static struct task_struct *writer_task; 73 static struct task_struct *writer_task;
74 static struct task_struct **reader_tasks; 74 static struct task_struct **reader_tasks;
75 static struct task_struct *stats_task; 75 static struct task_struct *stats_task;
76 76
77 #define RCU_TORTURE_PIPE_LEN 10 77 #define RCU_TORTURE_PIPE_LEN 10
78 78
79 struct rcu_torture { 79 struct rcu_torture {
80 struct rcu_head rtort_rcu; 80 struct rcu_head rtort_rcu;
81 int rtort_pipe_count; 81 int rtort_pipe_count;
82 struct list_head rtort_free; 82 struct list_head rtort_free;
83 int rtort_mbtest; 83 int rtort_mbtest;
84 }; 84 };
85 85
86 static int fullstop = 0; /* stop generating callbacks at test end. */ 86 static int fullstop = 0; /* stop generating callbacks at test end. */
87 static LIST_HEAD(rcu_torture_freelist); 87 static LIST_HEAD(rcu_torture_freelist);
88 static struct rcu_torture *rcu_torture_current = NULL; 88 static struct rcu_torture *rcu_torture_current = NULL;
89 static long rcu_torture_current_version = 0; 89 static long rcu_torture_current_version = 0;
90 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 90 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
91 static DEFINE_SPINLOCK(rcu_torture_lock); 91 static DEFINE_SPINLOCK(rcu_torture_lock);
92 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 92 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
93 { 0 }; 93 { 0 };
94 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = 94 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
95 { 0 }; 95 { 0 };
96 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; 96 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
97 atomic_t n_rcu_torture_alloc; 97 atomic_t n_rcu_torture_alloc;
98 atomic_t n_rcu_torture_alloc_fail; 98 atomic_t n_rcu_torture_alloc_fail;
99 atomic_t n_rcu_torture_free; 99 atomic_t n_rcu_torture_free;
100 atomic_t n_rcu_torture_mberror; 100 atomic_t n_rcu_torture_mberror;
101 atomic_t n_rcu_torture_error; 101 atomic_t n_rcu_torture_error;
102 102
103 /* 103 /*
104 * Allocate an element from the rcu_tortures pool. 104 * Allocate an element from the rcu_tortures pool.
105 */ 105 */
106 struct rcu_torture * 106 static struct rcu_torture *
107 rcu_torture_alloc(void) 107 rcu_torture_alloc(void)
108 { 108 {
109 struct list_head *p; 109 struct list_head *p;
110 110
111 spin_lock(&rcu_torture_lock); 111 spin_lock(&rcu_torture_lock);
112 if (list_empty(&rcu_torture_freelist)) { 112 if (list_empty(&rcu_torture_freelist)) {
113 atomic_inc(&n_rcu_torture_alloc_fail); 113 atomic_inc(&n_rcu_torture_alloc_fail);
114 spin_unlock(&rcu_torture_lock); 114 spin_unlock(&rcu_torture_lock);
115 return NULL; 115 return NULL;
116 } 116 }
117 atomic_inc(&n_rcu_torture_alloc); 117 atomic_inc(&n_rcu_torture_alloc);
118 p = rcu_torture_freelist.next; 118 p = rcu_torture_freelist.next;
119 list_del_init(p); 119 list_del_init(p);
120 spin_unlock(&rcu_torture_lock); 120 spin_unlock(&rcu_torture_lock);
121 return container_of(p, struct rcu_torture, rtort_free); 121 return container_of(p, struct rcu_torture, rtort_free);
122 } 122 }
123 123
124 /* 124 /*
125 * Free an element to the rcu_tortures pool. 125 * Free an element to the rcu_tortures pool.
126 */ 126 */
127 static void 127 static void
128 rcu_torture_free(struct rcu_torture *p) 128 rcu_torture_free(struct rcu_torture *p)
129 { 129 {
130 atomic_inc(&n_rcu_torture_free); 130 atomic_inc(&n_rcu_torture_free);
131 spin_lock(&rcu_torture_lock); 131 spin_lock(&rcu_torture_lock);
132 list_add_tail(&p->rtort_free, &rcu_torture_freelist); 132 list_add_tail(&p->rtort_free, &rcu_torture_freelist);
133 spin_unlock(&rcu_torture_lock); 133 spin_unlock(&rcu_torture_lock);
134 } 134 }
135 135
136 static void 136 static void
137 rcu_torture_cb(struct rcu_head *p) 137 rcu_torture_cb(struct rcu_head *p)
138 { 138 {
139 int i; 139 int i;
140 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 140 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
141 141
142 if (fullstop) { 142 if (fullstop) {
143 /* Test is ending, just drop callbacks on the floor. */ 143 /* Test is ending, just drop callbacks on the floor. */
144 /* The next initialization will pick up the pieces. */ 144 /* The next initialization will pick up the pieces. */
145 return; 145 return;
146 } 146 }
147 i = rp->rtort_pipe_count; 147 i = rp->rtort_pipe_count;
148 if (i > RCU_TORTURE_PIPE_LEN) 148 if (i > RCU_TORTURE_PIPE_LEN)
149 i = RCU_TORTURE_PIPE_LEN; 149 i = RCU_TORTURE_PIPE_LEN;
150 atomic_inc(&rcu_torture_wcount[i]); 150 atomic_inc(&rcu_torture_wcount[i]);
151 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 151 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
152 rp->rtort_mbtest = 0; 152 rp->rtort_mbtest = 0;
153 rcu_torture_free(rp); 153 rcu_torture_free(rp);
154 } else 154 } else
155 call_rcu(p, rcu_torture_cb); 155 call_rcu(p, rcu_torture_cb);
156 } 156 }
157 157
158 struct rcu_random_state { 158 struct rcu_random_state {
159 unsigned long rrs_state; 159 unsigned long rrs_state;
160 unsigned long rrs_count; 160 unsigned long rrs_count;
161 }; 161 };
162 162
163 #define RCU_RANDOM_MULT 39916801 /* prime */ 163 #define RCU_RANDOM_MULT 39916801 /* prime */
164 #define RCU_RANDOM_ADD 479001701 /* prime */ 164 #define RCU_RANDOM_ADD 479001701 /* prime */
165 #define RCU_RANDOM_REFRESH 10000 165 #define RCU_RANDOM_REFRESH 10000
166 166
167 #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } 167 #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
168 168
169 /* 169 /*
170 * Crude but fast random-number generator. Uses a linear congruential 170 * Crude but fast random-number generator. Uses a linear congruential
171 * generator, with occasional help from get_random_bytes(). 171 * generator, with occasional help from get_random_bytes().
172 */ 172 */
173 static long 173 static long
174 rcu_random(struct rcu_random_state *rrsp) 174 rcu_random(struct rcu_random_state *rrsp)
175 { 175 {
176 long refresh; 176 long refresh;
177 177
178 if (--rrsp->rrs_count < 0) { 178 if (--rrsp->rrs_count < 0) {
179 get_random_bytes(&refresh, sizeof(refresh)); 179 get_random_bytes(&refresh, sizeof(refresh));
180 rrsp->rrs_state += refresh; 180 rrsp->rrs_state += refresh;
181 rrsp->rrs_count = RCU_RANDOM_REFRESH; 181 rrsp->rrs_count = RCU_RANDOM_REFRESH;
182 } 182 }
183 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 183 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
184 return swahw32(rrsp->rrs_state); 184 return swahw32(rrsp->rrs_state);
185 } 185 }
186 186
187 /* 187 /*
188 * RCU torture writer kthread. Repeatedly substitutes a new structure 188 * RCU torture writer kthread. Repeatedly substitutes a new structure
189 * for that pointed to by rcu_torture_current, freeing the old structure 189 * for that pointed to by rcu_torture_current, freeing the old structure
190 * after a series of grace periods (the "pipeline"). 190 * after a series of grace periods (the "pipeline").
191 */ 191 */
192 static int 192 static int
193 rcu_torture_writer(void *arg) 193 rcu_torture_writer(void *arg)
194 { 194 {
195 int i; 195 int i;
196 long oldbatch = rcu_batches_completed(); 196 long oldbatch = rcu_batches_completed();
197 struct rcu_torture *rp; 197 struct rcu_torture *rp;
198 struct rcu_torture *old_rp; 198 struct rcu_torture *old_rp;
199 static DEFINE_RCU_RANDOM(rand); 199 static DEFINE_RCU_RANDOM(rand);
200 200
201 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 201 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
202 set_user_nice(current, 19); 202 set_user_nice(current, 19);
203 203
204 do { 204 do {
205 schedule_timeout_uninterruptible(1); 205 schedule_timeout_uninterruptible(1);
206 if (rcu_batches_completed() == oldbatch) 206 if (rcu_batches_completed() == oldbatch)
207 continue; 207 continue;
208 if ((rp = rcu_torture_alloc()) == NULL) 208 if ((rp = rcu_torture_alloc()) == NULL)
209 continue; 209 continue;
210 rp->rtort_pipe_count = 0; 210 rp->rtort_pipe_count = 0;
211 udelay(rcu_random(&rand) & 0x3ff); 211 udelay(rcu_random(&rand) & 0x3ff);
212 old_rp = rcu_torture_current; 212 old_rp = rcu_torture_current;
213 rp->rtort_mbtest = 1; 213 rp->rtort_mbtest = 1;
214 rcu_assign_pointer(rcu_torture_current, rp); 214 rcu_assign_pointer(rcu_torture_current, rp);
215 smp_wmb(); 215 smp_wmb();
216 if (old_rp != NULL) { 216 if (old_rp != NULL) {
217 i = old_rp->rtort_pipe_count; 217 i = old_rp->rtort_pipe_count;
218 if (i > RCU_TORTURE_PIPE_LEN) 218 if (i > RCU_TORTURE_PIPE_LEN)
219 i = RCU_TORTURE_PIPE_LEN; 219 i = RCU_TORTURE_PIPE_LEN;
220 atomic_inc(&rcu_torture_wcount[i]); 220 atomic_inc(&rcu_torture_wcount[i]);
221 old_rp->rtort_pipe_count++; 221 old_rp->rtort_pipe_count++;
222 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); 222 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
223 } 223 }
224 rcu_torture_current_version++; 224 rcu_torture_current_version++;
225 oldbatch = rcu_batches_completed(); 225 oldbatch = rcu_batches_completed();
226 } while (!kthread_should_stop() && !fullstop); 226 } while (!kthread_should_stop() && !fullstop);
227 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 227 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
228 while (!kthread_should_stop()) 228 while (!kthread_should_stop())
229 schedule_timeout_uninterruptible(1); 229 schedule_timeout_uninterruptible(1);
230 return 0; 230 return 0;
231 } 231 }
232 232
233 /* 233 /*
234 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, 234 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
235 * incrementing the corresponding element of the pipeline array. The 235 * incrementing the corresponding element of the pipeline array. The
236 * counter in the element should never be greater than 1, otherwise, the 236 * counter in the element should never be greater than 1, otherwise, the
237 * RCU implementation is broken. 237 * RCU implementation is broken.
238 */ 238 */
239 static int 239 static int
240 rcu_torture_reader(void *arg) 240 rcu_torture_reader(void *arg)
241 { 241 {
242 int completed; 242 int completed;
243 DEFINE_RCU_RANDOM(rand); 243 DEFINE_RCU_RANDOM(rand);
244 struct rcu_torture *p; 244 struct rcu_torture *p;
245 int pipe_count; 245 int pipe_count;
246 246
247 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 247 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
248 set_user_nice(current, 19); 248 set_user_nice(current, 19);
249 249
250 do { 250 do {
251 rcu_read_lock(); 251 rcu_read_lock();
252 completed = rcu_batches_completed(); 252 completed = rcu_batches_completed();
253 p = rcu_dereference(rcu_torture_current); 253 p = rcu_dereference(rcu_torture_current);
254 if (p == NULL) { 254 if (p == NULL) {
255 /* Wait for rcu_torture_writer to get underway */ 255 /* Wait for rcu_torture_writer to get underway */
256 rcu_read_unlock(); 256 rcu_read_unlock();
257 schedule_timeout_interruptible(HZ); 257 schedule_timeout_interruptible(HZ);
258 continue; 258 continue;
259 } 259 }
260 if (p->rtort_mbtest == 0) 260 if (p->rtort_mbtest == 0)
261 atomic_inc(&n_rcu_torture_mberror); 261 atomic_inc(&n_rcu_torture_mberror);
262 udelay(rcu_random(&rand) & 0x7f); 262 udelay(rcu_random(&rand) & 0x7f);
263 preempt_disable(); 263 preempt_disable();
264 pipe_count = p->rtort_pipe_count; 264 pipe_count = p->rtort_pipe_count;
265 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 265 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
266 /* Should not happen, but... */ 266 /* Should not happen, but... */
267 pipe_count = RCU_TORTURE_PIPE_LEN; 267 pipe_count = RCU_TORTURE_PIPE_LEN;
268 } 268 }
269 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 269 ++__get_cpu_var(rcu_torture_count)[pipe_count];
270 completed = rcu_batches_completed() - completed; 270 completed = rcu_batches_completed() - completed;
271 if (completed > RCU_TORTURE_PIPE_LEN) { 271 if (completed > RCU_TORTURE_PIPE_LEN) {
272 /* Should not happen, but... */ 272 /* Should not happen, but... */
273 completed = RCU_TORTURE_PIPE_LEN; 273 completed = RCU_TORTURE_PIPE_LEN;
274 } 274 }
275 ++__get_cpu_var(rcu_torture_batch)[completed]; 275 ++__get_cpu_var(rcu_torture_batch)[completed];
276 preempt_enable(); 276 preempt_enable();
277 rcu_read_unlock(); 277 rcu_read_unlock();
278 schedule(); 278 schedule();
279 } while (!kthread_should_stop() && !fullstop); 279 } while (!kthread_should_stop() && !fullstop);
280 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 280 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
281 while (!kthread_should_stop()) 281 while (!kthread_should_stop())
282 schedule_timeout_uninterruptible(1); 282 schedule_timeout_uninterruptible(1);
283 return 0; 283 return 0;
284 } 284 }
285 285
286 /* 286 /*
287 * Create an RCU-torture statistics message in the specified buffer. 287 * Create an RCU-torture statistics message in the specified buffer.
288 */ 288 */
289 static int 289 static int
290 rcu_torture_printk(char *page) 290 rcu_torture_printk(char *page)
291 { 291 {
292 int cnt = 0; 292 int cnt = 0;
293 int cpu; 293 int cpu;
294 int i; 294 int i;
295 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 295 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
296 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 296 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
297 297
298 for_each_cpu(cpu) { 298 for_each_cpu(cpu) {
299 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 299 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
300 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; 300 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
301 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; 301 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
302 } 302 }
303 } 303 }
304 for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { 304 for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
305 if (pipesummary[i] != 0) 305 if (pipesummary[i] != 0)
306 break; 306 break;
307 } 307 }
308 cnt += sprintf(&page[cnt], "rcutorture: "); 308 cnt += sprintf(&page[cnt], "rcutorture: ");
309 cnt += sprintf(&page[cnt], 309 cnt += sprintf(&page[cnt],
310 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 310 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
311 "rtmbe: %d", 311 "rtmbe: %d",
312 rcu_torture_current, 312 rcu_torture_current,
313 rcu_torture_current_version, 313 rcu_torture_current_version,
314 list_empty(&rcu_torture_freelist), 314 list_empty(&rcu_torture_freelist),
315 atomic_read(&n_rcu_torture_alloc), 315 atomic_read(&n_rcu_torture_alloc),
316 atomic_read(&n_rcu_torture_alloc_fail), 316 atomic_read(&n_rcu_torture_alloc_fail),
317 atomic_read(&n_rcu_torture_free), 317 atomic_read(&n_rcu_torture_free),
318 atomic_read(&n_rcu_torture_mberror)); 318 atomic_read(&n_rcu_torture_mberror));
319 if (atomic_read(&n_rcu_torture_mberror) != 0) 319 if (atomic_read(&n_rcu_torture_mberror) != 0)
320 cnt += sprintf(&page[cnt], " !!!"); 320 cnt += sprintf(&page[cnt], " !!!");
321 cnt += sprintf(&page[cnt], "\nrcutorture: "); 321 cnt += sprintf(&page[cnt], "\nrcutorture: ");
322 if (i > 1) { 322 if (i > 1) {
323 cnt += sprintf(&page[cnt], "!!! "); 323 cnt += sprintf(&page[cnt], "!!! ");
324 atomic_inc(&n_rcu_torture_error); 324 atomic_inc(&n_rcu_torture_error);
325 } 325 }
326 cnt += sprintf(&page[cnt], "Reader Pipe: "); 326 cnt += sprintf(&page[cnt], "Reader Pipe: ");
327 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 327 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
328 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 328 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
329 cnt += sprintf(&page[cnt], "\nrcutorture: "); 329 cnt += sprintf(&page[cnt], "\nrcutorture: ");
330 cnt += sprintf(&page[cnt], "Reader Batch: "); 330 cnt += sprintf(&page[cnt], "Reader Batch: ");
331 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) 331 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
332 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 332 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
333 cnt += sprintf(&page[cnt], "\nrcutorture: "); 333 cnt += sprintf(&page[cnt], "\nrcutorture: ");
334 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 334 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
335 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 335 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
336 cnt += sprintf(&page[cnt], " %d", 336 cnt += sprintf(&page[cnt], " %d",
337 atomic_read(&rcu_torture_wcount[i])); 337 atomic_read(&rcu_torture_wcount[i]));
338 } 338 }
339 cnt += sprintf(&page[cnt], "\n"); 339 cnt += sprintf(&page[cnt], "\n");
340 return cnt; 340 return cnt;
341 } 341 }
342 342
343 /* 343 /*
344 * Print torture statistics. Caller must ensure that there is only 344 * Print torture statistics. Caller must ensure that there is only
345 * one call to this function at a given time!!! This is normally 345 * one call to this function at a given time!!! This is normally
346 * accomplished by relying on the module system to only have one copy 346 * accomplished by relying on the module system to only have one copy
347 * of the module loaded, and then by giving the rcu_torture_stats 347 * of the module loaded, and then by giving the rcu_torture_stats
348 * kthread full control (or the init/cleanup functions when rcu_torture_stats 348 * kthread full control (or the init/cleanup functions when rcu_torture_stats
349 * thread is not running). 349 * thread is not running).
350 */ 350 */
351 static void 351 static void
352 rcu_torture_stats_print(void) 352 rcu_torture_stats_print(void)
353 { 353 {
354 int cnt; 354 int cnt;
355 355
356 cnt = rcu_torture_printk(printk_buf); 356 cnt = rcu_torture_printk(printk_buf);
357 printk(KERN_ALERT "%s", printk_buf); 357 printk(KERN_ALERT "%s", printk_buf);
358 } 358 }
359 359
360 /* 360 /*
361 * Periodically prints torture statistics, if periodic statistics printing 361 * Periodically prints torture statistics, if periodic statistics printing
362 * was specified via the stat_interval module parameter. 362 * was specified via the stat_interval module parameter.
363 * 363 *
364 * No need to worry about fullstop here, since this one doesn't reference 364 * No need to worry about fullstop here, since this one doesn't reference
365 * volatile state or register callbacks. 365 * volatile state or register callbacks.
366 */ 366 */
367 static int 367 static int
368 rcu_torture_stats(void *arg) 368 rcu_torture_stats(void *arg)
369 { 369 {
370 VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); 370 VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
371 do { 371 do {
372 schedule_timeout_interruptible(stat_interval * HZ); 372 schedule_timeout_interruptible(stat_interval * HZ);
373 rcu_torture_stats_print(); 373 rcu_torture_stats_print();
374 } while (!kthread_should_stop()); 374 } while (!kthread_should_stop());
375 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 375 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
376 return 0; 376 return 0;
377 } 377 }
378 378
379 static void 379 static void
380 rcu_torture_cleanup(void) 380 rcu_torture_cleanup(void)
381 { 381 {
382 int i; 382 int i;
383 383
384 fullstop = 1; 384 fullstop = 1;
385 if (writer_task != NULL) { 385 if (writer_task != NULL) {
386 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 386 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
387 kthread_stop(writer_task); 387 kthread_stop(writer_task);
388 } 388 }
389 writer_task = NULL; 389 writer_task = NULL;
390 390
391 if (reader_tasks != NULL) { 391 if (reader_tasks != NULL) {
392 for (i = 0; i < nrealreaders; i++) { 392 for (i = 0; i < nrealreaders; i++) {
393 if (reader_tasks[i] != NULL) { 393 if (reader_tasks[i] != NULL) {
394 VERBOSE_PRINTK_STRING( 394 VERBOSE_PRINTK_STRING(
395 "Stopping rcu_torture_reader task"); 395 "Stopping rcu_torture_reader task");
396 kthread_stop(reader_tasks[i]); 396 kthread_stop(reader_tasks[i]);
397 } 397 }
398 reader_tasks[i] = NULL; 398 reader_tasks[i] = NULL;
399 } 399 }
400 kfree(reader_tasks); 400 kfree(reader_tasks);
401 reader_tasks = NULL; 401 reader_tasks = NULL;
402 } 402 }
403 rcu_torture_current = NULL; 403 rcu_torture_current = NULL;
404 404
405 if (stats_task != NULL) { 405 if (stats_task != NULL) {
406 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); 406 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
407 kthread_stop(stats_task); 407 kthread_stop(stats_task);
408 } 408 }
409 stats_task = NULL; 409 stats_task = NULL;
410 410
411 /* Wait for all RCU callbacks to fire. */ 411 /* Wait for all RCU callbacks to fire. */
412 rcu_barrier(); 412 rcu_barrier();
413 413
414 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 414 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
415 printk(KERN_ALERT TORTURE_FLAG 415 printk(KERN_ALERT TORTURE_FLAG
416 "--- End of test: %s\n", 416 "--- End of test: %s\n",
417 atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); 417 atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
418 } 418 }
419 419
420 static int 420 static int
421 rcu_torture_init(void) 421 rcu_torture_init(void)
422 { 422 {
423 int i; 423 int i;
424 int cpu; 424 int cpu;
425 int firsterr = 0; 425 int firsterr = 0;
426 426
427 /* Process args and tell the world that the torturer is on the job. */ 427 /* Process args and tell the world that the torturer is on the job. */
428 428
429 if (nreaders >= 0) 429 if (nreaders >= 0)
430 nrealreaders = nreaders; 430 nrealreaders = nreaders;
431 else 431 else
432 nrealreaders = 2 * num_online_cpus(); 432 nrealreaders = 2 * num_online_cpus();
433 printk(KERN_ALERT TORTURE_FLAG 433 printk(KERN_ALERT TORTURE_FLAG
434 "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", 434 "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
435 nrealreaders, stat_interval, verbose); 435 nrealreaders, stat_interval, verbose);
436 fullstop = 0; 436 fullstop = 0;
437 437
438 /* Set up the freelist. */ 438 /* Set up the freelist. */
439 439
440 INIT_LIST_HEAD(&rcu_torture_freelist); 440 INIT_LIST_HEAD(&rcu_torture_freelist);
441 for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { 441 for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
442 rcu_tortures[i].rtort_mbtest = 0; 442 rcu_tortures[i].rtort_mbtest = 0;
443 list_add_tail(&rcu_tortures[i].rtort_free, 443 list_add_tail(&rcu_tortures[i].rtort_free,
444 &rcu_torture_freelist); 444 &rcu_torture_freelist);
445 } 445 }
446 446
447 /* Initialize the statistics so that each run gets its own numbers. */ 447 /* Initialize the statistics so that each run gets its own numbers. */
448 448
449 rcu_torture_current = NULL; 449 rcu_torture_current = NULL;
450 rcu_torture_current_version = 0; 450 rcu_torture_current_version = 0;
451 atomic_set(&n_rcu_torture_alloc, 0); 451 atomic_set(&n_rcu_torture_alloc, 0);
452 atomic_set(&n_rcu_torture_alloc_fail, 0); 452 atomic_set(&n_rcu_torture_alloc_fail, 0);
453 atomic_set(&n_rcu_torture_free, 0); 453 atomic_set(&n_rcu_torture_free, 0);
454 atomic_set(&n_rcu_torture_mberror, 0); 454 atomic_set(&n_rcu_torture_mberror, 0);
455 atomic_set(&n_rcu_torture_error, 0); 455 atomic_set(&n_rcu_torture_error, 0);
456 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 456 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
457 atomic_set(&rcu_torture_wcount[i], 0); 457 atomic_set(&rcu_torture_wcount[i], 0);
458 for_each_cpu(cpu) { 458 for_each_cpu(cpu) {
459 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 459 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
460 per_cpu(rcu_torture_count, cpu)[i] = 0; 460 per_cpu(rcu_torture_count, cpu)[i] = 0;
461 per_cpu(rcu_torture_batch, cpu)[i] = 0; 461 per_cpu(rcu_torture_batch, cpu)[i] = 0;
462 } 462 }
463 } 463 }
464 464
465 /* Start up the kthreads. */ 465 /* Start up the kthreads. */
466 466
467 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 467 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
468 writer_task = kthread_run(rcu_torture_writer, NULL, 468 writer_task = kthread_run(rcu_torture_writer, NULL,
469 "rcu_torture_writer"); 469 "rcu_torture_writer");
470 if (IS_ERR(writer_task)) { 470 if (IS_ERR(writer_task)) {
471 firsterr = PTR_ERR(writer_task); 471 firsterr = PTR_ERR(writer_task);
472 VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); 472 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
473 writer_task = NULL; 473 writer_task = NULL;
474 goto unwind; 474 goto unwind;
475 } 475 }
476 reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), 476 reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
477 GFP_KERNEL); 477 GFP_KERNEL);
478 if (reader_tasks == NULL) { 478 if (reader_tasks == NULL) {
479 VERBOSE_PRINTK_ERRSTRING("out of memory"); 479 VERBOSE_PRINTK_ERRSTRING("out of memory");
480 firsterr = -ENOMEM; 480 firsterr = -ENOMEM;
481 goto unwind; 481 goto unwind;
482 } 482 }
483 for (i = 0; i < nrealreaders; i++) { 483 for (i = 0; i < nrealreaders; i++) {
484 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); 484 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
485 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, 485 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
486 "rcu_torture_reader"); 486 "rcu_torture_reader");
487 if (IS_ERR(reader_tasks[i])) { 487 if (IS_ERR(reader_tasks[i])) {
488 firsterr = PTR_ERR(reader_tasks[i]); 488 firsterr = PTR_ERR(reader_tasks[i]);
489 VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); 489 VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
490 reader_tasks[i] = NULL; 490 reader_tasks[i] = NULL;
491 goto unwind; 491 goto unwind;
492 } 492 }
493 } 493 }
494 if (stat_interval > 0) { 494 if (stat_interval > 0) {
495 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); 495 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
496 stats_task = kthread_run(rcu_torture_stats, NULL, 496 stats_task = kthread_run(rcu_torture_stats, NULL,
497 "rcu_torture_stats"); 497 "rcu_torture_stats");
498 if (IS_ERR(stats_task)) { 498 if (IS_ERR(stats_task)) {
499 firsterr = PTR_ERR(stats_task); 499 firsterr = PTR_ERR(stats_task);
500 VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); 500 VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
501 stats_task = NULL; 501 stats_task = NULL;
502 goto unwind; 502 goto unwind;
503 } 503 }
504 } 504 }
505 return 0; 505 return 0;
506 506
507 unwind: 507 unwind:
508 rcu_torture_cleanup(); 508 rcu_torture_cleanup();
509 return firsterr; 509 return firsterr;
510 } 510 }
511 511
512 module_init(rcu_torture_init); 512 module_init(rcu_torture_init);
513 module_exit(rcu_torture_cleanup); 513 module_exit(rcu_torture_cleanup);
514 514
1 /* 1 /*
2 * linux/kernel/timer.c 2 * linux/kernel/timer.c
3 * 3 *
4 * Kernel internal timers, kernel timekeeping, basic process system calls 4 * Kernel internal timers, kernel timekeeping, basic process system calls
5 * 5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds 6 * Copyright (C) 1991, 1992 Linus Torvalds
7 * 7 *
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. 8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
9 * 9 *
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills 11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to 12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks). 13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli 14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl 15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love 16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling. 17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar 18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar 19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
20 */ 20 */
21 21
22 #include <linux/kernel_stat.h> 22 #include <linux/kernel_stat.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/interrupt.h> 24 #include <linux/interrupt.h>
25 #include <linux/percpu.h> 25 #include <linux/percpu.h>
26 #include <linux/init.h> 26 #include <linux/init.h>
27 #include <linux/mm.h> 27 #include <linux/mm.h>
28 #include <linux/swap.h> 28 #include <linux/swap.h>
29 #include <linux/notifier.h> 29 #include <linux/notifier.h>
30 #include <linux/thread_info.h> 30 #include <linux/thread_info.h>
31 #include <linux/time.h> 31 #include <linux/time.h>
32 #include <linux/jiffies.h> 32 #include <linux/jiffies.h>
33 #include <linux/posix-timers.h> 33 #include <linux/posix-timers.h>
34 #include <linux/cpu.h> 34 #include <linux/cpu.h>
35 #include <linux/syscalls.h> 35 #include <linux/syscalls.h>
36 #include <linux/delay.h>
36 37
37 #include <asm/uaccess.h> 38 #include <asm/uaccess.h>
38 #include <asm/unistd.h> 39 #include <asm/unistd.h>
39 #include <asm/div64.h> 40 #include <asm/div64.h>
40 #include <asm/timex.h> 41 #include <asm/timex.h>
41 #include <asm/io.h> 42 #include <asm/io.h>
42 43
43 #ifdef CONFIG_TIME_INTERPOLATION 44 #ifdef CONFIG_TIME_INTERPOLATION
44 static void time_interpolator_update(long delta_nsec); 45 static void time_interpolator_update(long delta_nsec);
45 #else 46 #else
46 #define time_interpolator_update(x) 47 #define time_interpolator_update(x)
47 #endif 48 #endif
48 49
49 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 50 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50 51
51 EXPORT_SYMBOL(jiffies_64); 52 EXPORT_SYMBOL(jiffies_64);
52 53
53 /* 54 /*
54 * per-CPU timer vector definitions: 55 * per-CPU timer vector definitions:
55 */ 56 */
56 57
57 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) 58 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
58 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) 59 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
59 #define TVN_SIZE (1 << TVN_BITS) 60 #define TVN_SIZE (1 << TVN_BITS)
60 #define TVR_SIZE (1 << TVR_BITS) 61 #define TVR_SIZE (1 << TVR_BITS)
61 #define TVN_MASK (TVN_SIZE - 1) 62 #define TVN_MASK (TVN_SIZE - 1)
62 #define TVR_MASK (TVR_SIZE - 1) 63 #define TVR_MASK (TVR_SIZE - 1)
63 64
64 struct timer_base_s { 65 struct timer_base_s {
65 spinlock_t lock; 66 spinlock_t lock;
66 struct timer_list *running_timer; 67 struct timer_list *running_timer;
67 }; 68 };
68 69
69 typedef struct tvec_s { 70 typedef struct tvec_s {
70 struct list_head vec[TVN_SIZE]; 71 struct list_head vec[TVN_SIZE];
71 } tvec_t; 72 } tvec_t;
72 73
73 typedef struct tvec_root_s { 74 typedef struct tvec_root_s {
74 struct list_head vec[TVR_SIZE]; 75 struct list_head vec[TVR_SIZE];
75 } tvec_root_t; 76 } tvec_root_t;
76 77
77 struct tvec_t_base_s { 78 struct tvec_t_base_s {
78 struct timer_base_s t_base; 79 struct timer_base_s t_base;
79 unsigned long timer_jiffies; 80 unsigned long timer_jiffies;
80 tvec_root_t tv1; 81 tvec_root_t tv1;
81 tvec_t tv2; 82 tvec_t tv2;
82 tvec_t tv3; 83 tvec_t tv3;
83 tvec_t tv4; 84 tvec_t tv4;
84 tvec_t tv5; 85 tvec_t tv5;
85 } ____cacheline_aligned_in_smp; 86 } ____cacheline_aligned_in_smp;
86 87
87 typedef struct tvec_t_base_s tvec_base_t; 88 typedef struct tvec_t_base_s tvec_base_t;
88 static DEFINE_PER_CPU(tvec_base_t, tvec_bases); 89 static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
89 90
90 static inline void set_running_timer(tvec_base_t *base, 91 static inline void set_running_timer(tvec_base_t *base,
91 struct timer_list *timer) 92 struct timer_list *timer)
92 { 93 {
93 #ifdef CONFIG_SMP 94 #ifdef CONFIG_SMP
94 base->t_base.running_timer = timer; 95 base->t_base.running_timer = timer;
95 #endif 96 #endif
96 } 97 }
97 98
98 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 99 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
99 { 100 {
100 unsigned long expires = timer->expires; 101 unsigned long expires = timer->expires;
101 unsigned long idx = expires - base->timer_jiffies; 102 unsigned long idx = expires - base->timer_jiffies;
102 struct list_head *vec; 103 struct list_head *vec;
103 104
104 if (idx < TVR_SIZE) { 105 if (idx < TVR_SIZE) {
105 int i = expires & TVR_MASK; 106 int i = expires & TVR_MASK;
106 vec = base->tv1.vec + i; 107 vec = base->tv1.vec + i;
107 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { 108 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
108 int i = (expires >> TVR_BITS) & TVN_MASK; 109 int i = (expires >> TVR_BITS) & TVN_MASK;
109 vec = base->tv2.vec + i; 110 vec = base->tv2.vec + i;
110 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { 111 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
111 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; 112 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
112 vec = base->tv3.vec + i; 113 vec = base->tv3.vec + i;
113 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { 114 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
114 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; 115 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
115 vec = base->tv4.vec + i; 116 vec = base->tv4.vec + i;
116 } else if ((signed long) idx < 0) { 117 } else if ((signed long) idx < 0) {
117 /* 118 /*
118 * Can happen if you add a timer with expires == jiffies, 119 * Can happen if you add a timer with expires == jiffies,
119 * or you set a timer to go off in the past 120 * or you set a timer to go off in the past
120 */ 121 */
121 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); 122 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
122 } else { 123 } else {
123 int i; 124 int i;
124 /* If the timeout is larger than 0xffffffff on 64-bit 125 /* If the timeout is larger than 0xffffffff on 64-bit
125 * architectures then we use the maximum timeout: 126 * architectures then we use the maximum timeout:
126 */ 127 */
127 if (idx > 0xffffffffUL) { 128 if (idx > 0xffffffffUL) {
128 idx = 0xffffffffUL; 129 idx = 0xffffffffUL;
129 expires = idx + base->timer_jiffies; 130 expires = idx + base->timer_jiffies;
130 } 131 }
131 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; 132 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
132 vec = base->tv5.vec + i; 133 vec = base->tv5.vec + i;
133 } 134 }
134 /* 135 /*
135 * Timers are FIFO: 136 * Timers are FIFO:
136 */ 137 */
137 list_add_tail(&timer->entry, vec); 138 list_add_tail(&timer->entry, vec);
138 } 139 }
139 140
140 typedef struct timer_base_s timer_base_t; 141 typedef struct timer_base_s timer_base_t;
141 /* 142 /*
142 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) 143 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
143 * at compile time, and we need timer->base to lock the timer. 144 * at compile time, and we need timer->base to lock the timer.
144 */ 145 */
145 timer_base_t __init_timer_base 146 timer_base_t __init_timer_base
146 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; 147 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
147 EXPORT_SYMBOL(__init_timer_base); 148 EXPORT_SYMBOL(__init_timer_base);
148 149
149 /*** 150 /***
150 * init_timer - initialize a timer. 151 * init_timer - initialize a timer.
151 * @timer: the timer to be initialized 152 * @timer: the timer to be initialized
152 * 153 *
153 * init_timer() must be done to a timer prior calling *any* of the 154 * init_timer() must be done to a timer prior calling *any* of the
154 * other timer functions. 155 * other timer functions.
155 */ 156 */
156 void fastcall init_timer(struct timer_list *timer) 157 void fastcall init_timer(struct timer_list *timer)
157 { 158 {
158 timer->entry.next = NULL; 159 timer->entry.next = NULL;
159 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; 160 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
160 } 161 }
161 EXPORT_SYMBOL(init_timer); 162 EXPORT_SYMBOL(init_timer);
162 163
163 static inline void detach_timer(struct timer_list *timer, 164 static inline void detach_timer(struct timer_list *timer,
164 int clear_pending) 165 int clear_pending)
165 { 166 {
166 struct list_head *entry = &timer->entry; 167 struct list_head *entry = &timer->entry;
167 168
168 __list_del(entry->prev, entry->next); 169 __list_del(entry->prev, entry->next);
169 if (clear_pending) 170 if (clear_pending)
170 entry->next = NULL; 171 entry->next = NULL;
171 entry->prev = LIST_POISON2; 172 entry->prev = LIST_POISON2;
172 } 173 }
173 174
174 /* 175 /*
175 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock 176 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
176 * means that all timers which are tied to this base via timer->base are 177 * means that all timers which are tied to this base via timer->base are
177 * locked, and the base itself is locked too. 178 * locked, and the base itself is locked too.
178 * 179 *
179 * So __run_timers/migrate_timers can safely modify all timers which could 180 * So __run_timers/migrate_timers can safely modify all timers which could
180 * be found on ->tvX lists. 181 * be found on ->tvX lists.
181 * 182 *
182 * When the timer's base is locked, and the timer removed from list, it is 183 * When the timer's base is locked, and the timer removed from list, it is
183 * possible to set timer->base = NULL and drop the lock: the timer remains 184 * possible to set timer->base = NULL and drop the lock: the timer remains
184 * locked. 185 * locked.
185 */ 186 */
186 static timer_base_t *lock_timer_base(struct timer_list *timer, 187 static timer_base_t *lock_timer_base(struct timer_list *timer,
187 unsigned long *flags) 188 unsigned long *flags)
188 { 189 {
189 timer_base_t *base; 190 timer_base_t *base;
190 191
191 for (;;) { 192 for (;;) {
192 base = timer->base; 193 base = timer->base;
193 if (likely(base != NULL)) { 194 if (likely(base != NULL)) {
194 spin_lock_irqsave(&base->lock, *flags); 195 spin_lock_irqsave(&base->lock, *flags);
195 if (likely(base == timer->base)) 196 if (likely(base == timer->base))
196 return base; 197 return base;
197 /* The timer has migrated to another CPU */ 198 /* The timer has migrated to another CPU */
198 spin_unlock_irqrestore(&base->lock, *flags); 199 spin_unlock_irqrestore(&base->lock, *flags);
199 } 200 }
200 cpu_relax(); 201 cpu_relax();
201 } 202 }
202 } 203 }
203 204
204 int __mod_timer(struct timer_list *timer, unsigned long expires) 205 int __mod_timer(struct timer_list *timer, unsigned long expires)
205 { 206 {
206 timer_base_t *base; 207 timer_base_t *base;
207 tvec_base_t *new_base; 208 tvec_base_t *new_base;
208 unsigned long flags; 209 unsigned long flags;
209 int ret = 0; 210 int ret = 0;
210 211
211 BUG_ON(!timer->function); 212 BUG_ON(!timer->function);
212 213
213 base = lock_timer_base(timer, &flags); 214 base = lock_timer_base(timer, &flags);
214 215
215 if (timer_pending(timer)) { 216 if (timer_pending(timer)) {
216 detach_timer(timer, 0); 217 detach_timer(timer, 0);
217 ret = 1; 218 ret = 1;
218 } 219 }
219 220
220 new_base = &__get_cpu_var(tvec_bases); 221 new_base = &__get_cpu_var(tvec_bases);
221 222
222 if (base != &new_base->t_base) { 223 if (base != &new_base->t_base) {
223 /* 224 /*
224 * We are trying to schedule the timer on the local CPU. 225 * We are trying to schedule the timer on the local CPU.
225 * However we can't change timer's base while it is running, 226 * However we can't change timer's base while it is running,
226 * otherwise del_timer_sync() can't detect that the timer's 227 * otherwise del_timer_sync() can't detect that the timer's
227 * handler yet has not finished. This also guarantees that 228 * handler yet has not finished. This also guarantees that
228 * the timer is serialized wrt itself. 229 * the timer is serialized wrt itself.
229 */ 230 */
230 if (unlikely(base->running_timer == timer)) { 231 if (unlikely(base->running_timer == timer)) {
231 /* The timer remains on a former base */ 232 /* The timer remains on a former base */
232 new_base = container_of(base, tvec_base_t, t_base); 233 new_base = container_of(base, tvec_base_t, t_base);
233 } else { 234 } else {
234 /* See the comment in lock_timer_base() */ 235 /* See the comment in lock_timer_base() */
235 timer->base = NULL; 236 timer->base = NULL;
236 spin_unlock(&base->lock); 237 spin_unlock(&base->lock);
237 spin_lock(&new_base->t_base.lock); 238 spin_lock(&new_base->t_base.lock);
238 timer->base = &new_base->t_base; 239 timer->base = &new_base->t_base;
239 } 240 }
240 } 241 }
241 242
242 timer->expires = expires; 243 timer->expires = expires;
243 internal_add_timer(new_base, timer); 244 internal_add_timer(new_base, timer);
244 spin_unlock_irqrestore(&new_base->t_base.lock, flags); 245 spin_unlock_irqrestore(&new_base->t_base.lock, flags);
245 246
246 return ret; 247 return ret;
247 } 248 }
248 249
249 EXPORT_SYMBOL(__mod_timer); 250 EXPORT_SYMBOL(__mod_timer);
250 251
251 /*** 252 /***
252 * add_timer_on - start a timer on a particular CPU 253 * add_timer_on - start a timer on a particular CPU
253 * @timer: the timer to be added 254 * @timer: the timer to be added
254 * @cpu: the CPU to start it on 255 * @cpu: the CPU to start it on
255 * 256 *
256 * This is not very scalable on SMP. Double adds are not possible. 257 * This is not very scalable on SMP. Double adds are not possible.
257 */ 258 */
258 void add_timer_on(struct timer_list *timer, int cpu) 259 void add_timer_on(struct timer_list *timer, int cpu)
259 { 260 {
260 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 261 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
261 unsigned long flags; 262 unsigned long flags;
262 263
263 BUG_ON(timer_pending(timer) || !timer->function); 264 BUG_ON(timer_pending(timer) || !timer->function);
264 spin_lock_irqsave(&base->t_base.lock, flags); 265 spin_lock_irqsave(&base->t_base.lock, flags);
265 timer->base = &base->t_base; 266 timer->base = &base->t_base;
266 internal_add_timer(base, timer); 267 internal_add_timer(base, timer);
267 spin_unlock_irqrestore(&base->t_base.lock, flags); 268 spin_unlock_irqrestore(&base->t_base.lock, flags);
268 } 269 }
269 270
270 271
271 /*** 272 /***
272 * mod_timer - modify a timer's timeout 273 * mod_timer - modify a timer's timeout
273 * @timer: the timer to be modified 274 * @timer: the timer to be modified
274 * 275 *
275 * mod_timer is a more efficient way to update the expire field of an 276 * mod_timer is a more efficient way to update the expire field of an
276 * active timer (if the timer is inactive it will be activated) 277 * active timer (if the timer is inactive it will be activated)
277 * 278 *
278 * mod_timer(timer, expires) is equivalent to: 279 * mod_timer(timer, expires) is equivalent to:
279 * 280 *
280 * del_timer(timer); timer->expires = expires; add_timer(timer); 281 * del_timer(timer); timer->expires = expires; add_timer(timer);
281 * 282 *
282 * Note that if there are multiple unserialized concurrent users of the 283 * Note that if there are multiple unserialized concurrent users of the
283 * same timer, then mod_timer() is the only safe way to modify the timeout, 284 * same timer, then mod_timer() is the only safe way to modify the timeout,
284 * since add_timer() cannot modify an already running timer. 285 * since add_timer() cannot modify an already running timer.
285 * 286 *
286 * The function returns whether it has modified a pending timer or not. 287 * The function returns whether it has modified a pending timer or not.
287 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an 288 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
288 * active timer returns 1.) 289 * active timer returns 1.)
289 */ 290 */
290 int mod_timer(struct timer_list *timer, unsigned long expires) 291 int mod_timer(struct timer_list *timer, unsigned long expires)
291 { 292 {
292 BUG_ON(!timer->function); 293 BUG_ON(!timer->function);
293 294
294 /* 295 /*
295 * This is a common optimization triggered by the 296 * This is a common optimization triggered by the
296 * networking code - if the timer is re-modified 297 * networking code - if the timer is re-modified
297 * to be the same thing then just return: 298 * to be the same thing then just return:
298 */ 299 */
299 if (timer->expires == expires && timer_pending(timer)) 300 if (timer->expires == expires && timer_pending(timer))
300 return 1; 301 return 1;
301 302
302 return __mod_timer(timer, expires); 303 return __mod_timer(timer, expires);
303 } 304 }
304 305
305 EXPORT_SYMBOL(mod_timer); 306 EXPORT_SYMBOL(mod_timer);
306 307
307 /*** 308 /***
308 * del_timer - deactive a timer. 309 * del_timer - deactive a timer.
309 * @timer: the timer to be deactivated 310 * @timer: the timer to be deactivated
310 * 311 *
311 * del_timer() deactivates a timer - this works on both active and inactive 312 * del_timer() deactivates a timer - this works on both active and inactive
312 * timers. 313 * timers.
313 * 314 *
314 * The function returns whether it has deactivated a pending timer or not. 315 * The function returns whether it has deactivated a pending timer or not.
315 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an 316 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
316 * active timer returns 1.) 317 * active timer returns 1.)
317 */ 318 */
318 int del_timer(struct timer_list *timer) 319 int del_timer(struct timer_list *timer)
319 { 320 {
320 timer_base_t *base; 321 timer_base_t *base;
321 unsigned long flags; 322 unsigned long flags;
322 int ret = 0; 323 int ret = 0;
323 324
324 if (timer_pending(timer)) { 325 if (timer_pending(timer)) {
325 base = lock_timer_base(timer, &flags); 326 base = lock_timer_base(timer, &flags);
326 if (timer_pending(timer)) { 327 if (timer_pending(timer)) {
327 detach_timer(timer, 1); 328 detach_timer(timer, 1);
328 ret = 1; 329 ret = 1;
329 } 330 }
330 spin_unlock_irqrestore(&base->lock, flags); 331 spin_unlock_irqrestore(&base->lock, flags);
331 } 332 }
332 333
333 return ret; 334 return ret;
334 } 335 }
335 336
336 EXPORT_SYMBOL(del_timer); 337 EXPORT_SYMBOL(del_timer);
337 338
338 #ifdef CONFIG_SMP 339 #ifdef CONFIG_SMP
339 /* 340 /*
340 * This function tries to deactivate a timer. Upon successful (ret >= 0) 341 * This function tries to deactivate a timer. Upon successful (ret >= 0)
341 * exit the timer is not queued and the handler is not running on any CPU. 342 * exit the timer is not queued and the handler is not running on any CPU.
342 * 343 *
343 * It must not be called from interrupt contexts. 344 * It must not be called from interrupt contexts.
344 */ 345 */
345 int try_to_del_timer_sync(struct timer_list *timer) 346 int try_to_del_timer_sync(struct timer_list *timer)
346 { 347 {
347 timer_base_t *base; 348 timer_base_t *base;
348 unsigned long flags; 349 unsigned long flags;
349 int ret = -1; 350 int ret = -1;
350 351
351 base = lock_timer_base(timer, &flags); 352 base = lock_timer_base(timer, &flags);
352 353
353 if (base->running_timer == timer) 354 if (base->running_timer == timer)
354 goto out; 355 goto out;
355 356
356 ret = 0; 357 ret = 0;
357 if (timer_pending(timer)) { 358 if (timer_pending(timer)) {
358 detach_timer(timer, 1); 359 detach_timer(timer, 1);
359 ret = 1; 360 ret = 1;
360 } 361 }
361 out: 362 out:
362 spin_unlock_irqrestore(&base->lock, flags); 363 spin_unlock_irqrestore(&base->lock, flags);
363 364
364 return ret; 365 return ret;
365 } 366 }
366 367
367 /*** 368 /***
368 * del_timer_sync - deactivate a timer and wait for the handler to finish. 369 * del_timer_sync - deactivate a timer and wait for the handler to finish.
369 * @timer: the timer to be deactivated 370 * @timer: the timer to be deactivated
370 * 371 *
371 * This function only differs from del_timer() on SMP: besides deactivating 372 * This function only differs from del_timer() on SMP: besides deactivating
372 * the timer it also makes sure the handler has finished executing on other 373 * the timer it also makes sure the handler has finished executing on other
373 * CPUs. 374 * CPUs.
374 * 375 *
375 * Synchronization rules: callers must prevent restarting of the timer, 376 * Synchronization rules: callers must prevent restarting of the timer,
376 * otherwise this function is meaningless. It must not be called from 377 * otherwise this function is meaningless. It must not be called from
377 * interrupt contexts. The caller must not hold locks which would prevent 378 * interrupt contexts. The caller must not hold locks which would prevent
378 * completion of the timer's handler. The timer's handler must not call 379 * completion of the timer's handler. The timer's handler must not call
379 * add_timer_on(). Upon exit the timer is not queued and the handler is 380 * add_timer_on(). Upon exit the timer is not queued and the handler is
380 * not running on any CPU. 381 * not running on any CPU.
381 * 382 *
382 * The function returns whether it has deactivated a pending timer or not. 383 * The function returns whether it has deactivated a pending timer or not.
383 */ 384 */
384 int del_timer_sync(struct timer_list *timer) 385 int del_timer_sync(struct timer_list *timer)
385 { 386 {
386 for (;;) { 387 for (;;) {
387 int ret = try_to_del_timer_sync(timer); 388 int ret = try_to_del_timer_sync(timer);
388 if (ret >= 0) 389 if (ret >= 0)
389 return ret; 390 return ret;
390 } 391 }
391 } 392 }
392 393
393 EXPORT_SYMBOL(del_timer_sync); 394 EXPORT_SYMBOL(del_timer_sync);
394 #endif 395 #endif
395 396
396 static int cascade(tvec_base_t *base, tvec_t *tv, int index) 397 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
397 { 398 {
398 /* cascade all the timers from tv up one level */ 399 /* cascade all the timers from tv up one level */
399 struct list_head *head, *curr; 400 struct list_head *head, *curr;
400 401
401 head = tv->vec + index; 402 head = tv->vec + index;
402 curr = head->next; 403 curr = head->next;
403 /* 404 /*
404 * We are removing _all_ timers from the list, so we don't have to 405 * We are removing _all_ timers from the list, so we don't have to
405 * detach them individually, just clear the list afterwards. 406 * detach them individually, just clear the list afterwards.
406 */ 407 */
407 while (curr != head) { 408 while (curr != head) {
408 struct timer_list *tmp; 409 struct timer_list *tmp;
409 410
410 tmp = list_entry(curr, struct timer_list, entry); 411 tmp = list_entry(curr, struct timer_list, entry);
411 BUG_ON(tmp->base != &base->t_base); 412 BUG_ON(tmp->base != &base->t_base);
412 curr = curr->next; 413 curr = curr->next;
413 internal_add_timer(base, tmp); 414 internal_add_timer(base, tmp);
414 } 415 }
415 INIT_LIST_HEAD(head); 416 INIT_LIST_HEAD(head);
416 417
417 return index; 418 return index;
418 } 419 }
419 420
420 /*** 421 /***
421 * __run_timers - run all expired timers (if any) on this CPU. 422 * __run_timers - run all expired timers (if any) on this CPU.
422 * @base: the timer vector to be processed. 423 * @base: the timer vector to be processed.
423 * 424 *
424 * This function cascades all vectors and executes all expired timer 425 * This function cascades all vectors and executes all expired timer
425 * vectors. 426 * vectors.
426 */ 427 */
427 #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK 428 #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
428 429
429 static inline void __run_timers(tvec_base_t *base) 430 static inline void __run_timers(tvec_base_t *base)
430 { 431 {
431 struct timer_list *timer; 432 struct timer_list *timer;
432 433
433 spin_lock_irq(&base->t_base.lock); 434 spin_lock_irq(&base->t_base.lock);
434 while (time_after_eq(jiffies, base->timer_jiffies)) { 435 while (time_after_eq(jiffies, base->timer_jiffies)) {
435 struct list_head work_list = LIST_HEAD_INIT(work_list); 436 struct list_head work_list = LIST_HEAD_INIT(work_list);
436 struct list_head *head = &work_list; 437 struct list_head *head = &work_list;
437 int index = base->timer_jiffies & TVR_MASK; 438 int index = base->timer_jiffies & TVR_MASK;
438 439
439 /* 440 /*
440 * Cascade timers: 441 * Cascade timers:
441 */ 442 */
442 if (!index && 443 if (!index &&
443 (!cascade(base, &base->tv2, INDEX(0))) && 444 (!cascade(base, &base->tv2, INDEX(0))) &&
444 (!cascade(base, &base->tv3, INDEX(1))) && 445 (!cascade(base, &base->tv3, INDEX(1))) &&
445 !cascade(base, &base->tv4, INDEX(2))) 446 !cascade(base, &base->tv4, INDEX(2)))
446 cascade(base, &base->tv5, INDEX(3)); 447 cascade(base, &base->tv5, INDEX(3));
447 ++base->timer_jiffies; 448 ++base->timer_jiffies;
448 list_splice_init(base->tv1.vec + index, &work_list); 449 list_splice_init(base->tv1.vec + index, &work_list);
449 while (!list_empty(head)) { 450 while (!list_empty(head)) {
450 void (*fn)(unsigned long); 451 void (*fn)(unsigned long);
451 unsigned long data; 452 unsigned long data;
452 453
453 timer = list_entry(head->next,struct timer_list,entry); 454 timer = list_entry(head->next,struct timer_list,entry);
454 fn = timer->function; 455 fn = timer->function;
455 data = timer->data; 456 data = timer->data;
456 457
457 set_running_timer(base, timer); 458 set_running_timer(base, timer);
458 detach_timer(timer, 1); 459 detach_timer(timer, 1);
459 spin_unlock_irq(&base->t_base.lock); 460 spin_unlock_irq(&base->t_base.lock);
460 { 461 {
461 int preempt_count = preempt_count(); 462 int preempt_count = preempt_count();
462 fn(data); 463 fn(data);
463 if (preempt_count != preempt_count()) { 464 if (preempt_count != preempt_count()) {
464 printk(KERN_WARNING "huh, entered %p " 465 printk(KERN_WARNING "huh, entered %p "
465 "with preempt_count %08x, exited" 466 "with preempt_count %08x, exited"
466 " with %08x?\n", 467 " with %08x?\n",
467 fn, preempt_count, 468 fn, preempt_count,
468 preempt_count()); 469 preempt_count());
469 BUG(); 470 BUG();
470 } 471 }
471 } 472 }
472 spin_lock_irq(&base->t_base.lock); 473 spin_lock_irq(&base->t_base.lock);
473 } 474 }
474 } 475 }
475 set_running_timer(base, NULL); 476 set_running_timer(base, NULL);
476 spin_unlock_irq(&base->t_base.lock); 477 spin_unlock_irq(&base->t_base.lock);
477 } 478 }
478 479
479 #ifdef CONFIG_NO_IDLE_HZ 480 #ifdef CONFIG_NO_IDLE_HZ
480 /* 481 /*
481 * Find out when the next timer event is due to happen. This 482 * Find out when the next timer event is due to happen. This
482 * is used on S/390 to stop all activity when a cpus is idle. 483 * is used on S/390 to stop all activity when a cpus is idle.
483 * This functions needs to be called disabled. 484 * This functions needs to be called disabled.
484 */ 485 */
485 unsigned long next_timer_interrupt(void) 486 unsigned long next_timer_interrupt(void)
486 { 487 {
487 tvec_base_t *base; 488 tvec_base_t *base;
488 struct list_head *list; 489 struct list_head *list;
489 struct timer_list *nte; 490 struct timer_list *nte;
490 unsigned long expires; 491 unsigned long expires;
491 tvec_t *varray[4]; 492 tvec_t *varray[4];
492 int i, j; 493 int i, j;
493 494
494 base = &__get_cpu_var(tvec_bases); 495 base = &__get_cpu_var(tvec_bases);
495 spin_lock(&base->t_base.lock); 496 spin_lock(&base->t_base.lock);
496 expires = base->timer_jiffies + (LONG_MAX >> 1); 497 expires = base->timer_jiffies + (LONG_MAX >> 1);
497 list = 0; 498 list = 0;
498 499
499 /* Look for timer events in tv1. */ 500 /* Look for timer events in tv1. */
500 j = base->timer_jiffies & TVR_MASK; 501 j = base->timer_jiffies & TVR_MASK;
501 do { 502 do {
502 list_for_each_entry(nte, base->tv1.vec + j, entry) { 503 list_for_each_entry(nte, base->tv1.vec + j, entry) {
503 expires = nte->expires; 504 expires = nte->expires;
504 if (j < (base->timer_jiffies & TVR_MASK)) 505 if (j < (base->timer_jiffies & TVR_MASK))
505 list = base->tv2.vec + (INDEX(0)); 506 list = base->tv2.vec + (INDEX(0));
506 goto found; 507 goto found;
507 } 508 }
508 j = (j + 1) & TVR_MASK; 509 j = (j + 1) & TVR_MASK;
509 } while (j != (base->timer_jiffies & TVR_MASK)); 510 } while (j != (base->timer_jiffies & TVR_MASK));
510 511
511 /* Check tv2-tv5. */ 512 /* Check tv2-tv5. */
512 varray[0] = &base->tv2; 513 varray[0] = &base->tv2;
513 varray[1] = &base->tv3; 514 varray[1] = &base->tv3;
514 varray[2] = &base->tv4; 515 varray[2] = &base->tv4;
515 varray[3] = &base->tv5; 516 varray[3] = &base->tv5;
516 for (i = 0; i < 4; i++) { 517 for (i = 0; i < 4; i++) {
517 j = INDEX(i); 518 j = INDEX(i);
518 do { 519 do {
519 if (list_empty(varray[i]->vec + j)) { 520 if (list_empty(varray[i]->vec + j)) {
520 j = (j + 1) & TVN_MASK; 521 j = (j + 1) & TVN_MASK;
521 continue; 522 continue;
522 } 523 }
523 list_for_each_entry(nte, varray[i]->vec + j, entry) 524 list_for_each_entry(nte, varray[i]->vec + j, entry)
524 if (time_before(nte->expires, expires)) 525 if (time_before(nte->expires, expires))
525 expires = nte->expires; 526 expires = nte->expires;
526 if (j < (INDEX(i)) && i < 3) 527 if (j < (INDEX(i)) && i < 3)
527 list = varray[i + 1]->vec + (INDEX(i + 1)); 528 list = varray[i + 1]->vec + (INDEX(i + 1));
528 goto found; 529 goto found;
529 } while (j != (INDEX(i))); 530 } while (j != (INDEX(i)));
530 } 531 }
531 found: 532 found:
532 if (list) { 533 if (list) {
533 /* 534 /*
534 * The search wrapped. We need to look at the next list 535 * The search wrapped. We need to look at the next list
535 * from next tv element that would cascade into tv element 536 * from next tv element that would cascade into tv element
536 * where we found the timer element. 537 * where we found the timer element.
537 */ 538 */
538 list_for_each_entry(nte, list, entry) { 539 list_for_each_entry(nte, list, entry) {
539 if (time_before(nte->expires, expires)) 540 if (time_before(nte->expires, expires))
540 expires = nte->expires; 541 expires = nte->expires;
541 } 542 }
542 } 543 }
543 spin_unlock(&base->t_base.lock); 544 spin_unlock(&base->t_base.lock);
544 return expires; 545 return expires;
545 } 546 }
546 #endif 547 #endif
547 548
548 /******************************************************************/ 549 /******************************************************************/
549 550
550 /* 551 /*
551 * Timekeeping variables 552 * Timekeeping variables
552 */ 553 */
553 unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ 554 unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
554 unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ 555 unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
555 556
556 /* 557 /*
557 * The current time 558 * The current time
558 * wall_to_monotonic is what we need to add to xtime (or xtime corrected 559 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
559 * for sub jiffie times) to get to monotonic time. Monotonic is pegged 560 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
560 * at zero at system boot time, so wall_to_monotonic will be negative, 561 * at zero at system boot time, so wall_to_monotonic will be negative,
561 * however, we will ALWAYS keep the tv_nsec part positive so we can use 562 * however, we will ALWAYS keep the tv_nsec part positive so we can use
562 * the usual normalization. 563 * the usual normalization.
563 */ 564 */
564 struct timespec xtime __attribute__ ((aligned (16))); 565 struct timespec xtime __attribute__ ((aligned (16)));
565 struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 566 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
566 567
567 EXPORT_SYMBOL(xtime); 568 EXPORT_SYMBOL(xtime);
568 569
569 /* Don't completely fail for HZ > 500. */ 570 /* Don't completely fail for HZ > 500. */
570 int tickadj = 500/HZ ? : 1; /* microsecs */ 571 int tickadj = 500/HZ ? : 1; /* microsecs */
571 572
572 573
573 /* 574 /*
574 * phase-lock loop variables 575 * phase-lock loop variables
575 */ 576 */
576 /* TIME_ERROR prevents overwriting the CMOS clock */ 577 /* TIME_ERROR prevents overwriting the CMOS clock */
577 int time_state = TIME_OK; /* clock synchronization status */ 578 int time_state = TIME_OK; /* clock synchronization status */
578 int time_status = STA_UNSYNC; /* clock status bits */ 579 int time_status = STA_UNSYNC; /* clock status bits */
579 long time_offset; /* time adjustment (us) */ 580 long time_offset; /* time adjustment (us) */
580 long time_constant = 2; /* pll time constant */ 581 long time_constant = 2; /* pll time constant */
581 long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ 582 long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
582 long time_precision = 1; /* clock precision (us) */ 583 long time_precision = 1; /* clock precision (us) */
583 long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 584 long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
584 long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 585 long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
585 static long time_phase; /* phase offset (scaled us) */ 586 static long time_phase; /* phase offset (scaled us) */
586 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 587 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
587 /* frequency offset (scaled ppm)*/ 588 /* frequency offset (scaled ppm)*/
588 static long time_adj; /* tick adjust (scaled 1 / HZ) */ 589 static long time_adj; /* tick adjust (scaled 1 / HZ) */
589 long time_reftime; /* time at last adjustment (s) */ 590 long time_reftime; /* time at last adjustment (s) */
590 long time_adjust; 591 long time_adjust;
591 long time_next_adjust; 592 long time_next_adjust;
592 593
593 /* 594 /*
594 * this routine handles the overflow of the microsecond field 595 * this routine handles the overflow of the microsecond field
595 * 596 *
596 * The tricky bits of code to handle the accurate clock support 597 * The tricky bits of code to handle the accurate clock support
597 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. 598 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
598 * They were originally developed for SUN and DEC kernels. 599 * They were originally developed for SUN and DEC kernels.
599 * All the kudos should go to Dave for this stuff. 600 * All the kudos should go to Dave for this stuff.
600 * 601 *
601 */ 602 */
602 static void second_overflow(void) 603 static void second_overflow(void)
603 { 604 {
604 long ltemp; 605 long ltemp;
605 606
606 /* Bump the maxerror field */ 607 /* Bump the maxerror field */
607 time_maxerror += time_tolerance >> SHIFT_USEC; 608 time_maxerror += time_tolerance >> SHIFT_USEC;
608 if (time_maxerror > NTP_PHASE_LIMIT) { 609 if (time_maxerror > NTP_PHASE_LIMIT) {
609 time_maxerror = NTP_PHASE_LIMIT; 610 time_maxerror = NTP_PHASE_LIMIT;
610 time_status |= STA_UNSYNC; 611 time_status |= STA_UNSYNC;
611 } 612 }
612 613
613 /* 614 /*
614 * Leap second processing. If in leap-insert state at the end of the 615 * Leap second processing. If in leap-insert state at the end of the
615 * day, the system clock is set back one second; if in leap-delete 616 * day, the system clock is set back one second; if in leap-delete
616 * state, the system clock is set ahead one second. The microtime() 617 * state, the system clock is set ahead one second. The microtime()
617 * routine or external clock driver will insure that reported time is 618 * routine or external clock driver will insure that reported time is
618 * always monotonic. The ugly divides should be replaced. 619 * always monotonic. The ugly divides should be replaced.
619 */ 620 */
620 switch (time_state) { 621 switch (time_state) {
621 case TIME_OK: 622 case TIME_OK:
622 if (time_status & STA_INS) 623 if (time_status & STA_INS)
623 time_state = TIME_INS; 624 time_state = TIME_INS;
624 else if (time_status & STA_DEL) 625 else if (time_status & STA_DEL)
625 time_state = TIME_DEL; 626 time_state = TIME_DEL;
626 break; 627 break;
627 case TIME_INS: 628 case TIME_INS:
628 if (xtime.tv_sec % 86400 == 0) { 629 if (xtime.tv_sec % 86400 == 0) {
629 xtime.tv_sec--; 630 xtime.tv_sec--;
630 wall_to_monotonic.tv_sec++; 631 wall_to_monotonic.tv_sec++;
631 /* 632 /*
632 * The timer interpolator will make time change 633 * The timer interpolator will make time change
633 * gradually instead of an immediate jump by one second 634 * gradually instead of an immediate jump by one second
634 */ 635 */
635 time_interpolator_update(-NSEC_PER_SEC); 636 time_interpolator_update(-NSEC_PER_SEC);
636 time_state = TIME_OOP; 637 time_state = TIME_OOP;
637 clock_was_set(); 638 clock_was_set();
638 printk(KERN_NOTICE "Clock: inserting leap second " 639 printk(KERN_NOTICE "Clock: inserting leap second "
639 "23:59:60 UTC\n"); 640 "23:59:60 UTC\n");
640 } 641 }
641 break; 642 break;
642 case TIME_DEL: 643 case TIME_DEL:
643 if ((xtime.tv_sec + 1) % 86400 == 0) { 644 if ((xtime.tv_sec + 1) % 86400 == 0) {
644 xtime.tv_sec++; 645 xtime.tv_sec++;
645 wall_to_monotonic.tv_sec--; 646 wall_to_monotonic.tv_sec--;
646 /* 647 /*
647 * Use of time interpolator for a gradual change of 648 * Use of time interpolator for a gradual change of
648 * time 649 * time
649 */ 650 */
650 time_interpolator_update(NSEC_PER_SEC); 651 time_interpolator_update(NSEC_PER_SEC);
651 time_state = TIME_WAIT; 652 time_state = TIME_WAIT;
652 clock_was_set(); 653 clock_was_set();
653 printk(KERN_NOTICE "Clock: deleting leap second " 654 printk(KERN_NOTICE "Clock: deleting leap second "
654 "23:59:59 UTC\n"); 655 "23:59:59 UTC\n");
655 } 656 }
656 break; 657 break;
657 case TIME_OOP: 658 case TIME_OOP:
658 time_state = TIME_WAIT; 659 time_state = TIME_WAIT;
659 break; 660 break;
660 case TIME_WAIT: 661 case TIME_WAIT:
661 if (!(time_status & (STA_INS | STA_DEL))) 662 if (!(time_status & (STA_INS | STA_DEL)))
662 time_state = TIME_OK; 663 time_state = TIME_OK;
663 } 664 }
664 665
665 /* 666 /*
666 * Compute the phase adjustment for the next second. In PLL mode, the 667 * Compute the phase adjustment for the next second. In PLL mode, the
667 * offset is reduced by a fixed factor times the time constant. In FLL 668 * offset is reduced by a fixed factor times the time constant. In FLL
668 * mode the offset is used directly. In either mode, the maximum phase 669 * mode the offset is used directly. In either mode, the maximum phase
669 * adjustment for each second is clamped so as to spread the adjustment 670 * adjustment for each second is clamped so as to spread the adjustment
670 * over not more than the number of seconds between updates. 671 * over not more than the number of seconds between updates.
671 */ 672 */
672 ltemp = time_offset; 673 ltemp = time_offset;
673 if (!(time_status & STA_FLL)) 674 if (!(time_status & STA_FLL))
674 ltemp = shift_right(ltemp, SHIFT_KG + time_constant); 675 ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
675 ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); 676 ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
676 ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); 677 ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
677 time_offset -= ltemp; 678 time_offset -= ltemp;
678 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); 679 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
679 680
680 /* 681 /*
681 * Compute the frequency estimate and additional phase adjustment due 682 * Compute the frequency estimate and additional phase adjustment due
682 * to frequency error for the next second. When the PPS signal is 683 * to frequency error for the next second. When the PPS signal is
683 * engaged, gnaw on the watchdog counter and update the frequency 684 * engaged, gnaw on the watchdog counter and update the frequency
684 * computed by the pll and the PPS signal. 685 * computed by the pll and the PPS signal.
685 */ 686 */
686 pps_valid++; 687 pps_valid++;
687 if (pps_valid == PPS_VALID) { /* PPS signal lost */ 688 if (pps_valid == PPS_VALID) { /* PPS signal lost */
688 pps_jitter = MAXTIME; 689 pps_jitter = MAXTIME;
689 pps_stabil = MAXFREQ; 690 pps_stabil = MAXFREQ;
690 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | 691 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
691 STA_PPSWANDER | STA_PPSERROR); 692 STA_PPSWANDER | STA_PPSERROR);
692 } 693 }
693 ltemp = time_freq + pps_freq; 694 ltemp = time_freq + pps_freq;
694 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); 695 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
695 696
696 #if HZ == 100 697 #if HZ == 100
697 /* 698 /*
698 * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to 699 * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to
699 * get 128.125; => only 0.125% error (p. 14) 700 * get 128.125; => only 0.125% error (p. 14)
700 */ 701 */
701 time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); 702 time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
702 #endif 703 #endif
703 #if HZ == 250 704 #if HZ == 250
704 /* 705 /*
705 * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and 706 * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and
706 * 0.78125% to get 255.85938; => only 0.05% error (p. 14) 707 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
707 */ 708 */
708 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); 709 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
709 #endif 710 #endif
710 #if HZ == 1000 711 #if HZ == 1000
711 /* 712 /*
712 * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and 713 * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and
713 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) 714 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
714 */ 715 */
715 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); 716 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
716 #endif 717 #endif
717 } 718 }
718 719
719 /* in the NTP reference this is called "hardclock()" */ 720 /* in the NTP reference this is called "hardclock()" */
720 static void update_wall_time_one_tick(void) 721 static void update_wall_time_one_tick(void)
721 { 722 {
722 long time_adjust_step, delta_nsec; 723 long time_adjust_step, delta_nsec;
723 724
724 if ((time_adjust_step = time_adjust) != 0 ) { 725 if ((time_adjust_step = time_adjust) != 0 ) {
725 /* 726 /*
726 * We are doing an adjtime thing. Prepare time_adjust_step to 727 * We are doing an adjtime thing. Prepare time_adjust_step to
727 * be within bounds. Note that a positive time_adjust means we 728 * be within bounds. Note that a positive time_adjust means we
728 * want the clock to run faster. 729 * want the clock to run faster.
729 * 730 *
730 * Limit the amount of the step to be in the range 731 * Limit the amount of the step to be in the range
731 * -tickadj .. +tickadj 732 * -tickadj .. +tickadj
732 */ 733 */
733 time_adjust_step = min(time_adjust_step, (long)tickadj); 734 time_adjust_step = min(time_adjust_step, (long)tickadj);
734 time_adjust_step = max(time_adjust_step, (long)-tickadj); 735 time_adjust_step = max(time_adjust_step, (long)-tickadj);
735 736
736 /* Reduce by this step the amount of time left */ 737 /* Reduce by this step the amount of time left */
737 time_adjust -= time_adjust_step; 738 time_adjust -= time_adjust_step;
738 } 739 }
739 delta_nsec = tick_nsec + time_adjust_step * 1000; 740 delta_nsec = tick_nsec + time_adjust_step * 1000;
740 /* 741 /*
741 * Advance the phase, once it gets to one microsecond, then 742 * Advance the phase, once it gets to one microsecond, then
742 * advance the tick more. 743 * advance the tick more.
743 */ 744 */
744 time_phase += time_adj; 745 time_phase += time_adj;
745 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { 746 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
746 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); 747 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
747 time_phase -= ltemp << (SHIFT_SCALE - 10); 748 time_phase -= ltemp << (SHIFT_SCALE - 10);
748 delta_nsec += ltemp; 749 delta_nsec += ltemp;
749 } 750 }
750 xtime.tv_nsec += delta_nsec; 751 xtime.tv_nsec += delta_nsec;
751 time_interpolator_update(delta_nsec); 752 time_interpolator_update(delta_nsec);
752 753
753 /* Changes by adjtime() do not take effect till next tick. */ 754 /* Changes by adjtime() do not take effect till next tick. */
754 if (time_next_adjust != 0) { 755 if (time_next_adjust != 0) {
755 time_adjust = time_next_adjust; 756 time_adjust = time_next_adjust;
756 time_next_adjust = 0; 757 time_next_adjust = 0;
757 } 758 }
758 } 759 }
759 760
760 /* 761 /*
761 * Using a loop looks inefficient, but "ticks" is 762 * Using a loop looks inefficient, but "ticks" is
762 * usually just one (we shouldn't be losing ticks, 763 * usually just one (we shouldn't be losing ticks,
763 * we're doing this this way mainly for interrupt 764 * we're doing this this way mainly for interrupt
764 * latency reasons, not because we think we'll 765 * latency reasons, not because we think we'll
765 * have lots of lost timer ticks 766 * have lots of lost timer ticks
766 */ 767 */
767 static void update_wall_time(unsigned long ticks) 768 static void update_wall_time(unsigned long ticks)
768 { 769 {
769 do { 770 do {
770 ticks--; 771 ticks--;
771 update_wall_time_one_tick(); 772 update_wall_time_one_tick();
772 if (xtime.tv_nsec >= 1000000000) { 773 if (xtime.tv_nsec >= 1000000000) {
773 xtime.tv_nsec -= 1000000000; 774 xtime.tv_nsec -= 1000000000;
774 xtime.tv_sec++; 775 xtime.tv_sec++;
775 second_overflow(); 776 second_overflow();
776 } 777 }
777 } while (ticks); 778 } while (ticks);
778 } 779 }
779 780
780 /* 781 /*
781 * Called from the timer interrupt handler to charge one tick to the current 782 * Called from the timer interrupt handler to charge one tick to the current
782 * process. user_tick is 1 if the tick is user time, 0 for system. 783 * process. user_tick is 1 if the tick is user time, 0 for system.
783 */ 784 */
784 void update_process_times(int user_tick) 785 void update_process_times(int user_tick)
785 { 786 {
786 struct task_struct *p = current; 787 struct task_struct *p = current;
787 int cpu = smp_processor_id(); 788 int cpu = smp_processor_id();
788 789
789 /* Note: this timer irq context must be accounted for as well. */ 790 /* Note: this timer irq context must be accounted for as well. */
790 if (user_tick) 791 if (user_tick)
791 account_user_time(p, jiffies_to_cputime(1)); 792 account_user_time(p, jiffies_to_cputime(1));
792 else 793 else
793 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); 794 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
794 run_local_timers(); 795 run_local_timers();
795 if (rcu_pending(cpu)) 796 if (rcu_pending(cpu))
796 rcu_check_callbacks(cpu, user_tick); 797 rcu_check_callbacks(cpu, user_tick);
797 scheduler_tick(); 798 scheduler_tick();
798 run_posix_cpu_timers(p); 799 run_posix_cpu_timers(p);
799 } 800 }
800 801
801 /* 802 /*
802 * Nr of active tasks - counted in fixed-point numbers 803 * Nr of active tasks - counted in fixed-point numbers
803 */ 804 */
804 static unsigned long count_active_tasks(void) 805 static unsigned long count_active_tasks(void)
805 { 806 {
806 return (nr_running() + nr_uninterruptible()) * FIXED_1; 807 return (nr_running() + nr_uninterruptible()) * FIXED_1;
807 } 808 }
808 809
809 /* 810 /*
810 * Hmm.. Changed this, as the GNU make sources (load.c) seems to 811 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
811 * imply that avenrun[] is the standard name for this kind of thing. 812 * imply that avenrun[] is the standard name for this kind of thing.
812 * Nothing else seems to be standardized: the fractional size etc 813 * Nothing else seems to be standardized: the fractional size etc
813 * all seem to differ on different machines. 814 * all seem to differ on different machines.
814 * 815 *
815 * Requires xtime_lock to access. 816 * Requires xtime_lock to access.
816 */ 817 */
817 unsigned long avenrun[3]; 818 unsigned long avenrun[3];
818 819
819 EXPORT_SYMBOL(avenrun); 820 EXPORT_SYMBOL(avenrun);
820 821
821 /* 822 /*
822 * calc_load - given tick count, update the avenrun load estimates. 823 * calc_load - given tick count, update the avenrun load estimates.
823 * This is called while holding a write_lock on xtime_lock. 824 * This is called while holding a write_lock on xtime_lock.
824 */ 825 */
825 static inline void calc_load(unsigned long ticks) 826 static inline void calc_load(unsigned long ticks)
826 { 827 {
827 unsigned long active_tasks; /* fixed-point */ 828 unsigned long active_tasks; /* fixed-point */
828 static int count = LOAD_FREQ; 829 static int count = LOAD_FREQ;
829 830
830 count -= ticks; 831 count -= ticks;
831 if (count < 0) { 832 if (count < 0) {
832 count += LOAD_FREQ; 833 count += LOAD_FREQ;
833 active_tasks = count_active_tasks(); 834 active_tasks = count_active_tasks();
834 CALC_LOAD(avenrun[0], EXP_1, active_tasks); 835 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
835 CALC_LOAD(avenrun[1], EXP_5, active_tasks); 836 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
836 CALC_LOAD(avenrun[2], EXP_15, active_tasks); 837 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
837 } 838 }
838 } 839 }
839 840
840 /* jiffies at the most recent update of wall time */ 841 /* jiffies at the most recent update of wall time */
841 unsigned long wall_jiffies = INITIAL_JIFFIES; 842 unsigned long wall_jiffies = INITIAL_JIFFIES;
842 843
843 /* 844 /*
844 * This read-write spinlock protects us from races in SMP while 845 * This read-write spinlock protects us from races in SMP while
845 * playing with xtime and avenrun. 846 * playing with xtime and avenrun.
846 */ 847 */
847 #ifndef ARCH_HAVE_XTIME_LOCK 848 #ifndef ARCH_HAVE_XTIME_LOCK
848 seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; 849 seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
849 850
850 EXPORT_SYMBOL(xtime_lock); 851 EXPORT_SYMBOL(xtime_lock);
851 #endif 852 #endif
852 853
853 /* 854 /*
854 * This function runs timers and the timer-tq in bottom half context. 855 * This function runs timers and the timer-tq in bottom half context.
855 */ 856 */
856 static void run_timer_softirq(struct softirq_action *h) 857 static void run_timer_softirq(struct softirq_action *h)
857 { 858 {
858 tvec_base_t *base = &__get_cpu_var(tvec_bases); 859 tvec_base_t *base = &__get_cpu_var(tvec_bases);
859 860
860 if (time_after_eq(jiffies, base->timer_jiffies)) 861 if (time_after_eq(jiffies, base->timer_jiffies))
861 __run_timers(base); 862 __run_timers(base);
862 } 863 }
863 864
864 /* 865 /*
865 * Called by the local, per-CPU timer interrupt on SMP. 866 * Called by the local, per-CPU timer interrupt on SMP.
866 */ 867 */
867 void run_local_timers(void) 868 void run_local_timers(void)
868 { 869 {
869 raise_softirq(TIMER_SOFTIRQ); 870 raise_softirq(TIMER_SOFTIRQ);
870 } 871 }
871 872
872 /* 873 /*
873 * Called by the timer interrupt. xtime_lock must already be taken 874 * Called by the timer interrupt. xtime_lock must already be taken
874 * by the timer IRQ! 875 * by the timer IRQ!
875 */ 876 */
876 static inline void update_times(void) 877 static inline void update_times(void)
877 { 878 {
878 unsigned long ticks; 879 unsigned long ticks;
879 880
880 ticks = jiffies - wall_jiffies; 881 ticks = jiffies - wall_jiffies;
881 if (ticks) { 882 if (ticks) {
882 wall_jiffies += ticks; 883 wall_jiffies += ticks;
883 update_wall_time(ticks); 884 update_wall_time(ticks);
884 } 885 }
885 calc_load(ticks); 886 calc_load(ticks);
886 } 887 }
887 888
888 /* 889 /*
889 * The 64-bit jiffies value is not atomic - you MUST NOT read it 890 * The 64-bit jiffies value is not atomic - you MUST NOT read it
890 * without sampling the sequence number in xtime_lock. 891 * without sampling the sequence number in xtime_lock.
891 * jiffies is defined in the linker script... 892 * jiffies is defined in the linker script...
892 */ 893 */
893 894
894 void do_timer(struct pt_regs *regs) 895 void do_timer(struct pt_regs *regs)
895 { 896 {
896 jiffies_64++; 897 jiffies_64++;
897 update_times(); 898 update_times();
898 softlockup_tick(regs); 899 softlockup_tick(regs);
899 } 900 }
900 901
901 #ifdef __ARCH_WANT_SYS_ALARM 902 #ifdef __ARCH_WANT_SYS_ALARM
902 903
903 /* 904 /*
904 * For backwards compatibility? This can be done in libc so Alpha 905 * For backwards compatibility? This can be done in libc so Alpha
905 * and all newer ports shouldn't need it. 906 * and all newer ports shouldn't need it.
906 */ 907 */
907 asmlinkage unsigned long sys_alarm(unsigned int seconds) 908 asmlinkage unsigned long sys_alarm(unsigned int seconds)
908 { 909 {
909 struct itimerval it_new, it_old; 910 struct itimerval it_new, it_old;
910 unsigned int oldalarm; 911 unsigned int oldalarm;
911 912
912 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; 913 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
913 it_new.it_value.tv_sec = seconds; 914 it_new.it_value.tv_sec = seconds;
914 it_new.it_value.tv_usec = 0; 915 it_new.it_value.tv_usec = 0;
915 do_setitimer(ITIMER_REAL, &it_new, &it_old); 916 do_setitimer(ITIMER_REAL, &it_new, &it_old);
916 oldalarm = it_old.it_value.tv_sec; 917 oldalarm = it_old.it_value.tv_sec;
917 /* ehhh.. We can't return 0 if we have an alarm pending.. */ 918 /* ehhh.. We can't return 0 if we have an alarm pending.. */
918 /* And we'd better return too much than too little anyway */ 919 /* And we'd better return too much than too little anyway */
919 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) 920 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
920 oldalarm++; 921 oldalarm++;
921 return oldalarm; 922 return oldalarm;
922 } 923 }
923 924
924 #endif 925 #endif
925 926
926 #ifndef __alpha__ 927 #ifndef __alpha__
927 928
928 /* 929 /*
929 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this 930 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
930 * should be moved into arch/i386 instead? 931 * should be moved into arch/i386 instead?
931 */ 932 */
932 933
933 /** 934 /**
934 * sys_getpid - return the thread group id of the current process 935 * sys_getpid - return the thread group id of the current process
935 * 936 *
936 * Note, despite the name, this returns the tgid not the pid. The tgid and 937 * Note, despite the name, this returns the tgid not the pid. The tgid and
937 * the pid are identical unless CLONE_THREAD was specified on clone() in 938 * the pid are identical unless CLONE_THREAD was specified on clone() in
938 * which case the tgid is the same in all threads of the same group. 939 * which case the tgid is the same in all threads of the same group.
939 * 940 *
940 * This is SMP safe as current->tgid does not change. 941 * This is SMP safe as current->tgid does not change.
941 */ 942 */
942 asmlinkage long sys_getpid(void) 943 asmlinkage long sys_getpid(void)
943 { 944 {
944 return current->tgid; 945 return current->tgid;
945 } 946 }
946 947
947 /* 948 /*
948 * Accessing ->group_leader->real_parent is not SMP-safe, it could 949 * Accessing ->group_leader->real_parent is not SMP-safe, it could
949 * change from under us. However, rather than getting any lock 950 * change from under us. However, rather than getting any lock
950 * we can use an optimistic algorithm: get the parent 951 * we can use an optimistic algorithm: get the parent
951 * pid, and go back and check that the parent is still 952 * pid, and go back and check that the parent is still
952 * the same. If it has changed (which is extremely unlikely 953 * the same. If it has changed (which is extremely unlikely
953 * indeed), we just try again.. 954 * indeed), we just try again..
954 * 955 *
955 * NOTE! This depends on the fact that even if we _do_ 956 * NOTE! This depends on the fact that even if we _do_
956 * get an old value of "parent", we can happily dereference 957 * get an old value of "parent", we can happily dereference
957 * the pointer (it was and remains a dereferencable kernel pointer 958 * the pointer (it was and remains a dereferencable kernel pointer
958 * no matter what): we just can't necessarily trust the result 959 * no matter what): we just can't necessarily trust the result
959 * until we know that the parent pointer is valid. 960 * until we know that the parent pointer is valid.
960 * 961 *
961 * NOTE2: ->group_leader never changes from under us. 962 * NOTE2: ->group_leader never changes from under us.
962 */ 963 */
963 asmlinkage long sys_getppid(void) 964 asmlinkage long sys_getppid(void)
964 { 965 {
965 int pid; 966 int pid;
966 struct task_struct *me = current; 967 struct task_struct *me = current;
967 struct task_struct *parent; 968 struct task_struct *parent;
968 969
969 parent = me->group_leader->real_parent; 970 parent = me->group_leader->real_parent;
970 for (;;) { 971 for (;;) {
971 pid = parent->tgid; 972 pid = parent->tgid;
972 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 973 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
973 { 974 {
974 struct task_struct *old = parent; 975 struct task_struct *old = parent;
975 976
976 /* 977 /*
977 * Make sure we read the pid before re-reading the 978 * Make sure we read the pid before re-reading the
978 * parent pointer: 979 * parent pointer:
979 */ 980 */
980 smp_rmb(); 981 smp_rmb();
981 parent = me->group_leader->real_parent; 982 parent = me->group_leader->real_parent;
982 if (old != parent) 983 if (old != parent)
983 continue; 984 continue;
984 } 985 }
985 #endif 986 #endif
986 break; 987 break;
987 } 988 }
988 return pid; 989 return pid;
989 } 990 }
990 991
991 asmlinkage long sys_getuid(void) 992 asmlinkage long sys_getuid(void)
992 { 993 {
993 /* Only we change this so SMP safe */ 994 /* Only we change this so SMP safe */
994 return current->uid; 995 return current->uid;
995 } 996 }
996 997
997 asmlinkage long sys_geteuid(void) 998 asmlinkage long sys_geteuid(void)
998 { 999 {
999 /* Only we change this so SMP safe */ 1000 /* Only we change this so SMP safe */
1000 return current->euid; 1001 return current->euid;
1001 } 1002 }
1002 1003
1003 asmlinkage long sys_getgid(void) 1004 asmlinkage long sys_getgid(void)
1004 { 1005 {
1005 /* Only we change this so SMP safe */ 1006 /* Only we change this so SMP safe */
1006 return current->gid; 1007 return current->gid;
1007 } 1008 }
1008 1009
1009 asmlinkage long sys_getegid(void) 1010 asmlinkage long sys_getegid(void)
1010 { 1011 {
1011 /* Only we change this so SMP safe */ 1012 /* Only we change this so SMP safe */
1012 return current->egid; 1013 return current->egid;
1013 } 1014 }
1014 1015
1015 #endif 1016 #endif
1016 1017
1017 static void process_timeout(unsigned long __data) 1018 static void process_timeout(unsigned long __data)
1018 { 1019 {
1019 wake_up_process((task_t *)__data); 1020 wake_up_process((task_t *)__data);
1020 } 1021 }
1021 1022
1022 /** 1023 /**
1023 * schedule_timeout - sleep until timeout 1024 * schedule_timeout - sleep until timeout
1024 * @timeout: timeout value in jiffies 1025 * @timeout: timeout value in jiffies
1025 * 1026 *
1026 * Make the current task sleep until @timeout jiffies have 1027 * Make the current task sleep until @timeout jiffies have
1027 * elapsed. The routine will return immediately unless 1028 * elapsed. The routine will return immediately unless
1028 * the current task state has been set (see set_current_state()). 1029 * the current task state has been set (see set_current_state()).
1029 * 1030 *
1030 * You can set the task state as follows - 1031 * You can set the task state as follows -
1031 * 1032 *
1032 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to 1033 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1033 * pass before the routine returns. The routine will return 0 1034 * pass before the routine returns. The routine will return 0
1034 * 1035 *
1035 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is 1036 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1036 * delivered to the current task. In this case the remaining time 1037 * delivered to the current task. In this case the remaining time
1037 * in jiffies will be returned, or 0 if the timer expired in time 1038 * in jiffies will be returned, or 0 if the timer expired in time
1038 * 1039 *
1039 * The current task state is guaranteed to be TASK_RUNNING when this 1040 * The current task state is guaranteed to be TASK_RUNNING when this
1040 * routine returns. 1041 * routine returns.
1041 * 1042 *
1042 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule 1043 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1043 * the CPU away without a bound on the timeout. In this case the return 1044 * the CPU away without a bound on the timeout. In this case the return
1044 * value will be %MAX_SCHEDULE_TIMEOUT. 1045 * value will be %MAX_SCHEDULE_TIMEOUT.
1045 * 1046 *
1046 * In all cases the return value is guaranteed to be non-negative. 1047 * In all cases the return value is guaranteed to be non-negative.
1047 */ 1048 */
1048 fastcall signed long __sched schedule_timeout(signed long timeout) 1049 fastcall signed long __sched schedule_timeout(signed long timeout)
1049 { 1050 {
1050 struct timer_list timer; 1051 struct timer_list timer;
1051 unsigned long expire; 1052 unsigned long expire;
1052 1053
1053 switch (timeout) 1054 switch (timeout)
1054 { 1055 {
1055 case MAX_SCHEDULE_TIMEOUT: 1056 case MAX_SCHEDULE_TIMEOUT:
1056 /* 1057 /*
1057 * These two special cases are useful to be comfortable 1058 * These two special cases are useful to be comfortable
1058 * in the caller. Nothing more. We could take 1059 * in the caller. Nothing more. We could take
1059 * MAX_SCHEDULE_TIMEOUT from one of the negative value 1060 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1060 * but I' d like to return a valid offset (>=0) to allow 1061 * but I' d like to return a valid offset (>=0) to allow
1061 * the caller to do everything it want with the retval. 1062 * the caller to do everything it want with the retval.
1062 */ 1063 */
1063 schedule(); 1064 schedule();
1064 goto out; 1065 goto out;
1065 default: 1066 default:
1066 /* 1067 /*
1067 * Another bit of PARANOID. Note that the retval will be 1068 * Another bit of PARANOID. Note that the retval will be
1068 * 0 since no piece of kernel is supposed to do a check 1069 * 0 since no piece of kernel is supposed to do a check
1069 * for a negative retval of schedule_timeout() (since it 1070 * for a negative retval of schedule_timeout() (since it
1070 * should never happens anyway). You just have the printk() 1071 * should never happens anyway). You just have the printk()
1071 * that will tell you if something is gone wrong and where. 1072 * that will tell you if something is gone wrong and where.
1072 */ 1073 */
1073 if (timeout < 0) 1074 if (timeout < 0)
1074 { 1075 {
1075 printk(KERN_ERR "schedule_timeout: wrong timeout " 1076 printk(KERN_ERR "schedule_timeout: wrong timeout "
1076 "value %lx from %p\n", timeout, 1077 "value %lx from %p\n", timeout,
1077 __builtin_return_address(0)); 1078 __builtin_return_address(0));
1078 current->state = TASK_RUNNING; 1079 current->state = TASK_RUNNING;
1079 goto out; 1080 goto out;
1080 } 1081 }
1081 } 1082 }
1082 1083
1083 expire = timeout + jiffies; 1084 expire = timeout + jiffies;
1084 1085
1085 setup_timer(&timer, process_timeout, (unsigned long)current); 1086 setup_timer(&timer, process_timeout, (unsigned long)current);
1086 __mod_timer(&timer, expire); 1087 __mod_timer(&timer, expire);
1087 schedule(); 1088 schedule();
1088 del_singleshot_timer_sync(&timer); 1089 del_singleshot_timer_sync(&timer);
1089 1090
1090 timeout = expire - jiffies; 1091 timeout = expire - jiffies;
1091 1092
1092 out: 1093 out:
1093 return timeout < 0 ? 0 : timeout; 1094 return timeout < 0 ? 0 : timeout;
1094 } 1095 }
1095 EXPORT_SYMBOL(schedule_timeout); 1096 EXPORT_SYMBOL(schedule_timeout);
1096 1097
1097 /* 1098 /*
1098 * We can use __set_current_state() here because schedule_timeout() calls 1099 * We can use __set_current_state() here because schedule_timeout() calls
1099 * schedule() unconditionally. 1100 * schedule() unconditionally.
1100 */ 1101 */
1101 signed long __sched schedule_timeout_interruptible(signed long timeout) 1102 signed long __sched schedule_timeout_interruptible(signed long timeout)
1102 { 1103 {
1103 __set_current_state(TASK_INTERRUPTIBLE); 1104 __set_current_state(TASK_INTERRUPTIBLE);
1104 return schedule_timeout(timeout); 1105 return schedule_timeout(timeout);
1105 } 1106 }
1106 EXPORT_SYMBOL(schedule_timeout_interruptible); 1107 EXPORT_SYMBOL(schedule_timeout_interruptible);
1107 1108
1108 signed long __sched schedule_timeout_uninterruptible(signed long timeout) 1109 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1109 { 1110 {
1110 __set_current_state(TASK_UNINTERRUPTIBLE); 1111 __set_current_state(TASK_UNINTERRUPTIBLE);
1111 return schedule_timeout(timeout); 1112 return schedule_timeout(timeout);
1112 } 1113 }
1113 EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1114 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1114 1115
1115 /* Thread ID - the internal kernel "pid" */ 1116 /* Thread ID - the internal kernel "pid" */
1116 asmlinkage long sys_gettid(void) 1117 asmlinkage long sys_gettid(void)
1117 { 1118 {
1118 return current->pid; 1119 return current->pid;
1119 } 1120 }
1120 1121
1121 static long __sched nanosleep_restart(struct restart_block *restart) 1122 static long __sched nanosleep_restart(struct restart_block *restart)
1122 { 1123 {
1123 unsigned long expire = restart->arg0, now = jiffies; 1124 unsigned long expire = restart->arg0, now = jiffies;
1124 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; 1125 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
1125 long ret; 1126 long ret;
1126 1127
1127 /* Did it expire while we handled signals? */ 1128 /* Did it expire while we handled signals? */
1128 if (!time_after(expire, now)) 1129 if (!time_after(expire, now))
1129 return 0; 1130 return 0;
1130 1131
1131 expire = schedule_timeout_interruptible(expire - now); 1132 expire = schedule_timeout_interruptible(expire - now);
1132 1133
1133 ret = 0; 1134 ret = 0;
1134 if (expire) { 1135 if (expire) {
1135 struct timespec t; 1136 struct timespec t;
1136 jiffies_to_timespec(expire, &t); 1137 jiffies_to_timespec(expire, &t);
1137 1138
1138 ret = -ERESTART_RESTARTBLOCK; 1139 ret = -ERESTART_RESTARTBLOCK;
1139 if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) 1140 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1140 ret = -EFAULT; 1141 ret = -EFAULT;
1141 /* The 'restart' block is already filled in */ 1142 /* The 'restart' block is already filled in */
1142 } 1143 }
1143 return ret; 1144 return ret;
1144 } 1145 }
1145 1146
1146 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) 1147 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1147 { 1148 {
1148 struct timespec t; 1149 struct timespec t;
1149 unsigned long expire; 1150 unsigned long expire;
1150 long ret; 1151 long ret;
1151 1152
1152 if (copy_from_user(&t, rqtp, sizeof(t))) 1153 if (copy_from_user(&t, rqtp, sizeof(t)))
1153 return -EFAULT; 1154 return -EFAULT;
1154 1155
1155 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) 1156 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
1156 return -EINVAL; 1157 return -EINVAL;
1157 1158
1158 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 1159 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1159 expire = schedule_timeout_interruptible(expire); 1160 expire = schedule_timeout_interruptible(expire);
1160 1161
1161 ret = 0; 1162 ret = 0;
1162 if (expire) { 1163 if (expire) {
1163 struct restart_block *restart; 1164 struct restart_block *restart;
1164 jiffies_to_timespec(expire, &t); 1165 jiffies_to_timespec(expire, &t);
1165 if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) 1166 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1166 return -EFAULT; 1167 return -EFAULT;
1167 1168
1168 restart = &current_thread_info()->restart_block; 1169 restart = &current_thread_info()->restart_block;
1169 restart->fn = nanosleep_restart; 1170 restart->fn = nanosleep_restart;
1170 restart->arg0 = jiffies + expire; 1171 restart->arg0 = jiffies + expire;
1171 restart->arg1 = (unsigned long) rmtp; 1172 restart->arg1 = (unsigned long) rmtp;
1172 ret = -ERESTART_RESTARTBLOCK; 1173 ret = -ERESTART_RESTARTBLOCK;
1173 } 1174 }
1174 return ret; 1175 return ret;
1175 } 1176 }
1176 1177
1177 /* 1178 /*
1178 * sys_sysinfo - fill in sysinfo struct 1179 * sys_sysinfo - fill in sysinfo struct
1179 */ 1180 */
1180 asmlinkage long sys_sysinfo(struct sysinfo __user *info) 1181 asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1181 { 1182 {
1182 struct sysinfo val; 1183 struct sysinfo val;
1183 unsigned long mem_total, sav_total; 1184 unsigned long mem_total, sav_total;
1184 unsigned int mem_unit, bitcount; 1185 unsigned int mem_unit, bitcount;
1185 unsigned long seq; 1186 unsigned long seq;
1186 1187
1187 memset((char *)&val, 0, sizeof(struct sysinfo)); 1188 memset((char *)&val, 0, sizeof(struct sysinfo));
1188 1189
1189 do { 1190 do {
1190 struct timespec tp; 1191 struct timespec tp;
1191 seq = read_seqbegin(&xtime_lock); 1192 seq = read_seqbegin(&xtime_lock);
1192 1193
1193 /* 1194 /*
1194 * This is annoying. The below is the same thing 1195 * This is annoying. The below is the same thing
1195 * posix_get_clock_monotonic() does, but it wants to 1196 * posix_get_clock_monotonic() does, but it wants to
1196 * take the lock which we want to cover the loads stuff 1197 * take the lock which we want to cover the loads stuff
1197 * too. 1198 * too.
1198 */ 1199 */
1199 1200
1200 getnstimeofday(&tp); 1201 getnstimeofday(&tp);
1201 tp.tv_sec += wall_to_monotonic.tv_sec; 1202 tp.tv_sec += wall_to_monotonic.tv_sec;
1202 tp.tv_nsec += wall_to_monotonic.tv_nsec; 1203 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1203 if (tp.tv_nsec - NSEC_PER_SEC >= 0) { 1204 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1204 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; 1205 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1205 tp.tv_sec++; 1206 tp.tv_sec++;
1206 } 1207 }
1207 val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 1208 val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1208 1209
1209 val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1210 val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
1210 val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); 1211 val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1211 val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); 1212 val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1212 1213
1213 val.procs = nr_threads; 1214 val.procs = nr_threads;
1214 } while (read_seqretry(&xtime_lock, seq)); 1215 } while (read_seqretry(&xtime_lock, seq));
1215 1216
1216 si_meminfo(&val); 1217 si_meminfo(&val);
1217 si_swapinfo(&val); 1218 si_swapinfo(&val);
1218 1219
1219 /* 1220 /*
1220 * If the sum of all the available memory (i.e. ram + swap) 1221 * If the sum of all the available memory (i.e. ram + swap)
1221 * is less than can be stored in a 32 bit unsigned long then 1222 * is less than can be stored in a 32 bit unsigned long then
1222 * we can be binary compatible with 2.2.x kernels. If not, 1223 * we can be binary compatible with 2.2.x kernels. If not,
1223 * well, in that case 2.2.x was broken anyways... 1224 * well, in that case 2.2.x was broken anyways...
1224 * 1225 *
1225 * -Erik Andersen <andersee@debian.org> 1226 * -Erik Andersen <andersee@debian.org>
1226 */ 1227 */
1227 1228
1228 mem_total = val.totalram + val.totalswap; 1229 mem_total = val.totalram + val.totalswap;
1229 if (mem_total < val.totalram || mem_total < val.totalswap) 1230 if (mem_total < val.totalram || mem_total < val.totalswap)
1230 goto out; 1231 goto out;
1231 bitcount = 0; 1232 bitcount = 0;
1232 mem_unit = val.mem_unit; 1233 mem_unit = val.mem_unit;
1233 while (mem_unit > 1) { 1234 while (mem_unit > 1) {
1234 bitcount++; 1235 bitcount++;
1235 mem_unit >>= 1; 1236 mem_unit >>= 1;
1236 sav_total = mem_total; 1237 sav_total = mem_total;
1237 mem_total <<= 1; 1238 mem_total <<= 1;
1238 if (mem_total < sav_total) 1239 if (mem_total < sav_total)
1239 goto out; 1240 goto out;
1240 } 1241 }
1241 1242
1242 /* 1243 /*
1243 * If mem_total did not overflow, multiply all memory values by 1244 * If mem_total did not overflow, multiply all memory values by
1244 * val.mem_unit and set it to 1. This leaves things compatible 1245 * val.mem_unit and set it to 1. This leaves things compatible
1245 * with 2.2.x, and also retains compatibility with earlier 2.4.x 1246 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1246 * kernels... 1247 * kernels...
1247 */ 1248 */
1248 1249
1249 val.mem_unit = 1; 1250 val.mem_unit = 1;
1250 val.totalram <<= bitcount; 1251 val.totalram <<= bitcount;
1251 val.freeram <<= bitcount; 1252 val.freeram <<= bitcount;
1252 val.sharedram <<= bitcount; 1253 val.sharedram <<= bitcount;
1253 val.bufferram <<= bitcount; 1254 val.bufferram <<= bitcount;
1254 val.totalswap <<= bitcount; 1255 val.totalswap <<= bitcount;
1255 val.freeswap <<= bitcount; 1256 val.freeswap <<= bitcount;
1256 val.totalhigh <<= bitcount; 1257 val.totalhigh <<= bitcount;
1257 val.freehigh <<= bitcount; 1258 val.freehigh <<= bitcount;
1258 1259
1259 out: 1260 out:
1260 if (copy_to_user(info, &val, sizeof(struct sysinfo))) 1261 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1261 return -EFAULT; 1262 return -EFAULT;
1262 1263
1263 return 0; 1264 return 0;
1264 } 1265 }
1265 1266
1266 static void __devinit init_timers_cpu(int cpu) 1267 static void __devinit init_timers_cpu(int cpu)
1267 { 1268 {
1268 int j; 1269 int j;
1269 tvec_base_t *base; 1270 tvec_base_t *base;
1270 1271
1271 base = &per_cpu(tvec_bases, cpu); 1272 base = &per_cpu(tvec_bases, cpu);
1272 spin_lock_init(&base->t_base.lock); 1273 spin_lock_init(&base->t_base.lock);
1273 for (j = 0; j < TVN_SIZE; j++) { 1274 for (j = 0; j < TVN_SIZE; j++) {
1274 INIT_LIST_HEAD(base->tv5.vec + j); 1275 INIT_LIST_HEAD(base->tv5.vec + j);
1275 INIT_LIST_HEAD(base->tv4.vec + j); 1276 INIT_LIST_HEAD(base->tv4.vec + j);
1276 INIT_LIST_HEAD(base->tv3.vec + j); 1277 INIT_LIST_HEAD(base->tv3.vec + j);
1277 INIT_LIST_HEAD(base->tv2.vec + j); 1278 INIT_LIST_HEAD(base->tv2.vec + j);
1278 } 1279 }
1279 for (j = 0; j < TVR_SIZE; j++) 1280 for (j = 0; j < TVR_SIZE; j++)
1280 INIT_LIST_HEAD(base->tv1.vec + j); 1281 INIT_LIST_HEAD(base->tv1.vec + j);
1281 1282
1282 base->timer_jiffies = jiffies; 1283 base->timer_jiffies = jiffies;
1283 } 1284 }
1284 1285
1285 #ifdef CONFIG_HOTPLUG_CPU 1286 #ifdef CONFIG_HOTPLUG_CPU
1286 static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1287 static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1287 { 1288 {
1288 struct timer_list *timer; 1289 struct timer_list *timer;
1289 1290
1290 while (!list_empty(head)) { 1291 while (!list_empty(head)) {
1291 timer = list_entry(head->next, struct timer_list, entry); 1292 timer = list_entry(head->next, struct timer_list, entry);
1292 detach_timer(timer, 0); 1293 detach_timer(timer, 0);
1293 timer->base = &new_base->t_base; 1294 timer->base = &new_base->t_base;
1294 internal_add_timer(new_base, timer); 1295 internal_add_timer(new_base, timer);
1295 } 1296 }
1296 } 1297 }
1297 1298
1298 static void __devinit migrate_timers(int cpu) 1299 static void __devinit migrate_timers(int cpu)
1299 { 1300 {
1300 tvec_base_t *old_base; 1301 tvec_base_t *old_base;
1301 tvec_base_t *new_base; 1302 tvec_base_t *new_base;
1302 int i; 1303 int i;
1303 1304
1304 BUG_ON(cpu_online(cpu)); 1305 BUG_ON(cpu_online(cpu));
1305 old_base = &per_cpu(tvec_bases, cpu); 1306 old_base = &per_cpu(tvec_bases, cpu);
1306 new_base = &get_cpu_var(tvec_bases); 1307 new_base = &get_cpu_var(tvec_bases);
1307 1308
1308 local_irq_disable(); 1309 local_irq_disable();
1309 spin_lock(&new_base->t_base.lock); 1310 spin_lock(&new_base->t_base.lock);
1310 spin_lock(&old_base->t_base.lock); 1311 spin_lock(&old_base->t_base.lock);
1311 1312
1312 if (old_base->t_base.running_timer) 1313 if (old_base->t_base.running_timer)
1313 BUG(); 1314 BUG();
1314 for (i = 0; i < TVR_SIZE; i++) 1315 for (i = 0; i < TVR_SIZE; i++)
1315 migrate_timer_list(new_base, old_base->tv1.vec + i); 1316 migrate_timer_list(new_base, old_base->tv1.vec + i);
1316 for (i = 0; i < TVN_SIZE; i++) { 1317 for (i = 0; i < TVN_SIZE; i++) {
1317 migrate_timer_list(new_base, old_base->tv2.vec + i); 1318 migrate_timer_list(new_base, old_base->tv2.vec + i);
1318 migrate_timer_list(new_base, old_base->tv3.vec + i); 1319 migrate_timer_list(new_base, old_base->tv3.vec + i);
1319 migrate_timer_list(new_base, old_base->tv4.vec + i); 1320 migrate_timer_list(new_base, old_base->tv4.vec + i);
1320 migrate_timer_list(new_base, old_base->tv5.vec + i); 1321 migrate_timer_list(new_base, old_base->tv5.vec + i);
1321 } 1322 }
1322 1323
1323 spin_unlock(&old_base->t_base.lock); 1324 spin_unlock(&old_base->t_base.lock);
1324 spin_unlock(&new_base->t_base.lock); 1325 spin_unlock(&new_base->t_base.lock);
1325 local_irq_enable(); 1326 local_irq_enable();
1326 put_cpu_var(tvec_bases); 1327 put_cpu_var(tvec_bases);
1327 } 1328 }
1328 #endif /* CONFIG_HOTPLUG_CPU */ 1329 #endif /* CONFIG_HOTPLUG_CPU */
1329 1330
1330 static int __devinit timer_cpu_notify(struct notifier_block *self, 1331 static int __devinit timer_cpu_notify(struct notifier_block *self,
1331 unsigned long action, void *hcpu) 1332 unsigned long action, void *hcpu)
1332 { 1333 {
1333 long cpu = (long)hcpu; 1334 long cpu = (long)hcpu;
1334 switch(action) { 1335 switch(action) {
1335 case CPU_UP_PREPARE: 1336 case CPU_UP_PREPARE:
1336 init_timers_cpu(cpu); 1337 init_timers_cpu(cpu);
1337 break; 1338 break;
1338 #ifdef CONFIG_HOTPLUG_CPU 1339 #ifdef CONFIG_HOTPLUG_CPU
1339 case CPU_DEAD: 1340 case CPU_DEAD:
1340 migrate_timers(cpu); 1341 migrate_timers(cpu);
1341 break; 1342 break;
1342 #endif 1343 #endif
1343 default: 1344 default:
1344 break; 1345 break;
1345 } 1346 }
1346 return NOTIFY_OK; 1347 return NOTIFY_OK;
1347 } 1348 }
1348 1349
1349 static struct notifier_block __devinitdata timers_nb = { 1350 static struct notifier_block __devinitdata timers_nb = {
1350 .notifier_call = timer_cpu_notify, 1351 .notifier_call = timer_cpu_notify,
1351 }; 1352 };
1352 1353
1353 1354
1354 void __init init_timers(void) 1355 void __init init_timers(void)
1355 { 1356 {
1356 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1357 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1357 (void *)(long)smp_processor_id()); 1358 (void *)(long)smp_processor_id());
1358 register_cpu_notifier(&timers_nb); 1359 register_cpu_notifier(&timers_nb);
1359 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1360 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1360 } 1361 }
1361 1362
1362 #ifdef CONFIG_TIME_INTERPOLATION 1363 #ifdef CONFIG_TIME_INTERPOLATION
1363 1364
1364 struct time_interpolator *time_interpolator; 1365 struct time_interpolator *time_interpolator;
1365 static struct time_interpolator *time_interpolator_list; 1366 static struct time_interpolator *time_interpolator_list;
1366 static DEFINE_SPINLOCK(time_interpolator_lock); 1367 static DEFINE_SPINLOCK(time_interpolator_lock);
1367 1368
1368 static inline u64 time_interpolator_get_cycles(unsigned int src) 1369 static inline u64 time_interpolator_get_cycles(unsigned int src)
1369 { 1370 {
1370 unsigned long (*x)(void); 1371 unsigned long (*x)(void);
1371 1372
1372 switch (src) 1373 switch (src)
1373 { 1374 {
1374 case TIME_SOURCE_FUNCTION: 1375 case TIME_SOURCE_FUNCTION:
1375 x = time_interpolator->addr; 1376 x = time_interpolator->addr;
1376 return x(); 1377 return x();
1377 1378
1378 case TIME_SOURCE_MMIO64 : 1379 case TIME_SOURCE_MMIO64 :
1379 return readq((void __iomem *) time_interpolator->addr); 1380 return readq((void __iomem *) time_interpolator->addr);
1380 1381
1381 case TIME_SOURCE_MMIO32 : 1382 case TIME_SOURCE_MMIO32 :
1382 return readl((void __iomem *) time_interpolator->addr); 1383 return readl((void __iomem *) time_interpolator->addr);
1383 1384
1384 default: return get_cycles(); 1385 default: return get_cycles();
1385 } 1386 }
1386 } 1387 }
1387 1388
1388 static inline u64 time_interpolator_get_counter(int writelock) 1389 static inline u64 time_interpolator_get_counter(int writelock)
1389 { 1390 {
1390 unsigned int src = time_interpolator->source; 1391 unsigned int src = time_interpolator->source;
1391 1392
1392 if (time_interpolator->jitter) 1393 if (time_interpolator->jitter)
1393 { 1394 {
1394 u64 lcycle; 1395 u64 lcycle;
1395 u64 now; 1396 u64 now;
1396 1397
1397 do { 1398 do {
1398 lcycle = time_interpolator->last_cycle; 1399 lcycle = time_interpolator->last_cycle;
1399 now = time_interpolator_get_cycles(src); 1400 now = time_interpolator_get_cycles(src);
1400 if (lcycle && time_after(lcycle, now)) 1401 if (lcycle && time_after(lcycle, now))
1401 return lcycle; 1402 return lcycle;
1402 1403
1403 /* When holding the xtime write lock, there's no need 1404 /* When holding the xtime write lock, there's no need
1404 * to add the overhead of the cmpxchg. Readers are 1405 * to add the overhead of the cmpxchg. Readers are
1405 * force to retry until the write lock is released. 1406 * force to retry until the write lock is released.
1406 */ 1407 */
1407 if (writelock) { 1408 if (writelock) {
1408 time_interpolator->last_cycle = now; 1409 time_interpolator->last_cycle = now;
1409 return now; 1410 return now;
1410 } 1411 }
1411 /* Keep track of the last timer value returned. The use of cmpxchg here 1412 /* Keep track of the last timer value returned. The use of cmpxchg here
1412 * will cause contention in an SMP environment. 1413 * will cause contention in an SMP environment.
1413 */ 1414 */
1414 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); 1415 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1415 return now; 1416 return now;
1416 } 1417 }
1417 else 1418 else
1418 return time_interpolator_get_cycles(src); 1419 return time_interpolator_get_cycles(src);
1419 } 1420 }
1420 1421
1421 void time_interpolator_reset(void) 1422 void time_interpolator_reset(void)
1422 { 1423 {
1423 time_interpolator->offset = 0; 1424 time_interpolator->offset = 0;
1424 time_interpolator->last_counter = time_interpolator_get_counter(1); 1425 time_interpolator->last_counter = time_interpolator_get_counter(1);
1425 } 1426 }
1426 1427
1427 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) 1428 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1428 1429
1429 unsigned long time_interpolator_get_offset(void) 1430 unsigned long time_interpolator_get_offset(void)
1430 { 1431 {
1431 /* If we do not have a time interpolator set up then just return zero */ 1432 /* If we do not have a time interpolator set up then just return zero */
1432 if (!time_interpolator) 1433 if (!time_interpolator)
1433 return 0; 1434 return 0;
1434 1435
1435 return time_interpolator->offset + 1436 return time_interpolator->offset +
1436 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); 1437 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1437 } 1438 }
1438 1439
1439 #define INTERPOLATOR_ADJUST 65536 1440 #define INTERPOLATOR_ADJUST 65536
1440 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST 1441 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1441 1442
1442 static void time_interpolator_update(long delta_nsec) 1443 static void time_interpolator_update(long delta_nsec)
1443 { 1444 {
1444 u64 counter; 1445 u64 counter;
1445 unsigned long offset; 1446 unsigned long offset;
1446 1447
1447 /* If there is no time interpolator set up then do nothing */ 1448 /* If there is no time interpolator set up then do nothing */
1448 if (!time_interpolator) 1449 if (!time_interpolator)
1449 return; 1450 return;
1450 1451
1451 /* 1452 /*
1452 * The interpolator compensates for late ticks by accumulating the late 1453 * The interpolator compensates for late ticks by accumulating the late
1453 * time in time_interpolator->offset. A tick earlier than expected will 1454 * time in time_interpolator->offset. A tick earlier than expected will
1454 * lead to a reset of the offset and a corresponding jump of the clock 1455 * lead to a reset of the offset and a corresponding jump of the clock
1455 * forward. Again this only works if the interpolator clock is running 1456 * forward. Again this only works if the interpolator clock is running
1456 * slightly slower than the regular clock and the tuning logic insures 1457 * slightly slower than the regular clock and the tuning logic insures
1457 * that. 1458 * that.
1458 */ 1459 */
1459 1460
1460 counter = time_interpolator_get_counter(1); 1461 counter = time_interpolator_get_counter(1);
1461 offset = time_interpolator->offset + 1462 offset = time_interpolator->offset +
1462 GET_TI_NSECS(counter, time_interpolator); 1463 GET_TI_NSECS(counter, time_interpolator);
1463 1464
1464 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) 1465 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1465 time_interpolator->offset = offset - delta_nsec; 1466 time_interpolator->offset = offset - delta_nsec;
1466 else { 1467 else {
1467 time_interpolator->skips++; 1468 time_interpolator->skips++;
1468 time_interpolator->ns_skipped += delta_nsec - offset; 1469 time_interpolator->ns_skipped += delta_nsec - offset;
1469 time_interpolator->offset = 0; 1470 time_interpolator->offset = 0;
1470 } 1471 }
1471 time_interpolator->last_counter = counter; 1472 time_interpolator->last_counter = counter;
1472 1473
1473 /* Tuning logic for time interpolator invoked every minute or so. 1474 /* Tuning logic for time interpolator invoked every minute or so.
1474 * Decrease interpolator clock speed if no skips occurred and an offset is carried. 1475 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1475 * Increase interpolator clock speed if we skip too much time. 1476 * Increase interpolator clock speed if we skip too much time.
1476 */ 1477 */
1477 if (jiffies % INTERPOLATOR_ADJUST == 0) 1478 if (jiffies % INTERPOLATOR_ADJUST == 0)
1478 { 1479 {
1479 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) 1480 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
1480 time_interpolator->nsec_per_cyc--; 1481 time_interpolator->nsec_per_cyc--;
1481 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) 1482 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1482 time_interpolator->nsec_per_cyc++; 1483 time_interpolator->nsec_per_cyc++;
1483 time_interpolator->skips = 0; 1484 time_interpolator->skips = 0;
1484 time_interpolator->ns_skipped = 0; 1485 time_interpolator->ns_skipped = 0;
1485 } 1486 }
1486 } 1487 }
1487 1488
1488 static inline int 1489 static inline int
1489 is_better_time_interpolator(struct time_interpolator *new) 1490 is_better_time_interpolator(struct time_interpolator *new)
1490 { 1491 {
1491 if (!time_interpolator) 1492 if (!time_interpolator)
1492 return 1; 1493 return 1;
1493 return new->frequency > 2*time_interpolator->frequency || 1494 return new->frequency > 2*time_interpolator->frequency ||
1494 (unsigned long)new->drift < (unsigned long)time_interpolator->drift; 1495 (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1495 } 1496 }
1496 1497
1497 void 1498 void
1498 register_time_interpolator(struct time_interpolator *ti) 1499 register_time_interpolator(struct time_interpolator *ti)
1499 { 1500 {
1500 unsigned long flags; 1501 unsigned long flags;
1501 1502
1502 /* Sanity check */ 1503 /* Sanity check */
1503 if (ti->frequency == 0 || ti->mask == 0) 1504 if (ti->frequency == 0 || ti->mask == 0)
1504 BUG(); 1505 BUG();
1505 1506
1506 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; 1507 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1507 spin_lock(&time_interpolator_lock); 1508 spin_lock(&time_interpolator_lock);
1508 write_seqlock_irqsave(&xtime_lock, flags); 1509 write_seqlock_irqsave(&xtime_lock, flags);
1509 if (is_better_time_interpolator(ti)) { 1510 if (is_better_time_interpolator(ti)) {
1510 time_interpolator = ti; 1511 time_interpolator = ti;
1511 time_interpolator_reset(); 1512 time_interpolator_reset();
1512 } 1513 }
1513 write_sequnlock_irqrestore(&xtime_lock, flags); 1514 write_sequnlock_irqrestore(&xtime_lock, flags);
1514 1515
1515 ti->next = time_interpolator_list; 1516 ti->next = time_interpolator_list;
1516 time_interpolator_list = ti; 1517 time_interpolator_list = ti;
1517 spin_unlock(&time_interpolator_lock); 1518 spin_unlock(&time_interpolator_lock);
1518 } 1519 }
1519 1520
1520 void 1521 void
1521 unregister_time_interpolator(struct time_interpolator *ti) 1522 unregister_time_interpolator(struct time_interpolator *ti)
1522 { 1523 {
1523 struct time_interpolator *curr, **prev; 1524 struct time_interpolator *curr, **prev;
1524 unsigned long flags; 1525 unsigned long flags;
1525 1526
1526 spin_lock(&time_interpolator_lock); 1527 spin_lock(&time_interpolator_lock);
1527 prev = &time_interpolator_list; 1528 prev = &time_interpolator_list;
1528 for (curr = *prev; curr; curr = curr->next) { 1529 for (curr = *prev; curr; curr = curr->next) {
1529 if (curr == ti) { 1530 if (curr == ti) {
1530 *prev = curr->next; 1531 *prev = curr->next;
1531 break; 1532 break;
1532 } 1533 }
1533 prev = &curr->next; 1534 prev = &curr->next;
1534 } 1535 }
1535 1536
1536 write_seqlock_irqsave(&xtime_lock, flags); 1537 write_seqlock_irqsave(&xtime_lock, flags);
1537 if (ti == time_interpolator) { 1538 if (ti == time_interpolator) {
1538 /* we lost the best time-interpolator: */ 1539 /* we lost the best time-interpolator: */
1539 time_interpolator = NULL; 1540 time_interpolator = NULL;
1540 /* find the next-best interpolator */ 1541 /* find the next-best interpolator */
1541 for (curr = time_interpolator_list; curr; curr = curr->next) 1542 for (curr = time_interpolator_list; curr; curr = curr->next)
1542 if (is_better_time_interpolator(curr)) 1543 if (is_better_time_interpolator(curr))
1543 time_interpolator = curr; 1544 time_interpolator = curr;
1544 time_interpolator_reset(); 1545 time_interpolator_reset();
1545 } 1546 }
1546 write_sequnlock_irqrestore(&xtime_lock, flags); 1547 write_sequnlock_irqrestore(&xtime_lock, flags);
1547 spin_unlock(&time_interpolator_lock); 1548 spin_unlock(&time_interpolator_lock);
1548 } 1549 }
1549 #endif /* CONFIG_TIME_INTERPOLATION */ 1550 #endif /* CONFIG_TIME_INTERPOLATION */
1550 1551
1551 /** 1552 /**
1552 * msleep - sleep safely even with waitqueue interruptions 1553 * msleep - sleep safely even with waitqueue interruptions
1553 * @msecs: Time in milliseconds to sleep for 1554 * @msecs: Time in milliseconds to sleep for
1554 */ 1555 */
1555 void msleep(unsigned int msecs) 1556 void msleep(unsigned int msecs)
1556 { 1557 {
1557 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 1558 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1558 1559
1559 while (timeout) 1560 while (timeout)
1560 timeout = schedule_timeout_uninterruptible(timeout); 1561 timeout = schedule_timeout_uninterruptible(timeout);
1561 } 1562 }
1562 1563
1563 EXPORT_SYMBOL(msleep); 1564 EXPORT_SYMBOL(msleep);
1564 1565
1565 /** 1566 /**
1566 * msleep_interruptible - sleep waiting for signals 1567 * msleep_interruptible - sleep waiting for signals
1567 * @msecs: Time in milliseconds to sleep for 1568 * @msecs: Time in milliseconds to sleep for
1568 */ 1569 */
1569 unsigned long msleep_interruptible(unsigned int msecs) 1570 unsigned long msleep_interruptible(unsigned int msecs)
1570 { 1571 {
1571 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 1572 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1572 1573
1573 while (timeout && !signal_pending(current)) 1574 while (timeout && !signal_pending(current))
1574 timeout = schedule_timeout_interruptible(timeout); 1575 timeout = schedule_timeout_interruptible(timeout);
1575 return jiffies_to_msecs(timeout); 1576 return jiffies_to_msecs(timeout);
1576 } 1577 }
1577 1578
1578 EXPORT_SYMBOL(msleep_interruptible); 1579 EXPORT_SYMBOL(msleep_interruptible);
1579 1580