Commit 97a41e26124330e41aa10ef88cd1711bc3d17460
Committed by
Linus Torvalds
1 parent
b7b4d7a466
Exists in
master
and in
7 other branches
[PATCH] kernel/: small cleanups
This patch contains the following cleanups: - make needlessly global functions static - every file should include the headers containing the prototypes for it's global functions Signed-off-by: Adrian Bunk <bunk@stusta.de> Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 4 changed files with 5 additions and 2 deletions Inline Diff
kernel/audit.c
1 | /* audit.c -- Auditing support | 1 | /* audit.c -- Auditing support |
2 | * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. | 2 | * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. |
3 | * System-call specific features have moved to auditsc.c | 3 | * System-call specific features have moved to auditsc.c |
4 | * | 4 | * |
5 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | 5 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. |
6 | * All Rights Reserved. | 6 | * All Rights Reserved. |
7 | * | 7 | * |
8 | * This program is free software; you can redistribute it and/or modify | 8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by | 9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or | 10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. | 11 | * (at your option) any later version. |
12 | * | 12 | * |
13 | * This program is distributed in the hope that it will be useful, | 13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. | 16 | * GNU General Public License for more details. |
17 | * | 17 | * |
18 | * You should have received a copy of the GNU General Public License | 18 | * You should have received a copy of the GNU General Public License |
19 | * along with this program; if not, write to the Free Software | 19 | * along with this program; if not, write to the Free Software |
20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 | * | 21 | * |
22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> | 22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> |
23 | * | 23 | * |
24 | * Goals: 1) Integrate fully with SELinux. | 24 | * Goals: 1) Integrate fully with SELinux. |
25 | * 2) Minimal run-time overhead: | 25 | * 2) Minimal run-time overhead: |
26 | * a) Minimal when syscall auditing is disabled (audit_enable=0). | 26 | * a) Minimal when syscall auditing is disabled (audit_enable=0). |
27 | * b) Small when syscall auditing is enabled and no audit record | 27 | * b) Small when syscall auditing is enabled and no audit record |
28 | * is generated (defer as much work as possible to record | 28 | * is generated (defer as much work as possible to record |
29 | * generation time): | 29 | * generation time): |
30 | * i) context is allocated, | 30 | * i) context is allocated, |
31 | * ii) names from getname are stored without a copy, and | 31 | * ii) names from getname are stored without a copy, and |
32 | * iii) inode information stored from path_lookup. | 32 | * iii) inode information stored from path_lookup. |
33 | * 3) Ability to disable syscall auditing at boot time (audit=0). | 33 | * 3) Ability to disable syscall auditing at boot time (audit=0). |
34 | * 4) Usable by other parts of the kernel (if audit_log* is called, | 34 | * 4) Usable by other parts of the kernel (if audit_log* is called, |
35 | * then a syscall record will be generated automatically for the | 35 | * then a syscall record will be generated automatically for the |
36 | * current syscall). | 36 | * current syscall). |
37 | * 5) Netlink interface to user-space. | 37 | * 5) Netlink interface to user-space. |
38 | * 6) Support low-overhead kernel-based filtering to minimize the | 38 | * 6) Support low-overhead kernel-based filtering to minimize the |
39 | * information that must be passed to user-space. | 39 | * information that must be passed to user-space. |
40 | * | 40 | * |
41 | * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ | 41 | * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ |
42 | */ | 42 | */ |
43 | 43 | ||
44 | #include <linux/init.h> | 44 | #include <linux/init.h> |
45 | #include <asm/atomic.h> | 45 | #include <asm/atomic.h> |
46 | #include <asm/types.h> | 46 | #include <asm/types.h> |
47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
49 | #include <linux/err.h> | 49 | #include <linux/err.h> |
50 | #include <linux/kthread.h> | 50 | #include <linux/kthread.h> |
51 | 51 | ||
52 | #include <linux/audit.h> | 52 | #include <linux/audit.h> |
53 | 53 | ||
54 | #include <net/sock.h> | 54 | #include <net/sock.h> |
55 | #include <linux/skbuff.h> | 55 | #include <linux/skbuff.h> |
56 | #include <linux/netlink.h> | 56 | #include <linux/netlink.h> |
57 | 57 | ||
58 | /* No auditing will take place until audit_initialized != 0. | 58 | /* No auditing will take place until audit_initialized != 0. |
59 | * (Initialization happens after skb_init is called.) */ | 59 | * (Initialization happens after skb_init is called.) */ |
60 | static int audit_initialized; | 60 | static int audit_initialized; |
61 | 61 | ||
62 | /* No syscall auditing will take place unless audit_enabled != 0. */ | 62 | /* No syscall auditing will take place unless audit_enabled != 0. */ |
63 | int audit_enabled; | 63 | int audit_enabled; |
64 | 64 | ||
65 | /* Default state when kernel boots without any parameters. */ | 65 | /* Default state when kernel boots without any parameters. */ |
66 | static int audit_default; | 66 | static int audit_default; |
67 | 67 | ||
68 | /* If auditing cannot proceed, audit_failure selects what happens. */ | 68 | /* If auditing cannot proceed, audit_failure selects what happens. */ |
69 | static int audit_failure = AUDIT_FAIL_PRINTK; | 69 | static int audit_failure = AUDIT_FAIL_PRINTK; |
70 | 70 | ||
71 | /* If audit records are to be written to the netlink socket, audit_pid | 71 | /* If audit records are to be written to the netlink socket, audit_pid |
72 | * contains the (non-zero) pid. */ | 72 | * contains the (non-zero) pid. */ |
73 | int audit_pid; | 73 | int audit_pid; |
74 | 74 | ||
75 | /* If audit_limit is non-zero, limit the rate of sending audit records | 75 | /* If audit_limit is non-zero, limit the rate of sending audit records |
76 | * to that number per second. This prevents DoS attacks, but results in | 76 | * to that number per second. This prevents DoS attacks, but results in |
77 | * audit records being dropped. */ | 77 | * audit records being dropped. */ |
78 | static int audit_rate_limit; | 78 | static int audit_rate_limit; |
79 | 79 | ||
80 | /* Number of outstanding audit_buffers allowed. */ | 80 | /* Number of outstanding audit_buffers allowed. */ |
81 | static int audit_backlog_limit = 64; | 81 | static int audit_backlog_limit = 64; |
82 | static int audit_backlog_wait_time = 60 * HZ; | 82 | static int audit_backlog_wait_time = 60 * HZ; |
83 | static int audit_backlog_wait_overflow = 0; | 83 | static int audit_backlog_wait_overflow = 0; |
84 | 84 | ||
85 | /* The identity of the user shutting down the audit system. */ | 85 | /* The identity of the user shutting down the audit system. */ |
86 | uid_t audit_sig_uid = -1; | 86 | uid_t audit_sig_uid = -1; |
87 | pid_t audit_sig_pid = -1; | 87 | pid_t audit_sig_pid = -1; |
88 | 88 | ||
89 | /* Records can be lost in several ways: | 89 | /* Records can be lost in several ways: |
90 | 0) [suppressed in audit_alloc] | 90 | 0) [suppressed in audit_alloc] |
91 | 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] | 91 | 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] |
92 | 2) out of memory in audit_log_move [alloc_skb] | 92 | 2) out of memory in audit_log_move [alloc_skb] |
93 | 3) suppressed due to audit_rate_limit | 93 | 3) suppressed due to audit_rate_limit |
94 | 4) suppressed due to audit_backlog_limit | 94 | 4) suppressed due to audit_backlog_limit |
95 | */ | 95 | */ |
96 | static atomic_t audit_lost = ATOMIC_INIT(0); | 96 | static atomic_t audit_lost = ATOMIC_INIT(0); |
97 | 97 | ||
98 | /* The netlink socket. */ | 98 | /* The netlink socket. */ |
99 | static struct sock *audit_sock; | 99 | static struct sock *audit_sock; |
100 | 100 | ||
101 | /* The audit_freelist is a list of pre-allocated audit buffers (if more | 101 | /* The audit_freelist is a list of pre-allocated audit buffers (if more |
102 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of | 102 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of |
103 | * being placed on the freelist). */ | 103 | * being placed on the freelist). */ |
104 | static DEFINE_SPINLOCK(audit_freelist_lock); | 104 | static DEFINE_SPINLOCK(audit_freelist_lock); |
105 | static int audit_freelist_count = 0; | 105 | static int audit_freelist_count = 0; |
106 | static LIST_HEAD(audit_freelist); | 106 | static LIST_HEAD(audit_freelist); |
107 | 107 | ||
108 | static struct sk_buff_head audit_skb_queue; | 108 | static struct sk_buff_head audit_skb_queue; |
109 | static struct task_struct *kauditd_task; | 109 | static struct task_struct *kauditd_task; |
110 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); | 110 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); |
111 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); | 111 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); |
112 | 112 | ||
113 | /* The netlink socket is only to be read by 1 CPU, which lets us assume | 113 | /* The netlink socket is only to be read by 1 CPU, which lets us assume |
114 | * that list additions and deletions never happen simultaneously in | 114 | * that list additions and deletions never happen simultaneously in |
115 | * auditsc.c */ | 115 | * auditsc.c */ |
116 | DECLARE_MUTEX(audit_netlink_sem); | 116 | DECLARE_MUTEX(audit_netlink_sem); |
117 | 117 | ||
118 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting | 118 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting |
119 | * audit records. Since printk uses a 1024 byte buffer, this buffer | 119 | * audit records. Since printk uses a 1024 byte buffer, this buffer |
120 | * should be at least that large. */ | 120 | * should be at least that large. */ |
121 | #define AUDIT_BUFSIZ 1024 | 121 | #define AUDIT_BUFSIZ 1024 |
122 | 122 | ||
123 | /* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the | 123 | /* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the |
124 | * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ | 124 | * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ |
125 | #define AUDIT_MAXFREE (2*NR_CPUS) | 125 | #define AUDIT_MAXFREE (2*NR_CPUS) |
126 | 126 | ||
127 | /* The audit_buffer is used when formatting an audit record. The caller | 127 | /* The audit_buffer is used when formatting an audit record. The caller |
128 | * locks briefly to get the record off the freelist or to allocate the | 128 | * locks briefly to get the record off the freelist or to allocate the |
129 | * buffer, and locks briefly to send the buffer to the netlink layer or | 129 | * buffer, and locks briefly to send the buffer to the netlink layer or |
130 | * to place it on a transmit queue. Multiple audit_buffers can be in | 130 | * to place it on a transmit queue. Multiple audit_buffers can be in |
131 | * use simultaneously. */ | 131 | * use simultaneously. */ |
132 | struct audit_buffer { | 132 | struct audit_buffer { |
133 | struct list_head list; | 133 | struct list_head list; |
134 | struct sk_buff *skb; /* formatted skb ready to send */ | 134 | struct sk_buff *skb; /* formatted skb ready to send */ |
135 | struct audit_context *ctx; /* NULL or associated context */ | 135 | struct audit_context *ctx; /* NULL or associated context */ |
136 | gfp_t gfp_mask; | 136 | gfp_t gfp_mask; |
137 | }; | 137 | }; |
138 | 138 | ||
139 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | 139 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) |
140 | { | 140 | { |
141 | struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; | 141 | struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; |
142 | nlh->nlmsg_pid = pid; | 142 | nlh->nlmsg_pid = pid; |
143 | } | 143 | } |
144 | 144 | ||
145 | static void audit_panic(const char *message) | 145 | static void audit_panic(const char *message) |
146 | { | 146 | { |
147 | switch (audit_failure) | 147 | switch (audit_failure) |
148 | { | 148 | { |
149 | case AUDIT_FAIL_SILENT: | 149 | case AUDIT_FAIL_SILENT: |
150 | break; | 150 | break; |
151 | case AUDIT_FAIL_PRINTK: | 151 | case AUDIT_FAIL_PRINTK: |
152 | printk(KERN_ERR "audit: %s\n", message); | 152 | printk(KERN_ERR "audit: %s\n", message); |
153 | break; | 153 | break; |
154 | case AUDIT_FAIL_PANIC: | 154 | case AUDIT_FAIL_PANIC: |
155 | panic("audit: %s\n", message); | 155 | panic("audit: %s\n", message); |
156 | break; | 156 | break; |
157 | } | 157 | } |
158 | } | 158 | } |
159 | 159 | ||
160 | static inline int audit_rate_check(void) | 160 | static inline int audit_rate_check(void) |
161 | { | 161 | { |
162 | static unsigned long last_check = 0; | 162 | static unsigned long last_check = 0; |
163 | static int messages = 0; | 163 | static int messages = 0; |
164 | static DEFINE_SPINLOCK(lock); | 164 | static DEFINE_SPINLOCK(lock); |
165 | unsigned long flags; | 165 | unsigned long flags; |
166 | unsigned long now; | 166 | unsigned long now; |
167 | unsigned long elapsed; | 167 | unsigned long elapsed; |
168 | int retval = 0; | 168 | int retval = 0; |
169 | 169 | ||
170 | if (!audit_rate_limit) return 1; | 170 | if (!audit_rate_limit) return 1; |
171 | 171 | ||
172 | spin_lock_irqsave(&lock, flags); | 172 | spin_lock_irqsave(&lock, flags); |
173 | if (++messages < audit_rate_limit) { | 173 | if (++messages < audit_rate_limit) { |
174 | retval = 1; | 174 | retval = 1; |
175 | } else { | 175 | } else { |
176 | now = jiffies; | 176 | now = jiffies; |
177 | elapsed = now - last_check; | 177 | elapsed = now - last_check; |
178 | if (elapsed > HZ) { | 178 | if (elapsed > HZ) { |
179 | last_check = now; | 179 | last_check = now; |
180 | messages = 0; | 180 | messages = 0; |
181 | retval = 1; | 181 | retval = 1; |
182 | } | 182 | } |
183 | } | 183 | } |
184 | spin_unlock_irqrestore(&lock, flags); | 184 | spin_unlock_irqrestore(&lock, flags); |
185 | 185 | ||
186 | return retval; | 186 | return retval; |
187 | } | 187 | } |
188 | 188 | ||
189 | /* Emit at least 1 message per second, even if audit_rate_check is | 189 | /* Emit at least 1 message per second, even if audit_rate_check is |
190 | * throttling. */ | 190 | * throttling. */ |
191 | void audit_log_lost(const char *message) | 191 | void audit_log_lost(const char *message) |
192 | { | 192 | { |
193 | static unsigned long last_msg = 0; | 193 | static unsigned long last_msg = 0; |
194 | static DEFINE_SPINLOCK(lock); | 194 | static DEFINE_SPINLOCK(lock); |
195 | unsigned long flags; | 195 | unsigned long flags; |
196 | unsigned long now; | 196 | unsigned long now; |
197 | int print; | 197 | int print; |
198 | 198 | ||
199 | atomic_inc(&audit_lost); | 199 | atomic_inc(&audit_lost); |
200 | 200 | ||
201 | print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); | 201 | print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); |
202 | 202 | ||
203 | if (!print) { | 203 | if (!print) { |
204 | spin_lock_irqsave(&lock, flags); | 204 | spin_lock_irqsave(&lock, flags); |
205 | now = jiffies; | 205 | now = jiffies; |
206 | if (now - last_msg > HZ) { | 206 | if (now - last_msg > HZ) { |
207 | print = 1; | 207 | print = 1; |
208 | last_msg = now; | 208 | last_msg = now; |
209 | } | 209 | } |
210 | spin_unlock_irqrestore(&lock, flags); | 210 | spin_unlock_irqrestore(&lock, flags); |
211 | } | 211 | } |
212 | 212 | ||
213 | if (print) { | 213 | if (print) { |
214 | printk(KERN_WARNING | 214 | printk(KERN_WARNING |
215 | "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", | 215 | "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", |
216 | atomic_read(&audit_lost), | 216 | atomic_read(&audit_lost), |
217 | audit_rate_limit, | 217 | audit_rate_limit, |
218 | audit_backlog_limit); | 218 | audit_backlog_limit); |
219 | audit_panic(message); | 219 | audit_panic(message); |
220 | } | 220 | } |
221 | 221 | ||
222 | } | 222 | } |
223 | 223 | ||
224 | static int audit_set_rate_limit(int limit, uid_t loginuid) | 224 | static int audit_set_rate_limit(int limit, uid_t loginuid) |
225 | { | 225 | { |
226 | int old = audit_rate_limit; | 226 | int old = audit_rate_limit; |
227 | audit_rate_limit = limit; | 227 | audit_rate_limit = limit; |
228 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 228 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
229 | "audit_rate_limit=%d old=%d by auid=%u", | 229 | "audit_rate_limit=%d old=%d by auid=%u", |
230 | audit_rate_limit, old, loginuid); | 230 | audit_rate_limit, old, loginuid); |
231 | return old; | 231 | return old; |
232 | } | 232 | } |
233 | 233 | ||
234 | static int audit_set_backlog_limit(int limit, uid_t loginuid) | 234 | static int audit_set_backlog_limit(int limit, uid_t loginuid) |
235 | { | 235 | { |
236 | int old = audit_backlog_limit; | 236 | int old = audit_backlog_limit; |
237 | audit_backlog_limit = limit; | 237 | audit_backlog_limit = limit; |
238 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 238 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
239 | "audit_backlog_limit=%d old=%d by auid=%u", | 239 | "audit_backlog_limit=%d old=%d by auid=%u", |
240 | audit_backlog_limit, old, loginuid); | 240 | audit_backlog_limit, old, loginuid); |
241 | return old; | 241 | return old; |
242 | } | 242 | } |
243 | 243 | ||
244 | static int audit_set_enabled(int state, uid_t loginuid) | 244 | static int audit_set_enabled(int state, uid_t loginuid) |
245 | { | 245 | { |
246 | int old = audit_enabled; | 246 | int old = audit_enabled; |
247 | if (state != 0 && state != 1) | 247 | if (state != 0 && state != 1) |
248 | return -EINVAL; | 248 | return -EINVAL; |
249 | audit_enabled = state; | 249 | audit_enabled = state; |
250 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 250 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
251 | "audit_enabled=%d old=%d by auid=%u", | 251 | "audit_enabled=%d old=%d by auid=%u", |
252 | audit_enabled, old, loginuid); | 252 | audit_enabled, old, loginuid); |
253 | return old; | 253 | return old; |
254 | } | 254 | } |
255 | 255 | ||
256 | static int audit_set_failure(int state, uid_t loginuid) | 256 | static int audit_set_failure(int state, uid_t loginuid) |
257 | { | 257 | { |
258 | int old = audit_failure; | 258 | int old = audit_failure; |
259 | if (state != AUDIT_FAIL_SILENT | 259 | if (state != AUDIT_FAIL_SILENT |
260 | && state != AUDIT_FAIL_PRINTK | 260 | && state != AUDIT_FAIL_PRINTK |
261 | && state != AUDIT_FAIL_PANIC) | 261 | && state != AUDIT_FAIL_PANIC) |
262 | return -EINVAL; | 262 | return -EINVAL; |
263 | audit_failure = state; | 263 | audit_failure = state; |
264 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 264 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
265 | "audit_failure=%d old=%d by auid=%u", | 265 | "audit_failure=%d old=%d by auid=%u", |
266 | audit_failure, old, loginuid); | 266 | audit_failure, old, loginuid); |
267 | return old; | 267 | return old; |
268 | } | 268 | } |
269 | 269 | ||
270 | int kauditd_thread(void *dummy) | 270 | static int kauditd_thread(void *dummy) |
271 | { | 271 | { |
272 | struct sk_buff *skb; | 272 | struct sk_buff *skb; |
273 | 273 | ||
274 | while (1) { | 274 | while (1) { |
275 | skb = skb_dequeue(&audit_skb_queue); | 275 | skb = skb_dequeue(&audit_skb_queue); |
276 | wake_up(&audit_backlog_wait); | 276 | wake_up(&audit_backlog_wait); |
277 | if (skb) { | 277 | if (skb) { |
278 | if (audit_pid) { | 278 | if (audit_pid) { |
279 | int err = netlink_unicast(audit_sock, skb, audit_pid, 0); | 279 | int err = netlink_unicast(audit_sock, skb, audit_pid, 0); |
280 | if (err < 0) { | 280 | if (err < 0) { |
281 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ | 281 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ |
282 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 282 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
283 | audit_pid = 0; | 283 | audit_pid = 0; |
284 | } | 284 | } |
285 | } else { | 285 | } else { |
286 | printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); | 286 | printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); |
287 | kfree_skb(skb); | 287 | kfree_skb(skb); |
288 | } | 288 | } |
289 | } else { | 289 | } else { |
290 | DECLARE_WAITQUEUE(wait, current); | 290 | DECLARE_WAITQUEUE(wait, current); |
291 | set_current_state(TASK_INTERRUPTIBLE); | 291 | set_current_state(TASK_INTERRUPTIBLE); |
292 | add_wait_queue(&kauditd_wait, &wait); | 292 | add_wait_queue(&kauditd_wait, &wait); |
293 | 293 | ||
294 | if (!skb_queue_len(&audit_skb_queue)) { | 294 | if (!skb_queue_len(&audit_skb_queue)) { |
295 | try_to_freeze(); | 295 | try_to_freeze(); |
296 | schedule(); | 296 | schedule(); |
297 | } | 297 | } |
298 | 298 | ||
299 | __set_current_state(TASK_RUNNING); | 299 | __set_current_state(TASK_RUNNING); |
300 | remove_wait_queue(&kauditd_wait, &wait); | 300 | remove_wait_queue(&kauditd_wait, &wait); |
301 | } | 301 | } |
302 | } | 302 | } |
303 | } | 303 | } |
304 | 304 | ||
305 | void audit_send_reply(int pid, int seq, int type, int done, int multi, | 305 | void audit_send_reply(int pid, int seq, int type, int done, int multi, |
306 | void *payload, int size) | 306 | void *payload, int size) |
307 | { | 307 | { |
308 | struct sk_buff *skb; | 308 | struct sk_buff *skb; |
309 | struct nlmsghdr *nlh; | 309 | struct nlmsghdr *nlh; |
310 | int len = NLMSG_SPACE(size); | 310 | int len = NLMSG_SPACE(size); |
311 | void *data; | 311 | void *data; |
312 | int flags = multi ? NLM_F_MULTI : 0; | 312 | int flags = multi ? NLM_F_MULTI : 0; |
313 | int t = done ? NLMSG_DONE : type; | 313 | int t = done ? NLMSG_DONE : type; |
314 | 314 | ||
315 | skb = alloc_skb(len, GFP_KERNEL); | 315 | skb = alloc_skb(len, GFP_KERNEL); |
316 | if (!skb) | 316 | if (!skb) |
317 | return; | 317 | return; |
318 | 318 | ||
319 | nlh = NLMSG_PUT(skb, pid, seq, t, size); | 319 | nlh = NLMSG_PUT(skb, pid, seq, t, size); |
320 | nlh->nlmsg_flags = flags; | 320 | nlh->nlmsg_flags = flags; |
321 | data = NLMSG_DATA(nlh); | 321 | data = NLMSG_DATA(nlh); |
322 | memcpy(data, payload, size); | 322 | memcpy(data, payload, size); |
323 | 323 | ||
324 | /* Ignore failure. It'll only happen if the sender goes away, | 324 | /* Ignore failure. It'll only happen if the sender goes away, |
325 | because our timeout is set to infinite. */ | 325 | because our timeout is set to infinite. */ |
326 | netlink_unicast(audit_sock, skb, pid, 0); | 326 | netlink_unicast(audit_sock, skb, pid, 0); |
327 | return; | 327 | return; |
328 | 328 | ||
329 | nlmsg_failure: /* Used by NLMSG_PUT */ | 329 | nlmsg_failure: /* Used by NLMSG_PUT */ |
330 | if (skb) | 330 | if (skb) |
331 | kfree_skb(skb); | 331 | kfree_skb(skb); |
332 | } | 332 | } |
333 | 333 | ||
334 | /* | 334 | /* |
335 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit | 335 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit |
336 | * control messages. | 336 | * control messages. |
337 | */ | 337 | */ |
338 | static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | 338 | static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) |
339 | { | 339 | { |
340 | int err = 0; | 340 | int err = 0; |
341 | 341 | ||
342 | switch (msg_type) { | 342 | switch (msg_type) { |
343 | case AUDIT_GET: | 343 | case AUDIT_GET: |
344 | case AUDIT_LIST: | 344 | case AUDIT_LIST: |
345 | case AUDIT_SET: | 345 | case AUDIT_SET: |
346 | case AUDIT_ADD: | 346 | case AUDIT_ADD: |
347 | case AUDIT_DEL: | 347 | case AUDIT_DEL: |
348 | case AUDIT_SIGNAL_INFO: | 348 | case AUDIT_SIGNAL_INFO: |
349 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) | 349 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) |
350 | err = -EPERM; | 350 | err = -EPERM; |
351 | break; | 351 | break; |
352 | case AUDIT_USER: | 352 | case AUDIT_USER: |
353 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 353 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
354 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) | 354 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) |
355 | err = -EPERM; | 355 | err = -EPERM; |
356 | break; | 356 | break; |
357 | default: /* bad msg */ | 357 | default: /* bad msg */ |
358 | err = -EINVAL; | 358 | err = -EINVAL; |
359 | } | 359 | } |
360 | 360 | ||
361 | return err; | 361 | return err; |
362 | } | 362 | } |
363 | 363 | ||
364 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | 364 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
365 | { | 365 | { |
366 | u32 uid, pid, seq; | 366 | u32 uid, pid, seq; |
367 | void *data; | 367 | void *data; |
368 | struct audit_status *status_get, status_set; | 368 | struct audit_status *status_get, status_set; |
369 | int err; | 369 | int err; |
370 | struct audit_buffer *ab; | 370 | struct audit_buffer *ab; |
371 | u16 msg_type = nlh->nlmsg_type; | 371 | u16 msg_type = nlh->nlmsg_type; |
372 | uid_t loginuid; /* loginuid of sender */ | 372 | uid_t loginuid; /* loginuid of sender */ |
373 | struct audit_sig_info sig_data; | 373 | struct audit_sig_info sig_data; |
374 | 374 | ||
375 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); | 375 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); |
376 | if (err) | 376 | if (err) |
377 | return err; | 377 | return err; |
378 | 378 | ||
379 | /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ | 379 | /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ |
380 | if (!kauditd_task) | 380 | if (!kauditd_task) |
381 | kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); | 381 | kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); |
382 | if (IS_ERR(kauditd_task)) { | 382 | if (IS_ERR(kauditd_task)) { |
383 | err = PTR_ERR(kauditd_task); | 383 | err = PTR_ERR(kauditd_task); |
384 | kauditd_task = NULL; | 384 | kauditd_task = NULL; |
385 | return err; | 385 | return err; |
386 | } | 386 | } |
387 | 387 | ||
388 | pid = NETLINK_CREDS(skb)->pid; | 388 | pid = NETLINK_CREDS(skb)->pid; |
389 | uid = NETLINK_CREDS(skb)->uid; | 389 | uid = NETLINK_CREDS(skb)->uid; |
390 | loginuid = NETLINK_CB(skb).loginuid; | 390 | loginuid = NETLINK_CB(skb).loginuid; |
391 | seq = nlh->nlmsg_seq; | 391 | seq = nlh->nlmsg_seq; |
392 | data = NLMSG_DATA(nlh); | 392 | data = NLMSG_DATA(nlh); |
393 | 393 | ||
394 | switch (msg_type) { | 394 | switch (msg_type) { |
395 | case AUDIT_GET: | 395 | case AUDIT_GET: |
396 | status_set.enabled = audit_enabled; | 396 | status_set.enabled = audit_enabled; |
397 | status_set.failure = audit_failure; | 397 | status_set.failure = audit_failure; |
398 | status_set.pid = audit_pid; | 398 | status_set.pid = audit_pid; |
399 | status_set.rate_limit = audit_rate_limit; | 399 | status_set.rate_limit = audit_rate_limit; |
400 | status_set.backlog_limit = audit_backlog_limit; | 400 | status_set.backlog_limit = audit_backlog_limit; |
401 | status_set.lost = atomic_read(&audit_lost); | 401 | status_set.lost = atomic_read(&audit_lost); |
402 | status_set.backlog = skb_queue_len(&audit_skb_queue); | 402 | status_set.backlog = skb_queue_len(&audit_skb_queue); |
403 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, | 403 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, |
404 | &status_set, sizeof(status_set)); | 404 | &status_set, sizeof(status_set)); |
405 | break; | 405 | break; |
406 | case AUDIT_SET: | 406 | case AUDIT_SET: |
407 | if (nlh->nlmsg_len < sizeof(struct audit_status)) | 407 | if (nlh->nlmsg_len < sizeof(struct audit_status)) |
408 | return -EINVAL; | 408 | return -EINVAL; |
409 | status_get = (struct audit_status *)data; | 409 | status_get = (struct audit_status *)data; |
410 | if (status_get->mask & AUDIT_STATUS_ENABLED) { | 410 | if (status_get->mask & AUDIT_STATUS_ENABLED) { |
411 | err = audit_set_enabled(status_get->enabled, loginuid); | 411 | err = audit_set_enabled(status_get->enabled, loginuid); |
412 | if (err < 0) return err; | 412 | if (err < 0) return err; |
413 | } | 413 | } |
414 | if (status_get->mask & AUDIT_STATUS_FAILURE) { | 414 | if (status_get->mask & AUDIT_STATUS_FAILURE) { |
415 | err = audit_set_failure(status_get->failure, loginuid); | 415 | err = audit_set_failure(status_get->failure, loginuid); |
416 | if (err < 0) return err; | 416 | if (err < 0) return err; |
417 | } | 417 | } |
418 | if (status_get->mask & AUDIT_STATUS_PID) { | 418 | if (status_get->mask & AUDIT_STATUS_PID) { |
419 | int old = audit_pid; | 419 | int old = audit_pid; |
420 | audit_pid = status_get->pid; | 420 | audit_pid = status_get->pid; |
421 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 421 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
422 | "audit_pid=%d old=%d by auid=%u", | 422 | "audit_pid=%d old=%d by auid=%u", |
423 | audit_pid, old, loginuid); | 423 | audit_pid, old, loginuid); |
424 | } | 424 | } |
425 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 425 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
426 | audit_set_rate_limit(status_get->rate_limit, loginuid); | 426 | audit_set_rate_limit(status_get->rate_limit, loginuid); |
427 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | 427 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) |
428 | audit_set_backlog_limit(status_get->backlog_limit, | 428 | audit_set_backlog_limit(status_get->backlog_limit, |
429 | loginuid); | 429 | loginuid); |
430 | break; | 430 | break; |
431 | case AUDIT_USER: | 431 | case AUDIT_USER: |
432 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 432 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
433 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) | 433 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
434 | return 0; | 434 | return 0; |
435 | 435 | ||
436 | err = audit_filter_user(&NETLINK_CB(skb), msg_type); | 436 | err = audit_filter_user(&NETLINK_CB(skb), msg_type); |
437 | if (err == 1) { | 437 | if (err == 1) { |
438 | err = 0; | 438 | err = 0; |
439 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 439 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
440 | if (ab) { | 440 | if (ab) { |
441 | audit_log_format(ab, | 441 | audit_log_format(ab, |
442 | "user pid=%d uid=%u auid=%u msg='%.1024s'", | 442 | "user pid=%d uid=%u auid=%u msg='%.1024s'", |
443 | pid, uid, loginuid, (char *)data); | 443 | pid, uid, loginuid, (char *)data); |
444 | audit_set_pid(ab, pid); | 444 | audit_set_pid(ab, pid); |
445 | audit_log_end(ab); | 445 | audit_log_end(ab); |
446 | } | 446 | } |
447 | } | 447 | } |
448 | break; | 448 | break; |
449 | case AUDIT_ADD: | 449 | case AUDIT_ADD: |
450 | case AUDIT_DEL: | 450 | case AUDIT_DEL: |
451 | if (nlh->nlmsg_len < sizeof(struct audit_rule)) | 451 | if (nlh->nlmsg_len < sizeof(struct audit_rule)) |
452 | return -EINVAL; | 452 | return -EINVAL; |
453 | /* fallthrough */ | 453 | /* fallthrough */ |
454 | case AUDIT_LIST: | 454 | case AUDIT_LIST: |
455 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | 455 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, |
456 | uid, seq, data, loginuid); | 456 | uid, seq, data, loginuid); |
457 | break; | 457 | break; |
458 | case AUDIT_SIGNAL_INFO: | 458 | case AUDIT_SIGNAL_INFO: |
459 | sig_data.uid = audit_sig_uid; | 459 | sig_data.uid = audit_sig_uid; |
460 | sig_data.pid = audit_sig_pid; | 460 | sig_data.pid = audit_sig_pid; |
461 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | 461 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, |
462 | 0, 0, &sig_data, sizeof(sig_data)); | 462 | 0, 0, &sig_data, sizeof(sig_data)); |
463 | break; | 463 | break; |
464 | default: | 464 | default: |
465 | err = -EINVAL; | 465 | err = -EINVAL; |
466 | break; | 466 | break; |
467 | } | 467 | } |
468 | 468 | ||
469 | return err < 0 ? err : 0; | 469 | return err < 0 ? err : 0; |
470 | } | 470 | } |
471 | 471 | ||
472 | /* Get message from skb (based on rtnetlink_rcv_skb). Each message is | 472 | /* Get message from skb (based on rtnetlink_rcv_skb). Each message is |
473 | * processed by audit_receive_msg. Malformed skbs with wrong length are | 473 | * processed by audit_receive_msg. Malformed skbs with wrong length are |
474 | * discarded silently. */ | 474 | * discarded silently. */ |
475 | static void audit_receive_skb(struct sk_buff *skb) | 475 | static void audit_receive_skb(struct sk_buff *skb) |
476 | { | 476 | { |
477 | int err; | 477 | int err; |
478 | struct nlmsghdr *nlh; | 478 | struct nlmsghdr *nlh; |
479 | u32 rlen; | 479 | u32 rlen; |
480 | 480 | ||
481 | while (skb->len >= NLMSG_SPACE(0)) { | 481 | while (skb->len >= NLMSG_SPACE(0)) { |
482 | nlh = (struct nlmsghdr *)skb->data; | 482 | nlh = (struct nlmsghdr *)skb->data; |
483 | if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) | 483 | if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) |
484 | return; | 484 | return; |
485 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | 485 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); |
486 | if (rlen > skb->len) | 486 | if (rlen > skb->len) |
487 | rlen = skb->len; | 487 | rlen = skb->len; |
488 | if ((err = audit_receive_msg(skb, nlh))) { | 488 | if ((err = audit_receive_msg(skb, nlh))) { |
489 | netlink_ack(skb, nlh, err); | 489 | netlink_ack(skb, nlh, err); |
490 | } else if (nlh->nlmsg_flags & NLM_F_ACK) | 490 | } else if (nlh->nlmsg_flags & NLM_F_ACK) |
491 | netlink_ack(skb, nlh, 0); | 491 | netlink_ack(skb, nlh, 0); |
492 | skb_pull(skb, rlen); | 492 | skb_pull(skb, rlen); |
493 | } | 493 | } |
494 | } | 494 | } |
495 | 495 | ||
496 | /* Receive messages from netlink socket. */ | 496 | /* Receive messages from netlink socket. */ |
497 | static void audit_receive(struct sock *sk, int length) | 497 | static void audit_receive(struct sock *sk, int length) |
498 | { | 498 | { |
499 | struct sk_buff *skb; | 499 | struct sk_buff *skb; |
500 | unsigned int qlen; | 500 | unsigned int qlen; |
501 | 501 | ||
502 | down(&audit_netlink_sem); | 502 | down(&audit_netlink_sem); |
503 | 503 | ||
504 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { | 504 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { |
505 | skb = skb_dequeue(&sk->sk_receive_queue); | 505 | skb = skb_dequeue(&sk->sk_receive_queue); |
506 | audit_receive_skb(skb); | 506 | audit_receive_skb(skb); |
507 | kfree_skb(skb); | 507 | kfree_skb(skb); |
508 | } | 508 | } |
509 | up(&audit_netlink_sem); | 509 | up(&audit_netlink_sem); |
510 | } | 510 | } |
511 | 511 | ||
512 | 512 | ||
513 | /* Initialize audit support at boot time. */ | 513 | /* Initialize audit support at boot time. */ |
514 | static int __init audit_init(void) | 514 | static int __init audit_init(void) |
515 | { | 515 | { |
516 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 516 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
517 | audit_default ? "enabled" : "disabled"); | 517 | audit_default ? "enabled" : "disabled"); |
518 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, | 518 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, |
519 | THIS_MODULE); | 519 | THIS_MODULE); |
520 | if (!audit_sock) | 520 | if (!audit_sock) |
521 | audit_panic("cannot initialize netlink socket"); | 521 | audit_panic("cannot initialize netlink socket"); |
522 | 522 | ||
523 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; | 523 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; |
524 | skb_queue_head_init(&audit_skb_queue); | 524 | skb_queue_head_init(&audit_skb_queue); |
525 | audit_initialized = 1; | 525 | audit_initialized = 1; |
526 | audit_enabled = audit_default; | 526 | audit_enabled = audit_default; |
527 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); | 527 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); |
528 | return 0; | 528 | return 0; |
529 | } | 529 | } |
530 | __initcall(audit_init); | 530 | __initcall(audit_init); |
531 | 531 | ||
532 | /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ | 532 | /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ |
533 | static int __init audit_enable(char *str) | 533 | static int __init audit_enable(char *str) |
534 | { | 534 | { |
535 | audit_default = !!simple_strtol(str, NULL, 0); | 535 | audit_default = !!simple_strtol(str, NULL, 0); |
536 | printk(KERN_INFO "audit: %s%s\n", | 536 | printk(KERN_INFO "audit: %s%s\n", |
537 | audit_default ? "enabled" : "disabled", | 537 | audit_default ? "enabled" : "disabled", |
538 | audit_initialized ? "" : " (after initialization)"); | 538 | audit_initialized ? "" : " (after initialization)"); |
539 | if (audit_initialized) | 539 | if (audit_initialized) |
540 | audit_enabled = audit_default; | 540 | audit_enabled = audit_default; |
541 | return 0; | 541 | return 0; |
542 | } | 542 | } |
543 | 543 | ||
544 | __setup("audit=", audit_enable); | 544 | __setup("audit=", audit_enable); |
545 | 545 | ||
546 | static void audit_buffer_free(struct audit_buffer *ab) | 546 | static void audit_buffer_free(struct audit_buffer *ab) |
547 | { | 547 | { |
548 | unsigned long flags; | 548 | unsigned long flags; |
549 | 549 | ||
550 | if (!ab) | 550 | if (!ab) |
551 | return; | 551 | return; |
552 | 552 | ||
553 | if (ab->skb) | 553 | if (ab->skb) |
554 | kfree_skb(ab->skb); | 554 | kfree_skb(ab->skb); |
555 | 555 | ||
556 | spin_lock_irqsave(&audit_freelist_lock, flags); | 556 | spin_lock_irqsave(&audit_freelist_lock, flags); |
557 | if (++audit_freelist_count > AUDIT_MAXFREE) | 557 | if (++audit_freelist_count > AUDIT_MAXFREE) |
558 | kfree(ab); | 558 | kfree(ab); |
559 | else | 559 | else |
560 | list_add(&ab->list, &audit_freelist); | 560 | list_add(&ab->list, &audit_freelist); |
561 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | 561 | spin_unlock_irqrestore(&audit_freelist_lock, flags); |
562 | } | 562 | } |
563 | 563 | ||
564 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, | 564 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, |
565 | gfp_t gfp_mask, int type) | 565 | gfp_t gfp_mask, int type) |
566 | { | 566 | { |
567 | unsigned long flags; | 567 | unsigned long flags; |
568 | struct audit_buffer *ab = NULL; | 568 | struct audit_buffer *ab = NULL; |
569 | struct nlmsghdr *nlh; | 569 | struct nlmsghdr *nlh; |
570 | 570 | ||
571 | spin_lock_irqsave(&audit_freelist_lock, flags); | 571 | spin_lock_irqsave(&audit_freelist_lock, flags); |
572 | if (!list_empty(&audit_freelist)) { | 572 | if (!list_empty(&audit_freelist)) { |
573 | ab = list_entry(audit_freelist.next, | 573 | ab = list_entry(audit_freelist.next, |
574 | struct audit_buffer, list); | 574 | struct audit_buffer, list); |
575 | list_del(&ab->list); | 575 | list_del(&ab->list); |
576 | --audit_freelist_count; | 576 | --audit_freelist_count; |
577 | } | 577 | } |
578 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | 578 | spin_unlock_irqrestore(&audit_freelist_lock, flags); |
579 | 579 | ||
580 | if (!ab) { | 580 | if (!ab) { |
581 | ab = kmalloc(sizeof(*ab), gfp_mask); | 581 | ab = kmalloc(sizeof(*ab), gfp_mask); |
582 | if (!ab) | 582 | if (!ab) |
583 | goto err; | 583 | goto err; |
584 | } | 584 | } |
585 | 585 | ||
586 | ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); | 586 | ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); |
587 | if (!ab->skb) | 587 | if (!ab->skb) |
588 | goto err; | 588 | goto err; |
589 | 589 | ||
590 | ab->ctx = ctx; | 590 | ab->ctx = ctx; |
591 | ab->gfp_mask = gfp_mask; | 591 | ab->gfp_mask = gfp_mask; |
592 | nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); | 592 | nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); |
593 | nlh->nlmsg_type = type; | 593 | nlh->nlmsg_type = type; |
594 | nlh->nlmsg_flags = 0; | 594 | nlh->nlmsg_flags = 0; |
595 | nlh->nlmsg_pid = 0; | 595 | nlh->nlmsg_pid = 0; |
596 | nlh->nlmsg_seq = 0; | 596 | nlh->nlmsg_seq = 0; |
597 | return ab; | 597 | return ab; |
598 | err: | 598 | err: |
599 | audit_buffer_free(ab); | 599 | audit_buffer_free(ab); |
600 | return NULL; | 600 | return NULL; |
601 | } | 601 | } |
602 | 602 | ||
603 | /* Compute a serial number for the audit record. Audit records are | 603 | /* Compute a serial number for the audit record. Audit records are |
604 | * written to user-space as soon as they are generated, so a complete | 604 | * written to user-space as soon as they are generated, so a complete |
605 | * audit record may be written in several pieces. The timestamp of the | 605 | * audit record may be written in several pieces. The timestamp of the |
606 | * record and this serial number are used by the user-space tools to | 606 | * record and this serial number are used by the user-space tools to |
607 | * determine which pieces belong to the same audit record. The | 607 | * determine which pieces belong to the same audit record. The |
608 | * (timestamp,serial) tuple is unique for each syscall and is live from | 608 | * (timestamp,serial) tuple is unique for each syscall and is live from |
609 | * syscall entry to syscall exit. | 609 | * syscall entry to syscall exit. |
610 | * | 610 | * |
611 | * NOTE: Another possibility is to store the formatted records off the | 611 | * NOTE: Another possibility is to store the formatted records off the |
612 | * audit context (for those records that have a context), and emit them | 612 | * audit context (for those records that have a context), and emit them |
613 | * all at syscall exit. However, this could delay the reporting of | 613 | * all at syscall exit. However, this could delay the reporting of |
614 | * significant errors until syscall exit (or never, if the system | 614 | * significant errors until syscall exit (or never, if the system |
615 | * halts). */ | 615 | * halts). */ |
616 | 616 | ||
617 | unsigned int audit_serial(void) | 617 | unsigned int audit_serial(void) |
618 | { | 618 | { |
619 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; | 619 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; |
620 | static unsigned int serial = 0; | 620 | static unsigned int serial = 0; |
621 | 621 | ||
622 | unsigned long flags; | 622 | unsigned long flags; |
623 | unsigned int ret; | 623 | unsigned int ret; |
624 | 624 | ||
625 | spin_lock_irqsave(&serial_lock, flags); | 625 | spin_lock_irqsave(&serial_lock, flags); |
626 | do { | 626 | do { |
627 | ret = ++serial; | 627 | ret = ++serial; |
628 | } while (unlikely(!ret)); | 628 | } while (unlikely(!ret)); |
629 | spin_unlock_irqrestore(&serial_lock, flags); | 629 | spin_unlock_irqrestore(&serial_lock, flags); |
630 | 630 | ||
631 | return ret; | 631 | return ret; |
632 | } | 632 | } |
633 | 633 | ||
634 | static inline void audit_get_stamp(struct audit_context *ctx, | 634 | static inline void audit_get_stamp(struct audit_context *ctx, |
635 | struct timespec *t, unsigned int *serial) | 635 | struct timespec *t, unsigned int *serial) |
636 | { | 636 | { |
637 | if (ctx) | 637 | if (ctx) |
638 | auditsc_get_stamp(ctx, t, serial); | 638 | auditsc_get_stamp(ctx, t, serial); |
639 | else { | 639 | else { |
640 | *t = CURRENT_TIME; | 640 | *t = CURRENT_TIME; |
641 | *serial = audit_serial(); | 641 | *serial = audit_serial(); |
642 | } | 642 | } |
643 | } | 643 | } |
644 | 644 | ||
645 | /* Obtain an audit buffer. This routine does locking to obtain the | 645 | /* Obtain an audit buffer. This routine does locking to obtain the |
646 | * audit buffer, but then no locking is required for calls to | 646 | * audit buffer, but then no locking is required for calls to |
647 | * audit_log_*format. If the tsk is a task that is currently in a | 647 | * audit_log_*format. If the tsk is a task that is currently in a |
648 | * syscall, then the syscall is marked as auditable and an audit record | 648 | * syscall, then the syscall is marked as auditable and an audit record |
649 | * will be written at syscall exit. If there is no associated task, tsk | 649 | * will be written at syscall exit. If there is no associated task, tsk |
650 | * should be NULL. */ | 650 | * should be NULL. */ |
651 | 651 | ||
652 | struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | 652 | struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, |
653 | int type) | 653 | int type) |
654 | { | 654 | { |
655 | struct audit_buffer *ab = NULL; | 655 | struct audit_buffer *ab = NULL; |
656 | struct timespec t; | 656 | struct timespec t; |
657 | unsigned int serial; | 657 | unsigned int serial; |
658 | int reserve; | 658 | int reserve; |
659 | unsigned long timeout_start = jiffies; | 659 | unsigned long timeout_start = jiffies; |
660 | 660 | ||
661 | if (!audit_initialized) | 661 | if (!audit_initialized) |
662 | return NULL; | 662 | return NULL; |
663 | 663 | ||
664 | if (gfp_mask & __GFP_WAIT) | 664 | if (gfp_mask & __GFP_WAIT) |
665 | reserve = 0; | 665 | reserve = 0; |
666 | else | 666 | else |
667 | reserve = 5; /* Allow atomic callers to go up to five | 667 | reserve = 5; /* Allow atomic callers to go up to five |
668 | entries over the normal backlog limit */ | 668 | entries over the normal backlog limit */ |
669 | 669 | ||
670 | while (audit_backlog_limit | 670 | while (audit_backlog_limit |
671 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { | 671 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { |
672 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time | 672 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time |
673 | && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { | 673 | && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { |
674 | 674 | ||
675 | /* Wait for auditd to drain the queue a little */ | 675 | /* Wait for auditd to drain the queue a little */ |
676 | DECLARE_WAITQUEUE(wait, current); | 676 | DECLARE_WAITQUEUE(wait, current); |
677 | set_current_state(TASK_INTERRUPTIBLE); | 677 | set_current_state(TASK_INTERRUPTIBLE); |
678 | add_wait_queue(&audit_backlog_wait, &wait); | 678 | add_wait_queue(&audit_backlog_wait, &wait); |
679 | 679 | ||
680 | if (audit_backlog_limit && | 680 | if (audit_backlog_limit && |
681 | skb_queue_len(&audit_skb_queue) > audit_backlog_limit) | 681 | skb_queue_len(&audit_skb_queue) > audit_backlog_limit) |
682 | schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); | 682 | schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); |
683 | 683 | ||
684 | __set_current_state(TASK_RUNNING); | 684 | __set_current_state(TASK_RUNNING); |
685 | remove_wait_queue(&audit_backlog_wait, &wait); | 685 | remove_wait_queue(&audit_backlog_wait, &wait); |
686 | continue; | 686 | continue; |
687 | } | 687 | } |
688 | if (audit_rate_check()) | 688 | if (audit_rate_check()) |
689 | printk(KERN_WARNING | 689 | printk(KERN_WARNING |
690 | "audit: audit_backlog=%d > " | 690 | "audit: audit_backlog=%d > " |
691 | "audit_backlog_limit=%d\n", | 691 | "audit_backlog_limit=%d\n", |
692 | skb_queue_len(&audit_skb_queue), | 692 | skb_queue_len(&audit_skb_queue), |
693 | audit_backlog_limit); | 693 | audit_backlog_limit); |
694 | audit_log_lost("backlog limit exceeded"); | 694 | audit_log_lost("backlog limit exceeded"); |
695 | audit_backlog_wait_time = audit_backlog_wait_overflow; | 695 | audit_backlog_wait_time = audit_backlog_wait_overflow; |
696 | wake_up(&audit_backlog_wait); | 696 | wake_up(&audit_backlog_wait); |
697 | return NULL; | 697 | return NULL; |
698 | } | 698 | } |
699 | 699 | ||
700 | ab = audit_buffer_alloc(ctx, gfp_mask, type); | 700 | ab = audit_buffer_alloc(ctx, gfp_mask, type); |
701 | if (!ab) { | 701 | if (!ab) { |
702 | audit_log_lost("out of memory in audit_log_start"); | 702 | audit_log_lost("out of memory in audit_log_start"); |
703 | return NULL; | 703 | return NULL; |
704 | } | 704 | } |
705 | 705 | ||
706 | audit_get_stamp(ab->ctx, &t, &serial); | 706 | audit_get_stamp(ab->ctx, &t, &serial); |
707 | 707 | ||
708 | audit_log_format(ab, "audit(%lu.%03lu:%u): ", | 708 | audit_log_format(ab, "audit(%lu.%03lu:%u): ", |
709 | t.tv_sec, t.tv_nsec/1000000, serial); | 709 | t.tv_sec, t.tv_nsec/1000000, serial); |
710 | return ab; | 710 | return ab; |
711 | } | 711 | } |
712 | 712 | ||
713 | /** | 713 | /** |
714 | * audit_expand - expand skb in the audit buffer | 714 | * audit_expand - expand skb in the audit buffer |
715 | * @ab: audit_buffer | 715 | * @ab: audit_buffer |
716 | * | 716 | * |
717 | * Returns 0 (no space) on failed expansion, or available space if | 717 | * Returns 0 (no space) on failed expansion, or available space if |
718 | * successful. | 718 | * successful. |
719 | */ | 719 | */ |
720 | static inline int audit_expand(struct audit_buffer *ab, int extra) | 720 | static inline int audit_expand(struct audit_buffer *ab, int extra) |
721 | { | 721 | { |
722 | struct sk_buff *skb = ab->skb; | 722 | struct sk_buff *skb = ab->skb; |
723 | int ret = pskb_expand_head(skb, skb_headroom(skb), extra, | 723 | int ret = pskb_expand_head(skb, skb_headroom(skb), extra, |
724 | ab->gfp_mask); | 724 | ab->gfp_mask); |
725 | if (ret < 0) { | 725 | if (ret < 0) { |
726 | audit_log_lost("out of memory in audit_expand"); | 726 | audit_log_lost("out of memory in audit_expand"); |
727 | return 0; | 727 | return 0; |
728 | } | 728 | } |
729 | return skb_tailroom(skb); | 729 | return skb_tailroom(skb); |
730 | } | 730 | } |
731 | 731 | ||
732 | /* Format an audit message into the audit buffer. If there isn't enough | 732 | /* Format an audit message into the audit buffer. If there isn't enough |
733 | * room in the audit buffer, more room will be allocated and vsnprint | 733 | * room in the audit buffer, more room will be allocated and vsnprint |
734 | * will be called a second time. Currently, we assume that a printk | 734 | * will be called a second time. Currently, we assume that a printk |
735 | * can't format message larger than 1024 bytes, so we don't either. */ | 735 | * can't format message larger than 1024 bytes, so we don't either. */ |
736 | static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | 736 | static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, |
737 | va_list args) | 737 | va_list args) |
738 | { | 738 | { |
739 | int len, avail; | 739 | int len, avail; |
740 | struct sk_buff *skb; | 740 | struct sk_buff *skb; |
741 | va_list args2; | 741 | va_list args2; |
742 | 742 | ||
743 | if (!ab) | 743 | if (!ab) |
744 | return; | 744 | return; |
745 | 745 | ||
746 | BUG_ON(!ab->skb); | 746 | BUG_ON(!ab->skb); |
747 | skb = ab->skb; | 747 | skb = ab->skb; |
748 | avail = skb_tailroom(skb); | 748 | avail = skb_tailroom(skb); |
749 | if (avail == 0) { | 749 | if (avail == 0) { |
750 | avail = audit_expand(ab, AUDIT_BUFSIZ); | 750 | avail = audit_expand(ab, AUDIT_BUFSIZ); |
751 | if (!avail) | 751 | if (!avail) |
752 | goto out; | 752 | goto out; |
753 | } | 753 | } |
754 | va_copy(args2, args); | 754 | va_copy(args2, args); |
755 | len = vsnprintf(skb->tail, avail, fmt, args); | 755 | len = vsnprintf(skb->tail, avail, fmt, args); |
756 | if (len >= avail) { | 756 | if (len >= avail) { |
757 | /* The printk buffer is 1024 bytes long, so if we get | 757 | /* The printk buffer is 1024 bytes long, so if we get |
758 | * here and AUDIT_BUFSIZ is at least 1024, then we can | 758 | * here and AUDIT_BUFSIZ is at least 1024, then we can |
759 | * log everything that printk could have logged. */ | 759 | * log everything that printk could have logged. */ |
760 | avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); | 760 | avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); |
761 | if (!avail) | 761 | if (!avail) |
762 | goto out; | 762 | goto out; |
763 | len = vsnprintf(skb->tail, avail, fmt, args2); | 763 | len = vsnprintf(skb->tail, avail, fmt, args2); |
764 | } | 764 | } |
765 | if (len > 0) | 765 | if (len > 0) |
766 | skb_put(skb, len); | 766 | skb_put(skb, len); |
767 | out: | 767 | out: |
768 | return; | 768 | return; |
769 | } | 769 | } |
770 | 770 | ||
771 | /* Format a message into the audit buffer. All the work is done in | 771 | /* Format a message into the audit buffer. All the work is done in |
772 | * audit_log_vformat. */ | 772 | * audit_log_vformat. */ |
773 | void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) | 773 | void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) |
774 | { | 774 | { |
775 | va_list args; | 775 | va_list args; |
776 | 776 | ||
777 | if (!ab) | 777 | if (!ab) |
778 | return; | 778 | return; |
779 | va_start(args, fmt); | 779 | va_start(args, fmt); |
780 | audit_log_vformat(ab, fmt, args); | 780 | audit_log_vformat(ab, fmt, args); |
781 | va_end(args); | 781 | va_end(args); |
782 | } | 782 | } |
783 | 783 | ||
784 | /* This function will take the passed buf and convert it into a string of | 784 | /* This function will take the passed buf and convert it into a string of |
785 | * ascii hex digits. The new string is placed onto the skb. */ | 785 | * ascii hex digits. The new string is placed onto the skb. */ |
786 | void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | 786 | void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, |
787 | size_t len) | 787 | size_t len) |
788 | { | 788 | { |
789 | int i, avail, new_len; | 789 | int i, avail, new_len; |
790 | unsigned char *ptr; | 790 | unsigned char *ptr; |
791 | struct sk_buff *skb; | 791 | struct sk_buff *skb; |
792 | static const unsigned char *hex = "0123456789ABCDEF"; | 792 | static const unsigned char *hex = "0123456789ABCDEF"; |
793 | 793 | ||
794 | BUG_ON(!ab->skb); | 794 | BUG_ON(!ab->skb); |
795 | skb = ab->skb; | 795 | skb = ab->skb; |
796 | avail = skb_tailroom(skb); | 796 | avail = skb_tailroom(skb); |
797 | new_len = len<<1; | 797 | new_len = len<<1; |
798 | if (new_len >= avail) { | 798 | if (new_len >= avail) { |
799 | /* Round the buffer request up to the next multiple */ | 799 | /* Round the buffer request up to the next multiple */ |
800 | new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); | 800 | new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); |
801 | avail = audit_expand(ab, new_len); | 801 | avail = audit_expand(ab, new_len); |
802 | if (!avail) | 802 | if (!avail) |
803 | return; | 803 | return; |
804 | } | 804 | } |
805 | 805 | ||
806 | ptr = skb->tail; | 806 | ptr = skb->tail; |
807 | for (i=0; i<len; i++) { | 807 | for (i=0; i<len; i++) { |
808 | *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ | 808 | *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ |
809 | *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ | 809 | *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ |
810 | } | 810 | } |
811 | *ptr = 0; | 811 | *ptr = 0; |
812 | skb_put(skb, len << 1); /* new string is twice the old string */ | 812 | skb_put(skb, len << 1); /* new string is twice the old string */ |
813 | } | 813 | } |
814 | 814 | ||
815 | /* This code will escape a string that is passed to it if the string | 815 | /* This code will escape a string that is passed to it if the string |
816 | * contains a control character, unprintable character, double quote mark, | 816 | * contains a control character, unprintable character, double quote mark, |
817 | * or a space. Unescaped strings will start and end with a double quote mark. | 817 | * or a space. Unescaped strings will start and end with a double quote mark. |
818 | * Strings that are escaped are printed in hex (2 digits per char). */ | 818 | * Strings that are escaped are printed in hex (2 digits per char). */ |
819 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 819 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) |
820 | { | 820 | { |
821 | const unsigned char *p = string; | 821 | const unsigned char *p = string; |
822 | 822 | ||
823 | while (*p) { | 823 | while (*p) { |
824 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { | 824 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { |
825 | audit_log_hex(ab, string, strlen(string)); | 825 | audit_log_hex(ab, string, strlen(string)); |
826 | return; | 826 | return; |
827 | } | 827 | } |
828 | p++; | 828 | p++; |
829 | } | 829 | } |
830 | audit_log_format(ab, "\"%s\"", string); | 830 | audit_log_format(ab, "\"%s\"", string); |
831 | } | 831 | } |
832 | 832 | ||
833 | /* This is a helper-function to print the escaped d_path */ | 833 | /* This is a helper-function to print the escaped d_path */ |
834 | void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | 834 | void audit_log_d_path(struct audit_buffer *ab, const char *prefix, |
835 | struct dentry *dentry, struct vfsmount *vfsmnt) | 835 | struct dentry *dentry, struct vfsmount *vfsmnt) |
836 | { | 836 | { |
837 | char *p, *path; | 837 | char *p, *path; |
838 | 838 | ||
839 | if (prefix) | 839 | if (prefix) |
840 | audit_log_format(ab, " %s", prefix); | 840 | audit_log_format(ab, " %s", prefix); |
841 | 841 | ||
842 | /* We will allow 11 spaces for ' (deleted)' to be appended */ | 842 | /* We will allow 11 spaces for ' (deleted)' to be appended */ |
843 | path = kmalloc(PATH_MAX+11, ab->gfp_mask); | 843 | path = kmalloc(PATH_MAX+11, ab->gfp_mask); |
844 | if (!path) { | 844 | if (!path) { |
845 | audit_log_format(ab, "<no memory>"); | 845 | audit_log_format(ab, "<no memory>"); |
846 | return; | 846 | return; |
847 | } | 847 | } |
848 | p = d_path(dentry, vfsmnt, path, PATH_MAX+11); | 848 | p = d_path(dentry, vfsmnt, path, PATH_MAX+11); |
849 | if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ | 849 | if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ |
850 | /* FIXME: can we save some information here? */ | 850 | /* FIXME: can we save some information here? */ |
851 | audit_log_format(ab, "<too long>"); | 851 | audit_log_format(ab, "<too long>"); |
852 | } else | 852 | } else |
853 | audit_log_untrustedstring(ab, p); | 853 | audit_log_untrustedstring(ab, p); |
854 | kfree(path); | 854 | kfree(path); |
855 | } | 855 | } |
856 | 856 | ||
857 | /* The netlink_* functions cannot be called inside an irq context, so | 857 | /* The netlink_* functions cannot be called inside an irq context, so |
858 | * the audit buffer is places on a queue and a tasklet is scheduled to | 858 | * the audit buffer is places on a queue and a tasklet is scheduled to |
859 | * remove them from the queue outside the irq context. May be called in | 859 | * remove them from the queue outside the irq context. May be called in |
860 | * any context. */ | 860 | * any context. */ |
861 | void audit_log_end(struct audit_buffer *ab) | 861 | void audit_log_end(struct audit_buffer *ab) |
862 | { | 862 | { |
863 | if (!ab) | 863 | if (!ab) |
864 | return; | 864 | return; |
865 | if (!audit_rate_check()) { | 865 | if (!audit_rate_check()) { |
866 | audit_log_lost("rate limit exceeded"); | 866 | audit_log_lost("rate limit exceeded"); |
867 | } else { | 867 | } else { |
868 | if (audit_pid) { | 868 | if (audit_pid) { |
869 | struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; | 869 | struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; |
870 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); | 870 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); |
871 | skb_queue_tail(&audit_skb_queue, ab->skb); | 871 | skb_queue_tail(&audit_skb_queue, ab->skb); |
872 | ab->skb = NULL; | 872 | ab->skb = NULL; |
873 | wake_up_interruptible(&kauditd_wait); | 873 | wake_up_interruptible(&kauditd_wait); |
874 | } else { | 874 | } else { |
875 | printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); | 875 | printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); |
876 | } | 876 | } |
877 | } | 877 | } |
878 | audit_buffer_free(ab); | 878 | audit_buffer_free(ab); |
879 | } | 879 | } |
880 | 880 | ||
881 | /* Log an audit record. This is a convenience function that calls | 881 | /* Log an audit record. This is a convenience function that calls |
882 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be | 882 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be |
883 | * called in any context. */ | 883 | * called in any context. */ |
884 | void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, | 884 | void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, |
885 | const char *fmt, ...) | 885 | const char *fmt, ...) |
886 | { | 886 | { |
887 | struct audit_buffer *ab; | 887 | struct audit_buffer *ab; |
888 | va_list args; | 888 | va_list args; |
889 | 889 | ||
890 | ab = audit_log_start(ctx, gfp_mask, type); | 890 | ab = audit_log_start(ctx, gfp_mask, type); |
891 | if (ab) { | 891 | if (ab) { |
892 | va_start(args, fmt); | 892 | va_start(args, fmt); |
893 | audit_log_vformat(ab, fmt, args); | 893 | audit_log_vformat(ab, fmt, args); |
894 | va_end(args); | 894 | va_end(args); |
895 | audit_log_end(ab); | 895 | audit_log_end(ab); |
896 | } | 896 | } |
897 | } | 897 | } |
898 | 898 |
kernel/irq/proc.c
1 | /* | 1 | /* |
2 | * linux/kernel/irq/proc.c | 2 | * linux/kernel/irq/proc.c |
3 | * | 3 | * |
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | 4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar |
5 | * | 5 | * |
6 | * This file contains the /proc/irq/ handling code. | 6 | * This file contains the /proc/irq/ handling code. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
10 | #include <linux/proc_fs.h> | 10 | #include <linux/proc_fs.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | 12 | ||
13 | #include "internals.h" | ||
14 | |||
13 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | 15 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; |
14 | 16 | ||
15 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
16 | 18 | ||
17 | /* | 19 | /* |
18 | * The /proc/irq/<irq>/smp_affinity values: | 20 | * The /proc/irq/<irq>/smp_affinity values: |
19 | */ | 21 | */ |
20 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; | 22 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; |
21 | 23 | ||
22 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 24 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
23 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 25 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
24 | { | 26 | { |
25 | /* | 27 | /* |
26 | * Save these away for later use. Re-progam when the | 28 | * Save these away for later use. Re-progam when the |
27 | * interrupt is pending | 29 | * interrupt is pending |
28 | */ | 30 | */ |
29 | set_pending_irq(irq, mask_val); | 31 | set_pending_irq(irq, mask_val); |
30 | } | 32 | } |
31 | #else | 33 | #else |
32 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 34 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
33 | { | 35 | { |
34 | irq_affinity[irq] = mask_val; | 36 | irq_affinity[irq] = mask_val; |
35 | irq_desc[irq].handler->set_affinity(irq, mask_val); | 37 | irq_desc[irq].handler->set_affinity(irq, mask_val); |
36 | } | 38 | } |
37 | #endif | 39 | #endif |
38 | 40 | ||
39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 41 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
40 | int count, int *eof, void *data) | 42 | int count, int *eof, void *data) |
41 | { | 43 | { |
42 | int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); | 44 | int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); |
43 | 45 | ||
44 | if (count - len < 2) | 46 | if (count - len < 2) |
45 | return -EINVAL; | 47 | return -EINVAL; |
46 | len += sprintf(page + len, "\n"); | 48 | len += sprintf(page + len, "\n"); |
47 | return len; | 49 | return len; |
48 | } | 50 | } |
49 | 51 | ||
50 | int no_irq_affinity; | 52 | int no_irq_affinity; |
51 | static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | 53 | static int irq_affinity_write_proc(struct file *file, const char __user *buffer, |
52 | unsigned long count, void *data) | 54 | unsigned long count, void *data) |
53 | { | 55 | { |
54 | unsigned int irq = (int)(long)data, full_count = count, err; | 56 | unsigned int irq = (int)(long)data, full_count = count, err; |
55 | cpumask_t new_value, tmp; | 57 | cpumask_t new_value, tmp; |
56 | 58 | ||
57 | if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) | 59 | if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) |
58 | return -EIO; | 60 | return -EIO; |
59 | 61 | ||
60 | err = cpumask_parse(buffer, count, new_value); | 62 | err = cpumask_parse(buffer, count, new_value); |
61 | if (err) | 63 | if (err) |
62 | return err; | 64 | return err; |
63 | 65 | ||
64 | /* | 66 | /* |
65 | * Do not allow disabling IRQs completely - it's a too easy | 67 | * Do not allow disabling IRQs completely - it's a too easy |
66 | * way to make the system unusable accidentally :-) At least | 68 | * way to make the system unusable accidentally :-) At least |
67 | * one online CPU still has to be targeted. | 69 | * one online CPU still has to be targeted. |
68 | */ | 70 | */ |
69 | cpus_and(tmp, new_value, cpu_online_map); | 71 | cpus_and(tmp, new_value, cpu_online_map); |
70 | if (cpus_empty(tmp)) | 72 | if (cpus_empty(tmp)) |
71 | /* Special case for empty set - allow the architecture | 73 | /* Special case for empty set - allow the architecture |
72 | code to set default SMP affinity. */ | 74 | code to set default SMP affinity. */ |
73 | return select_smp_affinity(irq) ? -EINVAL : full_count; | 75 | return select_smp_affinity(irq) ? -EINVAL : full_count; |
74 | 76 | ||
75 | proc_set_irq_affinity(irq, new_value); | 77 | proc_set_irq_affinity(irq, new_value); |
76 | 78 | ||
77 | return full_count; | 79 | return full_count; |
78 | } | 80 | } |
79 | 81 | ||
80 | #endif | 82 | #endif |
81 | 83 | ||
82 | #define MAX_NAMELEN 128 | 84 | #define MAX_NAMELEN 128 |
83 | 85 | ||
84 | static int name_unique(unsigned int irq, struct irqaction *new_action) | 86 | static int name_unique(unsigned int irq, struct irqaction *new_action) |
85 | { | 87 | { |
86 | struct irq_desc *desc = irq_desc + irq; | 88 | struct irq_desc *desc = irq_desc + irq; |
87 | struct irqaction *action; | 89 | struct irqaction *action; |
88 | 90 | ||
89 | for (action = desc->action ; action; action = action->next) | 91 | for (action = desc->action ; action; action = action->next) |
90 | if ((action != new_action) && action->name && | 92 | if ((action != new_action) && action->name && |
91 | !strcmp(new_action->name, action->name)) | 93 | !strcmp(new_action->name, action->name)) |
92 | return 0; | 94 | return 0; |
93 | return 1; | 95 | return 1; |
94 | } | 96 | } |
95 | 97 | ||
96 | void register_handler_proc(unsigned int irq, struct irqaction *action) | 98 | void register_handler_proc(unsigned int irq, struct irqaction *action) |
97 | { | 99 | { |
98 | char name [MAX_NAMELEN]; | 100 | char name [MAX_NAMELEN]; |
99 | 101 | ||
100 | if (!irq_dir[irq] || action->dir || !action->name || | 102 | if (!irq_dir[irq] || action->dir || !action->name || |
101 | !name_unique(irq, action)) | 103 | !name_unique(irq, action)) |
102 | return; | 104 | return; |
103 | 105 | ||
104 | memset(name, 0, MAX_NAMELEN); | 106 | memset(name, 0, MAX_NAMELEN); |
105 | snprintf(name, MAX_NAMELEN, "%s", action->name); | 107 | snprintf(name, MAX_NAMELEN, "%s", action->name); |
106 | 108 | ||
107 | /* create /proc/irq/1234/handler/ */ | 109 | /* create /proc/irq/1234/handler/ */ |
108 | action->dir = proc_mkdir(name, irq_dir[irq]); | 110 | action->dir = proc_mkdir(name, irq_dir[irq]); |
109 | } | 111 | } |
110 | 112 | ||
111 | #undef MAX_NAMELEN | 113 | #undef MAX_NAMELEN |
112 | 114 | ||
113 | #define MAX_NAMELEN 10 | 115 | #define MAX_NAMELEN 10 |
114 | 116 | ||
115 | void register_irq_proc(unsigned int irq) | 117 | void register_irq_proc(unsigned int irq) |
116 | { | 118 | { |
117 | char name [MAX_NAMELEN]; | 119 | char name [MAX_NAMELEN]; |
118 | 120 | ||
119 | if (!root_irq_dir || | 121 | if (!root_irq_dir || |
120 | (irq_desc[irq].handler == &no_irq_type) || | 122 | (irq_desc[irq].handler == &no_irq_type) || |
121 | irq_dir[irq]) | 123 | irq_dir[irq]) |
122 | return; | 124 | return; |
123 | 125 | ||
124 | memset(name, 0, MAX_NAMELEN); | 126 | memset(name, 0, MAX_NAMELEN); |
125 | sprintf(name, "%d", irq); | 127 | sprintf(name, "%d", irq); |
126 | 128 | ||
127 | /* create /proc/irq/1234 */ | 129 | /* create /proc/irq/1234 */ |
128 | irq_dir[irq] = proc_mkdir(name, root_irq_dir); | 130 | irq_dir[irq] = proc_mkdir(name, root_irq_dir); |
129 | 131 | ||
130 | #ifdef CONFIG_SMP | 132 | #ifdef CONFIG_SMP |
131 | { | 133 | { |
132 | struct proc_dir_entry *entry; | 134 | struct proc_dir_entry *entry; |
133 | 135 | ||
134 | /* create /proc/irq/<irq>/smp_affinity */ | 136 | /* create /proc/irq/<irq>/smp_affinity */ |
135 | entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); | 137 | entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); |
136 | 138 | ||
137 | if (entry) { | 139 | if (entry) { |
138 | entry->nlink = 1; | 140 | entry->nlink = 1; |
139 | entry->data = (void *)(long)irq; | 141 | entry->data = (void *)(long)irq; |
140 | entry->read_proc = irq_affinity_read_proc; | 142 | entry->read_proc = irq_affinity_read_proc; |
141 | entry->write_proc = irq_affinity_write_proc; | 143 | entry->write_proc = irq_affinity_write_proc; |
142 | } | 144 | } |
143 | smp_affinity_entry[irq] = entry; | 145 | smp_affinity_entry[irq] = entry; |
144 | } | 146 | } |
145 | #endif | 147 | #endif |
146 | } | 148 | } |
147 | 149 | ||
148 | #undef MAX_NAMELEN | 150 | #undef MAX_NAMELEN |
149 | 151 | ||
150 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 152 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
151 | { | 153 | { |
152 | if (action->dir) | 154 | if (action->dir) |
153 | remove_proc_entry(action->dir->name, irq_dir[irq]); | 155 | remove_proc_entry(action->dir->name, irq_dir[irq]); |
154 | } | 156 | } |
155 | 157 | ||
156 | void init_irq_proc(void) | 158 | void init_irq_proc(void) |
157 | { | 159 | { |
158 | int i; | 160 | int i; |
159 | 161 | ||
160 | /* create /proc/irq */ | 162 | /* create /proc/irq */ |
161 | root_irq_dir = proc_mkdir("irq", NULL); | 163 | root_irq_dir = proc_mkdir("irq", NULL); |
162 | if (!root_irq_dir) | 164 | if (!root_irq_dir) |
163 | return; | 165 | return; |
164 | 166 | ||
165 | /* | 167 | /* |
166 | * Create entries for all existing IRQs. | 168 | * Create entries for all existing IRQs. |
167 | */ | 169 | */ |
168 | for (i = 0; i < NR_IRQS; i++) | 170 | for (i = 0; i < NR_IRQS; i++) |
169 | register_irq_proc(i); | 171 | register_irq_proc(i); |
170 | } | 172 | } |
171 | 173 | ||
172 | 174 |
kernel/rcutorture.c
1 | /* | 1 | /* |
2 | * Read-Copy Update /proc-based torture test facility | 2 | * Read-Copy Update /proc-based torture test facility |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
6 | * the Free Software Foundation; either version 2 of the License, or | 6 | * the Free Software Foundation; either version 2 of the License, or |
7 | * (at your option) any later version. | 7 | * (at your option) any later version. |
8 | * | 8 | * |
9 | * This program is distributed in the hope that it will be useful, | 9 | * This program is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. | 12 | * GNU General Public License for more details. |
13 | * | 13 | * |
14 | * You should have received a copy of the GNU General Public License | 14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2005 | 18 | * Copyright (C) IBM Corporation, 2005 |
19 | * | 19 | * |
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> |
21 | * | 21 | * |
22 | * See also: Documentation/RCU/torture.txt | 22 | * See also: Documentation/RCU/torture.txt |
23 | */ | 23 | */ |
24 | #include <linux/types.h> | 24 | #include <linux/types.h> |
25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/module.h> | 27 | #include <linux/module.h> |
28 | #include <linux/kthread.h> | 28 | #include <linux/kthread.h> |
29 | #include <linux/err.h> | 29 | #include <linux/err.h> |
30 | #include <linux/spinlock.h> | 30 | #include <linux/spinlock.h> |
31 | #include <linux/smp.h> | 31 | #include <linux/smp.h> |
32 | #include <linux/rcupdate.h> | 32 | #include <linux/rcupdate.h> |
33 | #include <linux/interrupt.h> | 33 | #include <linux/interrupt.h> |
34 | #include <linux/sched.h> | 34 | #include <linux/sched.h> |
35 | #include <asm/atomic.h> | 35 | #include <asm/atomic.h> |
36 | #include <linux/bitops.h> | 36 | #include <linux/bitops.h> |
37 | #include <linux/module.h> | 37 | #include <linux/module.h> |
38 | #include <linux/completion.h> | 38 | #include <linux/completion.h> |
39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
40 | #include <linux/percpu.h> | 40 | #include <linux/percpu.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/rcuref.h> | 42 | #include <linux/rcuref.h> |
43 | #include <linux/cpu.h> | 43 | #include <linux/cpu.h> |
44 | #include <linux/random.h> | 44 | #include <linux/random.h> |
45 | #include <linux/delay.h> | 45 | #include <linux/delay.h> |
46 | #include <linux/byteorder/swabb.h> | 46 | #include <linux/byteorder/swabb.h> |
47 | #include <linux/stat.h> | 47 | #include <linux/stat.h> |
48 | 48 | ||
49 | MODULE_LICENSE("GPL"); | 49 | MODULE_LICENSE("GPL"); |
50 | 50 | ||
51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ | 51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ |
52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ | 52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ |
53 | /* Defaults to "only at end of test". */ | 53 | /* Defaults to "only at end of test". */ |
54 | static int verbose = 0; /* Print more debug info. */ | 54 | static int verbose = 0; /* Print more debug info. */ |
55 | 55 | ||
56 | MODULE_PARM(nreaders, "i"); | 56 | MODULE_PARM(nreaders, "i"); |
57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
58 | MODULE_PARM(stat_interval, "i"); | 58 | MODULE_PARM(stat_interval, "i"); |
59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
60 | MODULE_PARM(verbose, "i"); | 60 | MODULE_PARM(verbose, "i"); |
61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
62 | #define TORTURE_FLAG "rcutorture: " | 62 | #define TORTURE_FLAG "rcutorture: " |
63 | #define PRINTK_STRING(s) \ | 63 | #define PRINTK_STRING(s) \ |
64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) |
65 | #define VERBOSE_PRINTK_STRING(s) \ | 65 | #define VERBOSE_PRINTK_STRING(s) \ |
66 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 66 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) |
67 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 67 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
68 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | 68 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) |
69 | 69 | ||
70 | static char printk_buf[4096]; | 70 | static char printk_buf[4096]; |
71 | 71 | ||
72 | static int nrealreaders; | 72 | static int nrealreaders; |
73 | static struct task_struct *writer_task; | 73 | static struct task_struct *writer_task; |
74 | static struct task_struct **reader_tasks; | 74 | static struct task_struct **reader_tasks; |
75 | static struct task_struct *stats_task; | 75 | static struct task_struct *stats_task; |
76 | 76 | ||
77 | #define RCU_TORTURE_PIPE_LEN 10 | 77 | #define RCU_TORTURE_PIPE_LEN 10 |
78 | 78 | ||
79 | struct rcu_torture { | 79 | struct rcu_torture { |
80 | struct rcu_head rtort_rcu; | 80 | struct rcu_head rtort_rcu; |
81 | int rtort_pipe_count; | 81 | int rtort_pipe_count; |
82 | struct list_head rtort_free; | 82 | struct list_head rtort_free; |
83 | int rtort_mbtest; | 83 | int rtort_mbtest; |
84 | }; | 84 | }; |
85 | 85 | ||
86 | static int fullstop = 0; /* stop generating callbacks at test end. */ | 86 | static int fullstop = 0; /* stop generating callbacks at test end. */ |
87 | static LIST_HEAD(rcu_torture_freelist); | 87 | static LIST_HEAD(rcu_torture_freelist); |
88 | static struct rcu_torture *rcu_torture_current = NULL; | 88 | static struct rcu_torture *rcu_torture_current = NULL; |
89 | static long rcu_torture_current_version = 0; | 89 | static long rcu_torture_current_version = 0; |
90 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 90 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
91 | static DEFINE_SPINLOCK(rcu_torture_lock); | 91 | static DEFINE_SPINLOCK(rcu_torture_lock); |
92 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | 92 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = |
93 | { 0 }; | 93 | { 0 }; |
94 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = | 94 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = |
95 | { 0 }; | 95 | { 0 }; |
96 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; | 96 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; |
97 | atomic_t n_rcu_torture_alloc; | 97 | atomic_t n_rcu_torture_alloc; |
98 | atomic_t n_rcu_torture_alloc_fail; | 98 | atomic_t n_rcu_torture_alloc_fail; |
99 | atomic_t n_rcu_torture_free; | 99 | atomic_t n_rcu_torture_free; |
100 | atomic_t n_rcu_torture_mberror; | 100 | atomic_t n_rcu_torture_mberror; |
101 | atomic_t n_rcu_torture_error; | 101 | atomic_t n_rcu_torture_error; |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * Allocate an element from the rcu_tortures pool. | 104 | * Allocate an element from the rcu_tortures pool. |
105 | */ | 105 | */ |
106 | struct rcu_torture * | 106 | static struct rcu_torture * |
107 | rcu_torture_alloc(void) | 107 | rcu_torture_alloc(void) |
108 | { | 108 | { |
109 | struct list_head *p; | 109 | struct list_head *p; |
110 | 110 | ||
111 | spin_lock(&rcu_torture_lock); | 111 | spin_lock(&rcu_torture_lock); |
112 | if (list_empty(&rcu_torture_freelist)) { | 112 | if (list_empty(&rcu_torture_freelist)) { |
113 | atomic_inc(&n_rcu_torture_alloc_fail); | 113 | atomic_inc(&n_rcu_torture_alloc_fail); |
114 | spin_unlock(&rcu_torture_lock); | 114 | spin_unlock(&rcu_torture_lock); |
115 | return NULL; | 115 | return NULL; |
116 | } | 116 | } |
117 | atomic_inc(&n_rcu_torture_alloc); | 117 | atomic_inc(&n_rcu_torture_alloc); |
118 | p = rcu_torture_freelist.next; | 118 | p = rcu_torture_freelist.next; |
119 | list_del_init(p); | 119 | list_del_init(p); |
120 | spin_unlock(&rcu_torture_lock); | 120 | spin_unlock(&rcu_torture_lock); |
121 | return container_of(p, struct rcu_torture, rtort_free); | 121 | return container_of(p, struct rcu_torture, rtort_free); |
122 | } | 122 | } |
123 | 123 | ||
124 | /* | 124 | /* |
125 | * Free an element to the rcu_tortures pool. | 125 | * Free an element to the rcu_tortures pool. |
126 | */ | 126 | */ |
127 | static void | 127 | static void |
128 | rcu_torture_free(struct rcu_torture *p) | 128 | rcu_torture_free(struct rcu_torture *p) |
129 | { | 129 | { |
130 | atomic_inc(&n_rcu_torture_free); | 130 | atomic_inc(&n_rcu_torture_free); |
131 | spin_lock(&rcu_torture_lock); | 131 | spin_lock(&rcu_torture_lock); |
132 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); | 132 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); |
133 | spin_unlock(&rcu_torture_lock); | 133 | spin_unlock(&rcu_torture_lock); |
134 | } | 134 | } |
135 | 135 | ||
136 | static void | 136 | static void |
137 | rcu_torture_cb(struct rcu_head *p) | 137 | rcu_torture_cb(struct rcu_head *p) |
138 | { | 138 | { |
139 | int i; | 139 | int i; |
140 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | 140 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); |
141 | 141 | ||
142 | if (fullstop) { | 142 | if (fullstop) { |
143 | /* Test is ending, just drop callbacks on the floor. */ | 143 | /* Test is ending, just drop callbacks on the floor. */ |
144 | /* The next initialization will pick up the pieces. */ | 144 | /* The next initialization will pick up the pieces. */ |
145 | return; | 145 | return; |
146 | } | 146 | } |
147 | i = rp->rtort_pipe_count; | 147 | i = rp->rtort_pipe_count; |
148 | if (i > RCU_TORTURE_PIPE_LEN) | 148 | if (i > RCU_TORTURE_PIPE_LEN) |
149 | i = RCU_TORTURE_PIPE_LEN; | 149 | i = RCU_TORTURE_PIPE_LEN; |
150 | atomic_inc(&rcu_torture_wcount[i]); | 150 | atomic_inc(&rcu_torture_wcount[i]); |
151 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | 151 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { |
152 | rp->rtort_mbtest = 0; | 152 | rp->rtort_mbtest = 0; |
153 | rcu_torture_free(rp); | 153 | rcu_torture_free(rp); |
154 | } else | 154 | } else |
155 | call_rcu(p, rcu_torture_cb); | 155 | call_rcu(p, rcu_torture_cb); |
156 | } | 156 | } |
157 | 157 | ||
158 | struct rcu_random_state { | 158 | struct rcu_random_state { |
159 | unsigned long rrs_state; | 159 | unsigned long rrs_state; |
160 | unsigned long rrs_count; | 160 | unsigned long rrs_count; |
161 | }; | 161 | }; |
162 | 162 | ||
163 | #define RCU_RANDOM_MULT 39916801 /* prime */ | 163 | #define RCU_RANDOM_MULT 39916801 /* prime */ |
164 | #define RCU_RANDOM_ADD 479001701 /* prime */ | 164 | #define RCU_RANDOM_ADD 479001701 /* prime */ |
165 | #define RCU_RANDOM_REFRESH 10000 | 165 | #define RCU_RANDOM_REFRESH 10000 |
166 | 166 | ||
167 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } | 167 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } |
168 | 168 | ||
169 | /* | 169 | /* |
170 | * Crude but fast random-number generator. Uses a linear congruential | 170 | * Crude but fast random-number generator. Uses a linear congruential |
171 | * generator, with occasional help from get_random_bytes(). | 171 | * generator, with occasional help from get_random_bytes(). |
172 | */ | 172 | */ |
173 | static long | 173 | static long |
174 | rcu_random(struct rcu_random_state *rrsp) | 174 | rcu_random(struct rcu_random_state *rrsp) |
175 | { | 175 | { |
176 | long refresh; | 176 | long refresh; |
177 | 177 | ||
178 | if (--rrsp->rrs_count < 0) { | 178 | if (--rrsp->rrs_count < 0) { |
179 | get_random_bytes(&refresh, sizeof(refresh)); | 179 | get_random_bytes(&refresh, sizeof(refresh)); |
180 | rrsp->rrs_state += refresh; | 180 | rrsp->rrs_state += refresh; |
181 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | 181 | rrsp->rrs_count = RCU_RANDOM_REFRESH; |
182 | } | 182 | } |
183 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | 183 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; |
184 | return swahw32(rrsp->rrs_state); | 184 | return swahw32(rrsp->rrs_state); |
185 | } | 185 | } |
186 | 186 | ||
187 | /* | 187 | /* |
188 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 188 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
189 | * for that pointed to by rcu_torture_current, freeing the old structure | 189 | * for that pointed to by rcu_torture_current, freeing the old structure |
190 | * after a series of grace periods (the "pipeline"). | 190 | * after a series of grace periods (the "pipeline"). |
191 | */ | 191 | */ |
192 | static int | 192 | static int |
193 | rcu_torture_writer(void *arg) | 193 | rcu_torture_writer(void *arg) |
194 | { | 194 | { |
195 | int i; | 195 | int i; |
196 | long oldbatch = rcu_batches_completed(); | 196 | long oldbatch = rcu_batches_completed(); |
197 | struct rcu_torture *rp; | 197 | struct rcu_torture *rp; |
198 | struct rcu_torture *old_rp; | 198 | struct rcu_torture *old_rp; |
199 | static DEFINE_RCU_RANDOM(rand); | 199 | static DEFINE_RCU_RANDOM(rand); |
200 | 200 | ||
201 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | 201 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); |
202 | set_user_nice(current, 19); | 202 | set_user_nice(current, 19); |
203 | 203 | ||
204 | do { | 204 | do { |
205 | schedule_timeout_uninterruptible(1); | 205 | schedule_timeout_uninterruptible(1); |
206 | if (rcu_batches_completed() == oldbatch) | 206 | if (rcu_batches_completed() == oldbatch) |
207 | continue; | 207 | continue; |
208 | if ((rp = rcu_torture_alloc()) == NULL) | 208 | if ((rp = rcu_torture_alloc()) == NULL) |
209 | continue; | 209 | continue; |
210 | rp->rtort_pipe_count = 0; | 210 | rp->rtort_pipe_count = 0; |
211 | udelay(rcu_random(&rand) & 0x3ff); | 211 | udelay(rcu_random(&rand) & 0x3ff); |
212 | old_rp = rcu_torture_current; | 212 | old_rp = rcu_torture_current; |
213 | rp->rtort_mbtest = 1; | 213 | rp->rtort_mbtest = 1; |
214 | rcu_assign_pointer(rcu_torture_current, rp); | 214 | rcu_assign_pointer(rcu_torture_current, rp); |
215 | smp_wmb(); | 215 | smp_wmb(); |
216 | if (old_rp != NULL) { | 216 | if (old_rp != NULL) { |
217 | i = old_rp->rtort_pipe_count; | 217 | i = old_rp->rtort_pipe_count; |
218 | if (i > RCU_TORTURE_PIPE_LEN) | 218 | if (i > RCU_TORTURE_PIPE_LEN) |
219 | i = RCU_TORTURE_PIPE_LEN; | 219 | i = RCU_TORTURE_PIPE_LEN; |
220 | atomic_inc(&rcu_torture_wcount[i]); | 220 | atomic_inc(&rcu_torture_wcount[i]); |
221 | old_rp->rtort_pipe_count++; | 221 | old_rp->rtort_pipe_count++; |
222 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | 222 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); |
223 | } | 223 | } |
224 | rcu_torture_current_version++; | 224 | rcu_torture_current_version++; |
225 | oldbatch = rcu_batches_completed(); | 225 | oldbatch = rcu_batches_completed(); |
226 | } while (!kthread_should_stop() && !fullstop); | 226 | } while (!kthread_should_stop() && !fullstop); |
227 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 227 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
228 | while (!kthread_should_stop()) | 228 | while (!kthread_should_stop()) |
229 | schedule_timeout_uninterruptible(1); | 229 | schedule_timeout_uninterruptible(1); |
230 | return 0; | 230 | return 0; |
231 | } | 231 | } |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, | 234 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, |
235 | * incrementing the corresponding element of the pipeline array. The | 235 | * incrementing the corresponding element of the pipeline array. The |
236 | * counter in the element should never be greater than 1, otherwise, the | 236 | * counter in the element should never be greater than 1, otherwise, the |
237 | * RCU implementation is broken. | 237 | * RCU implementation is broken. |
238 | */ | 238 | */ |
239 | static int | 239 | static int |
240 | rcu_torture_reader(void *arg) | 240 | rcu_torture_reader(void *arg) |
241 | { | 241 | { |
242 | int completed; | 242 | int completed; |
243 | DEFINE_RCU_RANDOM(rand); | 243 | DEFINE_RCU_RANDOM(rand); |
244 | struct rcu_torture *p; | 244 | struct rcu_torture *p; |
245 | int pipe_count; | 245 | int pipe_count; |
246 | 246 | ||
247 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | 247 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); |
248 | set_user_nice(current, 19); | 248 | set_user_nice(current, 19); |
249 | 249 | ||
250 | do { | 250 | do { |
251 | rcu_read_lock(); | 251 | rcu_read_lock(); |
252 | completed = rcu_batches_completed(); | 252 | completed = rcu_batches_completed(); |
253 | p = rcu_dereference(rcu_torture_current); | 253 | p = rcu_dereference(rcu_torture_current); |
254 | if (p == NULL) { | 254 | if (p == NULL) { |
255 | /* Wait for rcu_torture_writer to get underway */ | 255 | /* Wait for rcu_torture_writer to get underway */ |
256 | rcu_read_unlock(); | 256 | rcu_read_unlock(); |
257 | schedule_timeout_interruptible(HZ); | 257 | schedule_timeout_interruptible(HZ); |
258 | continue; | 258 | continue; |
259 | } | 259 | } |
260 | if (p->rtort_mbtest == 0) | 260 | if (p->rtort_mbtest == 0) |
261 | atomic_inc(&n_rcu_torture_mberror); | 261 | atomic_inc(&n_rcu_torture_mberror); |
262 | udelay(rcu_random(&rand) & 0x7f); | 262 | udelay(rcu_random(&rand) & 0x7f); |
263 | preempt_disable(); | 263 | preempt_disable(); |
264 | pipe_count = p->rtort_pipe_count; | 264 | pipe_count = p->rtort_pipe_count; |
265 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | 265 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { |
266 | /* Should not happen, but... */ | 266 | /* Should not happen, but... */ |
267 | pipe_count = RCU_TORTURE_PIPE_LEN; | 267 | pipe_count = RCU_TORTURE_PIPE_LEN; |
268 | } | 268 | } |
269 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | 269 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; |
270 | completed = rcu_batches_completed() - completed; | 270 | completed = rcu_batches_completed() - completed; |
271 | if (completed > RCU_TORTURE_PIPE_LEN) { | 271 | if (completed > RCU_TORTURE_PIPE_LEN) { |
272 | /* Should not happen, but... */ | 272 | /* Should not happen, but... */ |
273 | completed = RCU_TORTURE_PIPE_LEN; | 273 | completed = RCU_TORTURE_PIPE_LEN; |
274 | } | 274 | } |
275 | ++__get_cpu_var(rcu_torture_batch)[completed]; | 275 | ++__get_cpu_var(rcu_torture_batch)[completed]; |
276 | preempt_enable(); | 276 | preempt_enable(); |
277 | rcu_read_unlock(); | 277 | rcu_read_unlock(); |
278 | schedule(); | 278 | schedule(); |
279 | } while (!kthread_should_stop() && !fullstop); | 279 | } while (!kthread_should_stop() && !fullstop); |
280 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 280 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
281 | while (!kthread_should_stop()) | 281 | while (!kthread_should_stop()) |
282 | schedule_timeout_uninterruptible(1); | 282 | schedule_timeout_uninterruptible(1); |
283 | return 0; | 283 | return 0; |
284 | } | 284 | } |
285 | 285 | ||
286 | /* | 286 | /* |
287 | * Create an RCU-torture statistics message in the specified buffer. | 287 | * Create an RCU-torture statistics message in the specified buffer. |
288 | */ | 288 | */ |
289 | static int | 289 | static int |
290 | rcu_torture_printk(char *page) | 290 | rcu_torture_printk(char *page) |
291 | { | 291 | { |
292 | int cnt = 0; | 292 | int cnt = 0; |
293 | int cpu; | 293 | int cpu; |
294 | int i; | 294 | int i; |
295 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 295 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
296 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 296 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
297 | 297 | ||
298 | for_each_cpu(cpu) { | 298 | for_each_cpu(cpu) { |
299 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 299 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
300 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; | 300 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; |
301 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; | 301 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; |
302 | } | 302 | } |
303 | } | 303 | } |
304 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { | 304 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { |
305 | if (pipesummary[i] != 0) | 305 | if (pipesummary[i] != 0) |
306 | break; | 306 | break; |
307 | } | 307 | } |
308 | cnt += sprintf(&page[cnt], "rcutorture: "); | 308 | cnt += sprintf(&page[cnt], "rcutorture: "); |
309 | cnt += sprintf(&page[cnt], | 309 | cnt += sprintf(&page[cnt], |
310 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 310 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
311 | "rtmbe: %d", | 311 | "rtmbe: %d", |
312 | rcu_torture_current, | 312 | rcu_torture_current, |
313 | rcu_torture_current_version, | 313 | rcu_torture_current_version, |
314 | list_empty(&rcu_torture_freelist), | 314 | list_empty(&rcu_torture_freelist), |
315 | atomic_read(&n_rcu_torture_alloc), | 315 | atomic_read(&n_rcu_torture_alloc), |
316 | atomic_read(&n_rcu_torture_alloc_fail), | 316 | atomic_read(&n_rcu_torture_alloc_fail), |
317 | atomic_read(&n_rcu_torture_free), | 317 | atomic_read(&n_rcu_torture_free), |
318 | atomic_read(&n_rcu_torture_mberror)); | 318 | atomic_read(&n_rcu_torture_mberror)); |
319 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 319 | if (atomic_read(&n_rcu_torture_mberror) != 0) |
320 | cnt += sprintf(&page[cnt], " !!!"); | 320 | cnt += sprintf(&page[cnt], " !!!"); |
321 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 321 | cnt += sprintf(&page[cnt], "\nrcutorture: "); |
322 | if (i > 1) { | 322 | if (i > 1) { |
323 | cnt += sprintf(&page[cnt], "!!! "); | 323 | cnt += sprintf(&page[cnt], "!!! "); |
324 | atomic_inc(&n_rcu_torture_error); | 324 | atomic_inc(&n_rcu_torture_error); |
325 | } | 325 | } |
326 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 326 | cnt += sprintf(&page[cnt], "Reader Pipe: "); |
327 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 327 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
328 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 328 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); |
329 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 329 | cnt += sprintf(&page[cnt], "\nrcutorture: "); |
330 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 330 | cnt += sprintf(&page[cnt], "Reader Batch: "); |
331 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | 331 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) |
332 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 332 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); |
333 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 333 | cnt += sprintf(&page[cnt], "\nrcutorture: "); |
334 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 334 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); |
335 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 335 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
336 | cnt += sprintf(&page[cnt], " %d", | 336 | cnt += sprintf(&page[cnt], " %d", |
337 | atomic_read(&rcu_torture_wcount[i])); | 337 | atomic_read(&rcu_torture_wcount[i])); |
338 | } | 338 | } |
339 | cnt += sprintf(&page[cnt], "\n"); | 339 | cnt += sprintf(&page[cnt], "\n"); |
340 | return cnt; | 340 | return cnt; |
341 | } | 341 | } |
342 | 342 | ||
343 | /* | 343 | /* |
344 | * Print torture statistics. Caller must ensure that there is only | 344 | * Print torture statistics. Caller must ensure that there is only |
345 | * one call to this function at a given time!!! This is normally | 345 | * one call to this function at a given time!!! This is normally |
346 | * accomplished by relying on the module system to only have one copy | 346 | * accomplished by relying on the module system to only have one copy |
347 | * of the module loaded, and then by giving the rcu_torture_stats | 347 | * of the module loaded, and then by giving the rcu_torture_stats |
348 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | 348 | * kthread full control (or the init/cleanup functions when rcu_torture_stats |
349 | * thread is not running). | 349 | * thread is not running). |
350 | */ | 350 | */ |
351 | static void | 351 | static void |
352 | rcu_torture_stats_print(void) | 352 | rcu_torture_stats_print(void) |
353 | { | 353 | { |
354 | int cnt; | 354 | int cnt; |
355 | 355 | ||
356 | cnt = rcu_torture_printk(printk_buf); | 356 | cnt = rcu_torture_printk(printk_buf); |
357 | printk(KERN_ALERT "%s", printk_buf); | 357 | printk(KERN_ALERT "%s", printk_buf); |
358 | } | 358 | } |
359 | 359 | ||
360 | /* | 360 | /* |
361 | * Periodically prints torture statistics, if periodic statistics printing | 361 | * Periodically prints torture statistics, if periodic statistics printing |
362 | * was specified via the stat_interval module parameter. | 362 | * was specified via the stat_interval module parameter. |
363 | * | 363 | * |
364 | * No need to worry about fullstop here, since this one doesn't reference | 364 | * No need to worry about fullstop here, since this one doesn't reference |
365 | * volatile state or register callbacks. | 365 | * volatile state or register callbacks. |
366 | */ | 366 | */ |
367 | static int | 367 | static int |
368 | rcu_torture_stats(void *arg) | 368 | rcu_torture_stats(void *arg) |
369 | { | 369 | { |
370 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); | 370 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); |
371 | do { | 371 | do { |
372 | schedule_timeout_interruptible(stat_interval * HZ); | 372 | schedule_timeout_interruptible(stat_interval * HZ); |
373 | rcu_torture_stats_print(); | 373 | rcu_torture_stats_print(); |
374 | } while (!kthread_should_stop()); | 374 | } while (!kthread_should_stop()); |
375 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); | 375 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); |
376 | return 0; | 376 | return 0; |
377 | } | 377 | } |
378 | 378 | ||
379 | static void | 379 | static void |
380 | rcu_torture_cleanup(void) | 380 | rcu_torture_cleanup(void) |
381 | { | 381 | { |
382 | int i; | 382 | int i; |
383 | 383 | ||
384 | fullstop = 1; | 384 | fullstop = 1; |
385 | if (writer_task != NULL) { | 385 | if (writer_task != NULL) { |
386 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | 386 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); |
387 | kthread_stop(writer_task); | 387 | kthread_stop(writer_task); |
388 | } | 388 | } |
389 | writer_task = NULL; | 389 | writer_task = NULL; |
390 | 390 | ||
391 | if (reader_tasks != NULL) { | 391 | if (reader_tasks != NULL) { |
392 | for (i = 0; i < nrealreaders; i++) { | 392 | for (i = 0; i < nrealreaders; i++) { |
393 | if (reader_tasks[i] != NULL) { | 393 | if (reader_tasks[i] != NULL) { |
394 | VERBOSE_PRINTK_STRING( | 394 | VERBOSE_PRINTK_STRING( |
395 | "Stopping rcu_torture_reader task"); | 395 | "Stopping rcu_torture_reader task"); |
396 | kthread_stop(reader_tasks[i]); | 396 | kthread_stop(reader_tasks[i]); |
397 | } | 397 | } |
398 | reader_tasks[i] = NULL; | 398 | reader_tasks[i] = NULL; |
399 | } | 399 | } |
400 | kfree(reader_tasks); | 400 | kfree(reader_tasks); |
401 | reader_tasks = NULL; | 401 | reader_tasks = NULL; |
402 | } | 402 | } |
403 | rcu_torture_current = NULL; | 403 | rcu_torture_current = NULL; |
404 | 404 | ||
405 | if (stats_task != NULL) { | 405 | if (stats_task != NULL) { |
406 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); | 406 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); |
407 | kthread_stop(stats_task); | 407 | kthread_stop(stats_task); |
408 | } | 408 | } |
409 | stats_task = NULL; | 409 | stats_task = NULL; |
410 | 410 | ||
411 | /* Wait for all RCU callbacks to fire. */ | 411 | /* Wait for all RCU callbacks to fire. */ |
412 | rcu_barrier(); | 412 | rcu_barrier(); |
413 | 413 | ||
414 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 414 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
415 | printk(KERN_ALERT TORTURE_FLAG | 415 | printk(KERN_ALERT TORTURE_FLAG |
416 | "--- End of test: %s\n", | 416 | "--- End of test: %s\n", |
417 | atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); | 417 | atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); |
418 | } | 418 | } |
419 | 419 | ||
420 | static int | 420 | static int |
421 | rcu_torture_init(void) | 421 | rcu_torture_init(void) |
422 | { | 422 | { |
423 | int i; | 423 | int i; |
424 | int cpu; | 424 | int cpu; |
425 | int firsterr = 0; | 425 | int firsterr = 0; |
426 | 426 | ||
427 | /* Process args and tell the world that the torturer is on the job. */ | 427 | /* Process args and tell the world that the torturer is on the job. */ |
428 | 428 | ||
429 | if (nreaders >= 0) | 429 | if (nreaders >= 0) |
430 | nrealreaders = nreaders; | 430 | nrealreaders = nreaders; |
431 | else | 431 | else |
432 | nrealreaders = 2 * num_online_cpus(); | 432 | nrealreaders = 2 * num_online_cpus(); |
433 | printk(KERN_ALERT TORTURE_FLAG | 433 | printk(KERN_ALERT TORTURE_FLAG |
434 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", | 434 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", |
435 | nrealreaders, stat_interval, verbose); | 435 | nrealreaders, stat_interval, verbose); |
436 | fullstop = 0; | 436 | fullstop = 0; |
437 | 437 | ||
438 | /* Set up the freelist. */ | 438 | /* Set up the freelist. */ |
439 | 439 | ||
440 | INIT_LIST_HEAD(&rcu_torture_freelist); | 440 | INIT_LIST_HEAD(&rcu_torture_freelist); |
441 | for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { | 441 | for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { |
442 | rcu_tortures[i].rtort_mbtest = 0; | 442 | rcu_tortures[i].rtort_mbtest = 0; |
443 | list_add_tail(&rcu_tortures[i].rtort_free, | 443 | list_add_tail(&rcu_tortures[i].rtort_free, |
444 | &rcu_torture_freelist); | 444 | &rcu_torture_freelist); |
445 | } | 445 | } |
446 | 446 | ||
447 | /* Initialize the statistics so that each run gets its own numbers. */ | 447 | /* Initialize the statistics so that each run gets its own numbers. */ |
448 | 448 | ||
449 | rcu_torture_current = NULL; | 449 | rcu_torture_current = NULL; |
450 | rcu_torture_current_version = 0; | 450 | rcu_torture_current_version = 0; |
451 | atomic_set(&n_rcu_torture_alloc, 0); | 451 | atomic_set(&n_rcu_torture_alloc, 0); |
452 | atomic_set(&n_rcu_torture_alloc_fail, 0); | 452 | atomic_set(&n_rcu_torture_alloc_fail, 0); |
453 | atomic_set(&n_rcu_torture_free, 0); | 453 | atomic_set(&n_rcu_torture_free, 0); |
454 | atomic_set(&n_rcu_torture_mberror, 0); | 454 | atomic_set(&n_rcu_torture_mberror, 0); |
455 | atomic_set(&n_rcu_torture_error, 0); | 455 | atomic_set(&n_rcu_torture_error, 0); |
456 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 456 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
457 | atomic_set(&rcu_torture_wcount[i], 0); | 457 | atomic_set(&rcu_torture_wcount[i], 0); |
458 | for_each_cpu(cpu) { | 458 | for_each_cpu(cpu) { |
459 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 459 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
460 | per_cpu(rcu_torture_count, cpu)[i] = 0; | 460 | per_cpu(rcu_torture_count, cpu)[i] = 0; |
461 | per_cpu(rcu_torture_batch, cpu)[i] = 0; | 461 | per_cpu(rcu_torture_batch, cpu)[i] = 0; |
462 | } | 462 | } |
463 | } | 463 | } |
464 | 464 | ||
465 | /* Start up the kthreads. */ | 465 | /* Start up the kthreads. */ |
466 | 466 | ||
467 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | 467 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); |
468 | writer_task = kthread_run(rcu_torture_writer, NULL, | 468 | writer_task = kthread_run(rcu_torture_writer, NULL, |
469 | "rcu_torture_writer"); | 469 | "rcu_torture_writer"); |
470 | if (IS_ERR(writer_task)) { | 470 | if (IS_ERR(writer_task)) { |
471 | firsterr = PTR_ERR(writer_task); | 471 | firsterr = PTR_ERR(writer_task); |
472 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | 472 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); |
473 | writer_task = NULL; | 473 | writer_task = NULL; |
474 | goto unwind; | 474 | goto unwind; |
475 | } | 475 | } |
476 | reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), | 476 | reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), |
477 | GFP_KERNEL); | 477 | GFP_KERNEL); |
478 | if (reader_tasks == NULL) { | 478 | if (reader_tasks == NULL) { |
479 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | 479 | VERBOSE_PRINTK_ERRSTRING("out of memory"); |
480 | firsterr = -ENOMEM; | 480 | firsterr = -ENOMEM; |
481 | goto unwind; | 481 | goto unwind; |
482 | } | 482 | } |
483 | for (i = 0; i < nrealreaders; i++) { | 483 | for (i = 0; i < nrealreaders; i++) { |
484 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); | 484 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); |
485 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, | 485 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, |
486 | "rcu_torture_reader"); | 486 | "rcu_torture_reader"); |
487 | if (IS_ERR(reader_tasks[i])) { | 487 | if (IS_ERR(reader_tasks[i])) { |
488 | firsterr = PTR_ERR(reader_tasks[i]); | 488 | firsterr = PTR_ERR(reader_tasks[i]); |
489 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); | 489 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); |
490 | reader_tasks[i] = NULL; | 490 | reader_tasks[i] = NULL; |
491 | goto unwind; | 491 | goto unwind; |
492 | } | 492 | } |
493 | } | 493 | } |
494 | if (stat_interval > 0) { | 494 | if (stat_interval > 0) { |
495 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); | 495 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); |
496 | stats_task = kthread_run(rcu_torture_stats, NULL, | 496 | stats_task = kthread_run(rcu_torture_stats, NULL, |
497 | "rcu_torture_stats"); | 497 | "rcu_torture_stats"); |
498 | if (IS_ERR(stats_task)) { | 498 | if (IS_ERR(stats_task)) { |
499 | firsterr = PTR_ERR(stats_task); | 499 | firsterr = PTR_ERR(stats_task); |
500 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); | 500 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); |
501 | stats_task = NULL; | 501 | stats_task = NULL; |
502 | goto unwind; | 502 | goto unwind; |
503 | } | 503 | } |
504 | } | 504 | } |
505 | return 0; | 505 | return 0; |
506 | 506 | ||
507 | unwind: | 507 | unwind: |
508 | rcu_torture_cleanup(); | 508 | rcu_torture_cleanup(); |
509 | return firsterr; | 509 | return firsterr; |
510 | } | 510 | } |
511 | 511 | ||
512 | module_init(rcu_torture_init); | 512 | module_init(rcu_torture_init); |
513 | module_exit(rcu_torture_cleanup); | 513 | module_exit(rcu_torture_cleanup); |
514 | 514 |
kernel/timer.c
1 | /* | 1 | /* |
2 | * linux/kernel/timer.c | 2 | * linux/kernel/timer.c |
3 | * | 3 | * |
4 | * Kernel internal timers, kernel timekeeping, basic process system calls | 4 | * Kernel internal timers, kernel timekeeping, basic process system calls |
5 | * | 5 | * |
6 | * Copyright (C) 1991, 1992 Linus Torvalds | 6 | * Copyright (C) 1991, 1992 Linus Torvalds |
7 | * | 7 | * |
8 | * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. | 8 | * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. |
9 | * | 9 | * |
10 | * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 | 10 | * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 |
11 | * "A Kernel Model for Precision Timekeeping" by Dave Mills | 11 | * "A Kernel Model for Precision Timekeeping" by Dave Mills |
12 | * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to | 12 | * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to |
13 | * serialize accesses to xtime/lost_ticks). | 13 | * serialize accesses to xtime/lost_ticks). |
14 | * Copyright (C) 1998 Andrea Arcangeli | 14 | * Copyright (C) 1998 Andrea Arcangeli |
15 | * 1999-03-10 Improved NTP compatibility by Ulrich Windl | 15 | * 1999-03-10 Improved NTP compatibility by Ulrich Windl |
16 | * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love | 16 | * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love |
17 | * 2000-10-05 Implemented scalable SMP per-CPU timer handling. | 17 | * 2000-10-05 Implemented scalable SMP per-CPU timer handling. |
18 | * Copyright (C) 2000, 2001, 2002 Ingo Molnar | 18 | * Copyright (C) 2000, 2001, 2002 Ingo Molnar |
19 | * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar | 19 | * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/kernel_stat.h> | 22 | #include <linux/kernel_stat.h> |
23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
24 | #include <linux/interrupt.h> | 24 | #include <linux/interrupt.h> |
25 | #include <linux/percpu.h> | 25 | #include <linux/percpu.h> |
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/swap.h> | 28 | #include <linux/swap.h> |
29 | #include <linux/notifier.h> | 29 | #include <linux/notifier.h> |
30 | #include <linux/thread_info.h> | 30 | #include <linux/thread_info.h> |
31 | #include <linux/time.h> | 31 | #include <linux/time.h> |
32 | #include <linux/jiffies.h> | 32 | #include <linux/jiffies.h> |
33 | #include <linux/posix-timers.h> | 33 | #include <linux/posix-timers.h> |
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/delay.h> | ||
36 | 37 | ||
37 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
38 | #include <asm/unistd.h> | 39 | #include <asm/unistd.h> |
39 | #include <asm/div64.h> | 40 | #include <asm/div64.h> |
40 | #include <asm/timex.h> | 41 | #include <asm/timex.h> |
41 | #include <asm/io.h> | 42 | #include <asm/io.h> |
42 | 43 | ||
43 | #ifdef CONFIG_TIME_INTERPOLATION | 44 | #ifdef CONFIG_TIME_INTERPOLATION |
44 | static void time_interpolator_update(long delta_nsec); | 45 | static void time_interpolator_update(long delta_nsec); |
45 | #else | 46 | #else |
46 | #define time_interpolator_update(x) | 47 | #define time_interpolator_update(x) |
47 | #endif | 48 | #endif |
48 | 49 | ||
49 | u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; | 50 | u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; |
50 | 51 | ||
51 | EXPORT_SYMBOL(jiffies_64); | 52 | EXPORT_SYMBOL(jiffies_64); |
52 | 53 | ||
53 | /* | 54 | /* |
54 | * per-CPU timer vector definitions: | 55 | * per-CPU timer vector definitions: |
55 | */ | 56 | */ |
56 | 57 | ||
57 | #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) | 58 | #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) |
58 | #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) | 59 | #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) |
59 | #define TVN_SIZE (1 << TVN_BITS) | 60 | #define TVN_SIZE (1 << TVN_BITS) |
60 | #define TVR_SIZE (1 << TVR_BITS) | 61 | #define TVR_SIZE (1 << TVR_BITS) |
61 | #define TVN_MASK (TVN_SIZE - 1) | 62 | #define TVN_MASK (TVN_SIZE - 1) |
62 | #define TVR_MASK (TVR_SIZE - 1) | 63 | #define TVR_MASK (TVR_SIZE - 1) |
63 | 64 | ||
64 | struct timer_base_s { | 65 | struct timer_base_s { |
65 | spinlock_t lock; | 66 | spinlock_t lock; |
66 | struct timer_list *running_timer; | 67 | struct timer_list *running_timer; |
67 | }; | 68 | }; |
68 | 69 | ||
69 | typedef struct tvec_s { | 70 | typedef struct tvec_s { |
70 | struct list_head vec[TVN_SIZE]; | 71 | struct list_head vec[TVN_SIZE]; |
71 | } tvec_t; | 72 | } tvec_t; |
72 | 73 | ||
73 | typedef struct tvec_root_s { | 74 | typedef struct tvec_root_s { |
74 | struct list_head vec[TVR_SIZE]; | 75 | struct list_head vec[TVR_SIZE]; |
75 | } tvec_root_t; | 76 | } tvec_root_t; |
76 | 77 | ||
77 | struct tvec_t_base_s { | 78 | struct tvec_t_base_s { |
78 | struct timer_base_s t_base; | 79 | struct timer_base_s t_base; |
79 | unsigned long timer_jiffies; | 80 | unsigned long timer_jiffies; |
80 | tvec_root_t tv1; | 81 | tvec_root_t tv1; |
81 | tvec_t tv2; | 82 | tvec_t tv2; |
82 | tvec_t tv3; | 83 | tvec_t tv3; |
83 | tvec_t tv4; | 84 | tvec_t tv4; |
84 | tvec_t tv5; | 85 | tvec_t tv5; |
85 | } ____cacheline_aligned_in_smp; | 86 | } ____cacheline_aligned_in_smp; |
86 | 87 | ||
87 | typedef struct tvec_t_base_s tvec_base_t; | 88 | typedef struct tvec_t_base_s tvec_base_t; |
88 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases); | 89 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases); |
89 | 90 | ||
90 | static inline void set_running_timer(tvec_base_t *base, | 91 | static inline void set_running_timer(tvec_base_t *base, |
91 | struct timer_list *timer) | 92 | struct timer_list *timer) |
92 | { | 93 | { |
93 | #ifdef CONFIG_SMP | 94 | #ifdef CONFIG_SMP |
94 | base->t_base.running_timer = timer; | 95 | base->t_base.running_timer = timer; |
95 | #endif | 96 | #endif |
96 | } | 97 | } |
97 | 98 | ||
98 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | 99 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) |
99 | { | 100 | { |
100 | unsigned long expires = timer->expires; | 101 | unsigned long expires = timer->expires; |
101 | unsigned long idx = expires - base->timer_jiffies; | 102 | unsigned long idx = expires - base->timer_jiffies; |
102 | struct list_head *vec; | 103 | struct list_head *vec; |
103 | 104 | ||
104 | if (idx < TVR_SIZE) { | 105 | if (idx < TVR_SIZE) { |
105 | int i = expires & TVR_MASK; | 106 | int i = expires & TVR_MASK; |
106 | vec = base->tv1.vec + i; | 107 | vec = base->tv1.vec + i; |
107 | } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { | 108 | } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { |
108 | int i = (expires >> TVR_BITS) & TVN_MASK; | 109 | int i = (expires >> TVR_BITS) & TVN_MASK; |
109 | vec = base->tv2.vec + i; | 110 | vec = base->tv2.vec + i; |
110 | } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { | 111 | } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { |
111 | int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; | 112 | int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; |
112 | vec = base->tv3.vec + i; | 113 | vec = base->tv3.vec + i; |
113 | } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { | 114 | } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { |
114 | int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; | 115 | int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; |
115 | vec = base->tv4.vec + i; | 116 | vec = base->tv4.vec + i; |
116 | } else if ((signed long) idx < 0) { | 117 | } else if ((signed long) idx < 0) { |
117 | /* | 118 | /* |
118 | * Can happen if you add a timer with expires == jiffies, | 119 | * Can happen if you add a timer with expires == jiffies, |
119 | * or you set a timer to go off in the past | 120 | * or you set a timer to go off in the past |
120 | */ | 121 | */ |
121 | vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); | 122 | vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); |
122 | } else { | 123 | } else { |
123 | int i; | 124 | int i; |
124 | /* If the timeout is larger than 0xffffffff on 64-bit | 125 | /* If the timeout is larger than 0xffffffff on 64-bit |
125 | * architectures then we use the maximum timeout: | 126 | * architectures then we use the maximum timeout: |
126 | */ | 127 | */ |
127 | if (idx > 0xffffffffUL) { | 128 | if (idx > 0xffffffffUL) { |
128 | idx = 0xffffffffUL; | 129 | idx = 0xffffffffUL; |
129 | expires = idx + base->timer_jiffies; | 130 | expires = idx + base->timer_jiffies; |
130 | } | 131 | } |
131 | i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; | 132 | i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; |
132 | vec = base->tv5.vec + i; | 133 | vec = base->tv5.vec + i; |
133 | } | 134 | } |
134 | /* | 135 | /* |
135 | * Timers are FIFO: | 136 | * Timers are FIFO: |
136 | */ | 137 | */ |
137 | list_add_tail(&timer->entry, vec); | 138 | list_add_tail(&timer->entry, vec); |
138 | } | 139 | } |
139 | 140 | ||
140 | typedef struct timer_base_s timer_base_t; | 141 | typedef struct timer_base_s timer_base_t; |
141 | /* | 142 | /* |
142 | * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) | 143 | * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) |
143 | * at compile time, and we need timer->base to lock the timer. | 144 | * at compile time, and we need timer->base to lock the timer. |
144 | */ | 145 | */ |
145 | timer_base_t __init_timer_base | 146 | timer_base_t __init_timer_base |
146 | ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; | 147 | ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; |
147 | EXPORT_SYMBOL(__init_timer_base); | 148 | EXPORT_SYMBOL(__init_timer_base); |
148 | 149 | ||
149 | /*** | 150 | /*** |
150 | * init_timer - initialize a timer. | 151 | * init_timer - initialize a timer. |
151 | * @timer: the timer to be initialized | 152 | * @timer: the timer to be initialized |
152 | * | 153 | * |
153 | * init_timer() must be done to a timer prior calling *any* of the | 154 | * init_timer() must be done to a timer prior calling *any* of the |
154 | * other timer functions. | 155 | * other timer functions. |
155 | */ | 156 | */ |
156 | void fastcall init_timer(struct timer_list *timer) | 157 | void fastcall init_timer(struct timer_list *timer) |
157 | { | 158 | { |
158 | timer->entry.next = NULL; | 159 | timer->entry.next = NULL; |
159 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; | 160 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; |
160 | } | 161 | } |
161 | EXPORT_SYMBOL(init_timer); | 162 | EXPORT_SYMBOL(init_timer); |
162 | 163 | ||
163 | static inline void detach_timer(struct timer_list *timer, | 164 | static inline void detach_timer(struct timer_list *timer, |
164 | int clear_pending) | 165 | int clear_pending) |
165 | { | 166 | { |
166 | struct list_head *entry = &timer->entry; | 167 | struct list_head *entry = &timer->entry; |
167 | 168 | ||
168 | __list_del(entry->prev, entry->next); | 169 | __list_del(entry->prev, entry->next); |
169 | if (clear_pending) | 170 | if (clear_pending) |
170 | entry->next = NULL; | 171 | entry->next = NULL; |
171 | entry->prev = LIST_POISON2; | 172 | entry->prev = LIST_POISON2; |
172 | } | 173 | } |
173 | 174 | ||
174 | /* | 175 | /* |
175 | * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock | 176 | * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock |
176 | * means that all timers which are tied to this base via timer->base are | 177 | * means that all timers which are tied to this base via timer->base are |
177 | * locked, and the base itself is locked too. | 178 | * locked, and the base itself is locked too. |
178 | * | 179 | * |
179 | * So __run_timers/migrate_timers can safely modify all timers which could | 180 | * So __run_timers/migrate_timers can safely modify all timers which could |
180 | * be found on ->tvX lists. | 181 | * be found on ->tvX lists. |
181 | * | 182 | * |
182 | * When the timer's base is locked, and the timer removed from list, it is | 183 | * When the timer's base is locked, and the timer removed from list, it is |
183 | * possible to set timer->base = NULL and drop the lock: the timer remains | 184 | * possible to set timer->base = NULL and drop the lock: the timer remains |
184 | * locked. | 185 | * locked. |
185 | */ | 186 | */ |
186 | static timer_base_t *lock_timer_base(struct timer_list *timer, | 187 | static timer_base_t *lock_timer_base(struct timer_list *timer, |
187 | unsigned long *flags) | 188 | unsigned long *flags) |
188 | { | 189 | { |
189 | timer_base_t *base; | 190 | timer_base_t *base; |
190 | 191 | ||
191 | for (;;) { | 192 | for (;;) { |
192 | base = timer->base; | 193 | base = timer->base; |
193 | if (likely(base != NULL)) { | 194 | if (likely(base != NULL)) { |
194 | spin_lock_irqsave(&base->lock, *flags); | 195 | spin_lock_irqsave(&base->lock, *flags); |
195 | if (likely(base == timer->base)) | 196 | if (likely(base == timer->base)) |
196 | return base; | 197 | return base; |
197 | /* The timer has migrated to another CPU */ | 198 | /* The timer has migrated to another CPU */ |
198 | spin_unlock_irqrestore(&base->lock, *flags); | 199 | spin_unlock_irqrestore(&base->lock, *flags); |
199 | } | 200 | } |
200 | cpu_relax(); | 201 | cpu_relax(); |
201 | } | 202 | } |
202 | } | 203 | } |
203 | 204 | ||
204 | int __mod_timer(struct timer_list *timer, unsigned long expires) | 205 | int __mod_timer(struct timer_list *timer, unsigned long expires) |
205 | { | 206 | { |
206 | timer_base_t *base; | 207 | timer_base_t *base; |
207 | tvec_base_t *new_base; | 208 | tvec_base_t *new_base; |
208 | unsigned long flags; | 209 | unsigned long flags; |
209 | int ret = 0; | 210 | int ret = 0; |
210 | 211 | ||
211 | BUG_ON(!timer->function); | 212 | BUG_ON(!timer->function); |
212 | 213 | ||
213 | base = lock_timer_base(timer, &flags); | 214 | base = lock_timer_base(timer, &flags); |
214 | 215 | ||
215 | if (timer_pending(timer)) { | 216 | if (timer_pending(timer)) { |
216 | detach_timer(timer, 0); | 217 | detach_timer(timer, 0); |
217 | ret = 1; | 218 | ret = 1; |
218 | } | 219 | } |
219 | 220 | ||
220 | new_base = &__get_cpu_var(tvec_bases); | 221 | new_base = &__get_cpu_var(tvec_bases); |
221 | 222 | ||
222 | if (base != &new_base->t_base) { | 223 | if (base != &new_base->t_base) { |
223 | /* | 224 | /* |
224 | * We are trying to schedule the timer on the local CPU. | 225 | * We are trying to schedule the timer on the local CPU. |
225 | * However we can't change timer's base while it is running, | 226 | * However we can't change timer's base while it is running, |
226 | * otherwise del_timer_sync() can't detect that the timer's | 227 | * otherwise del_timer_sync() can't detect that the timer's |
227 | * handler yet has not finished. This also guarantees that | 228 | * handler yet has not finished. This also guarantees that |
228 | * the timer is serialized wrt itself. | 229 | * the timer is serialized wrt itself. |
229 | */ | 230 | */ |
230 | if (unlikely(base->running_timer == timer)) { | 231 | if (unlikely(base->running_timer == timer)) { |
231 | /* The timer remains on a former base */ | 232 | /* The timer remains on a former base */ |
232 | new_base = container_of(base, tvec_base_t, t_base); | 233 | new_base = container_of(base, tvec_base_t, t_base); |
233 | } else { | 234 | } else { |
234 | /* See the comment in lock_timer_base() */ | 235 | /* See the comment in lock_timer_base() */ |
235 | timer->base = NULL; | 236 | timer->base = NULL; |
236 | spin_unlock(&base->lock); | 237 | spin_unlock(&base->lock); |
237 | spin_lock(&new_base->t_base.lock); | 238 | spin_lock(&new_base->t_base.lock); |
238 | timer->base = &new_base->t_base; | 239 | timer->base = &new_base->t_base; |
239 | } | 240 | } |
240 | } | 241 | } |
241 | 242 | ||
242 | timer->expires = expires; | 243 | timer->expires = expires; |
243 | internal_add_timer(new_base, timer); | 244 | internal_add_timer(new_base, timer); |
244 | spin_unlock_irqrestore(&new_base->t_base.lock, flags); | 245 | spin_unlock_irqrestore(&new_base->t_base.lock, flags); |
245 | 246 | ||
246 | return ret; | 247 | return ret; |
247 | } | 248 | } |
248 | 249 | ||
249 | EXPORT_SYMBOL(__mod_timer); | 250 | EXPORT_SYMBOL(__mod_timer); |
250 | 251 | ||
251 | /*** | 252 | /*** |
252 | * add_timer_on - start a timer on a particular CPU | 253 | * add_timer_on - start a timer on a particular CPU |
253 | * @timer: the timer to be added | 254 | * @timer: the timer to be added |
254 | * @cpu: the CPU to start it on | 255 | * @cpu: the CPU to start it on |
255 | * | 256 | * |
256 | * This is not very scalable on SMP. Double adds are not possible. | 257 | * This is not very scalable on SMP. Double adds are not possible. |
257 | */ | 258 | */ |
258 | void add_timer_on(struct timer_list *timer, int cpu) | 259 | void add_timer_on(struct timer_list *timer, int cpu) |
259 | { | 260 | { |
260 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); | 261 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); |
261 | unsigned long flags; | 262 | unsigned long flags; |
262 | 263 | ||
263 | BUG_ON(timer_pending(timer) || !timer->function); | 264 | BUG_ON(timer_pending(timer) || !timer->function); |
264 | spin_lock_irqsave(&base->t_base.lock, flags); | 265 | spin_lock_irqsave(&base->t_base.lock, flags); |
265 | timer->base = &base->t_base; | 266 | timer->base = &base->t_base; |
266 | internal_add_timer(base, timer); | 267 | internal_add_timer(base, timer); |
267 | spin_unlock_irqrestore(&base->t_base.lock, flags); | 268 | spin_unlock_irqrestore(&base->t_base.lock, flags); |
268 | } | 269 | } |
269 | 270 | ||
270 | 271 | ||
271 | /*** | 272 | /*** |
272 | * mod_timer - modify a timer's timeout | 273 | * mod_timer - modify a timer's timeout |
273 | * @timer: the timer to be modified | 274 | * @timer: the timer to be modified |
274 | * | 275 | * |
275 | * mod_timer is a more efficient way to update the expire field of an | 276 | * mod_timer is a more efficient way to update the expire field of an |
276 | * active timer (if the timer is inactive it will be activated) | 277 | * active timer (if the timer is inactive it will be activated) |
277 | * | 278 | * |
278 | * mod_timer(timer, expires) is equivalent to: | 279 | * mod_timer(timer, expires) is equivalent to: |
279 | * | 280 | * |
280 | * del_timer(timer); timer->expires = expires; add_timer(timer); | 281 | * del_timer(timer); timer->expires = expires; add_timer(timer); |
281 | * | 282 | * |
282 | * Note that if there are multiple unserialized concurrent users of the | 283 | * Note that if there are multiple unserialized concurrent users of the |
283 | * same timer, then mod_timer() is the only safe way to modify the timeout, | 284 | * same timer, then mod_timer() is the only safe way to modify the timeout, |
284 | * since add_timer() cannot modify an already running timer. | 285 | * since add_timer() cannot modify an already running timer. |
285 | * | 286 | * |
286 | * The function returns whether it has modified a pending timer or not. | 287 | * The function returns whether it has modified a pending timer or not. |
287 | * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an | 288 | * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an |
288 | * active timer returns 1.) | 289 | * active timer returns 1.) |
289 | */ | 290 | */ |
290 | int mod_timer(struct timer_list *timer, unsigned long expires) | 291 | int mod_timer(struct timer_list *timer, unsigned long expires) |
291 | { | 292 | { |
292 | BUG_ON(!timer->function); | 293 | BUG_ON(!timer->function); |
293 | 294 | ||
294 | /* | 295 | /* |
295 | * This is a common optimization triggered by the | 296 | * This is a common optimization triggered by the |
296 | * networking code - if the timer is re-modified | 297 | * networking code - if the timer is re-modified |
297 | * to be the same thing then just return: | 298 | * to be the same thing then just return: |
298 | */ | 299 | */ |
299 | if (timer->expires == expires && timer_pending(timer)) | 300 | if (timer->expires == expires && timer_pending(timer)) |
300 | return 1; | 301 | return 1; |
301 | 302 | ||
302 | return __mod_timer(timer, expires); | 303 | return __mod_timer(timer, expires); |
303 | } | 304 | } |
304 | 305 | ||
305 | EXPORT_SYMBOL(mod_timer); | 306 | EXPORT_SYMBOL(mod_timer); |
306 | 307 | ||
307 | /*** | 308 | /*** |
308 | * del_timer - deactive a timer. | 309 | * del_timer - deactive a timer. |
309 | * @timer: the timer to be deactivated | 310 | * @timer: the timer to be deactivated |
310 | * | 311 | * |
311 | * del_timer() deactivates a timer - this works on both active and inactive | 312 | * del_timer() deactivates a timer - this works on both active and inactive |
312 | * timers. | 313 | * timers. |
313 | * | 314 | * |
314 | * The function returns whether it has deactivated a pending timer or not. | 315 | * The function returns whether it has deactivated a pending timer or not. |
315 | * (ie. del_timer() of an inactive timer returns 0, del_timer() of an | 316 | * (ie. del_timer() of an inactive timer returns 0, del_timer() of an |
316 | * active timer returns 1.) | 317 | * active timer returns 1.) |
317 | */ | 318 | */ |
318 | int del_timer(struct timer_list *timer) | 319 | int del_timer(struct timer_list *timer) |
319 | { | 320 | { |
320 | timer_base_t *base; | 321 | timer_base_t *base; |
321 | unsigned long flags; | 322 | unsigned long flags; |
322 | int ret = 0; | 323 | int ret = 0; |
323 | 324 | ||
324 | if (timer_pending(timer)) { | 325 | if (timer_pending(timer)) { |
325 | base = lock_timer_base(timer, &flags); | 326 | base = lock_timer_base(timer, &flags); |
326 | if (timer_pending(timer)) { | 327 | if (timer_pending(timer)) { |
327 | detach_timer(timer, 1); | 328 | detach_timer(timer, 1); |
328 | ret = 1; | 329 | ret = 1; |
329 | } | 330 | } |
330 | spin_unlock_irqrestore(&base->lock, flags); | 331 | spin_unlock_irqrestore(&base->lock, flags); |
331 | } | 332 | } |
332 | 333 | ||
333 | return ret; | 334 | return ret; |
334 | } | 335 | } |
335 | 336 | ||
336 | EXPORT_SYMBOL(del_timer); | 337 | EXPORT_SYMBOL(del_timer); |
337 | 338 | ||
338 | #ifdef CONFIG_SMP | 339 | #ifdef CONFIG_SMP |
339 | /* | 340 | /* |
340 | * This function tries to deactivate a timer. Upon successful (ret >= 0) | 341 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
341 | * exit the timer is not queued and the handler is not running on any CPU. | 342 | * exit the timer is not queued and the handler is not running on any CPU. |
342 | * | 343 | * |
343 | * It must not be called from interrupt contexts. | 344 | * It must not be called from interrupt contexts. |
344 | */ | 345 | */ |
345 | int try_to_del_timer_sync(struct timer_list *timer) | 346 | int try_to_del_timer_sync(struct timer_list *timer) |
346 | { | 347 | { |
347 | timer_base_t *base; | 348 | timer_base_t *base; |
348 | unsigned long flags; | 349 | unsigned long flags; |
349 | int ret = -1; | 350 | int ret = -1; |
350 | 351 | ||
351 | base = lock_timer_base(timer, &flags); | 352 | base = lock_timer_base(timer, &flags); |
352 | 353 | ||
353 | if (base->running_timer == timer) | 354 | if (base->running_timer == timer) |
354 | goto out; | 355 | goto out; |
355 | 356 | ||
356 | ret = 0; | 357 | ret = 0; |
357 | if (timer_pending(timer)) { | 358 | if (timer_pending(timer)) { |
358 | detach_timer(timer, 1); | 359 | detach_timer(timer, 1); |
359 | ret = 1; | 360 | ret = 1; |
360 | } | 361 | } |
361 | out: | 362 | out: |
362 | spin_unlock_irqrestore(&base->lock, flags); | 363 | spin_unlock_irqrestore(&base->lock, flags); |
363 | 364 | ||
364 | return ret; | 365 | return ret; |
365 | } | 366 | } |
366 | 367 | ||
367 | /*** | 368 | /*** |
368 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 369 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
369 | * @timer: the timer to be deactivated | 370 | * @timer: the timer to be deactivated |
370 | * | 371 | * |
371 | * This function only differs from del_timer() on SMP: besides deactivating | 372 | * This function only differs from del_timer() on SMP: besides deactivating |
372 | * the timer it also makes sure the handler has finished executing on other | 373 | * the timer it also makes sure the handler has finished executing on other |
373 | * CPUs. | 374 | * CPUs. |
374 | * | 375 | * |
375 | * Synchronization rules: callers must prevent restarting of the timer, | 376 | * Synchronization rules: callers must prevent restarting of the timer, |
376 | * otherwise this function is meaningless. It must not be called from | 377 | * otherwise this function is meaningless. It must not be called from |
377 | * interrupt contexts. The caller must not hold locks which would prevent | 378 | * interrupt contexts. The caller must not hold locks which would prevent |
378 | * completion of the timer's handler. The timer's handler must not call | 379 | * completion of the timer's handler. The timer's handler must not call |
379 | * add_timer_on(). Upon exit the timer is not queued and the handler is | 380 | * add_timer_on(). Upon exit the timer is not queued and the handler is |
380 | * not running on any CPU. | 381 | * not running on any CPU. |
381 | * | 382 | * |
382 | * The function returns whether it has deactivated a pending timer or not. | 383 | * The function returns whether it has deactivated a pending timer or not. |
383 | */ | 384 | */ |
384 | int del_timer_sync(struct timer_list *timer) | 385 | int del_timer_sync(struct timer_list *timer) |
385 | { | 386 | { |
386 | for (;;) { | 387 | for (;;) { |
387 | int ret = try_to_del_timer_sync(timer); | 388 | int ret = try_to_del_timer_sync(timer); |
388 | if (ret >= 0) | 389 | if (ret >= 0) |
389 | return ret; | 390 | return ret; |
390 | } | 391 | } |
391 | } | 392 | } |
392 | 393 | ||
393 | EXPORT_SYMBOL(del_timer_sync); | 394 | EXPORT_SYMBOL(del_timer_sync); |
394 | #endif | 395 | #endif |
395 | 396 | ||
396 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 397 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) |
397 | { | 398 | { |
398 | /* cascade all the timers from tv up one level */ | 399 | /* cascade all the timers from tv up one level */ |
399 | struct list_head *head, *curr; | 400 | struct list_head *head, *curr; |
400 | 401 | ||
401 | head = tv->vec + index; | 402 | head = tv->vec + index; |
402 | curr = head->next; | 403 | curr = head->next; |
403 | /* | 404 | /* |
404 | * We are removing _all_ timers from the list, so we don't have to | 405 | * We are removing _all_ timers from the list, so we don't have to |
405 | * detach them individually, just clear the list afterwards. | 406 | * detach them individually, just clear the list afterwards. |
406 | */ | 407 | */ |
407 | while (curr != head) { | 408 | while (curr != head) { |
408 | struct timer_list *tmp; | 409 | struct timer_list *tmp; |
409 | 410 | ||
410 | tmp = list_entry(curr, struct timer_list, entry); | 411 | tmp = list_entry(curr, struct timer_list, entry); |
411 | BUG_ON(tmp->base != &base->t_base); | 412 | BUG_ON(tmp->base != &base->t_base); |
412 | curr = curr->next; | 413 | curr = curr->next; |
413 | internal_add_timer(base, tmp); | 414 | internal_add_timer(base, tmp); |
414 | } | 415 | } |
415 | INIT_LIST_HEAD(head); | 416 | INIT_LIST_HEAD(head); |
416 | 417 | ||
417 | return index; | 418 | return index; |
418 | } | 419 | } |
419 | 420 | ||
420 | /*** | 421 | /*** |
421 | * __run_timers - run all expired timers (if any) on this CPU. | 422 | * __run_timers - run all expired timers (if any) on this CPU. |
422 | * @base: the timer vector to be processed. | 423 | * @base: the timer vector to be processed. |
423 | * | 424 | * |
424 | * This function cascades all vectors and executes all expired timer | 425 | * This function cascades all vectors and executes all expired timer |
425 | * vectors. | 426 | * vectors. |
426 | */ | 427 | */ |
427 | #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK | 428 | #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK |
428 | 429 | ||
429 | static inline void __run_timers(tvec_base_t *base) | 430 | static inline void __run_timers(tvec_base_t *base) |
430 | { | 431 | { |
431 | struct timer_list *timer; | 432 | struct timer_list *timer; |
432 | 433 | ||
433 | spin_lock_irq(&base->t_base.lock); | 434 | spin_lock_irq(&base->t_base.lock); |
434 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 435 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
435 | struct list_head work_list = LIST_HEAD_INIT(work_list); | 436 | struct list_head work_list = LIST_HEAD_INIT(work_list); |
436 | struct list_head *head = &work_list; | 437 | struct list_head *head = &work_list; |
437 | int index = base->timer_jiffies & TVR_MASK; | 438 | int index = base->timer_jiffies & TVR_MASK; |
438 | 439 | ||
439 | /* | 440 | /* |
440 | * Cascade timers: | 441 | * Cascade timers: |
441 | */ | 442 | */ |
442 | if (!index && | 443 | if (!index && |
443 | (!cascade(base, &base->tv2, INDEX(0))) && | 444 | (!cascade(base, &base->tv2, INDEX(0))) && |
444 | (!cascade(base, &base->tv3, INDEX(1))) && | 445 | (!cascade(base, &base->tv3, INDEX(1))) && |
445 | !cascade(base, &base->tv4, INDEX(2))) | 446 | !cascade(base, &base->tv4, INDEX(2))) |
446 | cascade(base, &base->tv5, INDEX(3)); | 447 | cascade(base, &base->tv5, INDEX(3)); |
447 | ++base->timer_jiffies; | 448 | ++base->timer_jiffies; |
448 | list_splice_init(base->tv1.vec + index, &work_list); | 449 | list_splice_init(base->tv1.vec + index, &work_list); |
449 | while (!list_empty(head)) { | 450 | while (!list_empty(head)) { |
450 | void (*fn)(unsigned long); | 451 | void (*fn)(unsigned long); |
451 | unsigned long data; | 452 | unsigned long data; |
452 | 453 | ||
453 | timer = list_entry(head->next,struct timer_list,entry); | 454 | timer = list_entry(head->next,struct timer_list,entry); |
454 | fn = timer->function; | 455 | fn = timer->function; |
455 | data = timer->data; | 456 | data = timer->data; |
456 | 457 | ||
457 | set_running_timer(base, timer); | 458 | set_running_timer(base, timer); |
458 | detach_timer(timer, 1); | 459 | detach_timer(timer, 1); |
459 | spin_unlock_irq(&base->t_base.lock); | 460 | spin_unlock_irq(&base->t_base.lock); |
460 | { | 461 | { |
461 | int preempt_count = preempt_count(); | 462 | int preempt_count = preempt_count(); |
462 | fn(data); | 463 | fn(data); |
463 | if (preempt_count != preempt_count()) { | 464 | if (preempt_count != preempt_count()) { |
464 | printk(KERN_WARNING "huh, entered %p " | 465 | printk(KERN_WARNING "huh, entered %p " |
465 | "with preempt_count %08x, exited" | 466 | "with preempt_count %08x, exited" |
466 | " with %08x?\n", | 467 | " with %08x?\n", |
467 | fn, preempt_count, | 468 | fn, preempt_count, |
468 | preempt_count()); | 469 | preempt_count()); |
469 | BUG(); | 470 | BUG(); |
470 | } | 471 | } |
471 | } | 472 | } |
472 | spin_lock_irq(&base->t_base.lock); | 473 | spin_lock_irq(&base->t_base.lock); |
473 | } | 474 | } |
474 | } | 475 | } |
475 | set_running_timer(base, NULL); | 476 | set_running_timer(base, NULL); |
476 | spin_unlock_irq(&base->t_base.lock); | 477 | spin_unlock_irq(&base->t_base.lock); |
477 | } | 478 | } |
478 | 479 | ||
479 | #ifdef CONFIG_NO_IDLE_HZ | 480 | #ifdef CONFIG_NO_IDLE_HZ |
480 | /* | 481 | /* |
481 | * Find out when the next timer event is due to happen. This | 482 | * Find out when the next timer event is due to happen. This |
482 | * is used on S/390 to stop all activity when a cpus is idle. | 483 | * is used on S/390 to stop all activity when a cpus is idle. |
483 | * This functions needs to be called disabled. | 484 | * This functions needs to be called disabled. |
484 | */ | 485 | */ |
485 | unsigned long next_timer_interrupt(void) | 486 | unsigned long next_timer_interrupt(void) |
486 | { | 487 | { |
487 | tvec_base_t *base; | 488 | tvec_base_t *base; |
488 | struct list_head *list; | 489 | struct list_head *list; |
489 | struct timer_list *nte; | 490 | struct timer_list *nte; |
490 | unsigned long expires; | 491 | unsigned long expires; |
491 | tvec_t *varray[4]; | 492 | tvec_t *varray[4]; |
492 | int i, j; | 493 | int i, j; |
493 | 494 | ||
494 | base = &__get_cpu_var(tvec_bases); | 495 | base = &__get_cpu_var(tvec_bases); |
495 | spin_lock(&base->t_base.lock); | 496 | spin_lock(&base->t_base.lock); |
496 | expires = base->timer_jiffies + (LONG_MAX >> 1); | 497 | expires = base->timer_jiffies + (LONG_MAX >> 1); |
497 | list = 0; | 498 | list = 0; |
498 | 499 | ||
499 | /* Look for timer events in tv1. */ | 500 | /* Look for timer events in tv1. */ |
500 | j = base->timer_jiffies & TVR_MASK; | 501 | j = base->timer_jiffies & TVR_MASK; |
501 | do { | 502 | do { |
502 | list_for_each_entry(nte, base->tv1.vec + j, entry) { | 503 | list_for_each_entry(nte, base->tv1.vec + j, entry) { |
503 | expires = nte->expires; | 504 | expires = nte->expires; |
504 | if (j < (base->timer_jiffies & TVR_MASK)) | 505 | if (j < (base->timer_jiffies & TVR_MASK)) |
505 | list = base->tv2.vec + (INDEX(0)); | 506 | list = base->tv2.vec + (INDEX(0)); |
506 | goto found; | 507 | goto found; |
507 | } | 508 | } |
508 | j = (j + 1) & TVR_MASK; | 509 | j = (j + 1) & TVR_MASK; |
509 | } while (j != (base->timer_jiffies & TVR_MASK)); | 510 | } while (j != (base->timer_jiffies & TVR_MASK)); |
510 | 511 | ||
511 | /* Check tv2-tv5. */ | 512 | /* Check tv2-tv5. */ |
512 | varray[0] = &base->tv2; | 513 | varray[0] = &base->tv2; |
513 | varray[1] = &base->tv3; | 514 | varray[1] = &base->tv3; |
514 | varray[2] = &base->tv4; | 515 | varray[2] = &base->tv4; |
515 | varray[3] = &base->tv5; | 516 | varray[3] = &base->tv5; |
516 | for (i = 0; i < 4; i++) { | 517 | for (i = 0; i < 4; i++) { |
517 | j = INDEX(i); | 518 | j = INDEX(i); |
518 | do { | 519 | do { |
519 | if (list_empty(varray[i]->vec + j)) { | 520 | if (list_empty(varray[i]->vec + j)) { |
520 | j = (j + 1) & TVN_MASK; | 521 | j = (j + 1) & TVN_MASK; |
521 | continue; | 522 | continue; |
522 | } | 523 | } |
523 | list_for_each_entry(nte, varray[i]->vec + j, entry) | 524 | list_for_each_entry(nte, varray[i]->vec + j, entry) |
524 | if (time_before(nte->expires, expires)) | 525 | if (time_before(nte->expires, expires)) |
525 | expires = nte->expires; | 526 | expires = nte->expires; |
526 | if (j < (INDEX(i)) && i < 3) | 527 | if (j < (INDEX(i)) && i < 3) |
527 | list = varray[i + 1]->vec + (INDEX(i + 1)); | 528 | list = varray[i + 1]->vec + (INDEX(i + 1)); |
528 | goto found; | 529 | goto found; |
529 | } while (j != (INDEX(i))); | 530 | } while (j != (INDEX(i))); |
530 | } | 531 | } |
531 | found: | 532 | found: |
532 | if (list) { | 533 | if (list) { |
533 | /* | 534 | /* |
534 | * The search wrapped. We need to look at the next list | 535 | * The search wrapped. We need to look at the next list |
535 | * from next tv element that would cascade into tv element | 536 | * from next tv element that would cascade into tv element |
536 | * where we found the timer element. | 537 | * where we found the timer element. |
537 | */ | 538 | */ |
538 | list_for_each_entry(nte, list, entry) { | 539 | list_for_each_entry(nte, list, entry) { |
539 | if (time_before(nte->expires, expires)) | 540 | if (time_before(nte->expires, expires)) |
540 | expires = nte->expires; | 541 | expires = nte->expires; |
541 | } | 542 | } |
542 | } | 543 | } |
543 | spin_unlock(&base->t_base.lock); | 544 | spin_unlock(&base->t_base.lock); |
544 | return expires; | 545 | return expires; |
545 | } | 546 | } |
546 | #endif | 547 | #endif |
547 | 548 | ||
548 | /******************************************************************/ | 549 | /******************************************************************/ |
549 | 550 | ||
550 | /* | 551 | /* |
551 | * Timekeeping variables | 552 | * Timekeeping variables |
552 | */ | 553 | */ |
553 | unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ | 554 | unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ |
554 | unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ | 555 | unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ |
555 | 556 | ||
556 | /* | 557 | /* |
557 | * The current time | 558 | * The current time |
558 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | 559 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected |
559 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | 560 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged |
560 | * at zero at system boot time, so wall_to_monotonic will be negative, | 561 | * at zero at system boot time, so wall_to_monotonic will be negative, |
561 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | 562 | * however, we will ALWAYS keep the tv_nsec part positive so we can use |
562 | * the usual normalization. | 563 | * the usual normalization. |
563 | */ | 564 | */ |
564 | struct timespec xtime __attribute__ ((aligned (16))); | 565 | struct timespec xtime __attribute__ ((aligned (16))); |
565 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | 566 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); |
566 | 567 | ||
567 | EXPORT_SYMBOL(xtime); | 568 | EXPORT_SYMBOL(xtime); |
568 | 569 | ||
569 | /* Don't completely fail for HZ > 500. */ | 570 | /* Don't completely fail for HZ > 500. */ |
570 | int tickadj = 500/HZ ? : 1; /* microsecs */ | 571 | int tickadj = 500/HZ ? : 1; /* microsecs */ |
571 | 572 | ||
572 | 573 | ||
573 | /* | 574 | /* |
574 | * phase-lock loop variables | 575 | * phase-lock loop variables |
575 | */ | 576 | */ |
576 | /* TIME_ERROR prevents overwriting the CMOS clock */ | 577 | /* TIME_ERROR prevents overwriting the CMOS clock */ |
577 | int time_state = TIME_OK; /* clock synchronization status */ | 578 | int time_state = TIME_OK; /* clock synchronization status */ |
578 | int time_status = STA_UNSYNC; /* clock status bits */ | 579 | int time_status = STA_UNSYNC; /* clock status bits */ |
579 | long time_offset; /* time adjustment (us) */ | 580 | long time_offset; /* time adjustment (us) */ |
580 | long time_constant = 2; /* pll time constant */ | 581 | long time_constant = 2; /* pll time constant */ |
581 | long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ | 582 | long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ |
582 | long time_precision = 1; /* clock precision (us) */ | 583 | long time_precision = 1; /* clock precision (us) */ |
583 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ | 584 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ |
584 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | 585 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ |
585 | static long time_phase; /* phase offset (scaled us) */ | 586 | static long time_phase; /* phase offset (scaled us) */ |
586 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; | 587 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; |
587 | /* frequency offset (scaled ppm)*/ | 588 | /* frequency offset (scaled ppm)*/ |
588 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ | 589 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ |
589 | long time_reftime; /* time at last adjustment (s) */ | 590 | long time_reftime; /* time at last adjustment (s) */ |
590 | long time_adjust; | 591 | long time_adjust; |
591 | long time_next_adjust; | 592 | long time_next_adjust; |
592 | 593 | ||
593 | /* | 594 | /* |
594 | * this routine handles the overflow of the microsecond field | 595 | * this routine handles the overflow of the microsecond field |
595 | * | 596 | * |
596 | * The tricky bits of code to handle the accurate clock support | 597 | * The tricky bits of code to handle the accurate clock support |
597 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | 598 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. |
598 | * They were originally developed for SUN and DEC kernels. | 599 | * They were originally developed for SUN and DEC kernels. |
599 | * All the kudos should go to Dave for this stuff. | 600 | * All the kudos should go to Dave for this stuff. |
600 | * | 601 | * |
601 | */ | 602 | */ |
602 | static void second_overflow(void) | 603 | static void second_overflow(void) |
603 | { | 604 | { |
604 | long ltemp; | 605 | long ltemp; |
605 | 606 | ||
606 | /* Bump the maxerror field */ | 607 | /* Bump the maxerror field */ |
607 | time_maxerror += time_tolerance >> SHIFT_USEC; | 608 | time_maxerror += time_tolerance >> SHIFT_USEC; |
608 | if (time_maxerror > NTP_PHASE_LIMIT) { | 609 | if (time_maxerror > NTP_PHASE_LIMIT) { |
609 | time_maxerror = NTP_PHASE_LIMIT; | 610 | time_maxerror = NTP_PHASE_LIMIT; |
610 | time_status |= STA_UNSYNC; | 611 | time_status |= STA_UNSYNC; |
611 | } | 612 | } |
612 | 613 | ||
613 | /* | 614 | /* |
614 | * Leap second processing. If in leap-insert state at the end of the | 615 | * Leap second processing. If in leap-insert state at the end of the |
615 | * day, the system clock is set back one second; if in leap-delete | 616 | * day, the system clock is set back one second; if in leap-delete |
616 | * state, the system clock is set ahead one second. The microtime() | 617 | * state, the system clock is set ahead one second. The microtime() |
617 | * routine or external clock driver will insure that reported time is | 618 | * routine or external clock driver will insure that reported time is |
618 | * always monotonic. The ugly divides should be replaced. | 619 | * always monotonic. The ugly divides should be replaced. |
619 | */ | 620 | */ |
620 | switch (time_state) { | 621 | switch (time_state) { |
621 | case TIME_OK: | 622 | case TIME_OK: |
622 | if (time_status & STA_INS) | 623 | if (time_status & STA_INS) |
623 | time_state = TIME_INS; | 624 | time_state = TIME_INS; |
624 | else if (time_status & STA_DEL) | 625 | else if (time_status & STA_DEL) |
625 | time_state = TIME_DEL; | 626 | time_state = TIME_DEL; |
626 | break; | 627 | break; |
627 | case TIME_INS: | 628 | case TIME_INS: |
628 | if (xtime.tv_sec % 86400 == 0) { | 629 | if (xtime.tv_sec % 86400 == 0) { |
629 | xtime.tv_sec--; | 630 | xtime.tv_sec--; |
630 | wall_to_monotonic.tv_sec++; | 631 | wall_to_monotonic.tv_sec++; |
631 | /* | 632 | /* |
632 | * The timer interpolator will make time change | 633 | * The timer interpolator will make time change |
633 | * gradually instead of an immediate jump by one second | 634 | * gradually instead of an immediate jump by one second |
634 | */ | 635 | */ |
635 | time_interpolator_update(-NSEC_PER_SEC); | 636 | time_interpolator_update(-NSEC_PER_SEC); |
636 | time_state = TIME_OOP; | 637 | time_state = TIME_OOP; |
637 | clock_was_set(); | 638 | clock_was_set(); |
638 | printk(KERN_NOTICE "Clock: inserting leap second " | 639 | printk(KERN_NOTICE "Clock: inserting leap second " |
639 | "23:59:60 UTC\n"); | 640 | "23:59:60 UTC\n"); |
640 | } | 641 | } |
641 | break; | 642 | break; |
642 | case TIME_DEL: | 643 | case TIME_DEL: |
643 | if ((xtime.tv_sec + 1) % 86400 == 0) { | 644 | if ((xtime.tv_sec + 1) % 86400 == 0) { |
644 | xtime.tv_sec++; | 645 | xtime.tv_sec++; |
645 | wall_to_monotonic.tv_sec--; | 646 | wall_to_monotonic.tv_sec--; |
646 | /* | 647 | /* |
647 | * Use of time interpolator for a gradual change of | 648 | * Use of time interpolator for a gradual change of |
648 | * time | 649 | * time |
649 | */ | 650 | */ |
650 | time_interpolator_update(NSEC_PER_SEC); | 651 | time_interpolator_update(NSEC_PER_SEC); |
651 | time_state = TIME_WAIT; | 652 | time_state = TIME_WAIT; |
652 | clock_was_set(); | 653 | clock_was_set(); |
653 | printk(KERN_NOTICE "Clock: deleting leap second " | 654 | printk(KERN_NOTICE "Clock: deleting leap second " |
654 | "23:59:59 UTC\n"); | 655 | "23:59:59 UTC\n"); |
655 | } | 656 | } |
656 | break; | 657 | break; |
657 | case TIME_OOP: | 658 | case TIME_OOP: |
658 | time_state = TIME_WAIT; | 659 | time_state = TIME_WAIT; |
659 | break; | 660 | break; |
660 | case TIME_WAIT: | 661 | case TIME_WAIT: |
661 | if (!(time_status & (STA_INS | STA_DEL))) | 662 | if (!(time_status & (STA_INS | STA_DEL))) |
662 | time_state = TIME_OK; | 663 | time_state = TIME_OK; |
663 | } | 664 | } |
664 | 665 | ||
665 | /* | 666 | /* |
666 | * Compute the phase adjustment for the next second. In PLL mode, the | 667 | * Compute the phase adjustment for the next second. In PLL mode, the |
667 | * offset is reduced by a fixed factor times the time constant. In FLL | 668 | * offset is reduced by a fixed factor times the time constant. In FLL |
668 | * mode the offset is used directly. In either mode, the maximum phase | 669 | * mode the offset is used directly. In either mode, the maximum phase |
669 | * adjustment for each second is clamped so as to spread the adjustment | 670 | * adjustment for each second is clamped so as to spread the adjustment |
670 | * over not more than the number of seconds between updates. | 671 | * over not more than the number of seconds between updates. |
671 | */ | 672 | */ |
672 | ltemp = time_offset; | 673 | ltemp = time_offset; |
673 | if (!(time_status & STA_FLL)) | 674 | if (!(time_status & STA_FLL)) |
674 | ltemp = shift_right(ltemp, SHIFT_KG + time_constant); | 675 | ltemp = shift_right(ltemp, SHIFT_KG + time_constant); |
675 | ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); | 676 | ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); |
676 | ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); | 677 | ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); |
677 | time_offset -= ltemp; | 678 | time_offset -= ltemp; |
678 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | 679 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); |
679 | 680 | ||
680 | /* | 681 | /* |
681 | * Compute the frequency estimate and additional phase adjustment due | 682 | * Compute the frequency estimate and additional phase adjustment due |
682 | * to frequency error for the next second. When the PPS signal is | 683 | * to frequency error for the next second. When the PPS signal is |
683 | * engaged, gnaw on the watchdog counter and update the frequency | 684 | * engaged, gnaw on the watchdog counter and update the frequency |
684 | * computed by the pll and the PPS signal. | 685 | * computed by the pll and the PPS signal. |
685 | */ | 686 | */ |
686 | pps_valid++; | 687 | pps_valid++; |
687 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ | 688 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ |
688 | pps_jitter = MAXTIME; | 689 | pps_jitter = MAXTIME; |
689 | pps_stabil = MAXFREQ; | 690 | pps_stabil = MAXFREQ; |
690 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | 691 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | |
691 | STA_PPSWANDER | STA_PPSERROR); | 692 | STA_PPSWANDER | STA_PPSERROR); |
692 | } | 693 | } |
693 | ltemp = time_freq + pps_freq; | 694 | ltemp = time_freq + pps_freq; |
694 | time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); | 695 | time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); |
695 | 696 | ||
696 | #if HZ == 100 | 697 | #if HZ == 100 |
697 | /* | 698 | /* |
698 | * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to | 699 | * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to |
699 | * get 128.125; => only 0.125% error (p. 14) | 700 | * get 128.125; => only 0.125% error (p. 14) |
700 | */ | 701 | */ |
701 | time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); | 702 | time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); |
702 | #endif | 703 | #endif |
703 | #if HZ == 250 | 704 | #if HZ == 250 |
704 | /* | 705 | /* |
705 | * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and | 706 | * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and |
706 | * 0.78125% to get 255.85938; => only 0.05% error (p. 14) | 707 | * 0.78125% to get 255.85938; => only 0.05% error (p. 14) |
707 | */ | 708 | */ |
708 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); | 709 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); |
709 | #endif | 710 | #endif |
710 | #if HZ == 1000 | 711 | #if HZ == 1000 |
711 | /* | 712 | /* |
712 | * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and | 713 | * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and |
713 | * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) | 714 | * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) |
714 | */ | 715 | */ |
715 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); | 716 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); |
716 | #endif | 717 | #endif |
717 | } | 718 | } |
718 | 719 | ||
719 | /* in the NTP reference this is called "hardclock()" */ | 720 | /* in the NTP reference this is called "hardclock()" */ |
720 | static void update_wall_time_one_tick(void) | 721 | static void update_wall_time_one_tick(void) |
721 | { | 722 | { |
722 | long time_adjust_step, delta_nsec; | 723 | long time_adjust_step, delta_nsec; |
723 | 724 | ||
724 | if ((time_adjust_step = time_adjust) != 0 ) { | 725 | if ((time_adjust_step = time_adjust) != 0 ) { |
725 | /* | 726 | /* |
726 | * We are doing an adjtime thing. Prepare time_adjust_step to | 727 | * We are doing an adjtime thing. Prepare time_adjust_step to |
727 | * be within bounds. Note that a positive time_adjust means we | 728 | * be within bounds. Note that a positive time_adjust means we |
728 | * want the clock to run faster. | 729 | * want the clock to run faster. |
729 | * | 730 | * |
730 | * Limit the amount of the step to be in the range | 731 | * Limit the amount of the step to be in the range |
731 | * -tickadj .. +tickadj | 732 | * -tickadj .. +tickadj |
732 | */ | 733 | */ |
733 | time_adjust_step = min(time_adjust_step, (long)tickadj); | 734 | time_adjust_step = min(time_adjust_step, (long)tickadj); |
734 | time_adjust_step = max(time_adjust_step, (long)-tickadj); | 735 | time_adjust_step = max(time_adjust_step, (long)-tickadj); |
735 | 736 | ||
736 | /* Reduce by this step the amount of time left */ | 737 | /* Reduce by this step the amount of time left */ |
737 | time_adjust -= time_adjust_step; | 738 | time_adjust -= time_adjust_step; |
738 | } | 739 | } |
739 | delta_nsec = tick_nsec + time_adjust_step * 1000; | 740 | delta_nsec = tick_nsec + time_adjust_step * 1000; |
740 | /* | 741 | /* |
741 | * Advance the phase, once it gets to one microsecond, then | 742 | * Advance the phase, once it gets to one microsecond, then |
742 | * advance the tick more. | 743 | * advance the tick more. |
743 | */ | 744 | */ |
744 | time_phase += time_adj; | 745 | time_phase += time_adj; |
745 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { | 746 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { |
746 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); | 747 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); |
747 | time_phase -= ltemp << (SHIFT_SCALE - 10); | 748 | time_phase -= ltemp << (SHIFT_SCALE - 10); |
748 | delta_nsec += ltemp; | 749 | delta_nsec += ltemp; |
749 | } | 750 | } |
750 | xtime.tv_nsec += delta_nsec; | 751 | xtime.tv_nsec += delta_nsec; |
751 | time_interpolator_update(delta_nsec); | 752 | time_interpolator_update(delta_nsec); |
752 | 753 | ||
753 | /* Changes by adjtime() do not take effect till next tick. */ | 754 | /* Changes by adjtime() do not take effect till next tick. */ |
754 | if (time_next_adjust != 0) { | 755 | if (time_next_adjust != 0) { |
755 | time_adjust = time_next_adjust; | 756 | time_adjust = time_next_adjust; |
756 | time_next_adjust = 0; | 757 | time_next_adjust = 0; |
757 | } | 758 | } |
758 | } | 759 | } |
759 | 760 | ||
760 | /* | 761 | /* |
761 | * Using a loop looks inefficient, but "ticks" is | 762 | * Using a loop looks inefficient, but "ticks" is |
762 | * usually just one (we shouldn't be losing ticks, | 763 | * usually just one (we shouldn't be losing ticks, |
763 | * we're doing this this way mainly for interrupt | 764 | * we're doing this this way mainly for interrupt |
764 | * latency reasons, not because we think we'll | 765 | * latency reasons, not because we think we'll |
765 | * have lots of lost timer ticks | 766 | * have lots of lost timer ticks |
766 | */ | 767 | */ |
767 | static void update_wall_time(unsigned long ticks) | 768 | static void update_wall_time(unsigned long ticks) |
768 | { | 769 | { |
769 | do { | 770 | do { |
770 | ticks--; | 771 | ticks--; |
771 | update_wall_time_one_tick(); | 772 | update_wall_time_one_tick(); |
772 | if (xtime.tv_nsec >= 1000000000) { | 773 | if (xtime.tv_nsec >= 1000000000) { |
773 | xtime.tv_nsec -= 1000000000; | 774 | xtime.tv_nsec -= 1000000000; |
774 | xtime.tv_sec++; | 775 | xtime.tv_sec++; |
775 | second_overflow(); | 776 | second_overflow(); |
776 | } | 777 | } |
777 | } while (ticks); | 778 | } while (ticks); |
778 | } | 779 | } |
779 | 780 | ||
780 | /* | 781 | /* |
781 | * Called from the timer interrupt handler to charge one tick to the current | 782 | * Called from the timer interrupt handler to charge one tick to the current |
782 | * process. user_tick is 1 if the tick is user time, 0 for system. | 783 | * process. user_tick is 1 if the tick is user time, 0 for system. |
783 | */ | 784 | */ |
784 | void update_process_times(int user_tick) | 785 | void update_process_times(int user_tick) |
785 | { | 786 | { |
786 | struct task_struct *p = current; | 787 | struct task_struct *p = current; |
787 | int cpu = smp_processor_id(); | 788 | int cpu = smp_processor_id(); |
788 | 789 | ||
789 | /* Note: this timer irq context must be accounted for as well. */ | 790 | /* Note: this timer irq context must be accounted for as well. */ |
790 | if (user_tick) | 791 | if (user_tick) |
791 | account_user_time(p, jiffies_to_cputime(1)); | 792 | account_user_time(p, jiffies_to_cputime(1)); |
792 | else | 793 | else |
793 | account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); | 794 | account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); |
794 | run_local_timers(); | 795 | run_local_timers(); |
795 | if (rcu_pending(cpu)) | 796 | if (rcu_pending(cpu)) |
796 | rcu_check_callbacks(cpu, user_tick); | 797 | rcu_check_callbacks(cpu, user_tick); |
797 | scheduler_tick(); | 798 | scheduler_tick(); |
798 | run_posix_cpu_timers(p); | 799 | run_posix_cpu_timers(p); |
799 | } | 800 | } |
800 | 801 | ||
801 | /* | 802 | /* |
802 | * Nr of active tasks - counted in fixed-point numbers | 803 | * Nr of active tasks - counted in fixed-point numbers |
803 | */ | 804 | */ |
804 | static unsigned long count_active_tasks(void) | 805 | static unsigned long count_active_tasks(void) |
805 | { | 806 | { |
806 | return (nr_running() + nr_uninterruptible()) * FIXED_1; | 807 | return (nr_running() + nr_uninterruptible()) * FIXED_1; |
807 | } | 808 | } |
808 | 809 | ||
809 | /* | 810 | /* |
810 | * Hmm.. Changed this, as the GNU make sources (load.c) seems to | 811 | * Hmm.. Changed this, as the GNU make sources (load.c) seems to |
811 | * imply that avenrun[] is the standard name for this kind of thing. | 812 | * imply that avenrun[] is the standard name for this kind of thing. |
812 | * Nothing else seems to be standardized: the fractional size etc | 813 | * Nothing else seems to be standardized: the fractional size etc |
813 | * all seem to differ on different machines. | 814 | * all seem to differ on different machines. |
814 | * | 815 | * |
815 | * Requires xtime_lock to access. | 816 | * Requires xtime_lock to access. |
816 | */ | 817 | */ |
817 | unsigned long avenrun[3]; | 818 | unsigned long avenrun[3]; |
818 | 819 | ||
819 | EXPORT_SYMBOL(avenrun); | 820 | EXPORT_SYMBOL(avenrun); |
820 | 821 | ||
821 | /* | 822 | /* |
822 | * calc_load - given tick count, update the avenrun load estimates. | 823 | * calc_load - given tick count, update the avenrun load estimates. |
823 | * This is called while holding a write_lock on xtime_lock. | 824 | * This is called while holding a write_lock on xtime_lock. |
824 | */ | 825 | */ |
825 | static inline void calc_load(unsigned long ticks) | 826 | static inline void calc_load(unsigned long ticks) |
826 | { | 827 | { |
827 | unsigned long active_tasks; /* fixed-point */ | 828 | unsigned long active_tasks; /* fixed-point */ |
828 | static int count = LOAD_FREQ; | 829 | static int count = LOAD_FREQ; |
829 | 830 | ||
830 | count -= ticks; | 831 | count -= ticks; |
831 | if (count < 0) { | 832 | if (count < 0) { |
832 | count += LOAD_FREQ; | 833 | count += LOAD_FREQ; |
833 | active_tasks = count_active_tasks(); | 834 | active_tasks = count_active_tasks(); |
834 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); | 835 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); |
835 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | 836 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); |
836 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | 837 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); |
837 | } | 838 | } |
838 | } | 839 | } |
839 | 840 | ||
840 | /* jiffies at the most recent update of wall time */ | 841 | /* jiffies at the most recent update of wall time */ |
841 | unsigned long wall_jiffies = INITIAL_JIFFIES; | 842 | unsigned long wall_jiffies = INITIAL_JIFFIES; |
842 | 843 | ||
843 | /* | 844 | /* |
844 | * This read-write spinlock protects us from races in SMP while | 845 | * This read-write spinlock protects us from races in SMP while |
845 | * playing with xtime and avenrun. | 846 | * playing with xtime and avenrun. |
846 | */ | 847 | */ |
847 | #ifndef ARCH_HAVE_XTIME_LOCK | 848 | #ifndef ARCH_HAVE_XTIME_LOCK |
848 | seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; | 849 | seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; |
849 | 850 | ||
850 | EXPORT_SYMBOL(xtime_lock); | 851 | EXPORT_SYMBOL(xtime_lock); |
851 | #endif | 852 | #endif |
852 | 853 | ||
853 | /* | 854 | /* |
854 | * This function runs timers and the timer-tq in bottom half context. | 855 | * This function runs timers and the timer-tq in bottom half context. |
855 | */ | 856 | */ |
856 | static void run_timer_softirq(struct softirq_action *h) | 857 | static void run_timer_softirq(struct softirq_action *h) |
857 | { | 858 | { |
858 | tvec_base_t *base = &__get_cpu_var(tvec_bases); | 859 | tvec_base_t *base = &__get_cpu_var(tvec_bases); |
859 | 860 | ||
860 | if (time_after_eq(jiffies, base->timer_jiffies)) | 861 | if (time_after_eq(jiffies, base->timer_jiffies)) |
861 | __run_timers(base); | 862 | __run_timers(base); |
862 | } | 863 | } |
863 | 864 | ||
864 | /* | 865 | /* |
865 | * Called by the local, per-CPU timer interrupt on SMP. | 866 | * Called by the local, per-CPU timer interrupt on SMP. |
866 | */ | 867 | */ |
867 | void run_local_timers(void) | 868 | void run_local_timers(void) |
868 | { | 869 | { |
869 | raise_softirq(TIMER_SOFTIRQ); | 870 | raise_softirq(TIMER_SOFTIRQ); |
870 | } | 871 | } |
871 | 872 | ||
872 | /* | 873 | /* |
873 | * Called by the timer interrupt. xtime_lock must already be taken | 874 | * Called by the timer interrupt. xtime_lock must already be taken |
874 | * by the timer IRQ! | 875 | * by the timer IRQ! |
875 | */ | 876 | */ |
876 | static inline void update_times(void) | 877 | static inline void update_times(void) |
877 | { | 878 | { |
878 | unsigned long ticks; | 879 | unsigned long ticks; |
879 | 880 | ||
880 | ticks = jiffies - wall_jiffies; | 881 | ticks = jiffies - wall_jiffies; |
881 | if (ticks) { | 882 | if (ticks) { |
882 | wall_jiffies += ticks; | 883 | wall_jiffies += ticks; |
883 | update_wall_time(ticks); | 884 | update_wall_time(ticks); |
884 | } | 885 | } |
885 | calc_load(ticks); | 886 | calc_load(ticks); |
886 | } | 887 | } |
887 | 888 | ||
888 | /* | 889 | /* |
889 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 890 | * The 64-bit jiffies value is not atomic - you MUST NOT read it |
890 | * without sampling the sequence number in xtime_lock. | 891 | * without sampling the sequence number in xtime_lock. |
891 | * jiffies is defined in the linker script... | 892 | * jiffies is defined in the linker script... |
892 | */ | 893 | */ |
893 | 894 | ||
894 | void do_timer(struct pt_regs *regs) | 895 | void do_timer(struct pt_regs *regs) |
895 | { | 896 | { |
896 | jiffies_64++; | 897 | jiffies_64++; |
897 | update_times(); | 898 | update_times(); |
898 | softlockup_tick(regs); | 899 | softlockup_tick(regs); |
899 | } | 900 | } |
900 | 901 | ||
901 | #ifdef __ARCH_WANT_SYS_ALARM | 902 | #ifdef __ARCH_WANT_SYS_ALARM |
902 | 903 | ||
903 | /* | 904 | /* |
904 | * For backwards compatibility? This can be done in libc so Alpha | 905 | * For backwards compatibility? This can be done in libc so Alpha |
905 | * and all newer ports shouldn't need it. | 906 | * and all newer ports shouldn't need it. |
906 | */ | 907 | */ |
907 | asmlinkage unsigned long sys_alarm(unsigned int seconds) | 908 | asmlinkage unsigned long sys_alarm(unsigned int seconds) |
908 | { | 909 | { |
909 | struct itimerval it_new, it_old; | 910 | struct itimerval it_new, it_old; |
910 | unsigned int oldalarm; | 911 | unsigned int oldalarm; |
911 | 912 | ||
912 | it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; | 913 | it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; |
913 | it_new.it_value.tv_sec = seconds; | 914 | it_new.it_value.tv_sec = seconds; |
914 | it_new.it_value.tv_usec = 0; | 915 | it_new.it_value.tv_usec = 0; |
915 | do_setitimer(ITIMER_REAL, &it_new, &it_old); | 916 | do_setitimer(ITIMER_REAL, &it_new, &it_old); |
916 | oldalarm = it_old.it_value.tv_sec; | 917 | oldalarm = it_old.it_value.tv_sec; |
917 | /* ehhh.. We can't return 0 if we have an alarm pending.. */ | 918 | /* ehhh.. We can't return 0 if we have an alarm pending.. */ |
918 | /* And we'd better return too much than too little anyway */ | 919 | /* And we'd better return too much than too little anyway */ |
919 | if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) | 920 | if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) |
920 | oldalarm++; | 921 | oldalarm++; |
921 | return oldalarm; | 922 | return oldalarm; |
922 | } | 923 | } |
923 | 924 | ||
924 | #endif | 925 | #endif |
925 | 926 | ||
926 | #ifndef __alpha__ | 927 | #ifndef __alpha__ |
927 | 928 | ||
928 | /* | 929 | /* |
929 | * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this | 930 | * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this |
930 | * should be moved into arch/i386 instead? | 931 | * should be moved into arch/i386 instead? |
931 | */ | 932 | */ |
932 | 933 | ||
933 | /** | 934 | /** |
934 | * sys_getpid - return the thread group id of the current process | 935 | * sys_getpid - return the thread group id of the current process |
935 | * | 936 | * |
936 | * Note, despite the name, this returns the tgid not the pid. The tgid and | 937 | * Note, despite the name, this returns the tgid not the pid. The tgid and |
937 | * the pid are identical unless CLONE_THREAD was specified on clone() in | 938 | * the pid are identical unless CLONE_THREAD was specified on clone() in |
938 | * which case the tgid is the same in all threads of the same group. | 939 | * which case the tgid is the same in all threads of the same group. |
939 | * | 940 | * |
940 | * This is SMP safe as current->tgid does not change. | 941 | * This is SMP safe as current->tgid does not change. |
941 | */ | 942 | */ |
942 | asmlinkage long sys_getpid(void) | 943 | asmlinkage long sys_getpid(void) |
943 | { | 944 | { |
944 | return current->tgid; | 945 | return current->tgid; |
945 | } | 946 | } |
946 | 947 | ||
947 | /* | 948 | /* |
948 | * Accessing ->group_leader->real_parent is not SMP-safe, it could | 949 | * Accessing ->group_leader->real_parent is not SMP-safe, it could |
949 | * change from under us. However, rather than getting any lock | 950 | * change from under us. However, rather than getting any lock |
950 | * we can use an optimistic algorithm: get the parent | 951 | * we can use an optimistic algorithm: get the parent |
951 | * pid, and go back and check that the parent is still | 952 | * pid, and go back and check that the parent is still |
952 | * the same. If it has changed (which is extremely unlikely | 953 | * the same. If it has changed (which is extremely unlikely |
953 | * indeed), we just try again.. | 954 | * indeed), we just try again.. |
954 | * | 955 | * |
955 | * NOTE! This depends on the fact that even if we _do_ | 956 | * NOTE! This depends on the fact that even if we _do_ |
956 | * get an old value of "parent", we can happily dereference | 957 | * get an old value of "parent", we can happily dereference |
957 | * the pointer (it was and remains a dereferencable kernel pointer | 958 | * the pointer (it was and remains a dereferencable kernel pointer |
958 | * no matter what): we just can't necessarily trust the result | 959 | * no matter what): we just can't necessarily trust the result |
959 | * until we know that the parent pointer is valid. | 960 | * until we know that the parent pointer is valid. |
960 | * | 961 | * |
961 | * NOTE2: ->group_leader never changes from under us. | 962 | * NOTE2: ->group_leader never changes from under us. |
962 | */ | 963 | */ |
963 | asmlinkage long sys_getppid(void) | 964 | asmlinkage long sys_getppid(void) |
964 | { | 965 | { |
965 | int pid; | 966 | int pid; |
966 | struct task_struct *me = current; | 967 | struct task_struct *me = current; |
967 | struct task_struct *parent; | 968 | struct task_struct *parent; |
968 | 969 | ||
969 | parent = me->group_leader->real_parent; | 970 | parent = me->group_leader->real_parent; |
970 | for (;;) { | 971 | for (;;) { |
971 | pid = parent->tgid; | 972 | pid = parent->tgid; |
972 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | 973 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) |
973 | { | 974 | { |
974 | struct task_struct *old = parent; | 975 | struct task_struct *old = parent; |
975 | 976 | ||
976 | /* | 977 | /* |
977 | * Make sure we read the pid before re-reading the | 978 | * Make sure we read the pid before re-reading the |
978 | * parent pointer: | 979 | * parent pointer: |
979 | */ | 980 | */ |
980 | smp_rmb(); | 981 | smp_rmb(); |
981 | parent = me->group_leader->real_parent; | 982 | parent = me->group_leader->real_parent; |
982 | if (old != parent) | 983 | if (old != parent) |
983 | continue; | 984 | continue; |
984 | } | 985 | } |
985 | #endif | 986 | #endif |
986 | break; | 987 | break; |
987 | } | 988 | } |
988 | return pid; | 989 | return pid; |
989 | } | 990 | } |
990 | 991 | ||
991 | asmlinkage long sys_getuid(void) | 992 | asmlinkage long sys_getuid(void) |
992 | { | 993 | { |
993 | /* Only we change this so SMP safe */ | 994 | /* Only we change this so SMP safe */ |
994 | return current->uid; | 995 | return current->uid; |
995 | } | 996 | } |
996 | 997 | ||
997 | asmlinkage long sys_geteuid(void) | 998 | asmlinkage long sys_geteuid(void) |
998 | { | 999 | { |
999 | /* Only we change this so SMP safe */ | 1000 | /* Only we change this so SMP safe */ |
1000 | return current->euid; | 1001 | return current->euid; |
1001 | } | 1002 | } |
1002 | 1003 | ||
1003 | asmlinkage long sys_getgid(void) | 1004 | asmlinkage long sys_getgid(void) |
1004 | { | 1005 | { |
1005 | /* Only we change this so SMP safe */ | 1006 | /* Only we change this so SMP safe */ |
1006 | return current->gid; | 1007 | return current->gid; |
1007 | } | 1008 | } |
1008 | 1009 | ||
1009 | asmlinkage long sys_getegid(void) | 1010 | asmlinkage long sys_getegid(void) |
1010 | { | 1011 | { |
1011 | /* Only we change this so SMP safe */ | 1012 | /* Only we change this so SMP safe */ |
1012 | return current->egid; | 1013 | return current->egid; |
1013 | } | 1014 | } |
1014 | 1015 | ||
1015 | #endif | 1016 | #endif |
1016 | 1017 | ||
1017 | static void process_timeout(unsigned long __data) | 1018 | static void process_timeout(unsigned long __data) |
1018 | { | 1019 | { |
1019 | wake_up_process((task_t *)__data); | 1020 | wake_up_process((task_t *)__data); |
1020 | } | 1021 | } |
1021 | 1022 | ||
1022 | /** | 1023 | /** |
1023 | * schedule_timeout - sleep until timeout | 1024 | * schedule_timeout - sleep until timeout |
1024 | * @timeout: timeout value in jiffies | 1025 | * @timeout: timeout value in jiffies |
1025 | * | 1026 | * |
1026 | * Make the current task sleep until @timeout jiffies have | 1027 | * Make the current task sleep until @timeout jiffies have |
1027 | * elapsed. The routine will return immediately unless | 1028 | * elapsed. The routine will return immediately unless |
1028 | * the current task state has been set (see set_current_state()). | 1029 | * the current task state has been set (see set_current_state()). |
1029 | * | 1030 | * |
1030 | * You can set the task state as follows - | 1031 | * You can set the task state as follows - |
1031 | * | 1032 | * |
1032 | * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to | 1033 | * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to |
1033 | * pass before the routine returns. The routine will return 0 | 1034 | * pass before the routine returns. The routine will return 0 |
1034 | * | 1035 | * |
1035 | * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | 1036 | * %TASK_INTERRUPTIBLE - the routine may return early if a signal is |
1036 | * delivered to the current task. In this case the remaining time | 1037 | * delivered to the current task. In this case the remaining time |
1037 | * in jiffies will be returned, or 0 if the timer expired in time | 1038 | * in jiffies will be returned, or 0 if the timer expired in time |
1038 | * | 1039 | * |
1039 | * The current task state is guaranteed to be TASK_RUNNING when this | 1040 | * The current task state is guaranteed to be TASK_RUNNING when this |
1040 | * routine returns. | 1041 | * routine returns. |
1041 | * | 1042 | * |
1042 | * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule | 1043 | * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule |
1043 | * the CPU away without a bound on the timeout. In this case the return | 1044 | * the CPU away without a bound on the timeout. In this case the return |
1044 | * value will be %MAX_SCHEDULE_TIMEOUT. | 1045 | * value will be %MAX_SCHEDULE_TIMEOUT. |
1045 | * | 1046 | * |
1046 | * In all cases the return value is guaranteed to be non-negative. | 1047 | * In all cases the return value is guaranteed to be non-negative. |
1047 | */ | 1048 | */ |
1048 | fastcall signed long __sched schedule_timeout(signed long timeout) | 1049 | fastcall signed long __sched schedule_timeout(signed long timeout) |
1049 | { | 1050 | { |
1050 | struct timer_list timer; | 1051 | struct timer_list timer; |
1051 | unsigned long expire; | 1052 | unsigned long expire; |
1052 | 1053 | ||
1053 | switch (timeout) | 1054 | switch (timeout) |
1054 | { | 1055 | { |
1055 | case MAX_SCHEDULE_TIMEOUT: | 1056 | case MAX_SCHEDULE_TIMEOUT: |
1056 | /* | 1057 | /* |
1057 | * These two special cases are useful to be comfortable | 1058 | * These two special cases are useful to be comfortable |
1058 | * in the caller. Nothing more. We could take | 1059 | * in the caller. Nothing more. We could take |
1059 | * MAX_SCHEDULE_TIMEOUT from one of the negative value | 1060 | * MAX_SCHEDULE_TIMEOUT from one of the negative value |
1060 | * but I' d like to return a valid offset (>=0) to allow | 1061 | * but I' d like to return a valid offset (>=0) to allow |
1061 | * the caller to do everything it want with the retval. | 1062 | * the caller to do everything it want with the retval. |
1062 | */ | 1063 | */ |
1063 | schedule(); | 1064 | schedule(); |
1064 | goto out; | 1065 | goto out; |
1065 | default: | 1066 | default: |
1066 | /* | 1067 | /* |
1067 | * Another bit of PARANOID. Note that the retval will be | 1068 | * Another bit of PARANOID. Note that the retval will be |
1068 | * 0 since no piece of kernel is supposed to do a check | 1069 | * 0 since no piece of kernel is supposed to do a check |
1069 | * for a negative retval of schedule_timeout() (since it | 1070 | * for a negative retval of schedule_timeout() (since it |
1070 | * should never happens anyway). You just have the printk() | 1071 | * should never happens anyway). You just have the printk() |
1071 | * that will tell you if something is gone wrong and where. | 1072 | * that will tell you if something is gone wrong and where. |
1072 | */ | 1073 | */ |
1073 | if (timeout < 0) | 1074 | if (timeout < 0) |
1074 | { | 1075 | { |
1075 | printk(KERN_ERR "schedule_timeout: wrong timeout " | 1076 | printk(KERN_ERR "schedule_timeout: wrong timeout " |
1076 | "value %lx from %p\n", timeout, | 1077 | "value %lx from %p\n", timeout, |
1077 | __builtin_return_address(0)); | 1078 | __builtin_return_address(0)); |
1078 | current->state = TASK_RUNNING; | 1079 | current->state = TASK_RUNNING; |
1079 | goto out; | 1080 | goto out; |
1080 | } | 1081 | } |
1081 | } | 1082 | } |
1082 | 1083 | ||
1083 | expire = timeout + jiffies; | 1084 | expire = timeout + jiffies; |
1084 | 1085 | ||
1085 | setup_timer(&timer, process_timeout, (unsigned long)current); | 1086 | setup_timer(&timer, process_timeout, (unsigned long)current); |
1086 | __mod_timer(&timer, expire); | 1087 | __mod_timer(&timer, expire); |
1087 | schedule(); | 1088 | schedule(); |
1088 | del_singleshot_timer_sync(&timer); | 1089 | del_singleshot_timer_sync(&timer); |
1089 | 1090 | ||
1090 | timeout = expire - jiffies; | 1091 | timeout = expire - jiffies; |
1091 | 1092 | ||
1092 | out: | 1093 | out: |
1093 | return timeout < 0 ? 0 : timeout; | 1094 | return timeout < 0 ? 0 : timeout; |
1094 | } | 1095 | } |
1095 | EXPORT_SYMBOL(schedule_timeout); | 1096 | EXPORT_SYMBOL(schedule_timeout); |
1096 | 1097 | ||
1097 | /* | 1098 | /* |
1098 | * We can use __set_current_state() here because schedule_timeout() calls | 1099 | * We can use __set_current_state() here because schedule_timeout() calls |
1099 | * schedule() unconditionally. | 1100 | * schedule() unconditionally. |
1100 | */ | 1101 | */ |
1101 | signed long __sched schedule_timeout_interruptible(signed long timeout) | 1102 | signed long __sched schedule_timeout_interruptible(signed long timeout) |
1102 | { | 1103 | { |
1103 | __set_current_state(TASK_INTERRUPTIBLE); | 1104 | __set_current_state(TASK_INTERRUPTIBLE); |
1104 | return schedule_timeout(timeout); | 1105 | return schedule_timeout(timeout); |
1105 | } | 1106 | } |
1106 | EXPORT_SYMBOL(schedule_timeout_interruptible); | 1107 | EXPORT_SYMBOL(schedule_timeout_interruptible); |
1107 | 1108 | ||
1108 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) | 1109 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) |
1109 | { | 1110 | { |
1110 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1111 | __set_current_state(TASK_UNINTERRUPTIBLE); |
1111 | return schedule_timeout(timeout); | 1112 | return schedule_timeout(timeout); |
1112 | } | 1113 | } |
1113 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1114 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1114 | 1115 | ||
1115 | /* Thread ID - the internal kernel "pid" */ | 1116 | /* Thread ID - the internal kernel "pid" */ |
1116 | asmlinkage long sys_gettid(void) | 1117 | asmlinkage long sys_gettid(void) |
1117 | { | 1118 | { |
1118 | return current->pid; | 1119 | return current->pid; |
1119 | } | 1120 | } |
1120 | 1121 | ||
1121 | static long __sched nanosleep_restart(struct restart_block *restart) | 1122 | static long __sched nanosleep_restart(struct restart_block *restart) |
1122 | { | 1123 | { |
1123 | unsigned long expire = restart->arg0, now = jiffies; | 1124 | unsigned long expire = restart->arg0, now = jiffies; |
1124 | struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; | 1125 | struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; |
1125 | long ret; | 1126 | long ret; |
1126 | 1127 | ||
1127 | /* Did it expire while we handled signals? */ | 1128 | /* Did it expire while we handled signals? */ |
1128 | if (!time_after(expire, now)) | 1129 | if (!time_after(expire, now)) |
1129 | return 0; | 1130 | return 0; |
1130 | 1131 | ||
1131 | expire = schedule_timeout_interruptible(expire - now); | 1132 | expire = schedule_timeout_interruptible(expire - now); |
1132 | 1133 | ||
1133 | ret = 0; | 1134 | ret = 0; |
1134 | if (expire) { | 1135 | if (expire) { |
1135 | struct timespec t; | 1136 | struct timespec t; |
1136 | jiffies_to_timespec(expire, &t); | 1137 | jiffies_to_timespec(expire, &t); |
1137 | 1138 | ||
1138 | ret = -ERESTART_RESTARTBLOCK; | 1139 | ret = -ERESTART_RESTARTBLOCK; |
1139 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | 1140 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) |
1140 | ret = -EFAULT; | 1141 | ret = -EFAULT; |
1141 | /* The 'restart' block is already filled in */ | 1142 | /* The 'restart' block is already filled in */ |
1142 | } | 1143 | } |
1143 | return ret; | 1144 | return ret; |
1144 | } | 1145 | } |
1145 | 1146 | ||
1146 | asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | 1147 | asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) |
1147 | { | 1148 | { |
1148 | struct timespec t; | 1149 | struct timespec t; |
1149 | unsigned long expire; | 1150 | unsigned long expire; |
1150 | long ret; | 1151 | long ret; |
1151 | 1152 | ||
1152 | if (copy_from_user(&t, rqtp, sizeof(t))) | 1153 | if (copy_from_user(&t, rqtp, sizeof(t))) |
1153 | return -EFAULT; | 1154 | return -EFAULT; |
1154 | 1155 | ||
1155 | if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) | 1156 | if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) |
1156 | return -EINVAL; | 1157 | return -EINVAL; |
1157 | 1158 | ||
1158 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | 1159 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); |
1159 | expire = schedule_timeout_interruptible(expire); | 1160 | expire = schedule_timeout_interruptible(expire); |
1160 | 1161 | ||
1161 | ret = 0; | 1162 | ret = 0; |
1162 | if (expire) { | 1163 | if (expire) { |
1163 | struct restart_block *restart; | 1164 | struct restart_block *restart; |
1164 | jiffies_to_timespec(expire, &t); | 1165 | jiffies_to_timespec(expire, &t); |
1165 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | 1166 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) |
1166 | return -EFAULT; | 1167 | return -EFAULT; |
1167 | 1168 | ||
1168 | restart = ¤t_thread_info()->restart_block; | 1169 | restart = ¤t_thread_info()->restart_block; |
1169 | restart->fn = nanosleep_restart; | 1170 | restart->fn = nanosleep_restart; |
1170 | restart->arg0 = jiffies + expire; | 1171 | restart->arg0 = jiffies + expire; |
1171 | restart->arg1 = (unsigned long) rmtp; | 1172 | restart->arg1 = (unsigned long) rmtp; |
1172 | ret = -ERESTART_RESTARTBLOCK; | 1173 | ret = -ERESTART_RESTARTBLOCK; |
1173 | } | 1174 | } |
1174 | return ret; | 1175 | return ret; |
1175 | } | 1176 | } |
1176 | 1177 | ||
1177 | /* | 1178 | /* |
1178 | * sys_sysinfo - fill in sysinfo struct | 1179 | * sys_sysinfo - fill in sysinfo struct |
1179 | */ | 1180 | */ |
1180 | asmlinkage long sys_sysinfo(struct sysinfo __user *info) | 1181 | asmlinkage long sys_sysinfo(struct sysinfo __user *info) |
1181 | { | 1182 | { |
1182 | struct sysinfo val; | 1183 | struct sysinfo val; |
1183 | unsigned long mem_total, sav_total; | 1184 | unsigned long mem_total, sav_total; |
1184 | unsigned int mem_unit, bitcount; | 1185 | unsigned int mem_unit, bitcount; |
1185 | unsigned long seq; | 1186 | unsigned long seq; |
1186 | 1187 | ||
1187 | memset((char *)&val, 0, sizeof(struct sysinfo)); | 1188 | memset((char *)&val, 0, sizeof(struct sysinfo)); |
1188 | 1189 | ||
1189 | do { | 1190 | do { |
1190 | struct timespec tp; | 1191 | struct timespec tp; |
1191 | seq = read_seqbegin(&xtime_lock); | 1192 | seq = read_seqbegin(&xtime_lock); |
1192 | 1193 | ||
1193 | /* | 1194 | /* |
1194 | * This is annoying. The below is the same thing | 1195 | * This is annoying. The below is the same thing |
1195 | * posix_get_clock_monotonic() does, but it wants to | 1196 | * posix_get_clock_monotonic() does, but it wants to |
1196 | * take the lock which we want to cover the loads stuff | 1197 | * take the lock which we want to cover the loads stuff |
1197 | * too. | 1198 | * too. |
1198 | */ | 1199 | */ |
1199 | 1200 | ||
1200 | getnstimeofday(&tp); | 1201 | getnstimeofday(&tp); |
1201 | tp.tv_sec += wall_to_monotonic.tv_sec; | 1202 | tp.tv_sec += wall_to_monotonic.tv_sec; |
1202 | tp.tv_nsec += wall_to_monotonic.tv_nsec; | 1203 | tp.tv_nsec += wall_to_monotonic.tv_nsec; |
1203 | if (tp.tv_nsec - NSEC_PER_SEC >= 0) { | 1204 | if (tp.tv_nsec - NSEC_PER_SEC >= 0) { |
1204 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; | 1205 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; |
1205 | tp.tv_sec++; | 1206 | tp.tv_sec++; |
1206 | } | 1207 | } |
1207 | val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); | 1208 | val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); |
1208 | 1209 | ||
1209 | val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); | 1210 | val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); |
1210 | val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); | 1211 | val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); |
1211 | val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); | 1212 | val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); |
1212 | 1213 | ||
1213 | val.procs = nr_threads; | 1214 | val.procs = nr_threads; |
1214 | } while (read_seqretry(&xtime_lock, seq)); | 1215 | } while (read_seqretry(&xtime_lock, seq)); |
1215 | 1216 | ||
1216 | si_meminfo(&val); | 1217 | si_meminfo(&val); |
1217 | si_swapinfo(&val); | 1218 | si_swapinfo(&val); |
1218 | 1219 | ||
1219 | /* | 1220 | /* |
1220 | * If the sum of all the available memory (i.e. ram + swap) | 1221 | * If the sum of all the available memory (i.e. ram + swap) |
1221 | * is less than can be stored in a 32 bit unsigned long then | 1222 | * is less than can be stored in a 32 bit unsigned long then |
1222 | * we can be binary compatible with 2.2.x kernels. If not, | 1223 | * we can be binary compatible with 2.2.x kernels. If not, |
1223 | * well, in that case 2.2.x was broken anyways... | 1224 | * well, in that case 2.2.x was broken anyways... |
1224 | * | 1225 | * |
1225 | * -Erik Andersen <andersee@debian.org> | 1226 | * -Erik Andersen <andersee@debian.org> |
1226 | */ | 1227 | */ |
1227 | 1228 | ||
1228 | mem_total = val.totalram + val.totalswap; | 1229 | mem_total = val.totalram + val.totalswap; |
1229 | if (mem_total < val.totalram || mem_total < val.totalswap) | 1230 | if (mem_total < val.totalram || mem_total < val.totalswap) |
1230 | goto out; | 1231 | goto out; |
1231 | bitcount = 0; | 1232 | bitcount = 0; |
1232 | mem_unit = val.mem_unit; | 1233 | mem_unit = val.mem_unit; |
1233 | while (mem_unit > 1) { | 1234 | while (mem_unit > 1) { |
1234 | bitcount++; | 1235 | bitcount++; |
1235 | mem_unit >>= 1; | 1236 | mem_unit >>= 1; |
1236 | sav_total = mem_total; | 1237 | sav_total = mem_total; |
1237 | mem_total <<= 1; | 1238 | mem_total <<= 1; |
1238 | if (mem_total < sav_total) | 1239 | if (mem_total < sav_total) |
1239 | goto out; | 1240 | goto out; |
1240 | } | 1241 | } |
1241 | 1242 | ||
1242 | /* | 1243 | /* |
1243 | * If mem_total did not overflow, multiply all memory values by | 1244 | * If mem_total did not overflow, multiply all memory values by |
1244 | * val.mem_unit and set it to 1. This leaves things compatible | 1245 | * val.mem_unit and set it to 1. This leaves things compatible |
1245 | * with 2.2.x, and also retains compatibility with earlier 2.4.x | 1246 | * with 2.2.x, and also retains compatibility with earlier 2.4.x |
1246 | * kernels... | 1247 | * kernels... |
1247 | */ | 1248 | */ |
1248 | 1249 | ||
1249 | val.mem_unit = 1; | 1250 | val.mem_unit = 1; |
1250 | val.totalram <<= bitcount; | 1251 | val.totalram <<= bitcount; |
1251 | val.freeram <<= bitcount; | 1252 | val.freeram <<= bitcount; |
1252 | val.sharedram <<= bitcount; | 1253 | val.sharedram <<= bitcount; |
1253 | val.bufferram <<= bitcount; | 1254 | val.bufferram <<= bitcount; |
1254 | val.totalswap <<= bitcount; | 1255 | val.totalswap <<= bitcount; |
1255 | val.freeswap <<= bitcount; | 1256 | val.freeswap <<= bitcount; |
1256 | val.totalhigh <<= bitcount; | 1257 | val.totalhigh <<= bitcount; |
1257 | val.freehigh <<= bitcount; | 1258 | val.freehigh <<= bitcount; |
1258 | 1259 | ||
1259 | out: | 1260 | out: |
1260 | if (copy_to_user(info, &val, sizeof(struct sysinfo))) | 1261 | if (copy_to_user(info, &val, sizeof(struct sysinfo))) |
1261 | return -EFAULT; | 1262 | return -EFAULT; |
1262 | 1263 | ||
1263 | return 0; | 1264 | return 0; |
1264 | } | 1265 | } |
1265 | 1266 | ||
1266 | static void __devinit init_timers_cpu(int cpu) | 1267 | static void __devinit init_timers_cpu(int cpu) |
1267 | { | 1268 | { |
1268 | int j; | 1269 | int j; |
1269 | tvec_base_t *base; | 1270 | tvec_base_t *base; |
1270 | 1271 | ||
1271 | base = &per_cpu(tvec_bases, cpu); | 1272 | base = &per_cpu(tvec_bases, cpu); |
1272 | spin_lock_init(&base->t_base.lock); | 1273 | spin_lock_init(&base->t_base.lock); |
1273 | for (j = 0; j < TVN_SIZE; j++) { | 1274 | for (j = 0; j < TVN_SIZE; j++) { |
1274 | INIT_LIST_HEAD(base->tv5.vec + j); | 1275 | INIT_LIST_HEAD(base->tv5.vec + j); |
1275 | INIT_LIST_HEAD(base->tv4.vec + j); | 1276 | INIT_LIST_HEAD(base->tv4.vec + j); |
1276 | INIT_LIST_HEAD(base->tv3.vec + j); | 1277 | INIT_LIST_HEAD(base->tv3.vec + j); |
1277 | INIT_LIST_HEAD(base->tv2.vec + j); | 1278 | INIT_LIST_HEAD(base->tv2.vec + j); |
1278 | } | 1279 | } |
1279 | for (j = 0; j < TVR_SIZE; j++) | 1280 | for (j = 0; j < TVR_SIZE; j++) |
1280 | INIT_LIST_HEAD(base->tv1.vec + j); | 1281 | INIT_LIST_HEAD(base->tv1.vec + j); |
1281 | 1282 | ||
1282 | base->timer_jiffies = jiffies; | 1283 | base->timer_jiffies = jiffies; |
1283 | } | 1284 | } |
1284 | 1285 | ||
1285 | #ifdef CONFIG_HOTPLUG_CPU | 1286 | #ifdef CONFIG_HOTPLUG_CPU |
1286 | static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | 1287 | static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) |
1287 | { | 1288 | { |
1288 | struct timer_list *timer; | 1289 | struct timer_list *timer; |
1289 | 1290 | ||
1290 | while (!list_empty(head)) { | 1291 | while (!list_empty(head)) { |
1291 | timer = list_entry(head->next, struct timer_list, entry); | 1292 | timer = list_entry(head->next, struct timer_list, entry); |
1292 | detach_timer(timer, 0); | 1293 | detach_timer(timer, 0); |
1293 | timer->base = &new_base->t_base; | 1294 | timer->base = &new_base->t_base; |
1294 | internal_add_timer(new_base, timer); | 1295 | internal_add_timer(new_base, timer); |
1295 | } | 1296 | } |
1296 | } | 1297 | } |
1297 | 1298 | ||
1298 | static void __devinit migrate_timers(int cpu) | 1299 | static void __devinit migrate_timers(int cpu) |
1299 | { | 1300 | { |
1300 | tvec_base_t *old_base; | 1301 | tvec_base_t *old_base; |
1301 | tvec_base_t *new_base; | 1302 | tvec_base_t *new_base; |
1302 | int i; | 1303 | int i; |
1303 | 1304 | ||
1304 | BUG_ON(cpu_online(cpu)); | 1305 | BUG_ON(cpu_online(cpu)); |
1305 | old_base = &per_cpu(tvec_bases, cpu); | 1306 | old_base = &per_cpu(tvec_bases, cpu); |
1306 | new_base = &get_cpu_var(tvec_bases); | 1307 | new_base = &get_cpu_var(tvec_bases); |
1307 | 1308 | ||
1308 | local_irq_disable(); | 1309 | local_irq_disable(); |
1309 | spin_lock(&new_base->t_base.lock); | 1310 | spin_lock(&new_base->t_base.lock); |
1310 | spin_lock(&old_base->t_base.lock); | 1311 | spin_lock(&old_base->t_base.lock); |
1311 | 1312 | ||
1312 | if (old_base->t_base.running_timer) | 1313 | if (old_base->t_base.running_timer) |
1313 | BUG(); | 1314 | BUG(); |
1314 | for (i = 0; i < TVR_SIZE; i++) | 1315 | for (i = 0; i < TVR_SIZE; i++) |
1315 | migrate_timer_list(new_base, old_base->tv1.vec + i); | 1316 | migrate_timer_list(new_base, old_base->tv1.vec + i); |
1316 | for (i = 0; i < TVN_SIZE; i++) { | 1317 | for (i = 0; i < TVN_SIZE; i++) { |
1317 | migrate_timer_list(new_base, old_base->tv2.vec + i); | 1318 | migrate_timer_list(new_base, old_base->tv2.vec + i); |
1318 | migrate_timer_list(new_base, old_base->tv3.vec + i); | 1319 | migrate_timer_list(new_base, old_base->tv3.vec + i); |
1319 | migrate_timer_list(new_base, old_base->tv4.vec + i); | 1320 | migrate_timer_list(new_base, old_base->tv4.vec + i); |
1320 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 1321 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
1321 | } | 1322 | } |
1322 | 1323 | ||
1323 | spin_unlock(&old_base->t_base.lock); | 1324 | spin_unlock(&old_base->t_base.lock); |
1324 | spin_unlock(&new_base->t_base.lock); | 1325 | spin_unlock(&new_base->t_base.lock); |
1325 | local_irq_enable(); | 1326 | local_irq_enable(); |
1326 | put_cpu_var(tvec_bases); | 1327 | put_cpu_var(tvec_bases); |
1327 | } | 1328 | } |
1328 | #endif /* CONFIG_HOTPLUG_CPU */ | 1329 | #endif /* CONFIG_HOTPLUG_CPU */ |
1329 | 1330 | ||
1330 | static int __devinit timer_cpu_notify(struct notifier_block *self, | 1331 | static int __devinit timer_cpu_notify(struct notifier_block *self, |
1331 | unsigned long action, void *hcpu) | 1332 | unsigned long action, void *hcpu) |
1332 | { | 1333 | { |
1333 | long cpu = (long)hcpu; | 1334 | long cpu = (long)hcpu; |
1334 | switch(action) { | 1335 | switch(action) { |
1335 | case CPU_UP_PREPARE: | 1336 | case CPU_UP_PREPARE: |
1336 | init_timers_cpu(cpu); | 1337 | init_timers_cpu(cpu); |
1337 | break; | 1338 | break; |
1338 | #ifdef CONFIG_HOTPLUG_CPU | 1339 | #ifdef CONFIG_HOTPLUG_CPU |
1339 | case CPU_DEAD: | 1340 | case CPU_DEAD: |
1340 | migrate_timers(cpu); | 1341 | migrate_timers(cpu); |
1341 | break; | 1342 | break; |
1342 | #endif | 1343 | #endif |
1343 | default: | 1344 | default: |
1344 | break; | 1345 | break; |
1345 | } | 1346 | } |
1346 | return NOTIFY_OK; | 1347 | return NOTIFY_OK; |
1347 | } | 1348 | } |
1348 | 1349 | ||
1349 | static struct notifier_block __devinitdata timers_nb = { | 1350 | static struct notifier_block __devinitdata timers_nb = { |
1350 | .notifier_call = timer_cpu_notify, | 1351 | .notifier_call = timer_cpu_notify, |
1351 | }; | 1352 | }; |
1352 | 1353 | ||
1353 | 1354 | ||
1354 | void __init init_timers(void) | 1355 | void __init init_timers(void) |
1355 | { | 1356 | { |
1356 | timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1357 | timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, |
1357 | (void *)(long)smp_processor_id()); | 1358 | (void *)(long)smp_processor_id()); |
1358 | register_cpu_notifier(&timers_nb); | 1359 | register_cpu_notifier(&timers_nb); |
1359 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); | 1360 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); |
1360 | } | 1361 | } |
1361 | 1362 | ||
1362 | #ifdef CONFIG_TIME_INTERPOLATION | 1363 | #ifdef CONFIG_TIME_INTERPOLATION |
1363 | 1364 | ||
1364 | struct time_interpolator *time_interpolator; | 1365 | struct time_interpolator *time_interpolator; |
1365 | static struct time_interpolator *time_interpolator_list; | 1366 | static struct time_interpolator *time_interpolator_list; |
1366 | static DEFINE_SPINLOCK(time_interpolator_lock); | 1367 | static DEFINE_SPINLOCK(time_interpolator_lock); |
1367 | 1368 | ||
1368 | static inline u64 time_interpolator_get_cycles(unsigned int src) | 1369 | static inline u64 time_interpolator_get_cycles(unsigned int src) |
1369 | { | 1370 | { |
1370 | unsigned long (*x)(void); | 1371 | unsigned long (*x)(void); |
1371 | 1372 | ||
1372 | switch (src) | 1373 | switch (src) |
1373 | { | 1374 | { |
1374 | case TIME_SOURCE_FUNCTION: | 1375 | case TIME_SOURCE_FUNCTION: |
1375 | x = time_interpolator->addr; | 1376 | x = time_interpolator->addr; |
1376 | return x(); | 1377 | return x(); |
1377 | 1378 | ||
1378 | case TIME_SOURCE_MMIO64 : | 1379 | case TIME_SOURCE_MMIO64 : |
1379 | return readq((void __iomem *) time_interpolator->addr); | 1380 | return readq((void __iomem *) time_interpolator->addr); |
1380 | 1381 | ||
1381 | case TIME_SOURCE_MMIO32 : | 1382 | case TIME_SOURCE_MMIO32 : |
1382 | return readl((void __iomem *) time_interpolator->addr); | 1383 | return readl((void __iomem *) time_interpolator->addr); |
1383 | 1384 | ||
1384 | default: return get_cycles(); | 1385 | default: return get_cycles(); |
1385 | } | 1386 | } |
1386 | } | 1387 | } |
1387 | 1388 | ||
1388 | static inline u64 time_interpolator_get_counter(int writelock) | 1389 | static inline u64 time_interpolator_get_counter(int writelock) |
1389 | { | 1390 | { |
1390 | unsigned int src = time_interpolator->source; | 1391 | unsigned int src = time_interpolator->source; |
1391 | 1392 | ||
1392 | if (time_interpolator->jitter) | 1393 | if (time_interpolator->jitter) |
1393 | { | 1394 | { |
1394 | u64 lcycle; | 1395 | u64 lcycle; |
1395 | u64 now; | 1396 | u64 now; |
1396 | 1397 | ||
1397 | do { | 1398 | do { |
1398 | lcycle = time_interpolator->last_cycle; | 1399 | lcycle = time_interpolator->last_cycle; |
1399 | now = time_interpolator_get_cycles(src); | 1400 | now = time_interpolator_get_cycles(src); |
1400 | if (lcycle && time_after(lcycle, now)) | 1401 | if (lcycle && time_after(lcycle, now)) |
1401 | return lcycle; | 1402 | return lcycle; |
1402 | 1403 | ||
1403 | /* When holding the xtime write lock, there's no need | 1404 | /* When holding the xtime write lock, there's no need |
1404 | * to add the overhead of the cmpxchg. Readers are | 1405 | * to add the overhead of the cmpxchg. Readers are |
1405 | * force to retry until the write lock is released. | 1406 | * force to retry until the write lock is released. |
1406 | */ | 1407 | */ |
1407 | if (writelock) { | 1408 | if (writelock) { |
1408 | time_interpolator->last_cycle = now; | 1409 | time_interpolator->last_cycle = now; |
1409 | return now; | 1410 | return now; |
1410 | } | 1411 | } |
1411 | /* Keep track of the last timer value returned. The use of cmpxchg here | 1412 | /* Keep track of the last timer value returned. The use of cmpxchg here |
1412 | * will cause contention in an SMP environment. | 1413 | * will cause contention in an SMP environment. |
1413 | */ | 1414 | */ |
1414 | } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); | 1415 | } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); |
1415 | return now; | 1416 | return now; |
1416 | } | 1417 | } |
1417 | else | 1418 | else |
1418 | return time_interpolator_get_cycles(src); | 1419 | return time_interpolator_get_cycles(src); |
1419 | } | 1420 | } |
1420 | 1421 | ||
1421 | void time_interpolator_reset(void) | 1422 | void time_interpolator_reset(void) |
1422 | { | 1423 | { |
1423 | time_interpolator->offset = 0; | 1424 | time_interpolator->offset = 0; |
1424 | time_interpolator->last_counter = time_interpolator_get_counter(1); | 1425 | time_interpolator->last_counter = time_interpolator_get_counter(1); |
1425 | } | 1426 | } |
1426 | 1427 | ||
1427 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) | 1428 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) |
1428 | 1429 | ||
1429 | unsigned long time_interpolator_get_offset(void) | 1430 | unsigned long time_interpolator_get_offset(void) |
1430 | { | 1431 | { |
1431 | /* If we do not have a time interpolator set up then just return zero */ | 1432 | /* If we do not have a time interpolator set up then just return zero */ |
1432 | if (!time_interpolator) | 1433 | if (!time_interpolator) |
1433 | return 0; | 1434 | return 0; |
1434 | 1435 | ||
1435 | return time_interpolator->offset + | 1436 | return time_interpolator->offset + |
1436 | GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); | 1437 | GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); |
1437 | } | 1438 | } |
1438 | 1439 | ||
1439 | #define INTERPOLATOR_ADJUST 65536 | 1440 | #define INTERPOLATOR_ADJUST 65536 |
1440 | #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST | 1441 | #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST |
1441 | 1442 | ||
1442 | static void time_interpolator_update(long delta_nsec) | 1443 | static void time_interpolator_update(long delta_nsec) |
1443 | { | 1444 | { |
1444 | u64 counter; | 1445 | u64 counter; |
1445 | unsigned long offset; | 1446 | unsigned long offset; |
1446 | 1447 | ||
1447 | /* If there is no time interpolator set up then do nothing */ | 1448 | /* If there is no time interpolator set up then do nothing */ |
1448 | if (!time_interpolator) | 1449 | if (!time_interpolator) |
1449 | return; | 1450 | return; |
1450 | 1451 | ||
1451 | /* | 1452 | /* |
1452 | * The interpolator compensates for late ticks by accumulating the late | 1453 | * The interpolator compensates for late ticks by accumulating the late |
1453 | * time in time_interpolator->offset. A tick earlier than expected will | 1454 | * time in time_interpolator->offset. A tick earlier than expected will |
1454 | * lead to a reset of the offset and a corresponding jump of the clock | 1455 | * lead to a reset of the offset and a corresponding jump of the clock |
1455 | * forward. Again this only works if the interpolator clock is running | 1456 | * forward. Again this only works if the interpolator clock is running |
1456 | * slightly slower than the regular clock and the tuning logic insures | 1457 | * slightly slower than the regular clock and the tuning logic insures |
1457 | * that. | 1458 | * that. |
1458 | */ | 1459 | */ |
1459 | 1460 | ||
1460 | counter = time_interpolator_get_counter(1); | 1461 | counter = time_interpolator_get_counter(1); |
1461 | offset = time_interpolator->offset + | 1462 | offset = time_interpolator->offset + |
1462 | GET_TI_NSECS(counter, time_interpolator); | 1463 | GET_TI_NSECS(counter, time_interpolator); |
1463 | 1464 | ||
1464 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | 1465 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) |
1465 | time_interpolator->offset = offset - delta_nsec; | 1466 | time_interpolator->offset = offset - delta_nsec; |
1466 | else { | 1467 | else { |
1467 | time_interpolator->skips++; | 1468 | time_interpolator->skips++; |
1468 | time_interpolator->ns_skipped += delta_nsec - offset; | 1469 | time_interpolator->ns_skipped += delta_nsec - offset; |
1469 | time_interpolator->offset = 0; | 1470 | time_interpolator->offset = 0; |
1470 | } | 1471 | } |
1471 | time_interpolator->last_counter = counter; | 1472 | time_interpolator->last_counter = counter; |
1472 | 1473 | ||
1473 | /* Tuning logic for time interpolator invoked every minute or so. | 1474 | /* Tuning logic for time interpolator invoked every minute or so. |
1474 | * Decrease interpolator clock speed if no skips occurred and an offset is carried. | 1475 | * Decrease interpolator clock speed if no skips occurred and an offset is carried. |
1475 | * Increase interpolator clock speed if we skip too much time. | 1476 | * Increase interpolator clock speed if we skip too much time. |
1476 | */ | 1477 | */ |
1477 | if (jiffies % INTERPOLATOR_ADJUST == 0) | 1478 | if (jiffies % INTERPOLATOR_ADJUST == 0) |
1478 | { | 1479 | { |
1479 | if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) | 1480 | if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) |
1480 | time_interpolator->nsec_per_cyc--; | 1481 | time_interpolator->nsec_per_cyc--; |
1481 | if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) | 1482 | if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) |
1482 | time_interpolator->nsec_per_cyc++; | 1483 | time_interpolator->nsec_per_cyc++; |
1483 | time_interpolator->skips = 0; | 1484 | time_interpolator->skips = 0; |
1484 | time_interpolator->ns_skipped = 0; | 1485 | time_interpolator->ns_skipped = 0; |
1485 | } | 1486 | } |
1486 | } | 1487 | } |
1487 | 1488 | ||
1488 | static inline int | 1489 | static inline int |
1489 | is_better_time_interpolator(struct time_interpolator *new) | 1490 | is_better_time_interpolator(struct time_interpolator *new) |
1490 | { | 1491 | { |
1491 | if (!time_interpolator) | 1492 | if (!time_interpolator) |
1492 | return 1; | 1493 | return 1; |
1493 | return new->frequency > 2*time_interpolator->frequency || | 1494 | return new->frequency > 2*time_interpolator->frequency || |
1494 | (unsigned long)new->drift < (unsigned long)time_interpolator->drift; | 1495 | (unsigned long)new->drift < (unsigned long)time_interpolator->drift; |
1495 | } | 1496 | } |
1496 | 1497 | ||
1497 | void | 1498 | void |
1498 | register_time_interpolator(struct time_interpolator *ti) | 1499 | register_time_interpolator(struct time_interpolator *ti) |
1499 | { | 1500 | { |
1500 | unsigned long flags; | 1501 | unsigned long flags; |
1501 | 1502 | ||
1502 | /* Sanity check */ | 1503 | /* Sanity check */ |
1503 | if (ti->frequency == 0 || ti->mask == 0) | 1504 | if (ti->frequency == 0 || ti->mask == 0) |
1504 | BUG(); | 1505 | BUG(); |
1505 | 1506 | ||
1506 | ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; | 1507 | ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; |
1507 | spin_lock(&time_interpolator_lock); | 1508 | spin_lock(&time_interpolator_lock); |
1508 | write_seqlock_irqsave(&xtime_lock, flags); | 1509 | write_seqlock_irqsave(&xtime_lock, flags); |
1509 | if (is_better_time_interpolator(ti)) { | 1510 | if (is_better_time_interpolator(ti)) { |
1510 | time_interpolator = ti; | 1511 | time_interpolator = ti; |
1511 | time_interpolator_reset(); | 1512 | time_interpolator_reset(); |
1512 | } | 1513 | } |
1513 | write_sequnlock_irqrestore(&xtime_lock, flags); | 1514 | write_sequnlock_irqrestore(&xtime_lock, flags); |
1514 | 1515 | ||
1515 | ti->next = time_interpolator_list; | 1516 | ti->next = time_interpolator_list; |
1516 | time_interpolator_list = ti; | 1517 | time_interpolator_list = ti; |
1517 | spin_unlock(&time_interpolator_lock); | 1518 | spin_unlock(&time_interpolator_lock); |
1518 | } | 1519 | } |
1519 | 1520 | ||
1520 | void | 1521 | void |
1521 | unregister_time_interpolator(struct time_interpolator *ti) | 1522 | unregister_time_interpolator(struct time_interpolator *ti) |
1522 | { | 1523 | { |
1523 | struct time_interpolator *curr, **prev; | 1524 | struct time_interpolator *curr, **prev; |
1524 | unsigned long flags; | 1525 | unsigned long flags; |
1525 | 1526 | ||
1526 | spin_lock(&time_interpolator_lock); | 1527 | spin_lock(&time_interpolator_lock); |
1527 | prev = &time_interpolator_list; | 1528 | prev = &time_interpolator_list; |
1528 | for (curr = *prev; curr; curr = curr->next) { | 1529 | for (curr = *prev; curr; curr = curr->next) { |
1529 | if (curr == ti) { | 1530 | if (curr == ti) { |
1530 | *prev = curr->next; | 1531 | *prev = curr->next; |
1531 | break; | 1532 | break; |
1532 | } | 1533 | } |
1533 | prev = &curr->next; | 1534 | prev = &curr->next; |
1534 | } | 1535 | } |
1535 | 1536 | ||
1536 | write_seqlock_irqsave(&xtime_lock, flags); | 1537 | write_seqlock_irqsave(&xtime_lock, flags); |
1537 | if (ti == time_interpolator) { | 1538 | if (ti == time_interpolator) { |
1538 | /* we lost the best time-interpolator: */ | 1539 | /* we lost the best time-interpolator: */ |
1539 | time_interpolator = NULL; | 1540 | time_interpolator = NULL; |
1540 | /* find the next-best interpolator */ | 1541 | /* find the next-best interpolator */ |
1541 | for (curr = time_interpolator_list; curr; curr = curr->next) | 1542 | for (curr = time_interpolator_list; curr; curr = curr->next) |
1542 | if (is_better_time_interpolator(curr)) | 1543 | if (is_better_time_interpolator(curr)) |
1543 | time_interpolator = curr; | 1544 | time_interpolator = curr; |
1544 | time_interpolator_reset(); | 1545 | time_interpolator_reset(); |
1545 | } | 1546 | } |
1546 | write_sequnlock_irqrestore(&xtime_lock, flags); | 1547 | write_sequnlock_irqrestore(&xtime_lock, flags); |
1547 | spin_unlock(&time_interpolator_lock); | 1548 | spin_unlock(&time_interpolator_lock); |
1548 | } | 1549 | } |
1549 | #endif /* CONFIG_TIME_INTERPOLATION */ | 1550 | #endif /* CONFIG_TIME_INTERPOLATION */ |
1550 | 1551 | ||
1551 | /** | 1552 | /** |
1552 | * msleep - sleep safely even with waitqueue interruptions | 1553 | * msleep - sleep safely even with waitqueue interruptions |
1553 | * @msecs: Time in milliseconds to sleep for | 1554 | * @msecs: Time in milliseconds to sleep for |
1554 | */ | 1555 | */ |
1555 | void msleep(unsigned int msecs) | 1556 | void msleep(unsigned int msecs) |
1556 | { | 1557 | { |
1557 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | 1558 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; |
1558 | 1559 | ||
1559 | while (timeout) | 1560 | while (timeout) |
1560 | timeout = schedule_timeout_uninterruptible(timeout); | 1561 | timeout = schedule_timeout_uninterruptible(timeout); |
1561 | } | 1562 | } |
1562 | 1563 | ||
1563 | EXPORT_SYMBOL(msleep); | 1564 | EXPORT_SYMBOL(msleep); |
1564 | 1565 | ||
1565 | /** | 1566 | /** |
1566 | * msleep_interruptible - sleep waiting for signals | 1567 | * msleep_interruptible - sleep waiting for signals |
1567 | * @msecs: Time in milliseconds to sleep for | 1568 | * @msecs: Time in milliseconds to sleep for |
1568 | */ | 1569 | */ |
1569 | unsigned long msleep_interruptible(unsigned int msecs) | 1570 | unsigned long msleep_interruptible(unsigned int msecs) |
1570 | { | 1571 | { |
1571 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | 1572 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; |
1572 | 1573 | ||
1573 | while (timeout && !signal_pending(current)) | 1574 | while (timeout && !signal_pending(current)) |
1574 | timeout = schedule_timeout_interruptible(timeout); | 1575 | timeout = schedule_timeout_interruptible(timeout); |
1575 | return jiffies_to_msecs(timeout); | 1576 | return jiffies_to_msecs(timeout); |
1576 | } | 1577 | } |
1577 | 1578 | ||
1578 | EXPORT_SYMBOL(msleep_interruptible); | 1579 | EXPORT_SYMBOL(msleep_interruptible); |
1579 | 1580 |