Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 97a41e26124330e41aa10ef88cd1711bc3d17460

Authored by Adrian Bunk 2006-01-08 17:02:17 +0800

Committed by Linus Torvalds 2006-01-09 12:13:48 +0800

Exists in master and in 7 other branches

[PATCH] kernel/: small cleanups

This patch contains the following cleanups:
- make needlessly global functions static
- every file should include the headers containing the prototypes for
  it's global functions

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 5 additions and 2 deletions Inline Diff

kernel/audit.c
kernel/irq/proc.c
kernel/rcutorture.c
kernel/timer.c

kernel/audit.c

Diff comments View file @ 97a41e2

1	/* audit.c -- Auditing support	1	/* audit.c -- Auditing support
2	* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.	2	* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
3	* System-call specific features have moved to auditsc.c	3	* System-call specific features have moved to auditsc.c
4	*	4	*
5	* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.	5	* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
6	* All Rights Reserved.	6	* All Rights Reserved.
7	*	7	*
8	* This program is free software; you can redistribute it and/or modify	8	* This program is free software; you can redistribute it and/or modify
9	* it under the terms of the GNU General Public License as published by	9	* it under the terms of the GNU General Public License as published by
10	* the Free Software Foundation; either version 2 of the License, or	10	* the Free Software Foundation; either version 2 of the License, or
11	* (at your option) any later version.	11	* (at your option) any later version.
12	*	12	*
13	* This program is distributed in the hope that it will be useful,	13	* This program is distributed in the hope that it will be useful,
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of	14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	* GNU General Public License for more details.	16	* GNU General Public License for more details.
17	*	17	*
18	* You should have received a copy of the GNU General Public License	18	* You should have received a copy of the GNU General Public License
19	* along with this program; if not, write to the Free Software	19	* along with this program; if not, write to the Free Software
20	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA	20	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21	*	21	*
22	* Written by Rickard E. (Rik) Faith <faith@redhat.com>	22	* Written by Rickard E. (Rik) Faith <faith@redhat.com>
23	*	23	*
24	* Goals: 1) Integrate fully with SELinux.	24	* Goals: 1) Integrate fully with SELinux.
25	* 2) Minimal run-time overhead:	25	* 2) Minimal run-time overhead:
26	* a) Minimal when syscall auditing is disabled (audit_enable=0).	26	* a) Minimal when syscall auditing is disabled (audit_enable=0).
27	* b) Small when syscall auditing is enabled and no audit record	27	* b) Small when syscall auditing is enabled and no audit record
28	* is generated (defer as much work as possible to record	28	* is generated (defer as much work as possible to record
29	* generation time):	29	* generation time):
30	* i) context is allocated,	30	* i) context is allocated,
31	* ii) names from getname are stored without a copy, and	31	* ii) names from getname are stored without a copy, and
32	* iii) inode information stored from path_lookup.	32	* iii) inode information stored from path_lookup.
33	* 3) Ability to disable syscall auditing at boot time (audit=0).	33	* 3) Ability to disable syscall auditing at boot time (audit=0).
34	* 4) Usable by other parts of the kernel (if audit_log* is called,	34	* 4) Usable by other parts of the kernel (if audit_log* is called,
35	* then a syscall record will be generated automatically for the	35	* then a syscall record will be generated automatically for the
36	* current syscall).	36	* current syscall).
37	* 5) Netlink interface to user-space.	37	* 5) Netlink interface to user-space.
38	* 6) Support low-overhead kernel-based filtering to minimize the	38	* 6) Support low-overhead kernel-based filtering to minimize the
39	* information that must be passed to user-space.	39	* information that must be passed to user-space.
40	*	40	*
41	* Example user-space utilities: http://people.redhat.com/sgrubb/audit/	41	* Example user-space utilities: http://people.redhat.com/sgrubb/audit/
42	*/	42	*/
43		43
44	#include <linux/init.h>	44	#include <linux/init.h>
45	#include <asm/atomic.h>	45	#include <asm/atomic.h>
46	#include <asm/types.h>	46	#include <asm/types.h>
47	#include <linux/mm.h>	47	#include <linux/mm.h>
48	#include <linux/module.h>	48	#include <linux/module.h>
49	#include <linux/err.h>	49	#include <linux/err.h>
50	#include <linux/kthread.h>	50	#include <linux/kthread.h>
51		51
52	#include <linux/audit.h>	52	#include <linux/audit.h>
53		53
54	#include <net/sock.h>	54	#include <net/sock.h>
55	#include <linux/skbuff.h>	55	#include <linux/skbuff.h>
56	#include <linux/netlink.h>	56	#include <linux/netlink.h>
57		57
58	/* No auditing will take place until audit_initialized != 0.	58	/* No auditing will take place until audit_initialized != 0.
59	* (Initialization happens after skb_init is called.) */	59	* (Initialization happens after skb_init is called.) */
60	static int audit_initialized;	60	static int audit_initialized;
61		61
62	/* No syscall auditing will take place unless audit_enabled != 0. */	62	/* No syscall auditing will take place unless audit_enabled != 0. */
63	int audit_enabled;	63	int audit_enabled;
64		64
65	/* Default state when kernel boots without any parameters. */	65	/* Default state when kernel boots without any parameters. */
66	static int audit_default;	66	static int audit_default;
67		67
68	/* If auditing cannot proceed, audit_failure selects what happens. */	68	/* If auditing cannot proceed, audit_failure selects what happens. */
69	static int audit_failure = AUDIT_FAIL_PRINTK;	69	static int audit_failure = AUDIT_FAIL_PRINTK;
70		70
71	/* If audit records are to be written to the netlink socket, audit_pid	71	/* If audit records are to be written to the netlink socket, audit_pid
72	* contains the (non-zero) pid. */	72	* contains the (non-zero) pid. */
73	int audit_pid;	73	int audit_pid;
74		74
75	/* If audit_limit is non-zero, limit the rate of sending audit records	75	/* If audit_limit is non-zero, limit the rate of sending audit records
76	* to that number per second. This prevents DoS attacks, but results in	76	* to that number per second. This prevents DoS attacks, but results in
77	* audit records being dropped. */	77	* audit records being dropped. */
78	static int audit_rate_limit;	78	static int audit_rate_limit;
79		79
80	/* Number of outstanding audit_buffers allowed. */	80	/* Number of outstanding audit_buffers allowed. */
81	static int audit_backlog_limit = 64;	81	static int audit_backlog_limit = 64;
82	static int audit_backlog_wait_time = 60 * HZ;	82	static int audit_backlog_wait_time = 60 * HZ;
83	static int audit_backlog_wait_overflow = 0;	83	static int audit_backlog_wait_overflow = 0;
84		84
85	/* The identity of the user shutting down the audit system. */	85	/* The identity of the user shutting down the audit system. */
86	uid_t audit_sig_uid = -1;	86	uid_t audit_sig_uid = -1;
87	pid_t audit_sig_pid = -1;	87	pid_t audit_sig_pid = -1;
88		88
89	/* Records can be lost in several ways:	89	/* Records can be lost in several ways:
90	0) [suppressed in audit_alloc]	90	0) [suppressed in audit_alloc]
91	1) out of memory in audit_log_start [kmalloc of struct audit_buffer]	91	1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
92	2) out of memory in audit_log_move [alloc_skb]	92	2) out of memory in audit_log_move [alloc_skb]
93	3) suppressed due to audit_rate_limit	93	3) suppressed due to audit_rate_limit
94	4) suppressed due to audit_backlog_limit	94	4) suppressed due to audit_backlog_limit
95	*/	95	*/
96	static atomic_t audit_lost = ATOMIC_INIT(0);	96	static atomic_t audit_lost = ATOMIC_INIT(0);
97		97
98	/* The netlink socket. */	98	/* The netlink socket. */
99	static struct sock *audit_sock;	99	static struct sock *audit_sock;
100		100
101	/* The audit_freelist is a list of pre-allocated audit buffers (if more	101	/* The audit_freelist is a list of pre-allocated audit buffers (if more
102	* than AUDIT_MAXFREE are in use, the audit buffer is freed instead of	102	* than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
103	* being placed on the freelist). */	103	* being placed on the freelist). */
104	static DEFINE_SPINLOCK(audit_freelist_lock);	104	static DEFINE_SPINLOCK(audit_freelist_lock);
105	static int audit_freelist_count = 0;	105	static int audit_freelist_count = 0;
106	static LIST_HEAD(audit_freelist);	106	static LIST_HEAD(audit_freelist);
107		107
108	static struct sk_buff_head audit_skb_queue;	108	static struct sk_buff_head audit_skb_queue;
109	static struct task_struct *kauditd_task;	109	static struct task_struct *kauditd_task;
110	static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);	110	static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
111	static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);	111	static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
112		112
113	/* The netlink socket is only to be read by 1 CPU, which lets us assume	113	/* The netlink socket is only to be read by 1 CPU, which lets us assume
114	* that list additions and deletions never happen simultaneously in	114	* that list additions and deletions never happen simultaneously in
115	* auditsc.c */	115	* auditsc.c */
116	DECLARE_MUTEX(audit_netlink_sem);	116	DECLARE_MUTEX(audit_netlink_sem);
117		117
118	/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting	118	/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
119	* audit records. Since printk uses a 1024 byte buffer, this buffer	119	* audit records. Since printk uses a 1024 byte buffer, this buffer
120	* should be at least that large. */	120	* should be at least that large. */
121	#define AUDIT_BUFSIZ 1024	121	#define AUDIT_BUFSIZ 1024
122		122
123	/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the	123	/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
124	* audit_freelist. Doing so eliminates many kmalloc/kfree calls. */	124	* audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
125	#define AUDIT_MAXFREE (2*NR_CPUS)	125	#define AUDIT_MAXFREE (2*NR_CPUS)
126		126
127	/* The audit_buffer is used when formatting an audit record. The caller	127	/* The audit_buffer is used when formatting an audit record. The caller
128	* locks briefly to get the record off the freelist or to allocate the	128	* locks briefly to get the record off the freelist or to allocate the
129	* buffer, and locks briefly to send the buffer to the netlink layer or	129	* buffer, and locks briefly to send the buffer to the netlink layer or
130	* to place it on a transmit queue. Multiple audit_buffers can be in	130	* to place it on a transmit queue. Multiple audit_buffers can be in
131	* use simultaneously. */	131	* use simultaneously. */
132	struct audit_buffer {	132	struct audit_buffer {
133	struct list_head list;	133	struct list_head list;
134	struct sk_buff skb; / formatted skb ready to send */	134	struct sk_buff skb; / formatted skb ready to send */
135	struct audit_context ctx; / NULL or associated context */	135	struct audit_context ctx; / NULL or associated context */
136	gfp_t gfp_mask;	136	gfp_t gfp_mask;
137	};	137	};
138		138
139	static void audit_set_pid(struct audit_buffer *ab, pid_t pid)	139	static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
140	{	140	{
141	struct nlmsghdr nlh = (struct nlmsghdr )ab->skb->data;	141	struct nlmsghdr nlh = (struct nlmsghdr )ab->skb->data;
142	nlh->nlmsg_pid = pid;	142	nlh->nlmsg_pid = pid;
143	}	143	}
144		144
145	static void audit_panic(const char *message)	145	static void audit_panic(const char *message)
146	{	146	{
147	switch (audit_failure)	147	switch (audit_failure)
148	{	148	{
149	case AUDIT_FAIL_SILENT:	149	case AUDIT_FAIL_SILENT:
150	break;	150	break;
151	case AUDIT_FAIL_PRINTK:	151	case AUDIT_FAIL_PRINTK:
152	printk(KERN_ERR "audit: %s\n", message);	152	printk(KERN_ERR "audit: %s\n", message);
153	break;	153	break;
154	case AUDIT_FAIL_PANIC:	154	case AUDIT_FAIL_PANIC:
155	panic("audit: %s\n", message);	155	panic("audit: %s\n", message);
156	break;	156	break;
157	}	157	}
158	}	158	}
159		159
160	static inline int audit_rate_check(void)	160	static inline int audit_rate_check(void)
161	{	161	{
162	static unsigned long last_check = 0;	162	static unsigned long last_check = 0;
163	static int messages = 0;	163	static int messages = 0;
164	static DEFINE_SPINLOCK(lock);	164	static DEFINE_SPINLOCK(lock);
165	unsigned long flags;	165	unsigned long flags;
166	unsigned long now;	166	unsigned long now;
167	unsigned long elapsed;	167	unsigned long elapsed;
168	int retval = 0;	168	int retval = 0;
169		169
170	if (!audit_rate_limit) return 1;	170	if (!audit_rate_limit) return 1;
171		171
172	spin_lock_irqsave(&lock, flags);	172	spin_lock_irqsave(&lock, flags);
173	if (++messages < audit_rate_limit) {	173	if (++messages < audit_rate_limit) {
174	retval = 1;	174	retval = 1;
175	} else {	175	} else {
176	now = jiffies;	176	now = jiffies;
177	elapsed = now - last_check;	177	elapsed = now - last_check;
178	if (elapsed > HZ) {	178	if (elapsed > HZ) {
179	last_check = now;	179	last_check = now;
180	messages = 0;	180	messages = 0;
181	retval = 1;	181	retval = 1;
182	}	182	}
183	}	183	}
184	spin_unlock_irqrestore(&lock, flags);	184	spin_unlock_irqrestore(&lock, flags);
185		185
186	return retval;	186	return retval;
187	}	187	}
188		188
189	/* Emit at least 1 message per second, even if audit_rate_check is	189	/* Emit at least 1 message per second, even if audit_rate_check is
190	* throttling. */	190	* throttling. */
191	void audit_log_lost(const char *message)	191	void audit_log_lost(const char *message)
192	{	192	{
193	static unsigned long last_msg = 0;	193	static unsigned long last_msg = 0;
194	static DEFINE_SPINLOCK(lock);	194	static DEFINE_SPINLOCK(lock);
195	unsigned long flags;	195	unsigned long flags;
196	unsigned long now;	196	unsigned long now;
197	int print;	197	int print;
198		198
199	atomic_inc(&audit_lost);	199	atomic_inc(&audit_lost);
200		200
201	print = (audit_failure == AUDIT_FAIL_PANIC \|\| !audit_rate_limit);	201	print = (audit_failure == AUDIT_FAIL_PANIC \|\| !audit_rate_limit);
202		202
203	if (!print) {	203	if (!print) {
204	spin_lock_irqsave(&lock, flags);	204	spin_lock_irqsave(&lock, flags);
205	now = jiffies;	205	now = jiffies;
206	if (now - last_msg > HZ) {	206	if (now - last_msg > HZ) {
207	print = 1;	207	print = 1;
208	last_msg = now;	208	last_msg = now;
209	}	209	}
210	spin_unlock_irqrestore(&lock, flags);	210	spin_unlock_irqrestore(&lock, flags);
211	}	211	}
212		212
213	if (print) {	213	if (print) {
214	printk(KERN_WARNING	214	printk(KERN_WARNING
215	"audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",	215	"audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",
216	atomic_read(&audit_lost),	216	atomic_read(&audit_lost),
217	audit_rate_limit,	217	audit_rate_limit,
218	audit_backlog_limit);	218	audit_backlog_limit);
219	audit_panic(message);	219	audit_panic(message);
220	}	220	}
221		221
222	}	222	}
223		223
224	static int audit_set_rate_limit(int limit, uid_t loginuid)	224	static int audit_set_rate_limit(int limit, uid_t loginuid)
225	{	225	{
226	int old = audit_rate_limit;	226	int old = audit_rate_limit;
227	audit_rate_limit = limit;	227	audit_rate_limit = limit;
228	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,	228	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
229	"audit_rate_limit=%d old=%d by auid=%u",	229	"audit_rate_limit=%d old=%d by auid=%u",
230	audit_rate_limit, old, loginuid);	230	audit_rate_limit, old, loginuid);
231	return old;	231	return old;
232	}	232	}
233		233
234	static int audit_set_backlog_limit(int limit, uid_t loginuid)	234	static int audit_set_backlog_limit(int limit, uid_t loginuid)
235	{	235	{
236	int old = audit_backlog_limit;	236	int old = audit_backlog_limit;
237	audit_backlog_limit = limit;	237	audit_backlog_limit = limit;
238	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,	238	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
239	"audit_backlog_limit=%d old=%d by auid=%u",	239	"audit_backlog_limit=%d old=%d by auid=%u",
240	audit_backlog_limit, old, loginuid);	240	audit_backlog_limit, old, loginuid);
241	return old;	241	return old;
242	}	242	}
243		243
244	static int audit_set_enabled(int state, uid_t loginuid)	244	static int audit_set_enabled(int state, uid_t loginuid)
245	{	245	{
246	int old = audit_enabled;	246	int old = audit_enabled;
247	if (state != 0 && state != 1)	247	if (state != 0 && state != 1)
248	return -EINVAL;	248	return -EINVAL;
249	audit_enabled = state;	249	audit_enabled = state;
250	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,	250	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
251	"audit_enabled=%d old=%d by auid=%u",	251	"audit_enabled=%d old=%d by auid=%u",
252	audit_enabled, old, loginuid);	252	audit_enabled, old, loginuid);
253	return old;	253	return old;
254	}	254	}
255		255
256	static int audit_set_failure(int state, uid_t loginuid)	256	static int audit_set_failure(int state, uid_t loginuid)
257	{	257	{
258	int old = audit_failure;	258	int old = audit_failure;
259	if (state != AUDIT_FAIL_SILENT	259	if (state != AUDIT_FAIL_SILENT
260	&& state != AUDIT_FAIL_PRINTK	260	&& state != AUDIT_FAIL_PRINTK
261	&& state != AUDIT_FAIL_PANIC)	261	&& state != AUDIT_FAIL_PANIC)
262	return -EINVAL;	262	return -EINVAL;
263	audit_failure = state;	263	audit_failure = state;
264	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,	264	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
265	"audit_failure=%d old=%d by auid=%u",	265	"audit_failure=%d old=%d by auid=%u",
266	audit_failure, old, loginuid);	266	audit_failure, old, loginuid);
267	return old;	267	return old;
268	}	268	}
269		269
270	int kauditd_thread(void *dummy)	270	static int kauditd_thread(void *dummy)
271	{	271	{
272	struct sk_buff *skb;	272	struct sk_buff *skb;
273		273
274	while (1) {	274	while (1) {
275	skb = skb_dequeue(&audit_skb_queue);	275	skb = skb_dequeue(&audit_skb_queue);
276	wake_up(&audit_backlog_wait);	276	wake_up(&audit_backlog_wait);
277	if (skb) {	277	if (skb) {
278	if (audit_pid) {	278	if (audit_pid) {
279	int err = netlink_unicast(audit_sock, skb, audit_pid, 0);	279	int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
280	if (err < 0) {	280	if (err < 0) {
281	BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */	281	BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
282	printk(KERN_ERR "audit: NO daemon at audit_pid=%d\n", audit_pid);	282	printk(KERN_ERR "audit: NO daemon at audit_pid=%d\n", audit_pid);
283	audit_pid = 0;	283	audit_pid = 0;
284	}	284	}
285	} else {	285	} else {
286	printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));	286	printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
287	kfree_skb(skb);	287	kfree_skb(skb);
288	}	288	}
289	} else {	289	} else {
290	DECLARE_WAITQUEUE(wait, current);	290	DECLARE_WAITQUEUE(wait, current);
291	set_current_state(TASK_INTERRUPTIBLE);	291	set_current_state(TASK_INTERRUPTIBLE);
292	add_wait_queue(&kauditd_wait, &wait);	292	add_wait_queue(&kauditd_wait, &wait);
293		293
294	if (!skb_queue_len(&audit_skb_queue)) {	294	if (!skb_queue_len(&audit_skb_queue)) {
295	try_to_freeze();	295	try_to_freeze();
296	schedule();	296	schedule();
297	}	297	}
298		298
299	__set_current_state(TASK_RUNNING);	299	__set_current_state(TASK_RUNNING);
300	remove_wait_queue(&kauditd_wait, &wait);	300	remove_wait_queue(&kauditd_wait, &wait);
301	}	301	}
302	}	302	}
303	}	303	}
304		304
305	void audit_send_reply(int pid, int seq, int type, int done, int multi,	305	void audit_send_reply(int pid, int seq, int type, int done, int multi,
306	void *payload, int size)	306	void *payload, int size)
307	{	307	{
308	struct sk_buff *skb;	308	struct sk_buff *skb;
309	struct nlmsghdr *nlh;	309	struct nlmsghdr *nlh;
310	int len = NLMSG_SPACE(size);	310	int len = NLMSG_SPACE(size);
311	void *data;	311	void *data;
312	int flags = multi ? NLM_F_MULTI : 0;	312	int flags = multi ? NLM_F_MULTI : 0;
313	int t = done ? NLMSG_DONE : type;	313	int t = done ? NLMSG_DONE : type;
314		314
315	skb = alloc_skb(len, GFP_KERNEL);	315	skb = alloc_skb(len, GFP_KERNEL);
316	if (!skb)	316	if (!skb)
317	return;	317	return;
318		318
319	nlh = NLMSG_PUT(skb, pid, seq, t, size);	319	nlh = NLMSG_PUT(skb, pid, seq, t, size);
320	nlh->nlmsg_flags = flags;	320	nlh->nlmsg_flags = flags;
321	data = NLMSG_DATA(nlh);	321	data = NLMSG_DATA(nlh);
322	memcpy(data, payload, size);	322	memcpy(data, payload, size);
323		323
324	/* Ignore failure. It'll only happen if the sender goes away,	324	/* Ignore failure. It'll only happen if the sender goes away,
325	because our timeout is set to infinite. */	325	because our timeout is set to infinite. */
326	netlink_unicast(audit_sock, skb, pid, 0);	326	netlink_unicast(audit_sock, skb, pid, 0);
327	return;	327	return;
328		328
329	nlmsg_failure: /* Used by NLMSG_PUT */	329	nlmsg_failure: /* Used by NLMSG_PUT */
330	if (skb)	330	if (skb)
331	kfree_skb(skb);	331	kfree_skb(skb);
332	}	332	}
333		333
334	/*	334	/*
335	* Check for appropriate CAP_AUDIT_ capabilities on incoming audit	335	* Check for appropriate CAP_AUDIT_ capabilities on incoming audit
336	* control messages.	336	* control messages.
337	*/	337	*/
338	static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)	338	static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
339	{	339	{
340	int err = 0;	340	int err = 0;
341		341
342	switch (msg_type) {	342	switch (msg_type) {
343	case AUDIT_GET:	343	case AUDIT_GET:
344	case AUDIT_LIST:	344	case AUDIT_LIST:
345	case AUDIT_SET:	345	case AUDIT_SET:
346	case AUDIT_ADD:	346	case AUDIT_ADD:
347	case AUDIT_DEL:	347	case AUDIT_DEL:
348	case AUDIT_SIGNAL_INFO:	348	case AUDIT_SIGNAL_INFO:
349	if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))	349	if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
350	err = -EPERM;	350	err = -EPERM;
351	break;	351	break;
352	case AUDIT_USER:	352	case AUDIT_USER:
353	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:	353	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
354	if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))	354	if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
355	err = -EPERM;	355	err = -EPERM;
356	break;	356	break;
357	default: /* bad msg */	357	default: /* bad msg */
358	err = -EINVAL;	358	err = -EINVAL;
359	}	359	}
360		360
361	return err;	361	return err;
362	}	362	}
363		363
364	static int audit_receive_msg(struct sk_buff skb, struct nlmsghdr nlh)	364	static int audit_receive_msg(struct sk_buff skb, struct nlmsghdr nlh)
365	{	365	{
366	u32 uid, pid, seq;	366	u32 uid, pid, seq;
367	void *data;	367	void *data;
368	struct audit_status *status_get, status_set;	368	struct audit_status *status_get, status_set;
369	int err;	369	int err;
370	struct audit_buffer *ab;	370	struct audit_buffer *ab;
371	u16 msg_type = nlh->nlmsg_type;	371	u16 msg_type = nlh->nlmsg_type;
372	uid_t loginuid; /* loginuid of sender */	372	uid_t loginuid; /* loginuid of sender */
373	struct audit_sig_info sig_data;	373	struct audit_sig_info sig_data;
374		374
375	err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);	375	err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
376	if (err)	376	if (err)
377	return err;	377	return err;
378		378
379	/* As soon as there's any sign of userspace auditd, start kauditd to talk to it */	379	/* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
380	if (!kauditd_task)	380	if (!kauditd_task)
381	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");	381	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
382	if (IS_ERR(kauditd_task)) {	382	if (IS_ERR(kauditd_task)) {
383	err = PTR_ERR(kauditd_task);	383	err = PTR_ERR(kauditd_task);
384	kauditd_task = NULL;	384	kauditd_task = NULL;
385	return err;	385	return err;
386	}	386	}
387		387
388	pid = NETLINK_CREDS(skb)->pid;	388	pid = NETLINK_CREDS(skb)->pid;
389	uid = NETLINK_CREDS(skb)->uid;	389	uid = NETLINK_CREDS(skb)->uid;
390	loginuid = NETLINK_CB(skb).loginuid;	390	loginuid = NETLINK_CB(skb).loginuid;
391	seq = nlh->nlmsg_seq;	391	seq = nlh->nlmsg_seq;
392	data = NLMSG_DATA(nlh);	392	data = NLMSG_DATA(nlh);
393		393
394	switch (msg_type) {	394	switch (msg_type) {
395	case AUDIT_GET:	395	case AUDIT_GET:
396	status_set.enabled = audit_enabled;	396	status_set.enabled = audit_enabled;
397	status_set.failure = audit_failure;	397	status_set.failure = audit_failure;
398	status_set.pid = audit_pid;	398	status_set.pid = audit_pid;
399	status_set.rate_limit = audit_rate_limit;	399	status_set.rate_limit = audit_rate_limit;
400	status_set.backlog_limit = audit_backlog_limit;	400	status_set.backlog_limit = audit_backlog_limit;
401	status_set.lost = atomic_read(&audit_lost);	401	status_set.lost = atomic_read(&audit_lost);
402	status_set.backlog = skb_queue_len(&audit_skb_queue);	402	status_set.backlog = skb_queue_len(&audit_skb_queue);
403	audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,	403	audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
404	&status_set, sizeof(status_set));	404	&status_set, sizeof(status_set));
405	break;	405	break;
406	case AUDIT_SET:	406	case AUDIT_SET:
407	if (nlh->nlmsg_len < sizeof(struct audit_status))	407	if (nlh->nlmsg_len < sizeof(struct audit_status))
408	return -EINVAL;	408	return -EINVAL;
409	status_get = (struct audit_status *)data;	409	status_get = (struct audit_status *)data;
410	if (status_get->mask & AUDIT_STATUS_ENABLED) {	410	if (status_get->mask & AUDIT_STATUS_ENABLED) {
411	err = audit_set_enabled(status_get->enabled, loginuid);	411	err = audit_set_enabled(status_get->enabled, loginuid);
412	if (err < 0) return err;	412	if (err < 0) return err;
413	}	413	}
414	if (status_get->mask & AUDIT_STATUS_FAILURE) {	414	if (status_get->mask & AUDIT_STATUS_FAILURE) {
415	err = audit_set_failure(status_get->failure, loginuid);	415	err = audit_set_failure(status_get->failure, loginuid);
416	if (err < 0) return err;	416	if (err < 0) return err;
417	}	417	}
418	if (status_get->mask & AUDIT_STATUS_PID) {	418	if (status_get->mask & AUDIT_STATUS_PID) {
419	int old = audit_pid;	419	int old = audit_pid;
420	audit_pid = status_get->pid;	420	audit_pid = status_get->pid;
421	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,	421	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
422	"audit_pid=%d old=%d by auid=%u",	422	"audit_pid=%d old=%d by auid=%u",
423	audit_pid, old, loginuid);	423	audit_pid, old, loginuid);
424	}	424	}
425	if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)	425	if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
426	audit_set_rate_limit(status_get->rate_limit, loginuid);	426	audit_set_rate_limit(status_get->rate_limit, loginuid);
427	if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)	427	if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
428	audit_set_backlog_limit(status_get->backlog_limit,	428	audit_set_backlog_limit(status_get->backlog_limit,
429	loginuid);	429	loginuid);
430	break;	430	break;
431	case AUDIT_USER:	431	case AUDIT_USER:
432	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:	432	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
433	if (!audit_enabled && msg_type != AUDIT_USER_AVC)	433	if (!audit_enabled && msg_type != AUDIT_USER_AVC)
434	return 0;	434	return 0;
435		435
436	err = audit_filter_user(&NETLINK_CB(skb), msg_type);	436	err = audit_filter_user(&NETLINK_CB(skb), msg_type);
437	if (err == 1) {	437	if (err == 1) {
438	err = 0;	438	err = 0;
439	ab = audit_log_start(NULL, GFP_KERNEL, msg_type);	439	ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
440	if (ab) {	440	if (ab) {
441	audit_log_format(ab,	441	audit_log_format(ab,
442	"user pid=%d uid=%u auid=%u msg='%.1024s'",	442	"user pid=%d uid=%u auid=%u msg='%.1024s'",
443	pid, uid, loginuid, (char *)data);	443	pid, uid, loginuid, (char *)data);
444	audit_set_pid(ab, pid);	444	audit_set_pid(ab, pid);
445	audit_log_end(ab);	445	audit_log_end(ab);
446	}	446	}
447	}	447	}
448	break;	448	break;
449	case AUDIT_ADD:	449	case AUDIT_ADD:
450	case AUDIT_DEL:	450	case AUDIT_DEL:
451	if (nlh->nlmsg_len < sizeof(struct audit_rule))	451	if (nlh->nlmsg_len < sizeof(struct audit_rule))
452	return -EINVAL;	452	return -EINVAL;
453	/* fallthrough */	453	/* fallthrough */
454	case AUDIT_LIST:	454	case AUDIT_LIST:
455	err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,	455	err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
456	uid, seq, data, loginuid);	456	uid, seq, data, loginuid);
457	break;	457	break;
458	case AUDIT_SIGNAL_INFO:	458	case AUDIT_SIGNAL_INFO:
459	sig_data.uid = audit_sig_uid;	459	sig_data.uid = audit_sig_uid;
460	sig_data.pid = audit_sig_pid;	460	sig_data.pid = audit_sig_pid;
461	audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,	461	audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
462	0, 0, &sig_data, sizeof(sig_data));	462	0, 0, &sig_data, sizeof(sig_data));
463	break;	463	break;
464	default:	464	default:
465	err = -EINVAL;	465	err = -EINVAL;
466	break;	466	break;
467	}	467	}
468		468
469	return err < 0 ? err : 0;	469	return err < 0 ? err : 0;
470	}	470	}
471		471
472	/* Get message from skb (based on rtnetlink_rcv_skb). Each message is	472	/* Get message from skb (based on rtnetlink_rcv_skb). Each message is
473	* processed by audit_receive_msg. Malformed skbs with wrong length are	473	* processed by audit_receive_msg. Malformed skbs with wrong length are
474	* discarded silently. */	474	* discarded silently. */
475	static void audit_receive_skb(struct sk_buff *skb)	475	static void audit_receive_skb(struct sk_buff *skb)
476	{	476	{
477	int err;	477	int err;
478	struct nlmsghdr *nlh;	478	struct nlmsghdr *nlh;
479	u32 rlen;	479	u32 rlen;
480		480
481	while (skb->len >= NLMSG_SPACE(0)) {	481	while (skb->len >= NLMSG_SPACE(0)) {
482	nlh = (struct nlmsghdr *)skb->data;	482	nlh = (struct nlmsghdr *)skb->data;
483	if (nlh->nlmsg_len < sizeof(*nlh) \|\| skb->len < nlh->nlmsg_len)	483	if (nlh->nlmsg_len < sizeof(*nlh) \|\| skb->len < nlh->nlmsg_len)
484	return;	484	return;
485	rlen = NLMSG_ALIGN(nlh->nlmsg_len);	485	rlen = NLMSG_ALIGN(nlh->nlmsg_len);
486	if (rlen > skb->len)	486	if (rlen > skb->len)
487	rlen = skb->len;	487	rlen = skb->len;
488	if ((err = audit_receive_msg(skb, nlh))) {	488	if ((err = audit_receive_msg(skb, nlh))) {
489	netlink_ack(skb, nlh, err);	489	netlink_ack(skb, nlh, err);
490	} else if (nlh->nlmsg_flags & NLM_F_ACK)	490	} else if (nlh->nlmsg_flags & NLM_F_ACK)
491	netlink_ack(skb, nlh, 0);	491	netlink_ack(skb, nlh, 0);
492	skb_pull(skb, rlen);	492	skb_pull(skb, rlen);
493	}	493	}
494	}	494	}
495		495
496	/* Receive messages from netlink socket. */	496	/* Receive messages from netlink socket. */
497	static void audit_receive(struct sock *sk, int length)	497	static void audit_receive(struct sock *sk, int length)
498	{	498	{
499	struct sk_buff *skb;	499	struct sk_buff *skb;
500	unsigned int qlen;	500	unsigned int qlen;
501		501
502	down(&audit_netlink_sem);	502	down(&audit_netlink_sem);
503		503
504	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {	504	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
505	skb = skb_dequeue(&sk->sk_receive_queue);	505	skb = skb_dequeue(&sk->sk_receive_queue);
506	audit_receive_skb(skb);	506	audit_receive_skb(skb);
507	kfree_skb(skb);	507	kfree_skb(skb);
508	}	508	}
509	up(&audit_netlink_sem);	509	up(&audit_netlink_sem);
510	}	510	}
511		511
512		512
513	/* Initialize audit support at boot time. */	513	/* Initialize audit support at boot time. */
514	static int __init audit_init(void)	514	static int __init audit_init(void)
515	{	515	{
516	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",	516	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
517	audit_default ? "enabled" : "disabled");	517	audit_default ? "enabled" : "disabled");
518	audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,	518	audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
519	THIS_MODULE);	519	THIS_MODULE);
520	if (!audit_sock)	520	if (!audit_sock)
521	audit_panic("cannot initialize netlink socket");	521	audit_panic("cannot initialize netlink socket");
522		522
523	audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;	523	audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
524	skb_queue_head_init(&audit_skb_queue);	524	skb_queue_head_init(&audit_skb_queue);
525	audit_initialized = 1;	525	audit_initialized = 1;
526	audit_enabled = audit_default;	526	audit_enabled = audit_default;
527	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");	527	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
528	return 0;	528	return 0;
529	}	529	}
530	__initcall(audit_init);	530	__initcall(audit_init);
531		531
532	/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */	532	/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
533	static int __init audit_enable(char *str)	533	static int __init audit_enable(char *str)
534	{	534	{
535	audit_default = !!simple_strtol(str, NULL, 0);	535	audit_default = !!simple_strtol(str, NULL, 0);
536	printk(KERN_INFO "audit: %s%s\n",	536	printk(KERN_INFO "audit: %s%s\n",
537	audit_default ? "enabled" : "disabled",	537	audit_default ? "enabled" : "disabled",
538	audit_initialized ? "" : " (after initialization)");	538	audit_initialized ? "" : " (after initialization)");
539	if (audit_initialized)	539	if (audit_initialized)
540	audit_enabled = audit_default;	540	audit_enabled = audit_default;
541	return 0;	541	return 0;
542	}	542	}
543		543
544	__setup("audit=", audit_enable);	544	__setup("audit=", audit_enable);
545		545
546	static void audit_buffer_free(struct audit_buffer *ab)	546	static void audit_buffer_free(struct audit_buffer *ab)
547	{	547	{
548	unsigned long flags;	548	unsigned long flags;
549		549
550	if (!ab)	550	if (!ab)
551	return;	551	return;
552		552
553	if (ab->skb)	553	if (ab->skb)
554	kfree_skb(ab->skb);	554	kfree_skb(ab->skb);
555		555
556	spin_lock_irqsave(&audit_freelist_lock, flags);	556	spin_lock_irqsave(&audit_freelist_lock, flags);
557	if (++audit_freelist_count > AUDIT_MAXFREE)	557	if (++audit_freelist_count > AUDIT_MAXFREE)
558	kfree(ab);	558	kfree(ab);
559	else	559	else
560	list_add(&ab->list, &audit_freelist);	560	list_add(&ab->list, &audit_freelist);
561	spin_unlock_irqrestore(&audit_freelist_lock, flags);	561	spin_unlock_irqrestore(&audit_freelist_lock, flags);
562	}	562	}
563		563
564	static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,	564	static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
565	gfp_t gfp_mask, int type)	565	gfp_t gfp_mask, int type)
566	{	566	{
567	unsigned long flags;	567	unsigned long flags;
568	struct audit_buffer *ab = NULL;	568	struct audit_buffer *ab = NULL;
569	struct nlmsghdr *nlh;	569	struct nlmsghdr *nlh;
570		570
571	spin_lock_irqsave(&audit_freelist_lock, flags);	571	spin_lock_irqsave(&audit_freelist_lock, flags);
572	if (!list_empty(&audit_freelist)) {	572	if (!list_empty(&audit_freelist)) {
573	ab = list_entry(audit_freelist.next,	573	ab = list_entry(audit_freelist.next,
574	struct audit_buffer, list);	574	struct audit_buffer, list);
575	list_del(&ab->list);	575	list_del(&ab->list);
576	--audit_freelist_count;	576	--audit_freelist_count;
577	}	577	}
578	spin_unlock_irqrestore(&audit_freelist_lock, flags);	578	spin_unlock_irqrestore(&audit_freelist_lock, flags);
579		579
580	if (!ab) {	580	if (!ab) {
581	ab = kmalloc(sizeof(*ab), gfp_mask);	581	ab = kmalloc(sizeof(*ab), gfp_mask);
582	if (!ab)	582	if (!ab)
583	goto err;	583	goto err;
584	}	584	}
585		585
586	ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);	586	ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
587	if (!ab->skb)	587	if (!ab->skb)
588	goto err;	588	goto err;
589		589
590	ab->ctx = ctx;	590	ab->ctx = ctx;
591	ab->gfp_mask = gfp_mask;	591	ab->gfp_mask = gfp_mask;
592	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));	592	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
593	nlh->nlmsg_type = type;	593	nlh->nlmsg_type = type;
594	nlh->nlmsg_flags = 0;	594	nlh->nlmsg_flags = 0;
595	nlh->nlmsg_pid = 0;	595	nlh->nlmsg_pid = 0;
596	nlh->nlmsg_seq = 0;	596	nlh->nlmsg_seq = 0;
597	return ab;	597	return ab;
598	err:	598	err:
599	audit_buffer_free(ab);	599	audit_buffer_free(ab);
600	return NULL;	600	return NULL;
601	}	601	}
602		602
603	/* Compute a serial number for the audit record. Audit records are	603	/* Compute a serial number for the audit record. Audit records are
604	* written to user-space as soon as they are generated, so a complete	604	* written to user-space as soon as they are generated, so a complete
605	* audit record may be written in several pieces. The timestamp of the	605	* audit record may be written in several pieces. The timestamp of the
606	* record and this serial number are used by the user-space tools to	606	* record and this serial number are used by the user-space tools to
607	* determine which pieces belong to the same audit record. The	607	* determine which pieces belong to the same audit record. The
608	* (timestamp,serial) tuple is unique for each syscall and is live from	608	* (timestamp,serial) tuple is unique for each syscall and is live from
609	* syscall entry to syscall exit.	609	* syscall entry to syscall exit.
610	*	610	*
611	* NOTE: Another possibility is to store the formatted records off the	611	* NOTE: Another possibility is to store the formatted records off the
612	* audit context (for those records that have a context), and emit them	612	* audit context (for those records that have a context), and emit them
613	* all at syscall exit. However, this could delay the reporting of	613	* all at syscall exit. However, this could delay the reporting of
614	* significant errors until syscall exit (or never, if the system	614	* significant errors until syscall exit (or never, if the system
615	* halts). */	615	* halts). */
616		616
617	unsigned int audit_serial(void)	617	unsigned int audit_serial(void)
618	{	618	{
619	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;	619	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
620	static unsigned int serial = 0;	620	static unsigned int serial = 0;
621		621
622	unsigned long flags;	622	unsigned long flags;
623	unsigned int ret;	623	unsigned int ret;
624		624
625	spin_lock_irqsave(&serial_lock, flags);	625	spin_lock_irqsave(&serial_lock, flags);
626	do {	626	do {
627	ret = ++serial;	627	ret = ++serial;
628	} while (unlikely(!ret));	628	} while (unlikely(!ret));
629	spin_unlock_irqrestore(&serial_lock, flags);	629	spin_unlock_irqrestore(&serial_lock, flags);
630		630
631	return ret;	631	return ret;
632	}	632	}
633		633
634	static inline void audit_get_stamp(struct audit_context *ctx,	634	static inline void audit_get_stamp(struct audit_context *ctx,
635	struct timespec t, unsigned int serial)	635	struct timespec t, unsigned int serial)
636	{	636	{
637	if (ctx)	637	if (ctx)
638	auditsc_get_stamp(ctx, t, serial);	638	auditsc_get_stamp(ctx, t, serial);
639	else {	639	else {
640	*t = CURRENT_TIME;	640	*t = CURRENT_TIME;
641	*serial = audit_serial();	641	*serial = audit_serial();
642	}	642	}
643	}	643	}
644		644
645	/* Obtain an audit buffer. This routine does locking to obtain the	645	/* Obtain an audit buffer. This routine does locking to obtain the
646	* audit buffer, but then no locking is required for calls to	646	* audit buffer, but then no locking is required for calls to
647	* audit_log_*format. If the tsk is a task that is currently in a	647	* audit_log_*format. If the tsk is a task that is currently in a
648	* syscall, then the syscall is marked as auditable and an audit record	648	* syscall, then the syscall is marked as auditable and an audit record
649	* will be written at syscall exit. If there is no associated task, tsk	649	* will be written at syscall exit. If there is no associated task, tsk
650	* should be NULL. */	650	* should be NULL. */
651		651
652	struct audit_buffer audit_log_start(struct audit_context ctx, gfp_t gfp_mask,	652	struct audit_buffer audit_log_start(struct audit_context ctx, gfp_t gfp_mask,
653	int type)	653	int type)
654	{	654	{
655	struct audit_buffer *ab = NULL;	655	struct audit_buffer *ab = NULL;
656	struct timespec t;	656	struct timespec t;
657	unsigned int serial;	657	unsigned int serial;
658	int reserve;	658	int reserve;
659	unsigned long timeout_start = jiffies;	659	unsigned long timeout_start = jiffies;
660		660
661	if (!audit_initialized)	661	if (!audit_initialized)
662	return NULL;	662	return NULL;
663		663
664	if (gfp_mask & __GFP_WAIT)	664	if (gfp_mask & __GFP_WAIT)
665	reserve = 0;	665	reserve = 0;
666	else	666	else
667	reserve = 5; /* Allow atomic callers to go up to five	667	reserve = 5; /* Allow atomic callers to go up to five
668	entries over the normal backlog limit */	668	entries over the normal backlog limit */
669		669
670	while (audit_backlog_limit	670	while (audit_backlog_limit
671	&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {	671	&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
672	if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time	672	if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
673	&& time_before(jiffies, timeout_start + audit_backlog_wait_time)) {	673	&& time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
674		674
675	/* Wait for auditd to drain the queue a little */	675	/* Wait for auditd to drain the queue a little */
676	DECLARE_WAITQUEUE(wait, current);	676	DECLARE_WAITQUEUE(wait, current);
677	set_current_state(TASK_INTERRUPTIBLE);	677	set_current_state(TASK_INTERRUPTIBLE);
678	add_wait_queue(&audit_backlog_wait, &wait);	678	add_wait_queue(&audit_backlog_wait, &wait);
679		679
680	if (audit_backlog_limit &&	680	if (audit_backlog_limit &&
681	skb_queue_len(&audit_skb_queue) > audit_backlog_limit)	681	skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
682	schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);	682	schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
683		683
684	__set_current_state(TASK_RUNNING);	684	__set_current_state(TASK_RUNNING);
685	remove_wait_queue(&audit_backlog_wait, &wait);	685	remove_wait_queue(&audit_backlog_wait, &wait);
686	continue;	686	continue;
687	}	687	}
688	if (audit_rate_check())	688	if (audit_rate_check())
689	printk(KERN_WARNING	689	printk(KERN_WARNING
690	"audit: audit_backlog=%d > "	690	"audit: audit_backlog=%d > "
691	"audit_backlog_limit=%d\n",	691	"audit_backlog_limit=%d\n",
692	skb_queue_len(&audit_skb_queue),	692	skb_queue_len(&audit_skb_queue),
693	audit_backlog_limit);	693	audit_backlog_limit);
694	audit_log_lost("backlog limit exceeded");	694	audit_log_lost("backlog limit exceeded");
695	audit_backlog_wait_time = audit_backlog_wait_overflow;	695	audit_backlog_wait_time = audit_backlog_wait_overflow;
696	wake_up(&audit_backlog_wait);	696	wake_up(&audit_backlog_wait);
697	return NULL;	697	return NULL;
698	}	698	}
699		699
700	ab = audit_buffer_alloc(ctx, gfp_mask, type);	700	ab = audit_buffer_alloc(ctx, gfp_mask, type);
701	if (!ab) {	701	if (!ab) {
702	audit_log_lost("out of memory in audit_log_start");	702	audit_log_lost("out of memory in audit_log_start");
703	return NULL;	703	return NULL;
704	}	704	}
705		705
706	audit_get_stamp(ab->ctx, &t, &serial);	706	audit_get_stamp(ab->ctx, &t, &serial);
707		707
708	audit_log_format(ab, "audit(%lu.%03lu:%u): ",	708	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
709	t.tv_sec, t.tv_nsec/1000000, serial);	709	t.tv_sec, t.tv_nsec/1000000, serial);
710	return ab;	710	return ab;
711	}	711	}
712		712
713	/**	713	/**
714	* audit_expand - expand skb in the audit buffer	714	* audit_expand - expand skb in the audit buffer
715	* @ab: audit_buffer	715	* @ab: audit_buffer
716	*	716	*
717	* Returns 0 (no space) on failed expansion, or available space if	717	* Returns 0 (no space) on failed expansion, or available space if
718	* successful.	718	* successful.
719	*/	719	*/
720	static inline int audit_expand(struct audit_buffer *ab, int extra)	720	static inline int audit_expand(struct audit_buffer *ab, int extra)
721	{	721	{
722	struct sk_buff *skb = ab->skb;	722	struct sk_buff *skb = ab->skb;
723	int ret = pskb_expand_head(skb, skb_headroom(skb), extra,	723	int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
724	ab->gfp_mask);	724	ab->gfp_mask);
725	if (ret < 0) {	725	if (ret < 0) {
726	audit_log_lost("out of memory in audit_expand");	726	audit_log_lost("out of memory in audit_expand");
727	return 0;	727	return 0;
728	}	728	}
729	return skb_tailroom(skb);	729	return skb_tailroom(skb);
730	}	730	}
731		731
732	/* Format an audit message into the audit buffer. If there isn't enough	732	/* Format an audit message into the audit buffer. If there isn't enough
733	* room in the audit buffer, more room will be allocated and vsnprint	733	* room in the audit buffer, more room will be allocated and vsnprint
734	* will be called a second time. Currently, we assume that a printk	734	* will be called a second time. Currently, we assume that a printk
735	* can't format message larger than 1024 bytes, so we don't either. */	735	* can't format message larger than 1024 bytes, so we don't either. */
736	static void audit_log_vformat(struct audit_buffer ab, const char fmt,	736	static void audit_log_vformat(struct audit_buffer ab, const char fmt,
737	va_list args)	737	va_list args)
738	{	738	{
739	int len, avail;	739	int len, avail;
740	struct sk_buff *skb;	740	struct sk_buff *skb;
741	va_list args2;	741	va_list args2;
742		742
743	if (!ab)	743	if (!ab)
744	return;	744	return;
745		745
746	BUG_ON(!ab->skb);	746	BUG_ON(!ab->skb);
747	skb = ab->skb;	747	skb = ab->skb;
748	avail = skb_tailroom(skb);	748	avail = skb_tailroom(skb);
749	if (avail == 0) {	749	if (avail == 0) {
750	avail = audit_expand(ab, AUDIT_BUFSIZ);	750	avail = audit_expand(ab, AUDIT_BUFSIZ);
751	if (!avail)	751	if (!avail)
752	goto out;	752	goto out;
753	}	753	}
754	va_copy(args2, args);	754	va_copy(args2, args);
755	len = vsnprintf(skb->tail, avail, fmt, args);	755	len = vsnprintf(skb->tail, avail, fmt, args);
756	if (len >= avail) {	756	if (len >= avail) {
757	/* The printk buffer is 1024 bytes long, so if we get	757	/* The printk buffer is 1024 bytes long, so if we get
758	* here and AUDIT_BUFSIZ is at least 1024, then we can	758	* here and AUDIT_BUFSIZ is at least 1024, then we can
759	* log everything that printk could have logged. */	759	* log everything that printk could have logged. */
760	avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));	760	avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
761	if (!avail)	761	if (!avail)
762	goto out;	762	goto out;
763	len = vsnprintf(skb->tail, avail, fmt, args2);	763	len = vsnprintf(skb->tail, avail, fmt, args2);
764	}	764	}
765	if (len > 0)	765	if (len > 0)
766	skb_put(skb, len);	766	skb_put(skb, len);
767	out:	767	out:
768	return;	768	return;
769	}	769	}
770		770
771	/* Format a message into the audit buffer. All the work is done in	771	/* Format a message into the audit buffer. All the work is done in
772	* audit_log_vformat. */	772	* audit_log_vformat. */
773	void audit_log_format(struct audit_buffer ab, const char fmt, ...)	773	void audit_log_format(struct audit_buffer ab, const char fmt, ...)
774	{	774	{
775	va_list args;	775	va_list args;
776		776
777	if (!ab)	777	if (!ab)
778	return;	778	return;
779	va_start(args, fmt);	779	va_start(args, fmt);
780	audit_log_vformat(ab, fmt, args);	780	audit_log_vformat(ab, fmt, args);
781	va_end(args);	781	va_end(args);
782	}	782	}
783		783
784	/* This function will take the passed buf and convert it into a string of	784	/* This function will take the passed buf and convert it into a string of
785	* ascii hex digits. The new string is placed onto the skb. */	785	* ascii hex digits. The new string is placed onto the skb. */
786	void audit_log_hex(struct audit_buffer ab, const unsigned char buf,	786	void audit_log_hex(struct audit_buffer ab, const unsigned char buf,
787	size_t len)	787	size_t len)
788	{	788	{
789	int i, avail, new_len;	789	int i, avail, new_len;
790	unsigned char *ptr;	790	unsigned char *ptr;
791	struct sk_buff *skb;	791	struct sk_buff *skb;
792	static const unsigned char *hex = "0123456789ABCDEF";	792	static const unsigned char *hex = "0123456789ABCDEF";
793		793
794	BUG_ON(!ab->skb);	794	BUG_ON(!ab->skb);
795	skb = ab->skb;	795	skb = ab->skb;
796	avail = skb_tailroom(skb);	796	avail = skb_tailroom(skb);
797	new_len = len<<1;	797	new_len = len<<1;
798	if (new_len >= avail) {	798	if (new_len >= avail) {
799	/* Round the buffer request up to the next multiple */	799	/* Round the buffer request up to the next multiple */
800	new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);	800	new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
801	avail = audit_expand(ab, new_len);	801	avail = audit_expand(ab, new_len);
802	if (!avail)	802	if (!avail)
803	return;	803	return;
804	}	804	}
805		805
806	ptr = skb->tail;	806	ptr = skb->tail;
807	for (i=0; i<len; i++) {	807	for (i=0; i<len; i++) {
808	ptr++ = hex[(buf[i] & 0xF0)>>4]; / Upper nibble */	808	ptr++ = hex[(buf[i] & 0xF0)>>4]; / Upper nibble */
809	ptr++ = hex[buf[i] & 0x0F]; / Lower nibble */	809	ptr++ = hex[buf[i] & 0x0F]; / Lower nibble */
810	}	810	}
811	*ptr = 0;	811	*ptr = 0;
812	skb_put(skb, len << 1); /* new string is twice the old string */	812	skb_put(skb, len << 1); /* new string is twice the old string */
813	}	813	}
814		814
815	/* This code will escape a string that is passed to it if the string	815	/* This code will escape a string that is passed to it if the string
816	* contains a control character, unprintable character, double quote mark,	816	* contains a control character, unprintable character, double quote mark,
817	* or a space. Unescaped strings will start and end with a double quote mark.	817	* or a space. Unescaped strings will start and end with a double quote mark.
818	* Strings that are escaped are printed in hex (2 digits per char). */	818	* Strings that are escaped are printed in hex (2 digits per char). */
819	void audit_log_untrustedstring(struct audit_buffer ab, const char string)	819	void audit_log_untrustedstring(struct audit_buffer ab, const char string)
820	{	820	{
821	const unsigned char *p = string;	821	const unsigned char *p = string;
822		822
823	while (*p) {	823	while (*p) {
824	if (p == '"' \|\| p < 0x21 \|\| *p > 0x7f) {	824	if (p == '"' \|\| p < 0x21 \|\| *p > 0x7f) {
825	audit_log_hex(ab, string, strlen(string));	825	audit_log_hex(ab, string, strlen(string));
826	return;	826	return;
827	}	827	}
828	p++;	828	p++;
829	}	829	}
830	audit_log_format(ab, "\"%s\"", string);	830	audit_log_format(ab, "\"%s\"", string);
831	}	831	}
832		832
833	/* This is a helper-function to print the escaped d_path */	833	/* This is a helper-function to print the escaped d_path */
834	void audit_log_d_path(struct audit_buffer ab, const char prefix,	834	void audit_log_d_path(struct audit_buffer ab, const char prefix,
835	struct dentry dentry, struct vfsmount vfsmnt)	835	struct dentry dentry, struct vfsmount vfsmnt)
836	{	836	{
837	char p, path;	837	char p, path;
838		838
839	if (prefix)	839	if (prefix)
840	audit_log_format(ab, " %s", prefix);	840	audit_log_format(ab, " %s", prefix);
841		841
842	/* We will allow 11 spaces for ' (deleted)' to be appended */	842	/* We will allow 11 spaces for ' (deleted)' to be appended */
843	path = kmalloc(PATH_MAX+11, ab->gfp_mask);	843	path = kmalloc(PATH_MAX+11, ab->gfp_mask);
844	if (!path) {	844	if (!path) {
845	audit_log_format(ab, "<no memory>");	845	audit_log_format(ab, "<no memory>");
846	return;	846	return;
847	}	847	}
848	p = d_path(dentry, vfsmnt, path, PATH_MAX+11);	848	p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
849	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */	849	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
850	/* FIXME: can we save some information here? */	850	/* FIXME: can we save some information here? */
851	audit_log_format(ab, "<too long>");	851	audit_log_format(ab, "<too long>");
852	} else	852	} else
853	audit_log_untrustedstring(ab, p);	853	audit_log_untrustedstring(ab, p);
854	kfree(path);	854	kfree(path);
855	}	855	}
856		856
857	/* The netlink_* functions cannot be called inside an irq context, so	857	/* The netlink_* functions cannot be called inside an irq context, so
858	* the audit buffer is places on a queue and a tasklet is scheduled to	858	* the audit buffer is places on a queue and a tasklet is scheduled to
859	* remove them from the queue outside the irq context. May be called in	859	* remove them from the queue outside the irq context. May be called in
860	* any context. */	860	* any context. */
861	void audit_log_end(struct audit_buffer *ab)	861	void audit_log_end(struct audit_buffer *ab)
862	{	862	{
863	if (!ab)	863	if (!ab)
864	return;	864	return;
865	if (!audit_rate_check()) {	865	if (!audit_rate_check()) {
866	audit_log_lost("rate limit exceeded");	866	audit_log_lost("rate limit exceeded");
867	} else {	867	} else {
868	if (audit_pid) {	868	if (audit_pid) {
869	struct nlmsghdr nlh = (struct nlmsghdr )ab->skb->data;	869	struct nlmsghdr nlh = (struct nlmsghdr )ab->skb->data;
870	nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);	870	nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
871	skb_queue_tail(&audit_skb_queue, ab->skb);	871	skb_queue_tail(&audit_skb_queue, ab->skb);
872	ab->skb = NULL;	872	ab->skb = NULL;
873	wake_up_interruptible(&kauditd_wait);	873	wake_up_interruptible(&kauditd_wait);
874	} else {	874	} else {
875	printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));	875	printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));
876	}	876	}
877	}	877	}
878	audit_buffer_free(ab);	878	audit_buffer_free(ab);
879	}	879	}
880		880
881	/* Log an audit record. This is a convenience function that calls	881	/* Log an audit record. This is a convenience function that calls
882	* audit_log_start, audit_log_vformat, and audit_log_end. It may be	882	* audit_log_start, audit_log_vformat, and audit_log_end. It may be
883	* called in any context. */	883	* called in any context. */
884	void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,	884	void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
885	const char *fmt, ...)	885	const char *fmt, ...)
886	{	886	{
887	struct audit_buffer *ab;	887	struct audit_buffer *ab;
888	va_list args;	888	va_list args;
889		889
890	ab = audit_log_start(ctx, gfp_mask, type);	890	ab = audit_log_start(ctx, gfp_mask, type);
891	if (ab) {	891	if (ab) {
892	va_start(args, fmt);	892	va_start(args, fmt);
893	audit_log_vformat(ab, fmt, args);	893	audit_log_vformat(ab, fmt, args);
894	va_end(args);	894	va_end(args);
895	audit_log_end(ab);	895	audit_log_end(ab);
896	}	896	}
897	}	897	}
898		898

kernel/irq/proc.c

Diff comments View file @ 97a41e2

 /*
  * linux/kernel/irq/proc.c
  *
  * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
  *
  * This file contains the /proc/irq/ handling code.
  */
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
+#include "internals.h"
 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 #ifdef CONFIG_SMP
 /*
  * The /proc/irq/<irq>/smp_affinity values:
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	/*
 	 * Save these away for later use. Re-progam when the
 	 * interrupt is pending
 	 */
 	set_pending_irq(irq, mask_val);
 }
 #else
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
 #endif
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
 {
 	int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
 	if (count - len < 2)
 		return -EINVAL;
 	len += sprintf(page + len, "\n");
 	return len;
 }
 int no_irq_affinity;
 static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 				   unsigned long count, void *data)
 {
 	unsigned int irq = (int)(long)data, full_count = count, err;
 	cpumask_t new_value, tmp;
 	if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
 		return -EIO;
 	err = cpumask_parse(buffer, count, new_value);
 	if (err)
 		return err;
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
 	cpus_and(tmp, new_value, cpu_online_map);
 	if (cpus_empty(tmp))
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
 		return select_smp_affinity(irq) ? -EINVAL : full_count;
 	proc_set_irq_affinity(irq, new_value);
 	return full_count;
 }
 #endif
 #define MAX_NAMELEN 128
 static int name_unique(unsigned int irq, struct irqaction *new_action)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *action;
 	for (action = desc->action ; action; action = action->next)
 		if ((action != new_action) && action->name &&
 				!strcmp(new_action->name, action->name))
 			return 0;
 	return 1;
 }
 void register_handler_proc(unsigned int irq, struct irqaction *action)
 {
 	char name [MAX_NAMELEN];
 	if (!irq_dir[irq] || action->dir || !action->name ||
 					!name_unique(irq, action))
 		return;
 	memset(name, 0, MAX_NAMELEN);
 	snprintf(name, MAX_NAMELEN, "%s", action->name);
 	/* create /proc/irq/1234/handler/ */
 	action->dir = proc_mkdir(name, irq_dir[irq]);
 }
 #undef MAX_NAMELEN
 #define MAX_NAMELEN 10
 void register_irq_proc(unsigned int irq)
 {
 	char name [MAX_NAMELEN];
 	if (!root_irq_dir ||
 		(irq_desc[irq].handler == &no_irq_type) ||
 			irq_dir[irq])
 		return;
 	memset(name, 0, MAX_NAMELEN);
 	sprintf(name, "%d", irq);
 	/* create /proc/irq/1234 */
 	irq_dir[irq] = proc_mkdir(name, root_irq_dir);
 #ifdef CONFIG_SMP
 	{
 		struct proc_dir_entry *entry;
 		/* create /proc/irq/<irq>/smp_affinity */
 		entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
 		if (entry) {
 			entry->nlink = 1;
 			entry->data = (void *)(long)irq;
 			entry->read_proc = irq_affinity_read_proc;
 			entry->write_proc = irq_affinity_write_proc;
 		}
 		smp_affinity_entry[irq] = entry;
 	}
 #endif
 }
 #undef MAX_NAMELEN
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
 	if (action->dir)
 		remove_proc_entry(action->dir->name, irq_dir[irq]);
 }
 void init_irq_proc(void)
 {
 	int i;
 	/* create /proc/irq */
 	root_irq_dir = proc_mkdir("irq", NULL);
 	if (!root_irq_dir)
 		return;
 	/*
 	 * Create entries for all existing IRQs.
 	 */
 	for (i = 0; i < NR_IRQS; i++)
 		register_irq_proc(i);
 }

kernel/rcutorture.c

Diff comments View file @ 97a41e2

1	/*	1	/*
2	* Read-Copy Update /proc-based torture test facility	2	* Read-Copy Update /proc-based torture test facility
3	*	3	*
4	* This program is free software; you can redistribute it and/or modify	4	* This program is free software; you can redistribute it and/or modify
5	* it under the terms of the GNU General Public License as published by	5	* it under the terms of the GNU General Public License as published by
6	* the Free Software Foundation; either version 2 of the License, or	6	* the Free Software Foundation; either version 2 of the License, or
7	* (at your option) any later version.	7	* (at your option) any later version.
8	*	8	*
9	* This program is distributed in the hope that it will be useful,	9	* This program is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.	12	* GNU General Public License for more details.
13	*	13	*
14	* You should have received a copy of the GNU General Public License	14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write to the Free Software	15	* along with this program; if not, write to the Free Software
16	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.	16	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17	*	17	*
18	* Copyright (C) IBM Corporation, 2005	18	* Copyright (C) IBM Corporation, 2005
19	*	19	*
20	* Authors: Paul E. McKenney <paulmck@us.ibm.com>	20	* Authors: Paul E. McKenney <paulmck@us.ibm.com>
21	*	21	*
22	* See also: Documentation/RCU/torture.txt	22	* See also: Documentation/RCU/torture.txt
23	*/	23	*/
24	#include <linux/types.h>	24	#include <linux/types.h>
25	#include <linux/kernel.h>	25	#include <linux/kernel.h>
26	#include <linux/init.h>	26	#include <linux/init.h>
27	#include <linux/module.h>	27	#include <linux/module.h>
28	#include <linux/kthread.h>	28	#include <linux/kthread.h>
29	#include <linux/err.h>	29	#include <linux/err.h>
30	#include <linux/spinlock.h>	30	#include <linux/spinlock.h>
31	#include <linux/smp.h>	31	#include <linux/smp.h>
32	#include <linux/rcupdate.h>	32	#include <linux/rcupdate.h>
33	#include <linux/interrupt.h>	33	#include <linux/interrupt.h>
34	#include <linux/sched.h>	34	#include <linux/sched.h>
35	#include <asm/atomic.h>	35	#include <asm/atomic.h>
36	#include <linux/bitops.h>	36	#include <linux/bitops.h>
37	#include <linux/module.h>	37	#include <linux/module.h>
38	#include <linux/completion.h>	38	#include <linux/completion.h>
39	#include <linux/moduleparam.h>	39	#include <linux/moduleparam.h>
40	#include <linux/percpu.h>	40	#include <linux/percpu.h>
41	#include <linux/notifier.h>	41	#include <linux/notifier.h>
42	#include <linux/rcuref.h>	42	#include <linux/rcuref.h>
43	#include <linux/cpu.h>	43	#include <linux/cpu.h>
44	#include <linux/random.h>	44	#include <linux/random.h>
45	#include <linux/delay.h>	45	#include <linux/delay.h>
46	#include <linux/byteorder/swabb.h>	46	#include <linux/byteorder/swabb.h>
47	#include <linux/stat.h>	47	#include <linux/stat.h>
48		48
49	MODULE_LICENSE("GPL");	49	MODULE_LICENSE("GPL");
50		50
51	static int nreaders = -1; /* # reader threads, defaults to 4ncpus /	51	static int nreaders = -1; /* # reader threads, defaults to 4ncpus /
52	static int stat_interval = 0; /* Interval between stats, in seconds. */	52	static int stat_interval = 0; /* Interval between stats, in seconds. */
53	/* Defaults to "only at end of test". */	53	/* Defaults to "only at end of test". */
54	static int verbose = 0; /* Print more debug info. */	54	static int verbose = 0; /* Print more debug info. */
55		55
56	MODULE_PARM(nreaders, "i");	56	MODULE_PARM(nreaders, "i");
57	MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");	57	MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
58	MODULE_PARM(stat_interval, "i");	58	MODULE_PARM(stat_interval, "i");
59	MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");	59	MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
60	MODULE_PARM(verbose, "i");	60	MODULE_PARM(verbose, "i");
61	MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");	61	MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
62	#define TORTURE_FLAG "rcutorture: "	62	#define TORTURE_FLAG "rcutorture: "
63	#define PRINTK_STRING(s) \	63	#define PRINTK_STRING(s) \
64	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)	64	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
65	#define VERBOSE_PRINTK_STRING(s) \	65	#define VERBOSE_PRINTK_STRING(s) \
66	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)	66	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
67	#define VERBOSE_PRINTK_ERRSTRING(s) \	67	#define VERBOSE_PRINTK_ERRSTRING(s) \
68	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)	68	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
69		69
70	static char printk_buf[4096];	70	static char printk_buf[4096];
71		71
72	static int nrealreaders;	72	static int nrealreaders;
73	static struct task_struct *writer_task;	73	static struct task_struct *writer_task;
74	static struct task_struct **reader_tasks;	74	static struct task_struct **reader_tasks;
75	static struct task_struct *stats_task;	75	static struct task_struct *stats_task;
76		76
77	#define RCU_TORTURE_PIPE_LEN 10	77	#define RCU_TORTURE_PIPE_LEN 10
78		78
79	struct rcu_torture {	79	struct rcu_torture {
80	struct rcu_head rtort_rcu;	80	struct rcu_head rtort_rcu;
81	int rtort_pipe_count;	81	int rtort_pipe_count;
82	struct list_head rtort_free;	82	struct list_head rtort_free;
83	int rtort_mbtest;	83	int rtort_mbtest;
84	};	84	};
85		85
86	static int fullstop = 0; /* stop generating callbacks at test end. */	86	static int fullstop = 0; /* stop generating callbacks at test end. */
87	static LIST_HEAD(rcu_torture_freelist);	87	static LIST_HEAD(rcu_torture_freelist);
88	static struct rcu_torture *rcu_torture_current = NULL;	88	static struct rcu_torture *rcu_torture_current = NULL;
89	static long rcu_torture_current_version = 0;	89	static long rcu_torture_current_version = 0;
90	static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];	90	static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
91	static DEFINE_SPINLOCK(rcu_torture_lock);	91	static DEFINE_SPINLOCK(rcu_torture_lock);
92	static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =	92	static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
93	{ 0 };	93	{ 0 };
94	static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =	94	static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
95	{ 0 };	95	{ 0 };
96	static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];	96	static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
97	atomic_t n_rcu_torture_alloc;	97	atomic_t n_rcu_torture_alloc;
98	atomic_t n_rcu_torture_alloc_fail;	98	atomic_t n_rcu_torture_alloc_fail;
99	atomic_t n_rcu_torture_free;	99	atomic_t n_rcu_torture_free;
100	atomic_t n_rcu_torture_mberror;	100	atomic_t n_rcu_torture_mberror;
101	atomic_t n_rcu_torture_error;	101	atomic_t n_rcu_torture_error;
102		102
103	/*	103	/*
104	* Allocate an element from the rcu_tortures pool.	104	* Allocate an element from the rcu_tortures pool.
105	*/	105	*/
106	struct rcu_torture *	106	static struct rcu_torture *
107	rcu_torture_alloc(void)	107	rcu_torture_alloc(void)
108	{	108	{
109	struct list_head *p;	109	struct list_head *p;
110		110
111	spin_lock(&rcu_torture_lock);	111	spin_lock(&rcu_torture_lock);
112	if (list_empty(&rcu_torture_freelist)) {	112	if (list_empty(&rcu_torture_freelist)) {
113	atomic_inc(&n_rcu_torture_alloc_fail);	113	atomic_inc(&n_rcu_torture_alloc_fail);
114	spin_unlock(&rcu_torture_lock);	114	spin_unlock(&rcu_torture_lock);
115	return NULL;	115	return NULL;
116	}	116	}
117	atomic_inc(&n_rcu_torture_alloc);	117	atomic_inc(&n_rcu_torture_alloc);
118	p = rcu_torture_freelist.next;	118	p = rcu_torture_freelist.next;
119	list_del_init(p);	119	list_del_init(p);
120	spin_unlock(&rcu_torture_lock);	120	spin_unlock(&rcu_torture_lock);
121	return container_of(p, struct rcu_torture, rtort_free);	121	return container_of(p, struct rcu_torture, rtort_free);
122	}	122	}
123		123
124	/*	124	/*
125	* Free an element to the rcu_tortures pool.	125	* Free an element to the rcu_tortures pool.
126	*/	126	*/
127	static void	127	static void
128	rcu_torture_free(struct rcu_torture *p)	128	rcu_torture_free(struct rcu_torture *p)
129	{	129	{
130	atomic_inc(&n_rcu_torture_free);	130	atomic_inc(&n_rcu_torture_free);
131	spin_lock(&rcu_torture_lock);	131	spin_lock(&rcu_torture_lock);
132	list_add_tail(&p->rtort_free, &rcu_torture_freelist);	132	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
133	spin_unlock(&rcu_torture_lock);	133	spin_unlock(&rcu_torture_lock);
134	}	134	}
135		135
136	static void	136	static void
137	rcu_torture_cb(struct rcu_head *p)	137	rcu_torture_cb(struct rcu_head *p)
138	{	138	{
139	int i;	139	int i;
140	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);	140	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
141		141
142	if (fullstop) {	142	if (fullstop) {
143	/* Test is ending, just drop callbacks on the floor. */	143	/* Test is ending, just drop callbacks on the floor. */
144	/* The next initialization will pick up the pieces. */	144	/* The next initialization will pick up the pieces. */
145	return;	145	return;
146	}	146	}
147	i = rp->rtort_pipe_count;	147	i = rp->rtort_pipe_count;
148	if (i > RCU_TORTURE_PIPE_LEN)	148	if (i > RCU_TORTURE_PIPE_LEN)
149	i = RCU_TORTURE_PIPE_LEN;	149	i = RCU_TORTURE_PIPE_LEN;
150	atomic_inc(&rcu_torture_wcount[i]);	150	atomic_inc(&rcu_torture_wcount[i]);
151	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {	151	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
152	rp->rtort_mbtest = 0;	152	rp->rtort_mbtest = 0;
153	rcu_torture_free(rp);	153	rcu_torture_free(rp);
154	} else	154	} else
155	call_rcu(p, rcu_torture_cb);	155	call_rcu(p, rcu_torture_cb);
156	}	156	}
157		157
158	struct rcu_random_state {	158	struct rcu_random_state {
159	unsigned long rrs_state;	159	unsigned long rrs_state;
160	unsigned long rrs_count;	160	unsigned long rrs_count;
161	};	161	};
162		162
163	#define RCU_RANDOM_MULT 39916801 /* prime */	163	#define RCU_RANDOM_MULT 39916801 /* prime */
164	#define RCU_RANDOM_ADD 479001701 /* prime */	164	#define RCU_RANDOM_ADD 479001701 /* prime */
165	#define RCU_RANDOM_REFRESH 10000	165	#define RCU_RANDOM_REFRESH 10000
166		166
167	#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }	167	#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
168		168
169	/*	169	/*
170	* Crude but fast random-number generator. Uses a linear congruential	170	* Crude but fast random-number generator. Uses a linear congruential
171	* generator, with occasional help from get_random_bytes().	171	* generator, with occasional help from get_random_bytes().
172	*/	172	*/
173	static long	173	static long
174	rcu_random(struct rcu_random_state *rrsp)	174	rcu_random(struct rcu_random_state *rrsp)
175	{	175	{
176	long refresh;	176	long refresh;
177		177
178	if (--rrsp->rrs_count < 0) {	178	if (--rrsp->rrs_count < 0) {
179	get_random_bytes(&refresh, sizeof(refresh));	179	get_random_bytes(&refresh, sizeof(refresh));
180	rrsp->rrs_state += refresh;	180	rrsp->rrs_state += refresh;
181	rrsp->rrs_count = RCU_RANDOM_REFRESH;	181	rrsp->rrs_count = RCU_RANDOM_REFRESH;
182	}	182	}
183	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;	183	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
184	return swahw32(rrsp->rrs_state);	184	return swahw32(rrsp->rrs_state);
185	}	185	}
186		186
187	/*	187	/*
188	* RCU torture writer kthread. Repeatedly substitutes a new structure	188	* RCU torture writer kthread. Repeatedly substitutes a new structure
189	* for that pointed to by rcu_torture_current, freeing the old structure	189	* for that pointed to by rcu_torture_current, freeing the old structure
190	* after a series of grace periods (the "pipeline").	190	* after a series of grace periods (the "pipeline").
191	*/	191	*/
192	static int	192	static int
193	rcu_torture_writer(void *arg)	193	rcu_torture_writer(void *arg)
194	{	194	{
195	int i;	195	int i;
196	long oldbatch = rcu_batches_completed();	196	long oldbatch = rcu_batches_completed();
197	struct rcu_torture *rp;	197	struct rcu_torture *rp;
198	struct rcu_torture *old_rp;	198	struct rcu_torture *old_rp;
199	static DEFINE_RCU_RANDOM(rand);	199	static DEFINE_RCU_RANDOM(rand);
200		200
201	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");	201	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
202	set_user_nice(current, 19);	202	set_user_nice(current, 19);
203		203
204	do {	204	do {
205	schedule_timeout_uninterruptible(1);	205	schedule_timeout_uninterruptible(1);
206	if (rcu_batches_completed() == oldbatch)	206	if (rcu_batches_completed() == oldbatch)
207	continue;	207	continue;
208	if ((rp = rcu_torture_alloc()) == NULL)	208	if ((rp = rcu_torture_alloc()) == NULL)
209	continue;	209	continue;
210	rp->rtort_pipe_count = 0;	210	rp->rtort_pipe_count = 0;
211	udelay(rcu_random(&rand) & 0x3ff);	211	udelay(rcu_random(&rand) & 0x3ff);
212	old_rp = rcu_torture_current;	212	old_rp = rcu_torture_current;
213	rp->rtort_mbtest = 1;	213	rp->rtort_mbtest = 1;
214	rcu_assign_pointer(rcu_torture_current, rp);	214	rcu_assign_pointer(rcu_torture_current, rp);
215	smp_wmb();	215	smp_wmb();
216	if (old_rp != NULL) {	216	if (old_rp != NULL) {
217	i = old_rp->rtort_pipe_count;	217	i = old_rp->rtort_pipe_count;
218	if (i > RCU_TORTURE_PIPE_LEN)	218	if (i > RCU_TORTURE_PIPE_LEN)
219	i = RCU_TORTURE_PIPE_LEN;	219	i = RCU_TORTURE_PIPE_LEN;
220	atomic_inc(&rcu_torture_wcount[i]);	220	atomic_inc(&rcu_torture_wcount[i]);
221	old_rp->rtort_pipe_count++;	221	old_rp->rtort_pipe_count++;
222	call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);	222	call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
223	}	223	}
224	rcu_torture_current_version++;	224	rcu_torture_current_version++;
225	oldbatch = rcu_batches_completed();	225	oldbatch = rcu_batches_completed();
226	} while (!kthread_should_stop() && !fullstop);	226	} while (!kthread_should_stop() && !fullstop);
227	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");	227	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
228	while (!kthread_should_stop())	228	while (!kthread_should_stop())
229	schedule_timeout_uninterruptible(1);	229	schedule_timeout_uninterruptible(1);
230	return 0;	230	return 0;
231	}	231	}
232		232
233	/*	233	/*
234	* RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,	234	* RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
235	* incrementing the corresponding element of the pipeline array. The	235	* incrementing the corresponding element of the pipeline array. The
236	* counter in the element should never be greater than 1, otherwise, the	236	* counter in the element should never be greater than 1, otherwise, the
237	* RCU implementation is broken.	237	* RCU implementation is broken.
238	*/	238	*/
239	static int	239	static int
240	rcu_torture_reader(void *arg)	240	rcu_torture_reader(void *arg)
241	{	241	{
242	int completed;	242	int completed;
243	DEFINE_RCU_RANDOM(rand);	243	DEFINE_RCU_RANDOM(rand);
244	struct rcu_torture *p;	244	struct rcu_torture *p;
245	int pipe_count;	245	int pipe_count;
246		246
247	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");	247	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
248	set_user_nice(current, 19);	248	set_user_nice(current, 19);
249		249
250	do {	250	do {
251	rcu_read_lock();	251	rcu_read_lock();
252	completed = rcu_batches_completed();	252	completed = rcu_batches_completed();
253	p = rcu_dereference(rcu_torture_current);	253	p = rcu_dereference(rcu_torture_current);
254	if (p == NULL) {	254	if (p == NULL) {
255	/* Wait for rcu_torture_writer to get underway */	255	/* Wait for rcu_torture_writer to get underway */
256	rcu_read_unlock();	256	rcu_read_unlock();
257	schedule_timeout_interruptible(HZ);	257	schedule_timeout_interruptible(HZ);
258	continue;	258	continue;
259	}	259	}
260	if (p->rtort_mbtest == 0)	260	if (p->rtort_mbtest == 0)
261	atomic_inc(&n_rcu_torture_mberror);	261	atomic_inc(&n_rcu_torture_mberror);
262	udelay(rcu_random(&rand) & 0x7f);	262	udelay(rcu_random(&rand) & 0x7f);
263	preempt_disable();	263	preempt_disable();
264	pipe_count = p->rtort_pipe_count;	264	pipe_count = p->rtort_pipe_count;
265	if (pipe_count > RCU_TORTURE_PIPE_LEN) {	265	if (pipe_count > RCU_TORTURE_PIPE_LEN) {
266	/* Should not happen, but... */	266	/* Should not happen, but... */
267	pipe_count = RCU_TORTURE_PIPE_LEN;	267	pipe_count = RCU_TORTURE_PIPE_LEN;
268	}	268	}
269	++__get_cpu_var(rcu_torture_count)[pipe_count];	269	++__get_cpu_var(rcu_torture_count)[pipe_count];
270	completed = rcu_batches_completed() - completed;	270	completed = rcu_batches_completed() - completed;
271	if (completed > RCU_TORTURE_PIPE_LEN) {	271	if (completed > RCU_TORTURE_PIPE_LEN) {
272	/* Should not happen, but... */	272	/* Should not happen, but... */
273	completed = RCU_TORTURE_PIPE_LEN;	273	completed = RCU_TORTURE_PIPE_LEN;
274	}	274	}
275	++__get_cpu_var(rcu_torture_batch)[completed];	275	++__get_cpu_var(rcu_torture_batch)[completed];
276	preempt_enable();	276	preempt_enable();
277	rcu_read_unlock();	277	rcu_read_unlock();
278	schedule();	278	schedule();
279	} while (!kthread_should_stop() && !fullstop);	279	} while (!kthread_should_stop() && !fullstop);
280	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");	280	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
281	while (!kthread_should_stop())	281	while (!kthread_should_stop())
282	schedule_timeout_uninterruptible(1);	282	schedule_timeout_uninterruptible(1);
283	return 0;	283	return 0;
284	}	284	}
285		285
286	/*	286	/*
287	* Create an RCU-torture statistics message in the specified buffer.	287	* Create an RCU-torture statistics message in the specified buffer.
288	*/	288	*/
289	static int	289	static int
290	rcu_torture_printk(char *page)	290	rcu_torture_printk(char *page)
291	{	291	{
292	int cnt = 0;	292	int cnt = 0;
293	int cpu;	293	int cpu;
294	int i;	294	int i;
295	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };	295	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
296	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };	296	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
297		297
298	for_each_cpu(cpu) {	298	for_each_cpu(cpu) {
299	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {	299	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
300	pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];	300	pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
301	batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];	301	batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
302	}	302	}
303	}	303	}
304	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {	304	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
305	if (pipesummary[i] != 0)	305	if (pipesummary[i] != 0)
306	break;	306	break;
307	}	307	}
308	cnt += sprintf(&page[cnt], "rcutorture: ");	308	cnt += sprintf(&page[cnt], "rcutorture: ");
309	cnt += sprintf(&page[cnt],	309	cnt += sprintf(&page[cnt],
310	"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "	310	"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
311	"rtmbe: %d",	311	"rtmbe: %d",
312	rcu_torture_current,	312	rcu_torture_current,
313	rcu_torture_current_version,	313	rcu_torture_current_version,
314	list_empty(&rcu_torture_freelist),	314	list_empty(&rcu_torture_freelist),
315	atomic_read(&n_rcu_torture_alloc),	315	atomic_read(&n_rcu_torture_alloc),
316	atomic_read(&n_rcu_torture_alloc_fail),	316	atomic_read(&n_rcu_torture_alloc_fail),
317	atomic_read(&n_rcu_torture_free),	317	atomic_read(&n_rcu_torture_free),
318	atomic_read(&n_rcu_torture_mberror));	318	atomic_read(&n_rcu_torture_mberror));
319	if (atomic_read(&n_rcu_torture_mberror) != 0)	319	if (atomic_read(&n_rcu_torture_mberror) != 0)
320	cnt += sprintf(&page[cnt], " !!!");	320	cnt += sprintf(&page[cnt], " !!!");
321	cnt += sprintf(&page[cnt], "\nrcutorture: ");	321	cnt += sprintf(&page[cnt], "\nrcutorture: ");
322	if (i > 1) {	322	if (i > 1) {
323	cnt += sprintf(&page[cnt], "!!! ");	323	cnt += sprintf(&page[cnt], "!!! ");
324	atomic_inc(&n_rcu_torture_error);	324	atomic_inc(&n_rcu_torture_error);
325	}	325	}
326	cnt += sprintf(&page[cnt], "Reader Pipe: ");	326	cnt += sprintf(&page[cnt], "Reader Pipe: ");
327	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)	327	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
328	cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);	328	cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
329	cnt += sprintf(&page[cnt], "\nrcutorture: ");	329	cnt += sprintf(&page[cnt], "\nrcutorture: ");
330	cnt += sprintf(&page[cnt], "Reader Batch: ");	330	cnt += sprintf(&page[cnt], "Reader Batch: ");
331	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)	331	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
332	cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);	332	cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
333	cnt += sprintf(&page[cnt], "\nrcutorture: ");	333	cnt += sprintf(&page[cnt], "\nrcutorture: ");
334	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");	334	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
335	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {	335	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
336	cnt += sprintf(&page[cnt], " %d",	336	cnt += sprintf(&page[cnt], " %d",
337	atomic_read(&rcu_torture_wcount[i]));	337	atomic_read(&rcu_torture_wcount[i]));
338	}	338	}
339	cnt += sprintf(&page[cnt], "\n");	339	cnt += sprintf(&page[cnt], "\n");
340	return cnt;	340	return cnt;
341	}	341	}
342		342
343	/*	343	/*
344	* Print torture statistics. Caller must ensure that there is only	344	* Print torture statistics. Caller must ensure that there is only
345	* one call to this function at a given time!!! This is normally	345	* one call to this function at a given time!!! This is normally
346	* accomplished by relying on the module system to only have one copy	346	* accomplished by relying on the module system to only have one copy
347	* of the module loaded, and then by giving the rcu_torture_stats	347	* of the module loaded, and then by giving the rcu_torture_stats
348	* kthread full control (or the init/cleanup functions when rcu_torture_stats	348	* kthread full control (or the init/cleanup functions when rcu_torture_stats
349	* thread is not running).	349	* thread is not running).
350	*/	350	*/
351	static void	351	static void
352	rcu_torture_stats_print(void)	352	rcu_torture_stats_print(void)
353	{	353	{
354	int cnt;	354	int cnt;
355		355
356	cnt = rcu_torture_printk(printk_buf);	356	cnt = rcu_torture_printk(printk_buf);
357	printk(KERN_ALERT "%s", printk_buf);	357	printk(KERN_ALERT "%s", printk_buf);
358	}	358	}
359		359
360	/*	360	/*
361	* Periodically prints torture statistics, if periodic statistics printing	361	* Periodically prints torture statistics, if periodic statistics printing
362	* was specified via the stat_interval module parameter.	362	* was specified via the stat_interval module parameter.
363	*	363	*
364	* No need to worry about fullstop here, since this one doesn't reference	364	* No need to worry about fullstop here, since this one doesn't reference
365	* volatile state or register callbacks.	365	* volatile state or register callbacks.
366	*/	366	*/
367	static int	367	static int
368	rcu_torture_stats(void *arg)	368	rcu_torture_stats(void *arg)
369	{	369	{
370	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");	370	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
371	do {	371	do {
372	schedule_timeout_interruptible(stat_interval * HZ);	372	schedule_timeout_interruptible(stat_interval * HZ);
373	rcu_torture_stats_print();	373	rcu_torture_stats_print();
374	} while (!kthread_should_stop());	374	} while (!kthread_should_stop());
375	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");	375	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
376	return 0;	376	return 0;
377	}	377	}
378		378
379	static void	379	static void
380	rcu_torture_cleanup(void)	380	rcu_torture_cleanup(void)
381	{	381	{
382	int i;	382	int i;
383		383
384	fullstop = 1;	384	fullstop = 1;
385	if (writer_task != NULL) {	385	if (writer_task != NULL) {
386	VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");	386	VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
387	kthread_stop(writer_task);	387	kthread_stop(writer_task);
388	}	388	}
389	writer_task = NULL;	389	writer_task = NULL;
390		390
391	if (reader_tasks != NULL) {	391	if (reader_tasks != NULL) {
392	for (i = 0; i < nrealreaders; i++) {	392	for (i = 0; i < nrealreaders; i++) {
393	if (reader_tasks[i] != NULL) {	393	if (reader_tasks[i] != NULL) {
394	VERBOSE_PRINTK_STRING(	394	VERBOSE_PRINTK_STRING(
395	"Stopping rcu_torture_reader task");	395	"Stopping rcu_torture_reader task");
396	kthread_stop(reader_tasks[i]);	396	kthread_stop(reader_tasks[i]);
397	}	397	}
398	reader_tasks[i] = NULL;	398	reader_tasks[i] = NULL;
399	}	399	}
400	kfree(reader_tasks);	400	kfree(reader_tasks);
401	reader_tasks = NULL;	401	reader_tasks = NULL;
402	}	402	}
403	rcu_torture_current = NULL;	403	rcu_torture_current = NULL;
404		404
405	if (stats_task != NULL) {	405	if (stats_task != NULL) {
406	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");	406	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
407	kthread_stop(stats_task);	407	kthread_stop(stats_task);
408	}	408	}
409	stats_task = NULL;	409	stats_task = NULL;
410		410
411	/* Wait for all RCU callbacks to fire. */	411	/* Wait for all RCU callbacks to fire. */
412	rcu_barrier();	412	rcu_barrier();
413		413
414	rcu_torture_stats_print(); /* -After- the stats thread is stopped! */	414	rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
415	printk(KERN_ALERT TORTURE_FLAG	415	printk(KERN_ALERT TORTURE_FLAG
416	"--- End of test: %s\n",	416	"--- End of test: %s\n",
417	atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");	417	atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
418	}	418	}
419		419
420	static int	420	static int
421	rcu_torture_init(void)	421	rcu_torture_init(void)
422	{	422	{
423	int i;	423	int i;
424	int cpu;	424	int cpu;
425	int firsterr = 0;	425	int firsterr = 0;
426		426
427	/* Process args and tell the world that the torturer is on the job. */	427	/* Process args and tell the world that the torturer is on the job. */
428		428
429	if (nreaders >= 0)	429	if (nreaders >= 0)
430	nrealreaders = nreaders;	430	nrealreaders = nreaders;
431	else	431	else
432	nrealreaders = 2 * num_online_cpus();	432	nrealreaders = 2 * num_online_cpus();
433	printk(KERN_ALERT TORTURE_FLAG	433	printk(KERN_ALERT TORTURE_FLAG
434	"--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",	434	"--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
435	nrealreaders, stat_interval, verbose);	435	nrealreaders, stat_interval, verbose);
436	fullstop = 0;	436	fullstop = 0;
437		437
438	/* Set up the freelist. */	438	/* Set up the freelist. */
439		439
440	INIT_LIST_HEAD(&rcu_torture_freelist);	440	INIT_LIST_HEAD(&rcu_torture_freelist);
441	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {	441	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
442	rcu_tortures[i].rtort_mbtest = 0;	442	rcu_tortures[i].rtort_mbtest = 0;
443	list_add_tail(&rcu_tortures[i].rtort_free,	443	list_add_tail(&rcu_tortures[i].rtort_free,
444	&rcu_torture_freelist);	444	&rcu_torture_freelist);
445	}	445	}
446		446
447	/* Initialize the statistics so that each run gets its own numbers. */	447	/* Initialize the statistics so that each run gets its own numbers. */
448		448
449	rcu_torture_current = NULL;	449	rcu_torture_current = NULL;
450	rcu_torture_current_version = 0;	450	rcu_torture_current_version = 0;
451	atomic_set(&n_rcu_torture_alloc, 0);	451	atomic_set(&n_rcu_torture_alloc, 0);
452	atomic_set(&n_rcu_torture_alloc_fail, 0);	452	atomic_set(&n_rcu_torture_alloc_fail, 0);
453	atomic_set(&n_rcu_torture_free, 0);	453	atomic_set(&n_rcu_torture_free, 0);
454	atomic_set(&n_rcu_torture_mberror, 0);	454	atomic_set(&n_rcu_torture_mberror, 0);
455	atomic_set(&n_rcu_torture_error, 0);	455	atomic_set(&n_rcu_torture_error, 0);
456	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)	456	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
457	atomic_set(&rcu_torture_wcount[i], 0);	457	atomic_set(&rcu_torture_wcount[i], 0);
458	for_each_cpu(cpu) {	458	for_each_cpu(cpu) {
459	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {	459	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
460	per_cpu(rcu_torture_count, cpu)[i] = 0;	460	per_cpu(rcu_torture_count, cpu)[i] = 0;
461	per_cpu(rcu_torture_batch, cpu)[i] = 0;	461	per_cpu(rcu_torture_batch, cpu)[i] = 0;
462	}	462	}
463	}	463	}
464		464
465	/* Start up the kthreads. */	465	/* Start up the kthreads. */
466		466
467	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");	467	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
468	writer_task = kthread_run(rcu_torture_writer, NULL,	468	writer_task = kthread_run(rcu_torture_writer, NULL,
469	"rcu_torture_writer");	469	"rcu_torture_writer");
470	if (IS_ERR(writer_task)) {	470	if (IS_ERR(writer_task)) {
471	firsterr = PTR_ERR(writer_task);	471	firsterr = PTR_ERR(writer_task);
472	VERBOSE_PRINTK_ERRSTRING("Failed to create writer");	472	VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
473	writer_task = NULL;	473	writer_task = NULL;
474	goto unwind;	474	goto unwind;
475	}	475	}
476	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),	476	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
477	GFP_KERNEL);	477	GFP_KERNEL);
478	if (reader_tasks == NULL) {	478	if (reader_tasks == NULL) {
479	VERBOSE_PRINTK_ERRSTRING("out of memory");	479	VERBOSE_PRINTK_ERRSTRING("out of memory");
480	firsterr = -ENOMEM;	480	firsterr = -ENOMEM;
481	goto unwind;	481	goto unwind;
482	}	482	}
483	for (i = 0; i < nrealreaders; i++) {	483	for (i = 0; i < nrealreaders; i++) {
484	VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");	484	VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
485	reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,	485	reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
486	"rcu_torture_reader");	486	"rcu_torture_reader");
487	if (IS_ERR(reader_tasks[i])) {	487	if (IS_ERR(reader_tasks[i])) {
488	firsterr = PTR_ERR(reader_tasks[i]);	488	firsterr = PTR_ERR(reader_tasks[i]);
489	VERBOSE_PRINTK_ERRSTRING("Failed to create reader");	489	VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
490	reader_tasks[i] = NULL;	490	reader_tasks[i] = NULL;
491	goto unwind;	491	goto unwind;
492	}	492	}
493	}	493	}
494	if (stat_interval > 0) {	494	if (stat_interval > 0) {
495	VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");	495	VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
496	stats_task = kthread_run(rcu_torture_stats, NULL,	496	stats_task = kthread_run(rcu_torture_stats, NULL,
497	"rcu_torture_stats");	497	"rcu_torture_stats");
498	if (IS_ERR(stats_task)) {	498	if (IS_ERR(stats_task)) {
499	firsterr = PTR_ERR(stats_task);	499	firsterr = PTR_ERR(stats_task);
500	VERBOSE_PRINTK_ERRSTRING("Failed to create stats");	500	VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
501	stats_task = NULL;	501	stats_task = NULL;
502	goto unwind;	502	goto unwind;
503	}	503	}
504	}	504	}
505	return 0;	505	return 0;
506		506
507	unwind:	507	unwind:
508	rcu_torture_cleanup();	508	rcu_torture_cleanup();
509	return firsterr;	509	return firsterr;
510	}	510	}
511		511
512	module_init(rcu_torture_init);	512	module_init(rcu_torture_init);
513	module_exit(rcu_torture_cleanup);	513	module_exit(rcu_torture_cleanup);
514		514

kernel/timer.c

Diff comments View file @ 97a41e2

 /*
  *  linux/kernel/timer.c
  *
  *  Kernel internal timers, kernel timekeeping, basic process system calls
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
  *
  *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
  *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
  *              serialize accesses to xtime/lost_ticks).
  *                              Copyright (C) 1998  Andrea Arcangeli
  *  1999-03-10  Improved NTP compatibility by Ulrich Windl
  *  2002-05-31	Move sys_sysinfo here and make its locking sane, Robert Love
  *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
  *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
  *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  */
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/thread_info.h>
 #include <linux/time.h>
 #include <linux/jiffies.h>
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/delay.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
 #include <asm/io.h>
 #ifdef CONFIG_TIME_INTERPOLATION
 static void time_interpolator_update(long delta_nsec);
 #else
 #define time_interpolator_update(x)
 #endif
 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 EXPORT_SYMBOL(jiffies_64);
 /*
  * per-CPU timer vector definitions:
  */
 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
 #define TVN_SIZE (1 << TVN_BITS)
 #define TVR_SIZE (1 << TVR_BITS)
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 struct timer_base_s {
 	spinlock_t lock;
 	struct timer_list *running_timer;
 };
 typedef struct tvec_s {
 	struct list_head vec[TVN_SIZE];
 } tvec_t;
 typedef struct tvec_root_s {
 	struct list_head vec[TVR_SIZE];
 } tvec_root_t;
 struct tvec_t_base_s {
 	struct timer_base_s t_base;
 	unsigned long timer_jiffies;
 	tvec_root_t tv1;
 	tvec_t tv2;
 	tvec_t tv3;
 	tvec_t tv4;
 	tvec_t tv5;
 } ____cacheline_aligned_in_smp;
 typedef struct tvec_t_base_s tvec_base_t;
 static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
 	base->t_base.running_timer = timer;
 #endif
 }
 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
 	struct list_head *vec;
 	if (idx < TVR_SIZE) {
 		int i = expires & TVR_MASK;
 		vec = base->tv1.vec + i;
 	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 		int i = (expires >> TVR_BITS) & TVN_MASK;
 		vec = base->tv2.vec + i;
 	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 		vec = base->tv3.vec + i;
 	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv4.vec + i;
 	} else if ((signed long) idx < 0) {
 		/*
 		 * Can happen if you add a timer with expires == jiffies,
 		 * or you set a timer to go off in the past
 		 */
 		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
 	} else {
 		int i;
 		/* If the timeout is larger than 0xffffffff on 64-bit
 		 * architectures then we use the maximum timeout:
 		 */
 		if (idx > 0xffffffffUL) {
 			idx = 0xffffffffUL;
 			expires = idx + base->timer_jiffies;
 		}
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
 	}
 	/*
 	 * Timers are FIFO:
 	 */
 	list_add_tail(&timer->entry, vec);
 }
 typedef struct timer_base_s timer_base_t;
 /*
  * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
  * at compile time, and we need timer->base to lock the timer.
  */
 timer_base_t __init_timer_base
 	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
 EXPORT_SYMBOL(__init_timer_base);
 /***
  * init_timer - initialize a timer.
  * @timer: the timer to be initialized
  *
  * init_timer() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
 void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
 	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
 }
 EXPORT_SYMBOL(init_timer);
 static inline void detach_timer(struct timer_list *timer,
 					int clear_pending)
 {
 	struct list_head *entry = &timer->entry;
 	__list_del(entry->prev, entry->next);
 	if (clear_pending)
 		entry->next = NULL;
 	entry->prev = LIST_POISON2;
 }
 /*
  * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
  * So __run_timers/migrate_timers can safely modify all timers which could
  * be found on ->tvX lists.
  *
  * When the timer's base is locked, and the timer removed from list, it is
  * possible to set timer->base = NULL and drop the lock: the timer remains
  * locked.
  */
 static timer_base_t *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 {
 	timer_base_t *base;
 	for (;;) {
 		base = timer->base;
 		if (likely(base != NULL)) {
 			spin_lock_irqsave(&base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
 			/* The timer has migrated to another CPU */
 			spin_unlock_irqrestore(&base->lock, *flags);
 		}
 		cpu_relax();
 	}
 }
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
 	timer_base_t *base;
 	tvec_base_t *new_base;
 	unsigned long flags;
 	int ret = 0;
 	BUG_ON(!timer->function);
 	base = lock_timer_base(timer, &flags);
 	if (timer_pending(timer)) {
 		detach_timer(timer, 0);
 		ret = 1;
 	}
 	new_base = &__get_cpu_var(tvec_bases);
 	if (base != &new_base->t_base) {
 		/*
 		 * We are trying to schedule the timer on the local CPU.
 		 * However we can't change timer's base while it is running,
 		 * otherwise del_timer_sync() can't detect that the timer's
 		 * handler yet has not finished. This also guarantees that
 		 * the timer is serialized wrt itself.
 		 */
 		if (unlikely(base->running_timer == timer)) {
 			/* The timer remains on a former base */
 			new_base = container_of(base, tvec_base_t, t_base);
 		} else {
 			/* See the comment in lock_timer_base() */
 			timer->base = NULL;
 			spin_unlock(&base->lock);
 			spin_lock(&new_base->t_base.lock);
 			timer->base = &new_base->t_base;
 		}
 	}
 	timer->expires = expires;
 	internal_add_timer(new_base, timer);
 	spin_unlock_irqrestore(&new_base->t_base.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(__mod_timer);
 /***
  * add_timer_on - start a timer on a particular CPU
  * @timer: the timer to be added
  * @cpu: the CPU to start it on
  *
  * This is not very scalable on SMP. Double adds are not possible.
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
 	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
   	unsigned long flags;
   	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->t_base.lock, flags);
 	timer->base = &base->t_base;
 	internal_add_timer(base, timer);
 	spin_unlock_irqrestore(&base->t_base.lock, flags);
 }
 /***
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
  *
  * mod_timer is a more efficient way to update the expire field of an
  * active timer (if the timer is inactive it will be activated)
  *
  * mod_timer(timer, expires) is equivalent to:
  *
  *     del_timer(timer); timer->expires = expires; add_timer(timer);
  *
  * Note that if there are multiple unserialized concurrent users of the
  * same timer, then mod_timer() is the only safe way to modify the timeout,
  * since add_timer() cannot modify an already running timer.
  *
  * The function returns whether it has modified a pending timer or not.
  * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
  * active timer returns 1.)
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
 	BUG_ON(!timer->function);
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
 	 * to be the same thing then just return:
 	 */
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 	return __mod_timer(timer, expires);
 }
 EXPORT_SYMBOL(mod_timer);
 /***
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
  *
  * del_timer() deactivates a timer - this works on both active and inactive
  * timers.
  *
  * The function returns whether it has deactivated a pending timer or not.
  * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
  * active timer returns 1.)
  */
 int del_timer(struct timer_list *timer)
 {
 	timer_base_t *base;
 	unsigned long flags;
 	int ret = 0;
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		if (timer_pending(timer)) {
 			detach_timer(timer, 1);
 			ret = 1;
 		}
 		spin_unlock_irqrestore(&base->lock, flags);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(del_timer);
 #ifdef CONFIG_SMP
 /*
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
  *
  * It must not be called from interrupt contexts.
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
 	timer_base_t *base;
 	unsigned long flags;
 	int ret = -1;
 	base = lock_timer_base(timer, &flags);
 	if (base->running_timer == timer)
 		goto out;
 	ret = 0;
 	if (timer_pending(timer)) {
 		detach_timer(timer, 1);
 		ret = 1;
 	}
 out:
 	spin_unlock_irqrestore(&base->lock, flags);
 	return ret;
 }
 /***
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
  *
  * This function only differs from del_timer() on SMP: besides deactivating
  * the timer it also makes sure the handler has finished executing on other
  * CPUs.
  *
  * Synchronization rules: callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
  * interrupt contexts. The caller must not hold locks which would prevent
  * completion of the timer's handler. The timer's handler must not call
  * add_timer_on(). Upon exit the timer is not queued and the handler is
  * not running on any CPU.
  *
  * The function returns whether it has deactivated a pending timer or not.
  */
 int del_timer_sync(struct timer_list *timer)
 {
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
 	}
 }
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
 	struct list_head *head, *curr;
 	head = tv->vec + index;
 	curr = head->next;
 	/*
 	 * We are removing _all_ timers from the list, so we don't  have to
 	 * detach them individually, just clear the list afterwards.
 	 */
 	while (curr != head) {
 		struct timer_list *tmp;
 		tmp = list_entry(curr, struct timer_list, entry);
 		BUG_ON(tmp->base != &base->t_base);
 		curr = curr->next;
 		internal_add_timer(base, tmp);
 	}
 	INIT_LIST_HEAD(head);
 	return index;
 }
 /***
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
  *
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
 #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
 static inline void __run_timers(tvec_base_t *base)
 {
 	struct timer_list *timer;
 	spin_lock_irq(&base->t_base.lock);
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
  		int index = base->timer_jiffies & TVR_MASK;
 		/*
 		 * Cascade timers:
 		 */
 		if (!index &&
 			(!cascade(base, &base->tv2, INDEX(0))) &&
 				(!cascade(base, &base->tv3, INDEX(1))) &&
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies;
 		list_splice_init(base->tv1.vec + index, &work_list);
 		while (!list_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
 			timer = list_entry(head->next,struct timer_list,entry);
  			fn = timer->function;
  			data = timer->data;
 			set_running_timer(base, timer);
 			detach_timer(timer, 1);
 			spin_unlock_irq(&base->t_base.lock);
 			{
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
 					printk(KERN_WARNING "huh, entered %p "
 					       "with preempt_count %08x, exited"
 					       " with %08x?\n",
 					       fn, preempt_count,
 					       preempt_count());
 					BUG();
 				}
 			}
 			spin_lock_irq(&base->t_base.lock);
 		}
 	}
 	set_running_timer(base, NULL);
 	spin_unlock_irq(&base->t_base.lock);
 }
 #ifdef CONFIG_NO_IDLE_HZ
 /*
  * Find out when the next timer event is due to happen. This
  * is used on S/390 to stop all activity when a cpus is idle.
  * This functions needs to be called disabled.
  */
 unsigned long next_timer_interrupt(void)
 {
 	tvec_base_t *base;
 	struct list_head *list;
 	struct timer_list *nte;
 	unsigned long expires;
 	tvec_t *varray[4];
 	int i, j;
 	base = &__get_cpu_var(tvec_bases);
 	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = 0;
 	/* Look for timer events in tv1. */
 	j = base->timer_jiffies & TVR_MASK;
 	do {
 		list_for_each_entry(nte, base->tv1.vec + j, entry) {
 			expires = nte->expires;
 			if (j < (base->timer_jiffies & TVR_MASK))
 				list = base->tv2.vec + (INDEX(0));
 			goto found;
 		}
 		j = (j + 1) & TVR_MASK;
 	} while (j != (base->timer_jiffies & TVR_MASK));
 	/* Check tv2-tv5. */
 	varray[0] = &base->tv2;
 	varray[1] = &base->tv3;
 	varray[2] = &base->tv4;
 	varray[3] = &base->tv5;
 	for (i = 0; i < 4; i++) {
 		j = INDEX(i);
 		do {
 			if (list_empty(varray[i]->vec + j)) {
 				j = (j + 1) & TVN_MASK;
 				continue;
 			}
 			list_for_each_entry(nte, varray[i]->vec + j, entry)
 				if (time_before(nte->expires, expires))
 					expires = nte->expires;
 			if (j < (INDEX(i)) && i < 3)
 				list = varray[i + 1]->vec + (INDEX(i + 1));
 			goto found;
 		} while (j != (INDEX(i)));
 	}
 found:
 	if (list) {
 		/*
 		 * The search wrapped. We need to look at the next list
 		 * from next tv element that would cascade into tv element
 		 * where we found the timer element.
 		 */
 		list_for_each_entry(nte, list, entry) {
 			if (time_before(nte->expires, expires))
 				expires = nte->expires;
 		}
 	}
 	spin_unlock(&base->t_base.lock);
 	return expires;
 }
 #endif
 /******************************************************************/
 /*
  * Timekeeping variables
  */
 unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
 unsigned long tick_nsec = TICK_NSEC;		/* ACTHZ period (nsec) */
 /*
  * The current time
  * wall_to_monotonic is what we need to add to xtime (or xtime corrected
  * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
  * at zero at system boot time, so wall_to_monotonic will be negative,
  * however, we will ALWAYS keep the tv_nsec part positive so we can use
  * the usual normalization.
  */
 struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 EXPORT_SYMBOL(xtime);
 /* Don't completely fail for HZ > 500.  */
 int tickadj = 500/HZ ? : 1;		/* microsecs */
 /*
  * phase-lock loop variables
  */
 /* TIME_ERROR prevents overwriting the CMOS clock */
 int time_state = TIME_OK;		/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
 long time_offset;			/* time adjustment (us)		*/
 long time_constant = 2;			/* pll time constant		*/
 long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 static long time_phase;			/* phase offset (scaled us)	*/
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 					/* frequency offset (scaled ppm)*/
 static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
 /*
  * this routine handles the overflow of the microsecond field
  *
  * The tricky bits of code to handle the accurate clock support
  * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
  * They were originally developed for SUN and DEC kernels.
  * All the kudos should go to Dave for this stuff.
  *
  */
 static void second_overflow(void)
 {
 	long ltemp;
 	/* Bump the maxerror field */
 	time_maxerror += time_tolerance >> SHIFT_USEC;
 	if (time_maxerror > NTP_PHASE_LIMIT) {
 		time_maxerror = NTP_PHASE_LIMIT;
 		time_status |= STA_UNSYNC;
 	}
 	/*
 	 * Leap second processing. If in leap-insert state at the end of the
 	 * day, the system clock is set back one second; if in leap-delete
 	 * state, the system clock is set ahead one second. The microtime()
 	 * routine or external clock driver will insure that reported time is
 	 * always monotonic. The ugly divides should be replaced.
 	 */
 	switch (time_state) {
 	case TIME_OK:
 		if (time_status & STA_INS)
 			time_state = TIME_INS;
 		else if (time_status & STA_DEL)
 			time_state = TIME_DEL;
 		break;
 	case TIME_INS:
 		if (xtime.tv_sec % 86400 == 0) {
 			xtime.tv_sec--;
 			wall_to_monotonic.tv_sec++;
 			/*
 			 * The timer interpolator will make time change
 			 * gradually instead of an immediate jump by one second
 			 */
 			time_interpolator_update(-NSEC_PER_SEC);
 			time_state = TIME_OOP;
 			clock_was_set();
 			printk(KERN_NOTICE "Clock: inserting leap second "
 					"23:59:60 UTC\n");
 		}
 		break;
 	case TIME_DEL:
 		if ((xtime.tv_sec + 1) % 86400 == 0) {
 			xtime.tv_sec++;
 			wall_to_monotonic.tv_sec--;
 			/*
 			 * Use of time interpolator for a gradual change of
 			 * time
 			 */
 			time_interpolator_update(NSEC_PER_SEC);
 			time_state = TIME_WAIT;
 			clock_was_set();
 			printk(KERN_NOTICE "Clock: deleting leap second "
 					"23:59:59 UTC\n");
 		}
 		break;
 	case TIME_OOP:
 		time_state = TIME_WAIT;
 		break;
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 		time_state = TIME_OK;
 	}
 	/*
 	 * Compute the phase adjustment for the next second. In PLL mode, the
 	 * offset is reduced by a fixed factor times the time constant. In FLL
 	 * mode the offset is used directly. In either mode, the maximum phase
 	 * adjustment for each second is clamped so as to spread the adjustment
 	 * over not more than the number of seconds between updates.
 	 */
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
 		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
 	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
 	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 	/*
 	 * Compute the frequency estimate and additional phase adjustment due
 	 * to frequency error for the next second. When the PPS signal is
 	 * engaged, gnaw on the watchdog counter and update the frequency
 	 * computed by the pll and the PPS signal.
 	 */
 	pps_valid++;
 	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
 		pps_jitter = MAXTIME;
 		pps_stabil = MAXFREQ;
 		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
 				STA_PPSWANDER | STA_PPSERROR);
 	}
 	ltemp = time_freq + pps_freq;
 	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 #if HZ == 100
 	/*
 	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
 	 * get 128.125; => only 0.125% error (p. 14)
 	 */
 	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
 #endif
 #if HZ == 250
 	/*
 	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
 	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
 	 */
 	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
 	/*
 	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
 	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
 	 */
 	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 /* in the NTP reference this is called "hardclock()" */
 static void update_wall_time_one_tick(void)
 {
 	long time_adjust_step, delta_nsec;
 	if ((time_adjust_step = time_adjust) != 0 ) {
 		/*
 		 * We are doing an adjtime thing.  Prepare time_adjust_step to
 		 * be within bounds.  Note that a positive time_adjust means we
 		 * want the clock to run faster.
 		 *
 		 * Limit the amount of the step to be in the range
 		 * -tickadj .. +tickadj
 		 */
 		time_adjust_step = min(time_adjust_step, (long)tickadj);
 		time_adjust_step = max(time_adjust_step, (long)-tickadj);
 		/* Reduce by this step the amount of time left  */
 		time_adjust -= time_adjust_step;
 	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
 	 * Advance the phase, once it gets to one microsecond, then
 	 * advance the tick more.
 	 */
 	time_phase += time_adj;
 	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
 		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
 		time_phase -= ltemp << (SHIFT_SCALE - 10);
 		delta_nsec += ltemp;
 	}
 	xtime.tv_nsec += delta_nsec;
 	time_interpolator_update(delta_nsec);
 	/* Changes by adjtime() do not take effect till next tick. */
 	if (time_next_adjust != 0) {
 		time_adjust = time_next_adjust;
 		time_next_adjust = 0;
 	}
 }
 /*
  * Using a loop looks inefficient, but "ticks" is
  * usually just one (we shouldn't be losing ticks,
  * we're doing this this way mainly for interrupt
  * latency reasons, not because we think we'll
  * have lots of lost timer ticks
  */
 static void update_wall_time(unsigned long ticks)
 {
 	do {
 		ticks--;
 		update_wall_time_one_tick();
 		if (xtime.tv_nsec >= 1000000000) {
 			xtime.tv_nsec -= 1000000000;
 			xtime.tv_sec++;
 			second_overflow();
 		}
 	} while (ticks);
 }
 /*
  * Called from the timer interrupt handler to charge one tick to the current
  * process.  user_tick is 1 if the tick is user time, 0 for system.
  */
 void update_process_times(int user_tick)
 {
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
 	/* Note: this timer irq context must be accounted for as well. */
 	if (user_tick)
 		account_user_time(p, jiffies_to_cputime(1));
 	else
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
 	scheduler_tick();
  	run_posix_cpu_timers(p);
 }
 /*
  * Nr of active tasks - counted in fixed-point numbers
  */
 static unsigned long count_active_tasks(void)
 {
 	return (nr_running() + nr_uninterruptible()) * FIXED_1;
 }
 /*
  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
  * imply that avenrun[] is the standard name for this kind of thing.
  * Nothing else seems to be standardized: the fractional size etc
  * all seem to differ on different machines.
  *
  * Requires xtime_lock to access.
  */
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 /*
  * calc_load - given tick count, update the avenrun load estimates.
  * This is called while holding a write_lock on xtime_lock.
  */
 static inline void calc_load(unsigned long ticks)
 {
 	unsigned long active_tasks; /* fixed-point */
 	static int count = LOAD_FREQ;
 	count -= ticks;
 	if (count < 0) {
 		count += LOAD_FREQ;
 		active_tasks = count_active_tasks();
 		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
 	}
 }
 /* jiffies at the most recent update of wall time */
 unsigned long wall_jiffies = INITIAL_JIFFIES;
 /*
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
  */
 #ifndef ARCH_HAVE_XTIME_LOCK
 seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
 EXPORT_SYMBOL(xtime_lock);
 #endif
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
 	tvec_base_t *base = &__get_cpu_var(tvec_bases);
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
 /*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
 void run_local_timers(void)
 {
 	raise_softirq(TIMER_SOFTIRQ);
 }
 /*
  * Called by the timer interrupt. xtime_lock must already be taken
  * by the timer IRQ!
  */
 static inline void update_times(void)
 {
 	unsigned long ticks;
 	ticks = jiffies - wall_jiffies;
 	if (ticks) {
 		wall_jiffies += ticks;
 		update_wall_time(ticks);
 	}
 	calc_load(ticks);
 }
 /*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
  * jiffies is defined in the linker script...
  */
 void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
 	update_times();
 	softlockup_tick(regs);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
 /*
  * For backwards compatibility?  This can be done in libc so Alpha
  * and all newer ports shouldn't need it.
  */
 asmlinkage unsigned long sys_alarm(unsigned int seconds)
 {
 	struct itimerval it_new, it_old;
 	unsigned int oldalarm;
 	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
 	it_new.it_value.tv_sec = seconds;
 	it_new.it_value.tv_usec = 0;
 	do_setitimer(ITIMER_REAL, &it_new, &it_old);
 	oldalarm = it_old.it_value.tv_sec;
 	/* ehhh.. We can't return 0 if we have an alarm pending.. */
 	/* And we'd better return too much than too little anyway */
 	if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
 		oldalarm++;
 	return oldalarm;
 }
 #endif
 #ifndef __alpha__
 /*
  * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
  * should be moved into arch/i386 instead?
  */
 /**
  * sys_getpid - return the thread group id of the current process
  *
  * Note, despite the name, this returns the tgid not the pid.  The tgid and
  * the pid are identical unless CLONE_THREAD was specified on clone() in
  * which case the tgid is the same in all threads of the same group.
  *
  * This is SMP safe as current->tgid does not change.
  */
 asmlinkage long sys_getpid(void)
 {
 	return current->tgid;
 }
 /*
  * Accessing ->group_leader->real_parent is not SMP-safe, it could
  * change from under us. However, rather than getting any lock
  * we can use an optimistic algorithm: get the parent
  * pid, and go back and check that the parent is still
  * the same. If it has changed (which is extremely unlikely
  * indeed), we just try again..
  *
  * NOTE! This depends on the fact that even if we _do_
  * get an old value of "parent", we can happily dereference
  * the pointer (it was and remains a dereferencable kernel pointer
  * no matter what): we just can't necessarily trust the result
  * until we know that the parent pointer is valid.
  *
  * NOTE2: ->group_leader never changes from under us.
  */
 asmlinkage long sys_getppid(void)
 {
 	int pid;
 	struct task_struct *me = current;
 	struct task_struct *parent;
 	parent = me->group_leader->real_parent;
 	for (;;) {
 		pid = parent->tgid;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 {
 		struct task_struct *old = parent;
 		/*
 		 * Make sure we read the pid before re-reading the
 		 * parent pointer:
 		 */
 		smp_rmb();
 		parent = me->group_leader->real_parent;
 		if (old != parent)
 			continue;
 }
 #endif
 		break;
 	}
 	return pid;
 }
 asmlinkage long sys_getuid(void)
 {
 	/* Only we change this so SMP safe */
 	return current->uid;
 }
 asmlinkage long sys_geteuid(void)
 {
 	/* Only we change this so SMP safe */
 	return current->euid;
 }
 asmlinkage long sys_getgid(void)
 {
 	/* Only we change this so SMP safe */
 	return current->gid;
 }
 asmlinkage long sys_getegid(void)
 {
 	/* Only we change this so SMP safe */
 	return  current->egid;
 }
 #endif
 static void process_timeout(unsigned long __data)
 {
 	wake_up_process((task_t *)__data);
 }
 /**
  * schedule_timeout - sleep until timeout
  * @timeout: timeout value in jiffies
  *
  * Make the current task sleep until @timeout jiffies have
  * elapsed. The routine will return immediately unless
  * the current task state has been set (see set_current_state()).
  *
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
  * pass before the routine returns. The routine will return 0
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
  * delivered to the current task. In this case the remaining time
  * in jiffies will be returned, or 0 if the timer expired in time
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  *
  * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
  * the CPU away without a bound on the timeout. In this case the return
  * value will be %MAX_SCHEDULE_TIMEOUT.
  *
  * In all cases the return value is guaranteed to be non-negative.
  */
 fastcall signed long __sched schedule_timeout(signed long timeout)
 {
 	struct timer_list timer;
 	unsigned long expire;
 	switch (timeout)
 	{
 	case MAX_SCHEDULE_TIMEOUT:
 		/*
 		 * These two special cases are useful to be comfortable
 		 * in the caller. Nothing more. We could take
 		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
 		 * but I' d like to return a valid offset (>=0) to allow
 		 * the caller to do everything it want with the retval.
 		 */
 		schedule();
 		goto out;
 	default:
 		/*
 		 * Another bit of PARANOID. Note that the retval will be
 		 * 0 since no piece of kernel is supposed to do a check
 		 * for a negative retval of schedule_timeout() (since it
 		 * should never happens anyway). You just have the printk()
 		 * that will tell you if something is gone wrong and where.
 		 */
 		if (timeout < 0)
 		{
 			printk(KERN_ERR "schedule_timeout: wrong timeout "
 				"value %lx from %p\n", timeout,
 				__builtin_return_address(0));
 			current->state = TASK_RUNNING;
 			goto out;
 		}
 	}
 	expire = timeout + jiffies;
 	setup_timer(&timer, process_timeout, (unsigned long)current);
 	__mod_timer(&timer, expire);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 	timeout = expire - jiffies;
  out:
 	return timeout < 0 ? 0 : timeout;
 }
 EXPORT_SYMBOL(schedule_timeout);
 /*
  * We can use __set_current_state() here because schedule_timeout() calls
  * schedule() unconditionally.
  */
 signed long __sched schedule_timeout_interruptible(signed long timeout)
 {
 	__set_current_state(TASK_INTERRUPTIBLE);
 	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_interruptible);
 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 {
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 /* Thread ID - the internal kernel "pid" */
 asmlinkage long sys_gettid(void)
 {
 	return current->pid;
 }
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
 	unsigned long expire = restart->arg0, now = jiffies;
 	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
 	long ret;
 	/* Did it expire while we handled signals? */
 	if (!time_after(expire, now))
 		return 0;
 	expire = schedule_timeout_interruptible(expire - now);
 	ret = 0;
 	if (expire) {
 		struct timespec t;
 		jiffies_to_timespec(expire, &t);
 		ret = -ERESTART_RESTARTBLOCK;
 		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
 			ret = -EFAULT;
 		/* The 'restart' block is already filled in */
 	}
 	return ret;
 }
 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
 {
 	struct timespec t;
 	unsigned long expire;
 	long ret;
 	if (copy_from_user(&t, rqtp, sizeof(t)))
 		return -EFAULT;
 	if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
 		return -EINVAL;
 	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
 	expire = schedule_timeout_interruptible(expire);
 	ret = 0;
 	if (expire) {
 		struct restart_block *restart;
 		jiffies_to_timespec(expire, &t);
 		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
 			return -EFAULT;
 		restart = &current_thread_info()->restart_block;
 		restart->fn = nanosleep_restart;
 		restart->arg0 = jiffies + expire;
 		restart->arg1 = (unsigned long) rmtp;
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
 }
 /*
  * sys_sysinfo - fill in sysinfo struct
  */
 asmlinkage long sys_sysinfo(struct sysinfo __user *info)
 {
 	struct sysinfo val;
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
 	unsigned long seq;
 	memset((char *)&val, 0, sizeof(struct sysinfo));
 	do {
 		struct timespec tp;
 		seq = read_seqbegin(&xtime_lock);
 		/*
 		 * This is annoying.  The below is the same thing
 		 * posix_get_clock_monotonic() does, but it wants to
 		 * take the lock which we want to cover the loads stuff
 		 * too.
 		 */
 		getnstimeofday(&tp);
 		tp.tv_sec += wall_to_monotonic.tv_sec;
 		tp.tv_nsec += wall_to_monotonic.tv_nsec;
 		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
 			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
 			tp.tv_sec++;
 		}
 		val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 		val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
 		val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
 		val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
 		val.procs = nr_threads;
 	} while (read_seqretry(&xtime_lock, seq));
 	si_meminfo(&val);
 	si_swapinfo(&val);
 	/*
 	 * If the sum of all the available memory (i.e. ram + swap)
 	 * is less than can be stored in a 32 bit unsigned long then
 	 * we can be binary compatible with 2.2.x kernels.  If not,
 	 * well, in that case 2.2.x was broken anyways...
 	 *
 	 *  -Erik Andersen <andersee@debian.org>
 	 */
 	mem_total = val.totalram + val.totalswap;
 	if (mem_total < val.totalram || mem_total < val.totalswap)
 		goto out;
 	bitcount = 0;
 	mem_unit = val.mem_unit;
 	while (mem_unit > 1) {
 		bitcount++;
 		mem_unit >>= 1;
 		sav_total = mem_total;
 		mem_total <<= 1;
 		if (mem_total < sav_total)
 			goto out;
 	}
 	/*
 	 * If mem_total did not overflow, multiply all memory values by
 	 * val.mem_unit and set it to 1.  This leaves things compatible
 	 * with 2.2.x, and also retains compatibility with earlier 2.4.x
 	 * kernels...
 	 */
 	val.mem_unit = 1;
 	val.totalram <<= bitcount;
 	val.freeram <<= bitcount;
 	val.sharedram <<= bitcount;
 	val.bufferram <<= bitcount;
 	val.totalswap <<= bitcount;
 	val.freeswap <<= bitcount;
 	val.totalhigh <<= bitcount;
 	val.freehigh <<= bitcount;
  out:
 	if (copy_to_user(info, &val, sizeof(struct sysinfo)))
 		return -EFAULT;
 	return 0;
 }
 static void __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
 	base = &per_cpu(tvec_bases, cpu);
 	spin_lock_init(&base->t_base.lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
 		INIT_LIST_HEAD(base->tv3.vec + j);
 		INIT_LIST_HEAD(base->tv2.vec + j);
 	}
 	for (j = 0; j < TVR_SIZE; j++)
 		INIT_LIST_HEAD(base->tv1.vec + j);
 	base->timer_jiffies = jiffies;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 {
 	struct timer_list *timer;
 	while (!list_empty(head)) {
 		timer = list_entry(head->next, struct timer_list, entry);
 		detach_timer(timer, 0);
 		timer->base = &new_base->t_base;
 		internal_add_timer(new_base, timer);
 	}
 }
 static void __devinit migrate_timers(int cpu)
 {
 	tvec_base_t *old_base;
 	tvec_base_t *new_base;
 	int i;
 	BUG_ON(cpu_online(cpu));
 	old_base = &per_cpu(tvec_bases, cpu);
 	new_base = &get_cpu_var(tvec_bases);
 	local_irq_disable();
 	spin_lock(&new_base->t_base.lock);
 	spin_lock(&old_base->t_base.lock);
 	if (old_base->t_base.running_timer)
 		BUG();
 	for (i = 0; i < TVR_SIZE; i++)
 		migrate_timer_list(new_base, old_base->tv1.vec + i);
 	for (i = 0; i < TVN_SIZE; i++) {
 		migrate_timer_list(new_base, old_base->tv2.vec + i);
 		migrate_timer_list(new_base, old_base->tv3.vec + i);
 		migrate_timer_list(new_base, old_base->tv4.vec + i);
 		migrate_timer_list(new_base, old_base->tv5.vec + i);
 	}
 	spin_unlock(&old_base->t_base.lock);
 	spin_unlock(&new_base->t_base.lock);
 	local_irq_enable();
 	put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 static int __devinit timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	switch(action) {
 	case CPU_UP_PREPARE:
 		init_timers_cpu(cpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 		migrate_timers(cpu);
 		break;
 #endif
 	default:
 		break;
 	}
 	return NOTIFY_OK;
 }
 static struct notifier_block __devinitdata timers_nb = {
 	.notifier_call	= timer_cpu_notify,
 };
 void __init init_timers(void)
 {
 	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
 				(void *)(long)smp_processor_id());
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
 }
 #ifdef CONFIG_TIME_INTERPOLATION
 struct time_interpolator *time_interpolator;
 static struct time_interpolator *time_interpolator_list;
 static DEFINE_SPINLOCK(time_interpolator_lock);
 static inline u64 time_interpolator_get_cycles(unsigned int src)
 {
 	unsigned long (*x)(void);
 	switch (src)
 	{
 		case TIME_SOURCE_FUNCTION:
 			x = time_interpolator->addr;
 			return x();
 		case TIME_SOURCE_MMIO64	:
 			return readq((void __iomem *) time_interpolator->addr);
 		case TIME_SOURCE_MMIO32	:
 			return readl((void __iomem *) time_interpolator->addr);
 		default: return get_cycles();
 	}
 }
 static inline u64 time_interpolator_get_counter(int writelock)
 {
 	unsigned int src = time_interpolator->source;
 	if (time_interpolator->jitter)
 	{
 		u64 lcycle;
 		u64 now;
 		do {
 			lcycle = time_interpolator->last_cycle;
 			now = time_interpolator_get_cycles(src);
 			if (lcycle && time_after(lcycle, now))
 				return lcycle;
 			/* When holding the xtime write lock, there's no need
 			 * to add the overhead of the cmpxchg.  Readers are
 			 * force to retry until the write lock is released.
 			 */
 			if (writelock) {
 				time_interpolator->last_cycle = now;
 				return now;
 			}
 			/* Keep track of the last timer value returned. The use of cmpxchg here
 			 * will cause contention in an SMP environment.
 			 */
 		} while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
 		return now;
 	}
 	else
 		return time_interpolator_get_cycles(src);
 }
 void time_interpolator_reset(void)
 {
 	time_interpolator->offset = 0;
 	time_interpolator->last_counter = time_interpolator_get_counter(1);
 }
 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
 unsigned long time_interpolator_get_offset(void)
 {
 	/* If we do not have a time interpolator set up then just return zero */
 	if (!time_interpolator)
 		return 0;
 	return time_interpolator->offset +
 		GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
 }
 #define INTERPOLATOR_ADJUST 65536
 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
 static void time_interpolator_update(long delta_nsec)
 {
 	u64 counter;
 	unsigned long offset;
 	/* If there is no time interpolator set up then do nothing */
 	if (!time_interpolator)
 		return;
 	/*
 	 * The interpolator compensates for late ticks by accumulating the late
 	 * time in time_interpolator->offset. A tick earlier than expected will
 	 * lead to a reset of the offset and a corresponding jump of the clock
 	 * forward. Again this only works if the interpolator clock is running
 	 * slightly slower than the regular clock and the tuning logic insures
 	 * that.
 	 */
 	counter = time_interpolator_get_counter(1);
 	offset = time_interpolator->offset +
 			GET_TI_NSECS(counter, time_interpolator);
 	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
 		time_interpolator->offset = offset - delta_nsec;
 	else {
 		time_interpolator->skips++;
 		time_interpolator->ns_skipped += delta_nsec - offset;
 		time_interpolator->offset = 0;
 	}
 	time_interpolator->last_counter = counter;
 	/* Tuning logic for time interpolator invoked every minute or so.
 	 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
 	 * Increase interpolator clock speed if we skip too much time.
 	 */
 	if (jiffies % INTERPOLATOR_ADJUST == 0)
 	{
 		if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
 			time_interpolator->nsec_per_cyc--;
 		if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
 			time_interpolator->nsec_per_cyc++;
 		time_interpolator->skips = 0;
 		time_interpolator->ns_skipped = 0;
 	}
 }
 static inline int
 is_better_time_interpolator(struct time_interpolator *new)
 {
 	if (!time_interpolator)
 		return 1;
 	return new->frequency > 2*time_interpolator->frequency ||
 	    (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
 }
 void
 register_time_interpolator(struct time_interpolator *ti)
 {
 	unsigned long flags;
 	/* Sanity check */
 	if (ti->frequency == 0 || ti->mask == 0)
 		BUG();
 	ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
 	spin_lock(&time_interpolator_lock);
 	write_seqlock_irqsave(&xtime_lock, flags);
 	if (is_better_time_interpolator(ti)) {
 		time_interpolator = ti;
 		time_interpolator_reset();
 	}
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	ti->next = time_interpolator_list;
 	time_interpolator_list = ti;
 	spin_unlock(&time_interpolator_lock);
 }
 void
 unregister_time_interpolator(struct time_interpolator *ti)
 {
 	struct time_interpolator *curr, **prev;
 	unsigned long flags;
 	spin_lock(&time_interpolator_lock);
 	prev = &time_interpolator_list;
 	for (curr = *prev; curr; curr = curr->next) {
 		if (curr == ti) {
 			*prev = curr->next;
 			break;
 		}
 		prev = &curr->next;
 	}
 	write_seqlock_irqsave(&xtime_lock, flags);
 	if (ti == time_interpolator) {
 		/* we lost the best time-interpolator: */
 		time_interpolator = NULL;
 		/* find the next-best interpolator */
 		for (curr = time_interpolator_list; curr; curr = curr->next)
 			if (is_better_time_interpolator(curr))
 				time_interpolator = curr;
 		time_interpolator_reset();
 	}
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	spin_unlock(&time_interpolator_lock);
 }
 #endif /* CONFIG_TIME_INTERPOLATION */
 /**
  * msleep - sleep safely even with waitqueue interruptions
  * @msecs: Time in milliseconds to sleep for
  */
 void msleep(unsigned int msecs)
 {
 	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
 	while (timeout)
 		timeout = schedule_timeout_uninterruptible(timeout);
 }
 EXPORT_SYMBOL(msleep);
 /**
  * msleep_interruptible - sleep waiting for signals
  * @msecs: Time in milliseconds to sleep for
  */
 unsigned long msleep_interruptible(unsigned int msecs)
 {
 	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
 	while (timeout && !signal_pending(current))
 		timeout = schedule_timeout_interruptible(timeout);
 	return jiffies_to_msecs(timeout);
 }
 EXPORT_SYMBOL(msleep_interruptible);