Commit fd450b7318b75343fd76b3d95416853e34e72c95

Authored by Oleg Nesterov
Committed by Linus Torvalds
1 parent 55c888d6d0

[PATCH] timers: introduce try_to_del_timer_sync()

This patch splits del_timer_sync() into 2 functions.  The new one,
try_to_del_timer_sync(), returns -1 when it hits executing timer.

It can be used in interrupt context, or when the caller hold locks which
can prevent completion of the timer's handler.

NOTE.  Currently it can't be used in interrupt context in UP case, because
->running_timer is used only with CONFIG_SMP.

Should the need arise, it is possible to kill #ifdef CONFIG_SMP in
set_running_timer(), it is cheap.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 36 additions and 21 deletions Inline Diff

include/linux/timer.h
1 #ifndef _LINUX_TIMER_H 1 #ifndef _LINUX_TIMER_H
2 #define _LINUX_TIMER_H 2 #define _LINUX_TIMER_H
3 3
4 #include <linux/config.h> 4 #include <linux/config.h>
5 #include <linux/list.h> 5 #include <linux/list.h>
6 #include <linux/spinlock.h> 6 #include <linux/spinlock.h>
7 #include <linux/stddef.h> 7 #include <linux/stddef.h>
8 8
9 struct timer_base_s; 9 struct timer_base_s;
10 10
11 struct timer_list { 11 struct timer_list {
12 struct list_head entry; 12 struct list_head entry;
13 unsigned long expires; 13 unsigned long expires;
14 14
15 unsigned long magic; 15 unsigned long magic;
16 16
17 void (*function)(unsigned long); 17 void (*function)(unsigned long);
18 unsigned long data; 18 unsigned long data;
19 19
20 struct timer_base_s *base; 20 struct timer_base_s *base;
21 }; 21 };
22 22
23 #define TIMER_MAGIC 0x4b87ad6e 23 #define TIMER_MAGIC 0x4b87ad6e
24 24
25 extern struct timer_base_s __init_timer_base; 25 extern struct timer_base_s __init_timer_base;
26 26
27 #define TIMER_INITIALIZER(_function, _expires, _data) { \ 27 #define TIMER_INITIALIZER(_function, _expires, _data) { \
28 .function = (_function), \ 28 .function = (_function), \
29 .expires = (_expires), \ 29 .expires = (_expires), \
30 .data = (_data), \ 30 .data = (_data), \
31 .base = &__init_timer_base, \ 31 .base = &__init_timer_base, \
32 .magic = TIMER_MAGIC, \ 32 .magic = TIMER_MAGIC, \
33 } 33 }
34 34
35 void fastcall init_timer(struct timer_list * timer); 35 void fastcall init_timer(struct timer_list * timer);
36 36
37 /*** 37 /***
38 * timer_pending - is a timer pending? 38 * timer_pending - is a timer pending?
39 * @timer: the timer in question 39 * @timer: the timer in question
40 * 40 *
41 * timer_pending will tell whether a given timer is currently pending, 41 * timer_pending will tell whether a given timer is currently pending,
42 * or not. Callers must ensure serialization wrt. other operations done 42 * or not. Callers must ensure serialization wrt. other operations done
43 * to this timer, eg. interrupt contexts, or other CPUs on SMP. 43 * to this timer, eg. interrupt contexts, or other CPUs on SMP.
44 * 44 *
45 * return value: 1 if the timer is pending, 0 if not. 45 * return value: 1 if the timer is pending, 0 if not.
46 */ 46 */
47 static inline int timer_pending(const struct timer_list * timer) 47 static inline int timer_pending(const struct timer_list * timer)
48 { 48 {
49 return timer->entry.next != NULL; 49 return timer->entry.next != NULL;
50 } 50 }
51 51
52 extern void add_timer_on(struct timer_list *timer, int cpu); 52 extern void add_timer_on(struct timer_list *timer, int cpu);
53 extern int del_timer(struct timer_list * timer); 53 extern int del_timer(struct timer_list * timer);
54 extern int __mod_timer(struct timer_list *timer, unsigned long expires); 54 extern int __mod_timer(struct timer_list *timer, unsigned long expires);
55 extern int mod_timer(struct timer_list *timer, unsigned long expires); 55 extern int mod_timer(struct timer_list *timer, unsigned long expires);
56 56
57 extern unsigned long next_timer_interrupt(void); 57 extern unsigned long next_timer_interrupt(void);
58 58
59 /*** 59 /***
60 * add_timer - start a timer 60 * add_timer - start a timer
61 * @timer: the timer to be added 61 * @timer: the timer to be added
62 * 62 *
63 * The kernel will do a ->function(->data) callback from the 63 * The kernel will do a ->function(->data) callback from the
64 * timer interrupt at the ->expired point in the future. The 64 * timer interrupt at the ->expired point in the future. The
65 * current time is 'jiffies'. 65 * current time is 'jiffies'.
66 * 66 *
67 * The timer's ->expired, ->function (and if the handler uses it, ->data) 67 * The timer's ->expired, ->function (and if the handler uses it, ->data)
68 * fields must be set prior calling this function. 68 * fields must be set prior calling this function.
69 * 69 *
70 * Timers with an ->expired field in the past will be executed in the next 70 * Timers with an ->expired field in the past will be executed in the next
71 * timer tick. 71 * timer tick.
72 */ 72 */
73 static inline void add_timer(struct timer_list * timer) 73 static inline void add_timer(struct timer_list * timer)
74 { 74 {
75 __mod_timer(timer, timer->expires); 75 __mod_timer(timer, timer->expires);
76 } 76 }
77 77
78 #ifdef CONFIG_SMP 78 #ifdef CONFIG_SMP
79 extern int try_to_del_timer_sync(struct timer_list *timer);
79 extern int del_timer_sync(struct timer_list *timer); 80 extern int del_timer_sync(struct timer_list *timer);
80 #else 81 #else
81 # define del_timer_sync(t) del_timer(t) 82 # define try_to_del_timer_sync(t) del_timer(t)
83 # define del_timer_sync(t) del_timer(t)
82 #endif 84 #endif
83 85
84 #define del_singleshot_timer_sync(t) del_timer_sync(t) 86 #define del_singleshot_timer_sync(t) del_timer_sync(t)
85 87
86 extern void init_timers(void); 88 extern void init_timers(void);
87 extern void run_local_timers(void); 89 extern void run_local_timers(void);
88 extern void it_real_fn(unsigned long); 90 extern void it_real_fn(unsigned long);
89 91
90 #endif 92 #endif
91 93
1 /* 1 /*
2 * linux/kernel/timer.c 2 * linux/kernel/timer.c
3 * 3 *
4 * Kernel internal timers, kernel timekeeping, basic process system calls 4 * Kernel internal timers, kernel timekeeping, basic process system calls
5 * 5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds 6 * Copyright (C) 1991, 1992 Linus Torvalds
7 * 7 *
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. 8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
9 * 9 *
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills 11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to 12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks). 13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli 14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl 15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love 16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling. 17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar 18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar 19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
20 */ 20 */
21 21
22 #include <linux/kernel_stat.h> 22 #include <linux/kernel_stat.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/interrupt.h> 24 #include <linux/interrupt.h>
25 #include <linux/percpu.h> 25 #include <linux/percpu.h>
26 #include <linux/init.h> 26 #include <linux/init.h>
27 #include <linux/mm.h> 27 #include <linux/mm.h>
28 #include <linux/swap.h> 28 #include <linux/swap.h>
29 #include <linux/notifier.h> 29 #include <linux/notifier.h>
30 #include <linux/thread_info.h> 30 #include <linux/thread_info.h>
31 #include <linux/time.h> 31 #include <linux/time.h>
32 #include <linux/jiffies.h> 32 #include <linux/jiffies.h>
33 #include <linux/posix-timers.h> 33 #include <linux/posix-timers.h>
34 #include <linux/cpu.h> 34 #include <linux/cpu.h>
35 #include <linux/syscalls.h> 35 #include <linux/syscalls.h>
36 36
37 #include <asm/uaccess.h> 37 #include <asm/uaccess.h>
38 #include <asm/unistd.h> 38 #include <asm/unistd.h>
39 #include <asm/div64.h> 39 #include <asm/div64.h>
40 #include <asm/timex.h> 40 #include <asm/timex.h>
41 #include <asm/io.h> 41 #include <asm/io.h>
42 42
43 #ifdef CONFIG_TIME_INTERPOLATION 43 #ifdef CONFIG_TIME_INTERPOLATION
44 static void time_interpolator_update(long delta_nsec); 44 static void time_interpolator_update(long delta_nsec);
45 #else 45 #else
46 #define time_interpolator_update(x) 46 #define time_interpolator_update(x)
47 #endif 47 #endif
48 48
49 /* 49 /*
50 * per-CPU timer vector definitions: 50 * per-CPU timer vector definitions:
51 */ 51 */
52 52
53 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) 53 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
54 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) 54 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
55 #define TVN_SIZE (1 << TVN_BITS) 55 #define TVN_SIZE (1 << TVN_BITS)
56 #define TVR_SIZE (1 << TVR_BITS) 56 #define TVR_SIZE (1 << TVR_BITS)
57 #define TVN_MASK (TVN_SIZE - 1) 57 #define TVN_MASK (TVN_SIZE - 1)
58 #define TVR_MASK (TVR_SIZE - 1) 58 #define TVR_MASK (TVR_SIZE - 1)
59 59
60 struct timer_base_s { 60 struct timer_base_s {
61 spinlock_t lock; 61 spinlock_t lock;
62 struct timer_list *running_timer; 62 struct timer_list *running_timer;
63 }; 63 };
64 64
65 typedef struct tvec_s { 65 typedef struct tvec_s {
66 struct list_head vec[TVN_SIZE]; 66 struct list_head vec[TVN_SIZE];
67 } tvec_t; 67 } tvec_t;
68 68
69 typedef struct tvec_root_s { 69 typedef struct tvec_root_s {
70 struct list_head vec[TVR_SIZE]; 70 struct list_head vec[TVR_SIZE];
71 } tvec_root_t; 71 } tvec_root_t;
72 72
73 struct tvec_t_base_s { 73 struct tvec_t_base_s {
74 struct timer_base_s t_base; 74 struct timer_base_s t_base;
75 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
76 tvec_root_t tv1; 76 tvec_root_t tv1;
77 tvec_t tv2; 77 tvec_t tv2;
78 tvec_t tv3; 78 tvec_t tv3;
79 tvec_t tv4; 79 tvec_t tv4;
80 tvec_t tv5; 80 tvec_t tv5;
81 } ____cacheline_aligned_in_smp; 81 } ____cacheline_aligned_in_smp;
82 82
83 typedef struct tvec_t_base_s tvec_base_t; 83 typedef struct tvec_t_base_s tvec_base_t;
84 static DEFINE_PER_CPU(tvec_base_t, tvec_bases); 84 static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
85 85
86 static inline void set_running_timer(tvec_base_t *base, 86 static inline void set_running_timer(tvec_base_t *base,
87 struct timer_list *timer) 87 struct timer_list *timer)
88 { 88 {
89 #ifdef CONFIG_SMP 89 #ifdef CONFIG_SMP
90 base->t_base.running_timer = timer; 90 base->t_base.running_timer = timer;
91 #endif 91 #endif
92 } 92 }
93 93
94 static void check_timer_failed(struct timer_list *timer) 94 static void check_timer_failed(struct timer_list *timer)
95 { 95 {
96 static int whine_count; 96 static int whine_count;
97 if (whine_count < 16) { 97 if (whine_count < 16) {
98 whine_count++; 98 whine_count++;
99 printk("Uninitialised timer!\n"); 99 printk("Uninitialised timer!\n");
100 printk("This is just a warning. Your computer is OK\n"); 100 printk("This is just a warning. Your computer is OK\n");
101 printk("function=0x%p, data=0x%lx\n", 101 printk("function=0x%p, data=0x%lx\n",
102 timer->function, timer->data); 102 timer->function, timer->data);
103 dump_stack(); 103 dump_stack();
104 } 104 }
105 /* 105 /*
106 * Now fix it up 106 * Now fix it up
107 */ 107 */
108 timer->magic = TIMER_MAGIC; 108 timer->magic = TIMER_MAGIC;
109 } 109 }
110 110
111 static inline void check_timer(struct timer_list *timer) 111 static inline void check_timer(struct timer_list *timer)
112 { 112 {
113 if (timer->magic != TIMER_MAGIC) 113 if (timer->magic != TIMER_MAGIC)
114 check_timer_failed(timer); 114 check_timer_failed(timer);
115 } 115 }
116 116
117 117
118 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 118 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
119 { 119 {
120 unsigned long expires = timer->expires; 120 unsigned long expires = timer->expires;
121 unsigned long idx = expires - base->timer_jiffies; 121 unsigned long idx = expires - base->timer_jiffies;
122 struct list_head *vec; 122 struct list_head *vec;
123 123
124 if (idx < TVR_SIZE) { 124 if (idx < TVR_SIZE) {
125 int i = expires & TVR_MASK; 125 int i = expires & TVR_MASK;
126 vec = base->tv1.vec + i; 126 vec = base->tv1.vec + i;
127 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { 127 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
128 int i = (expires >> TVR_BITS) & TVN_MASK; 128 int i = (expires >> TVR_BITS) & TVN_MASK;
129 vec = base->tv2.vec + i; 129 vec = base->tv2.vec + i;
130 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { 130 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
131 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; 131 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
132 vec = base->tv3.vec + i; 132 vec = base->tv3.vec + i;
133 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { 133 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
134 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; 134 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
135 vec = base->tv4.vec + i; 135 vec = base->tv4.vec + i;
136 } else if ((signed long) idx < 0) { 136 } else if ((signed long) idx < 0) {
137 /* 137 /*
138 * Can happen if you add a timer with expires == jiffies, 138 * Can happen if you add a timer with expires == jiffies,
139 * or you set a timer to go off in the past 139 * or you set a timer to go off in the past
140 */ 140 */
141 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); 141 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
142 } else { 142 } else {
143 int i; 143 int i;
144 /* If the timeout is larger than 0xffffffff on 64-bit 144 /* If the timeout is larger than 0xffffffff on 64-bit
145 * architectures then we use the maximum timeout: 145 * architectures then we use the maximum timeout:
146 */ 146 */
147 if (idx > 0xffffffffUL) { 147 if (idx > 0xffffffffUL) {
148 idx = 0xffffffffUL; 148 idx = 0xffffffffUL;
149 expires = idx + base->timer_jiffies; 149 expires = idx + base->timer_jiffies;
150 } 150 }
151 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; 151 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
152 vec = base->tv5.vec + i; 152 vec = base->tv5.vec + i;
153 } 153 }
154 /* 154 /*
155 * Timers are FIFO: 155 * Timers are FIFO:
156 */ 156 */
157 list_add_tail(&timer->entry, vec); 157 list_add_tail(&timer->entry, vec);
158 } 158 }
159 159
160 typedef struct timer_base_s timer_base_t; 160 typedef struct timer_base_s timer_base_t;
161 /* 161 /*
162 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) 162 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
163 * at compile time, and we need timer->base to lock the timer. 163 * at compile time, and we need timer->base to lock the timer.
164 */ 164 */
165 timer_base_t __init_timer_base 165 timer_base_t __init_timer_base
166 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; 166 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
167 EXPORT_SYMBOL(__init_timer_base); 167 EXPORT_SYMBOL(__init_timer_base);
168 168
169 /*** 169 /***
170 * init_timer - initialize a timer. 170 * init_timer - initialize a timer.
171 * @timer: the timer to be initialized 171 * @timer: the timer to be initialized
172 * 172 *
173 * init_timer() must be done to a timer prior calling *any* of the 173 * init_timer() must be done to a timer prior calling *any* of the
174 * other timer functions. 174 * other timer functions.
175 */ 175 */
176 void fastcall init_timer(struct timer_list *timer) 176 void fastcall init_timer(struct timer_list *timer)
177 { 177 {
178 timer->entry.next = NULL; 178 timer->entry.next = NULL;
179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; 179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
180 timer->magic = TIMER_MAGIC; 180 timer->magic = TIMER_MAGIC;
181 } 181 }
182 EXPORT_SYMBOL(init_timer); 182 EXPORT_SYMBOL(init_timer);
183 183
184 static inline void detach_timer(struct timer_list *timer, 184 static inline void detach_timer(struct timer_list *timer,
185 int clear_pending) 185 int clear_pending)
186 { 186 {
187 struct list_head *entry = &timer->entry; 187 struct list_head *entry = &timer->entry;
188 188
189 __list_del(entry->prev, entry->next); 189 __list_del(entry->prev, entry->next);
190 if (clear_pending) 190 if (clear_pending)
191 entry->next = NULL; 191 entry->next = NULL;
192 entry->prev = LIST_POISON2; 192 entry->prev = LIST_POISON2;
193 } 193 }
194 194
195 /* 195 /*
196 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock 196 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
197 * means that all timers which are tied to this base via timer->base are 197 * means that all timers which are tied to this base via timer->base are
198 * locked, and the base itself is locked too. 198 * locked, and the base itself is locked too.
199 * 199 *
200 * So __run_timers/migrate_timers can safely modify all timers which could 200 * So __run_timers/migrate_timers can safely modify all timers which could
201 * be found on ->tvX lists. 201 * be found on ->tvX lists.
202 * 202 *
203 * When the timer's base is locked, and the timer removed from list, it is 203 * When the timer's base is locked, and the timer removed from list, it is
204 * possible to set timer->base = NULL and drop the lock: the timer remains 204 * possible to set timer->base = NULL and drop the lock: the timer remains
205 * locked. 205 * locked.
206 */ 206 */
207 static timer_base_t *lock_timer_base(struct timer_list *timer, 207 static timer_base_t *lock_timer_base(struct timer_list *timer,
208 unsigned long *flags) 208 unsigned long *flags)
209 { 209 {
210 timer_base_t *base; 210 timer_base_t *base;
211 211
212 for (;;) { 212 for (;;) {
213 base = timer->base; 213 base = timer->base;
214 if (likely(base != NULL)) { 214 if (likely(base != NULL)) {
215 spin_lock_irqsave(&base->lock, *flags); 215 spin_lock_irqsave(&base->lock, *flags);
216 if (likely(base == timer->base)) 216 if (likely(base == timer->base))
217 return base; 217 return base;
218 /* The timer has migrated to another CPU */ 218 /* The timer has migrated to another CPU */
219 spin_unlock_irqrestore(&base->lock, *flags); 219 spin_unlock_irqrestore(&base->lock, *flags);
220 } 220 }
221 cpu_relax(); 221 cpu_relax();
222 } 222 }
223 } 223 }
224 224
225 int __mod_timer(struct timer_list *timer, unsigned long expires) 225 int __mod_timer(struct timer_list *timer, unsigned long expires)
226 { 226 {
227 timer_base_t *base; 227 timer_base_t *base;
228 tvec_base_t *new_base; 228 tvec_base_t *new_base;
229 unsigned long flags; 229 unsigned long flags;
230 int ret = 0; 230 int ret = 0;
231 231
232 BUG_ON(!timer->function); 232 BUG_ON(!timer->function);
233 check_timer(timer); 233 check_timer(timer);
234 234
235 base = lock_timer_base(timer, &flags); 235 base = lock_timer_base(timer, &flags);
236 236
237 if (timer_pending(timer)) { 237 if (timer_pending(timer)) {
238 detach_timer(timer, 0); 238 detach_timer(timer, 0);
239 ret = 1; 239 ret = 1;
240 } 240 }
241 241
242 new_base = &__get_cpu_var(tvec_bases); 242 new_base = &__get_cpu_var(tvec_bases);
243 243
244 if (base != &new_base->t_base) { 244 if (base != &new_base->t_base) {
245 /* 245 /*
246 * We are trying to schedule the timer on the local CPU. 246 * We are trying to schedule the timer on the local CPU.
247 * However we can't change timer's base while it is running, 247 * However we can't change timer's base while it is running,
248 * otherwise del_timer_sync() can't detect that the timer's 248 * otherwise del_timer_sync() can't detect that the timer's
249 * handler yet has not finished. This also guarantees that 249 * handler yet has not finished. This also guarantees that
250 * the timer is serialized wrt itself. 250 * the timer is serialized wrt itself.
251 */ 251 */
252 if (unlikely(base->running_timer == timer)) { 252 if (unlikely(base->running_timer == timer)) {
253 /* The timer remains on a former base */ 253 /* The timer remains on a former base */
254 new_base = container_of(base, tvec_base_t, t_base); 254 new_base = container_of(base, tvec_base_t, t_base);
255 } else { 255 } else {
256 /* See the comment in lock_timer_base() */ 256 /* See the comment in lock_timer_base() */
257 timer->base = NULL; 257 timer->base = NULL;
258 spin_unlock(&base->lock); 258 spin_unlock(&base->lock);
259 spin_lock(&new_base->t_base.lock); 259 spin_lock(&new_base->t_base.lock);
260 timer->base = &new_base->t_base; 260 timer->base = &new_base->t_base;
261 } 261 }
262 } 262 }
263 263
264 timer->expires = expires; 264 timer->expires = expires;
265 internal_add_timer(new_base, timer); 265 internal_add_timer(new_base, timer);
266 spin_unlock_irqrestore(&new_base->t_base.lock, flags); 266 spin_unlock_irqrestore(&new_base->t_base.lock, flags);
267 267
268 return ret; 268 return ret;
269 } 269 }
270 270
271 EXPORT_SYMBOL(__mod_timer); 271 EXPORT_SYMBOL(__mod_timer);
272 272
273 /*** 273 /***
274 * add_timer_on - start a timer on a particular CPU 274 * add_timer_on - start a timer on a particular CPU
275 * @timer: the timer to be added 275 * @timer: the timer to be added
276 * @cpu: the CPU to start it on 276 * @cpu: the CPU to start it on
277 * 277 *
278 * This is not very scalable on SMP. Double adds are not possible. 278 * This is not very scalable on SMP. Double adds are not possible.
279 */ 279 */
280 void add_timer_on(struct timer_list *timer, int cpu) 280 void add_timer_on(struct timer_list *timer, int cpu)
281 { 281 {
282 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 282 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
283 unsigned long flags; 283 unsigned long flags;
284 284
285 BUG_ON(timer_pending(timer) || !timer->function); 285 BUG_ON(timer_pending(timer) || !timer->function);
286 286
287 check_timer(timer); 287 check_timer(timer);
288 288
289 spin_lock_irqsave(&base->t_base.lock, flags); 289 spin_lock_irqsave(&base->t_base.lock, flags);
290 timer->base = &base->t_base; 290 timer->base = &base->t_base;
291 internal_add_timer(base, timer); 291 internal_add_timer(base, timer);
292 spin_unlock_irqrestore(&base->t_base.lock, flags); 292 spin_unlock_irqrestore(&base->t_base.lock, flags);
293 } 293 }
294 294
295 295
296 /*** 296 /***
297 * mod_timer - modify a timer's timeout 297 * mod_timer - modify a timer's timeout
298 * @timer: the timer to be modified 298 * @timer: the timer to be modified
299 * 299 *
300 * mod_timer is a more efficient way to update the expire field of an 300 * mod_timer is a more efficient way to update the expire field of an
301 * active timer (if the timer is inactive it will be activated) 301 * active timer (if the timer is inactive it will be activated)
302 * 302 *
303 * mod_timer(timer, expires) is equivalent to: 303 * mod_timer(timer, expires) is equivalent to:
304 * 304 *
305 * del_timer(timer); timer->expires = expires; add_timer(timer); 305 * del_timer(timer); timer->expires = expires; add_timer(timer);
306 * 306 *
307 * Note that if there are multiple unserialized concurrent users of the 307 * Note that if there are multiple unserialized concurrent users of the
308 * same timer, then mod_timer() is the only safe way to modify the timeout, 308 * same timer, then mod_timer() is the only safe way to modify the timeout,
309 * since add_timer() cannot modify an already running timer. 309 * since add_timer() cannot modify an already running timer.
310 * 310 *
311 * The function returns whether it has modified a pending timer or not. 311 * The function returns whether it has modified a pending timer or not.
312 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an 312 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
313 * active timer returns 1.) 313 * active timer returns 1.)
314 */ 314 */
315 int mod_timer(struct timer_list *timer, unsigned long expires) 315 int mod_timer(struct timer_list *timer, unsigned long expires)
316 { 316 {
317 BUG_ON(!timer->function); 317 BUG_ON(!timer->function);
318 318
319 check_timer(timer); 319 check_timer(timer);
320 320
321 /* 321 /*
322 * This is a common optimization triggered by the 322 * This is a common optimization triggered by the
323 * networking code - if the timer is re-modified 323 * networking code - if the timer is re-modified
324 * to be the same thing then just return: 324 * to be the same thing then just return:
325 */ 325 */
326 if (timer->expires == expires && timer_pending(timer)) 326 if (timer->expires == expires && timer_pending(timer))
327 return 1; 327 return 1;
328 328
329 return __mod_timer(timer, expires); 329 return __mod_timer(timer, expires);
330 } 330 }
331 331
332 EXPORT_SYMBOL(mod_timer); 332 EXPORT_SYMBOL(mod_timer);
333 333
334 /*** 334 /***
335 * del_timer - deactive a timer. 335 * del_timer - deactive a timer.
336 * @timer: the timer to be deactivated 336 * @timer: the timer to be deactivated
337 * 337 *
338 * del_timer() deactivates a timer - this works on both active and inactive 338 * del_timer() deactivates a timer - this works on both active and inactive
339 * timers. 339 * timers.
340 * 340 *
341 * The function returns whether it has deactivated a pending timer or not. 341 * The function returns whether it has deactivated a pending timer or not.
342 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an 342 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
343 * active timer returns 1.) 343 * active timer returns 1.)
344 */ 344 */
345 int del_timer(struct timer_list *timer) 345 int del_timer(struct timer_list *timer)
346 { 346 {
347 timer_base_t *base; 347 timer_base_t *base;
348 unsigned long flags; 348 unsigned long flags;
349 int ret = 0; 349 int ret = 0;
350 350
351 check_timer(timer); 351 check_timer(timer);
352 352
353 if (timer_pending(timer)) { 353 if (timer_pending(timer)) {
354 base = lock_timer_base(timer, &flags); 354 base = lock_timer_base(timer, &flags);
355 if (timer_pending(timer)) { 355 if (timer_pending(timer)) {
356 detach_timer(timer, 1); 356 detach_timer(timer, 1);
357 ret = 1; 357 ret = 1;
358 } 358 }
359 spin_unlock_irqrestore(&base->lock, flags); 359 spin_unlock_irqrestore(&base->lock, flags);
360 } 360 }
361 361
362 return ret; 362 return ret;
363 } 363 }
364 364
365 EXPORT_SYMBOL(del_timer); 365 EXPORT_SYMBOL(del_timer);
366 366
367 #ifdef CONFIG_SMP 367 #ifdef CONFIG_SMP
368 /*
369 * This function tries to deactivate a timer. Upon successful (ret >= 0)
370 * exit the timer is not queued and the handler is not running on any CPU.
371 *
372 * It must not be called from interrupt contexts.
373 */
374 int try_to_del_timer_sync(struct timer_list *timer)
375 {
376 timer_base_t *base;
377 unsigned long flags;
378 int ret = -1;
379
380 base = lock_timer_base(timer, &flags);
381
382 if (base->running_timer == timer)
383 goto out;
384
385 ret = 0;
386 if (timer_pending(timer)) {
387 detach_timer(timer, 1);
388 ret = 1;
389 }
390 out:
391 spin_unlock_irqrestore(&base->lock, flags);
392
393 return ret;
394 }
395
368 /*** 396 /***
369 * del_timer_sync - deactivate a timer and wait for the handler to finish. 397 * del_timer_sync - deactivate a timer and wait for the handler to finish.
370 * @timer: the timer to be deactivated 398 * @timer: the timer to be deactivated
371 * 399 *
372 * This function only differs from del_timer() on SMP: besides deactivating 400 * This function only differs from del_timer() on SMP: besides deactivating
373 * the timer it also makes sure the handler has finished executing on other 401 * the timer it also makes sure the handler has finished executing on other
374 * CPUs. 402 * CPUs.
375 * 403 *
376 * Synchronization rules: callers must prevent restarting of the timer, 404 * Synchronization rules: callers must prevent restarting of the timer,
377 * otherwise this function is meaningless. It must not be called from 405 * otherwise this function is meaningless. It must not be called from
378 * interrupt contexts. The caller must not hold locks which would prevent 406 * interrupt contexts. The caller must not hold locks which would prevent
379 * completion of the timer's handler. The timer's handler must not call 407 * completion of the timer's handler. The timer's handler must not call
380 * add_timer_on(). Upon exit the timer is not queued and the handler is 408 * add_timer_on(). Upon exit the timer is not queued and the handler is
381 * not running on any CPU. 409 * not running on any CPU.
382 * 410 *
383 * The function returns whether it has deactivated a pending timer or not. 411 * The function returns whether it has deactivated a pending timer or not.
384 */ 412 */
385 int del_timer_sync(struct timer_list *timer) 413 int del_timer_sync(struct timer_list *timer)
386 { 414 {
387 timer_base_t *base;
388 unsigned long flags;
389 int ret = -1;
390
391 check_timer(timer); 415 check_timer(timer);
392 416
393 do { 417 for (;;) {
394 base = lock_timer_base(timer, &flags); 418 int ret = try_to_del_timer_sync(timer);
395 419 if (ret >= 0)
396 if (base->running_timer == timer) 420 return ret;
397 goto unlock; 421 }
398
399 ret = 0;
400 if (timer_pending(timer)) {
401 detach_timer(timer, 1);
402 ret = 1;
403 }
404 unlock:
405 spin_unlock_irqrestore(&base->lock, flags);
406 } while (ret < 0);
407
408 return ret;
409 } 422 }
410 423
411 EXPORT_SYMBOL(del_timer_sync); 424 EXPORT_SYMBOL(del_timer_sync);
412 #endif 425 #endif
413 426
414 static int cascade(tvec_base_t *base, tvec_t *tv, int index) 427 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
415 { 428 {
416 /* cascade all the timers from tv up one level */ 429 /* cascade all the timers from tv up one level */
417 struct list_head *head, *curr; 430 struct list_head *head, *curr;
418 431
419 head = tv->vec + index; 432 head = tv->vec + index;
420 curr = head->next; 433 curr = head->next;
421 /* 434 /*
422 * We are removing _all_ timers from the list, so we don't have to 435 * We are removing _all_ timers from the list, so we don't have to
423 * detach them individually, just clear the list afterwards. 436 * detach them individually, just clear the list afterwards.
424 */ 437 */
425 while (curr != head) { 438 while (curr != head) {
426 struct timer_list *tmp; 439 struct timer_list *tmp;
427 440
428 tmp = list_entry(curr, struct timer_list, entry); 441 tmp = list_entry(curr, struct timer_list, entry);
429 BUG_ON(tmp->base != &base->t_base); 442 BUG_ON(tmp->base != &base->t_base);
430 curr = curr->next; 443 curr = curr->next;
431 internal_add_timer(base, tmp); 444 internal_add_timer(base, tmp);
432 } 445 }
433 INIT_LIST_HEAD(head); 446 INIT_LIST_HEAD(head);
434 447
435 return index; 448 return index;
436 } 449 }
437 450
438 /*** 451 /***
439 * __run_timers - run all expired timers (if any) on this CPU. 452 * __run_timers - run all expired timers (if any) on this CPU.
440 * @base: the timer vector to be processed. 453 * @base: the timer vector to be processed.
441 * 454 *
442 * This function cascades all vectors and executes all expired timer 455 * This function cascades all vectors and executes all expired timer
443 * vectors. 456 * vectors.
444 */ 457 */
445 #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK 458 #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
446 459
447 static inline void __run_timers(tvec_base_t *base) 460 static inline void __run_timers(tvec_base_t *base)
448 { 461 {
449 struct timer_list *timer; 462 struct timer_list *timer;
450 463
451 spin_lock_irq(&base->t_base.lock); 464 spin_lock_irq(&base->t_base.lock);
452 while (time_after_eq(jiffies, base->timer_jiffies)) { 465 while (time_after_eq(jiffies, base->timer_jiffies)) {
453 struct list_head work_list = LIST_HEAD_INIT(work_list); 466 struct list_head work_list = LIST_HEAD_INIT(work_list);
454 struct list_head *head = &work_list; 467 struct list_head *head = &work_list;
455 int index = base->timer_jiffies & TVR_MASK; 468 int index = base->timer_jiffies & TVR_MASK;
456 469
457 /* 470 /*
458 * Cascade timers: 471 * Cascade timers:
459 */ 472 */
460 if (!index && 473 if (!index &&
461 (!cascade(base, &base->tv2, INDEX(0))) && 474 (!cascade(base, &base->tv2, INDEX(0))) &&
462 (!cascade(base, &base->tv3, INDEX(1))) && 475 (!cascade(base, &base->tv3, INDEX(1))) &&
463 !cascade(base, &base->tv4, INDEX(2))) 476 !cascade(base, &base->tv4, INDEX(2)))
464 cascade(base, &base->tv5, INDEX(3)); 477 cascade(base, &base->tv5, INDEX(3));
465 ++base->timer_jiffies; 478 ++base->timer_jiffies;
466 list_splice_init(base->tv1.vec + index, &work_list); 479 list_splice_init(base->tv1.vec + index, &work_list);
467 while (!list_empty(head)) { 480 while (!list_empty(head)) {
468 void (*fn)(unsigned long); 481 void (*fn)(unsigned long);
469 unsigned long data; 482 unsigned long data;
470 483
471 timer = list_entry(head->next,struct timer_list,entry); 484 timer = list_entry(head->next,struct timer_list,entry);
472 fn = timer->function; 485 fn = timer->function;
473 data = timer->data; 486 data = timer->data;
474 487
475 set_running_timer(base, timer); 488 set_running_timer(base, timer);
476 detach_timer(timer, 1); 489 detach_timer(timer, 1);
477 spin_unlock_irq(&base->t_base.lock); 490 spin_unlock_irq(&base->t_base.lock);
478 { 491 {
479 u32 preempt_count = preempt_count(); 492 u32 preempt_count = preempt_count();
480 fn(data); 493 fn(data);
481 if (preempt_count != preempt_count()) { 494 if (preempt_count != preempt_count()) {
482 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); 495 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
483 BUG(); 496 BUG();
484 } 497 }
485 } 498 }
486 spin_lock_irq(&base->t_base.lock); 499 spin_lock_irq(&base->t_base.lock);
487 } 500 }
488 } 501 }
489 set_running_timer(base, NULL); 502 set_running_timer(base, NULL);
490 spin_unlock_irq(&base->t_base.lock); 503 spin_unlock_irq(&base->t_base.lock);
491 } 504 }
492 505
493 #ifdef CONFIG_NO_IDLE_HZ 506 #ifdef CONFIG_NO_IDLE_HZ
494 /* 507 /*
495 * Find out when the next timer event is due to happen. This 508 * Find out when the next timer event is due to happen. This
496 * is used on S/390 to stop all activity when a cpus is idle. 509 * is used on S/390 to stop all activity when a cpus is idle.
497 * This functions needs to be called disabled. 510 * This functions needs to be called disabled.
498 */ 511 */
499 unsigned long next_timer_interrupt(void) 512 unsigned long next_timer_interrupt(void)
500 { 513 {
501 tvec_base_t *base; 514 tvec_base_t *base;
502 struct list_head *list; 515 struct list_head *list;
503 struct timer_list *nte; 516 struct timer_list *nte;
504 unsigned long expires; 517 unsigned long expires;
505 tvec_t *varray[4]; 518 tvec_t *varray[4];
506 int i, j; 519 int i, j;
507 520
508 base = &__get_cpu_var(tvec_bases); 521 base = &__get_cpu_var(tvec_bases);
509 spin_lock(&base->t_base.lock); 522 spin_lock(&base->t_base.lock);
510 expires = base->timer_jiffies + (LONG_MAX >> 1); 523 expires = base->timer_jiffies + (LONG_MAX >> 1);
511 list = 0; 524 list = 0;
512 525
513 /* Look for timer events in tv1. */ 526 /* Look for timer events in tv1. */
514 j = base->timer_jiffies & TVR_MASK; 527 j = base->timer_jiffies & TVR_MASK;
515 do { 528 do {
516 list_for_each_entry(nte, base->tv1.vec + j, entry) { 529 list_for_each_entry(nte, base->tv1.vec + j, entry) {
517 expires = nte->expires; 530 expires = nte->expires;
518 if (j < (base->timer_jiffies & TVR_MASK)) 531 if (j < (base->timer_jiffies & TVR_MASK))
519 list = base->tv2.vec + (INDEX(0)); 532 list = base->tv2.vec + (INDEX(0));
520 goto found; 533 goto found;
521 } 534 }
522 j = (j + 1) & TVR_MASK; 535 j = (j + 1) & TVR_MASK;
523 } while (j != (base->timer_jiffies & TVR_MASK)); 536 } while (j != (base->timer_jiffies & TVR_MASK));
524 537
525 /* Check tv2-tv5. */ 538 /* Check tv2-tv5. */
526 varray[0] = &base->tv2; 539 varray[0] = &base->tv2;
527 varray[1] = &base->tv3; 540 varray[1] = &base->tv3;
528 varray[2] = &base->tv4; 541 varray[2] = &base->tv4;
529 varray[3] = &base->tv5; 542 varray[3] = &base->tv5;
530 for (i = 0; i < 4; i++) { 543 for (i = 0; i < 4; i++) {
531 j = INDEX(i); 544 j = INDEX(i);
532 do { 545 do {
533 if (list_empty(varray[i]->vec + j)) { 546 if (list_empty(varray[i]->vec + j)) {
534 j = (j + 1) & TVN_MASK; 547 j = (j + 1) & TVN_MASK;
535 continue; 548 continue;
536 } 549 }
537 list_for_each_entry(nte, varray[i]->vec + j, entry) 550 list_for_each_entry(nte, varray[i]->vec + j, entry)
538 if (time_before(nte->expires, expires)) 551 if (time_before(nte->expires, expires))
539 expires = nte->expires; 552 expires = nte->expires;
540 if (j < (INDEX(i)) && i < 3) 553 if (j < (INDEX(i)) && i < 3)
541 list = varray[i + 1]->vec + (INDEX(i + 1)); 554 list = varray[i + 1]->vec + (INDEX(i + 1));
542 goto found; 555 goto found;
543 } while (j != (INDEX(i))); 556 } while (j != (INDEX(i)));
544 } 557 }
545 found: 558 found:
546 if (list) { 559 if (list) {
547 /* 560 /*
548 * The search wrapped. We need to look at the next list 561 * The search wrapped. We need to look at the next list
549 * from next tv element that would cascade into tv element 562 * from next tv element that would cascade into tv element
550 * where we found the timer element. 563 * where we found the timer element.
551 */ 564 */
552 list_for_each_entry(nte, list, entry) { 565 list_for_each_entry(nte, list, entry) {
553 if (time_before(nte->expires, expires)) 566 if (time_before(nte->expires, expires))
554 expires = nte->expires; 567 expires = nte->expires;
555 } 568 }
556 } 569 }
557 spin_unlock(&base->t_base.lock); 570 spin_unlock(&base->t_base.lock);
558 return expires; 571 return expires;
559 } 572 }
560 #endif 573 #endif
561 574
562 /******************************************************************/ 575 /******************************************************************/
563 576
564 /* 577 /*
565 * Timekeeping variables 578 * Timekeeping variables
566 */ 579 */
567 unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ 580 unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
568 unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ 581 unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
569 582
570 /* 583 /*
571 * The current time 584 * The current time
572 * wall_to_monotonic is what we need to add to xtime (or xtime corrected 585 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
573 * for sub jiffie times) to get to monotonic time. Monotonic is pegged 586 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
574 * at zero at system boot time, so wall_to_monotonic will be negative, 587 * at zero at system boot time, so wall_to_monotonic will be negative,
575 * however, we will ALWAYS keep the tv_nsec part positive so we can use 588 * however, we will ALWAYS keep the tv_nsec part positive so we can use
576 * the usual normalization. 589 * the usual normalization.
577 */ 590 */
578 struct timespec xtime __attribute__ ((aligned (16))); 591 struct timespec xtime __attribute__ ((aligned (16)));
579 struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 592 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
580 593
581 EXPORT_SYMBOL(xtime); 594 EXPORT_SYMBOL(xtime);
582 595
583 /* Don't completely fail for HZ > 500. */ 596 /* Don't completely fail for HZ > 500. */
584 int tickadj = 500/HZ ? : 1; /* microsecs */ 597 int tickadj = 500/HZ ? : 1; /* microsecs */
585 598
586 599
587 /* 600 /*
588 * phase-lock loop variables 601 * phase-lock loop variables
589 */ 602 */
590 /* TIME_ERROR prevents overwriting the CMOS clock */ 603 /* TIME_ERROR prevents overwriting the CMOS clock */
591 int time_state = TIME_OK; /* clock synchronization status */ 604 int time_state = TIME_OK; /* clock synchronization status */
592 int time_status = STA_UNSYNC; /* clock status bits */ 605 int time_status = STA_UNSYNC; /* clock status bits */
593 long time_offset; /* time adjustment (us) */ 606 long time_offset; /* time adjustment (us) */
594 long time_constant = 2; /* pll time constant */ 607 long time_constant = 2; /* pll time constant */
595 long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ 608 long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
596 long time_precision = 1; /* clock precision (us) */ 609 long time_precision = 1; /* clock precision (us) */
597 long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 610 long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
598 long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 611 long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
599 static long time_phase; /* phase offset (scaled us) */ 612 static long time_phase; /* phase offset (scaled us) */
600 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 613 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
601 /* frequency offset (scaled ppm)*/ 614 /* frequency offset (scaled ppm)*/
602 static long time_adj; /* tick adjust (scaled 1 / HZ) */ 615 static long time_adj; /* tick adjust (scaled 1 / HZ) */
603 long time_reftime; /* time at last adjustment (s) */ 616 long time_reftime; /* time at last adjustment (s) */
604 long time_adjust; 617 long time_adjust;
605 long time_next_adjust; 618 long time_next_adjust;
606 619
607 /* 620 /*
608 * this routine handles the overflow of the microsecond field 621 * this routine handles the overflow of the microsecond field
609 * 622 *
610 * The tricky bits of code to handle the accurate clock support 623 * The tricky bits of code to handle the accurate clock support
611 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. 624 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
612 * They were originally developed for SUN and DEC kernels. 625 * They were originally developed for SUN and DEC kernels.
613 * All the kudos should go to Dave for this stuff. 626 * All the kudos should go to Dave for this stuff.
614 * 627 *
615 */ 628 */
616 static void second_overflow(void) 629 static void second_overflow(void)
617 { 630 {
618 long ltemp; 631 long ltemp;
619 632
620 /* Bump the maxerror field */ 633 /* Bump the maxerror field */
621 time_maxerror += time_tolerance >> SHIFT_USEC; 634 time_maxerror += time_tolerance >> SHIFT_USEC;
622 if ( time_maxerror > NTP_PHASE_LIMIT ) { 635 if ( time_maxerror > NTP_PHASE_LIMIT ) {
623 time_maxerror = NTP_PHASE_LIMIT; 636 time_maxerror = NTP_PHASE_LIMIT;
624 time_status |= STA_UNSYNC; 637 time_status |= STA_UNSYNC;
625 } 638 }
626 639
627 /* 640 /*
628 * Leap second processing. If in leap-insert state at 641 * Leap second processing. If in leap-insert state at
629 * the end of the day, the system clock is set back one 642 * the end of the day, the system clock is set back one
630 * second; if in leap-delete state, the system clock is 643 * second; if in leap-delete state, the system clock is
631 * set ahead one second. The microtime() routine or 644 * set ahead one second. The microtime() routine or
632 * external clock driver will insure that reported time 645 * external clock driver will insure that reported time
633 * is always monotonic. The ugly divides should be 646 * is always monotonic. The ugly divides should be
634 * replaced. 647 * replaced.
635 */ 648 */
636 switch (time_state) { 649 switch (time_state) {
637 650
638 case TIME_OK: 651 case TIME_OK:
639 if (time_status & STA_INS) 652 if (time_status & STA_INS)
640 time_state = TIME_INS; 653 time_state = TIME_INS;
641 else if (time_status & STA_DEL) 654 else if (time_status & STA_DEL)
642 time_state = TIME_DEL; 655 time_state = TIME_DEL;
643 break; 656 break;
644 657
645 case TIME_INS: 658 case TIME_INS:
646 if (xtime.tv_sec % 86400 == 0) { 659 if (xtime.tv_sec % 86400 == 0) {
647 xtime.tv_sec--; 660 xtime.tv_sec--;
648 wall_to_monotonic.tv_sec++; 661 wall_to_monotonic.tv_sec++;
649 /* The timer interpolator will make time change gradually instead 662 /* The timer interpolator will make time change gradually instead
650 * of an immediate jump by one second. 663 * of an immediate jump by one second.
651 */ 664 */
652 time_interpolator_update(-NSEC_PER_SEC); 665 time_interpolator_update(-NSEC_PER_SEC);
653 time_state = TIME_OOP; 666 time_state = TIME_OOP;
654 clock_was_set(); 667 clock_was_set();
655 printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); 668 printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
656 } 669 }
657 break; 670 break;
658 671
659 case TIME_DEL: 672 case TIME_DEL:
660 if ((xtime.tv_sec + 1) % 86400 == 0) { 673 if ((xtime.tv_sec + 1) % 86400 == 0) {
661 xtime.tv_sec++; 674 xtime.tv_sec++;
662 wall_to_monotonic.tv_sec--; 675 wall_to_monotonic.tv_sec--;
663 /* Use of time interpolator for a gradual change of time */ 676 /* Use of time interpolator for a gradual change of time */
664 time_interpolator_update(NSEC_PER_SEC); 677 time_interpolator_update(NSEC_PER_SEC);
665 time_state = TIME_WAIT; 678 time_state = TIME_WAIT;
666 clock_was_set(); 679 clock_was_set();
667 printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); 680 printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
668 } 681 }
669 break; 682 break;
670 683
671 case TIME_OOP: 684 case TIME_OOP:
672 time_state = TIME_WAIT; 685 time_state = TIME_WAIT;
673 break; 686 break;
674 687
675 case TIME_WAIT: 688 case TIME_WAIT:
676 if (!(time_status & (STA_INS | STA_DEL))) 689 if (!(time_status & (STA_INS | STA_DEL)))
677 time_state = TIME_OK; 690 time_state = TIME_OK;
678 } 691 }
679 692
680 /* 693 /*
681 * Compute the phase adjustment for the next second. In 694 * Compute the phase adjustment for the next second. In
682 * PLL mode, the offset is reduced by a fixed factor 695 * PLL mode, the offset is reduced by a fixed factor
683 * times the time constant. In FLL mode the offset is 696 * times the time constant. In FLL mode the offset is
684 * used directly. In either mode, the maximum phase 697 * used directly. In either mode, the maximum phase
685 * adjustment for each second is clamped so as to spread 698 * adjustment for each second is clamped so as to spread
686 * the adjustment over not more than the number of 699 * the adjustment over not more than the number of
687 * seconds between updates. 700 * seconds between updates.
688 */ 701 */
689 if (time_offset < 0) { 702 if (time_offset < 0) {
690 ltemp = -time_offset; 703 ltemp = -time_offset;
691 if (!(time_status & STA_FLL)) 704 if (!(time_status & STA_FLL))
692 ltemp >>= SHIFT_KG + time_constant; 705 ltemp >>= SHIFT_KG + time_constant;
693 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) 706 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
694 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; 707 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
695 time_offset += ltemp; 708 time_offset += ltemp;
696 time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); 709 time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
697 } else { 710 } else {
698 ltemp = time_offset; 711 ltemp = time_offset;
699 if (!(time_status & STA_FLL)) 712 if (!(time_status & STA_FLL))
700 ltemp >>= SHIFT_KG + time_constant; 713 ltemp >>= SHIFT_KG + time_constant;
701 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) 714 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
702 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; 715 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
703 time_offset -= ltemp; 716 time_offset -= ltemp;
704 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); 717 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
705 } 718 }
706 719
707 /* 720 /*
708 * Compute the frequency estimate and additional phase 721 * Compute the frequency estimate and additional phase
709 * adjustment due to frequency error for the next 722 * adjustment due to frequency error for the next
710 * second. When the PPS signal is engaged, gnaw on the 723 * second. When the PPS signal is engaged, gnaw on the
711 * watchdog counter and update the frequency computed by 724 * watchdog counter and update the frequency computed by
712 * the pll and the PPS signal. 725 * the pll and the PPS signal.
713 */ 726 */
714 pps_valid++; 727 pps_valid++;
715 if (pps_valid == PPS_VALID) { /* PPS signal lost */ 728 if (pps_valid == PPS_VALID) { /* PPS signal lost */
716 pps_jitter = MAXTIME; 729 pps_jitter = MAXTIME;
717 pps_stabil = MAXFREQ; 730 pps_stabil = MAXFREQ;
718 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | 731 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
719 STA_PPSWANDER | STA_PPSERROR); 732 STA_PPSWANDER | STA_PPSERROR);
720 } 733 }
721 ltemp = time_freq + pps_freq; 734 ltemp = time_freq + pps_freq;
722 if (ltemp < 0) 735 if (ltemp < 0)
723 time_adj -= -ltemp >> 736 time_adj -= -ltemp >>
724 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); 737 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
725 else 738 else
726 time_adj += ltemp >> 739 time_adj += ltemp >>
727 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); 740 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
728 741
729 #if HZ == 100 742 #if HZ == 100
730 /* Compensate for (HZ==100) != (1 << SHIFT_HZ). 743 /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
731 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) 744 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
732 */ 745 */
733 if (time_adj < 0) 746 if (time_adj < 0)
734 time_adj -= (-time_adj >> 2) + (-time_adj >> 5); 747 time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
735 else 748 else
736 time_adj += (time_adj >> 2) + (time_adj >> 5); 749 time_adj += (time_adj >> 2) + (time_adj >> 5);
737 #endif 750 #endif
738 #if HZ == 1000 751 #if HZ == 1000
739 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). 752 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
740 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) 753 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
741 */ 754 */
742 if (time_adj < 0) 755 if (time_adj < 0)
743 time_adj -= (-time_adj >> 6) + (-time_adj >> 7); 756 time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
744 else 757 else
745 time_adj += (time_adj >> 6) + (time_adj >> 7); 758 time_adj += (time_adj >> 6) + (time_adj >> 7);
746 #endif 759 #endif
747 } 760 }
748 761
749 /* in the NTP reference this is called "hardclock()" */ 762 /* in the NTP reference this is called "hardclock()" */
750 static void update_wall_time_one_tick(void) 763 static void update_wall_time_one_tick(void)
751 { 764 {
752 long time_adjust_step, delta_nsec; 765 long time_adjust_step, delta_nsec;
753 766
754 if ( (time_adjust_step = time_adjust) != 0 ) { 767 if ( (time_adjust_step = time_adjust) != 0 ) {
755 /* We are doing an adjtime thing. 768 /* We are doing an adjtime thing.
756 * 769 *
757 * Prepare time_adjust_step to be within bounds. 770 * Prepare time_adjust_step to be within bounds.
758 * Note that a positive time_adjust means we want the clock 771 * Note that a positive time_adjust means we want the clock
759 * to run faster. 772 * to run faster.
760 * 773 *
761 * Limit the amount of the step to be in the range 774 * Limit the amount of the step to be in the range
762 * -tickadj .. +tickadj 775 * -tickadj .. +tickadj
763 */ 776 */
764 if (time_adjust > tickadj) 777 if (time_adjust > tickadj)
765 time_adjust_step = tickadj; 778 time_adjust_step = tickadj;
766 else if (time_adjust < -tickadj) 779 else if (time_adjust < -tickadj)
767 time_adjust_step = -tickadj; 780 time_adjust_step = -tickadj;
768 781
769 /* Reduce by this step the amount of time left */ 782 /* Reduce by this step the amount of time left */
770 time_adjust -= time_adjust_step; 783 time_adjust -= time_adjust_step;
771 } 784 }
772 delta_nsec = tick_nsec + time_adjust_step * 1000; 785 delta_nsec = tick_nsec + time_adjust_step * 1000;
773 /* 786 /*
774 * Advance the phase, once it gets to one microsecond, then 787 * Advance the phase, once it gets to one microsecond, then
775 * advance the tick more. 788 * advance the tick more.
776 */ 789 */
777 time_phase += time_adj; 790 time_phase += time_adj;
778 if (time_phase <= -FINENSEC) { 791 if (time_phase <= -FINENSEC) {
779 long ltemp = -time_phase >> (SHIFT_SCALE - 10); 792 long ltemp = -time_phase >> (SHIFT_SCALE - 10);
780 time_phase += ltemp << (SHIFT_SCALE - 10); 793 time_phase += ltemp << (SHIFT_SCALE - 10);
781 delta_nsec -= ltemp; 794 delta_nsec -= ltemp;
782 } 795 }
783 else if (time_phase >= FINENSEC) { 796 else if (time_phase >= FINENSEC) {
784 long ltemp = time_phase >> (SHIFT_SCALE - 10); 797 long ltemp = time_phase >> (SHIFT_SCALE - 10);
785 time_phase -= ltemp << (SHIFT_SCALE - 10); 798 time_phase -= ltemp << (SHIFT_SCALE - 10);
786 delta_nsec += ltemp; 799 delta_nsec += ltemp;
787 } 800 }
788 xtime.tv_nsec += delta_nsec; 801 xtime.tv_nsec += delta_nsec;
789 time_interpolator_update(delta_nsec); 802 time_interpolator_update(delta_nsec);
790 803
791 /* Changes by adjtime() do not take effect till next tick. */ 804 /* Changes by adjtime() do not take effect till next tick. */
792 if (time_next_adjust != 0) { 805 if (time_next_adjust != 0) {
793 time_adjust = time_next_adjust; 806 time_adjust = time_next_adjust;
794 time_next_adjust = 0; 807 time_next_adjust = 0;
795 } 808 }
796 } 809 }
797 810
798 /* 811 /*
799 * Using a loop looks inefficient, but "ticks" is 812 * Using a loop looks inefficient, but "ticks" is
800 * usually just one (we shouldn't be losing ticks, 813 * usually just one (we shouldn't be losing ticks,
801 * we're doing this this way mainly for interrupt 814 * we're doing this this way mainly for interrupt
802 * latency reasons, not because we think we'll 815 * latency reasons, not because we think we'll
803 * have lots of lost timer ticks 816 * have lots of lost timer ticks
804 */ 817 */
805 static void update_wall_time(unsigned long ticks) 818 static void update_wall_time(unsigned long ticks)
806 { 819 {
807 do { 820 do {
808 ticks--; 821 ticks--;
809 update_wall_time_one_tick(); 822 update_wall_time_one_tick();
810 if (xtime.tv_nsec >= 1000000000) { 823 if (xtime.tv_nsec >= 1000000000) {
811 xtime.tv_nsec -= 1000000000; 824 xtime.tv_nsec -= 1000000000;
812 xtime.tv_sec++; 825 xtime.tv_sec++;
813 second_overflow(); 826 second_overflow();
814 } 827 }
815 } while (ticks); 828 } while (ticks);
816 } 829 }
817 830
818 /* 831 /*
819 * Called from the timer interrupt handler to charge one tick to the current 832 * Called from the timer interrupt handler to charge one tick to the current
820 * process. user_tick is 1 if the tick is user time, 0 for system. 833 * process. user_tick is 1 if the tick is user time, 0 for system.
821 */ 834 */
822 void update_process_times(int user_tick) 835 void update_process_times(int user_tick)
823 { 836 {
824 struct task_struct *p = current; 837 struct task_struct *p = current;
825 int cpu = smp_processor_id(); 838 int cpu = smp_processor_id();
826 839
827 /* Note: this timer irq context must be accounted for as well. */ 840 /* Note: this timer irq context must be accounted for as well. */
828 if (user_tick) 841 if (user_tick)
829 account_user_time(p, jiffies_to_cputime(1)); 842 account_user_time(p, jiffies_to_cputime(1));
830 else 843 else
831 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); 844 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
832 run_local_timers(); 845 run_local_timers();
833 if (rcu_pending(cpu)) 846 if (rcu_pending(cpu))
834 rcu_check_callbacks(cpu, user_tick); 847 rcu_check_callbacks(cpu, user_tick);
835 scheduler_tick(); 848 scheduler_tick();
836 run_posix_cpu_timers(p); 849 run_posix_cpu_timers(p);
837 } 850 }
838 851
839 /* 852 /*
840 * Nr of active tasks - counted in fixed-point numbers 853 * Nr of active tasks - counted in fixed-point numbers
841 */ 854 */
842 static unsigned long count_active_tasks(void) 855 static unsigned long count_active_tasks(void)
843 { 856 {
844 return (nr_running() + nr_uninterruptible()) * FIXED_1; 857 return (nr_running() + nr_uninterruptible()) * FIXED_1;
845 } 858 }
846 859
847 /* 860 /*
848 * Hmm.. Changed this, as the GNU make sources (load.c) seems to 861 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
849 * imply that avenrun[] is the standard name for this kind of thing. 862 * imply that avenrun[] is the standard name for this kind of thing.
850 * Nothing else seems to be standardized: the fractional size etc 863 * Nothing else seems to be standardized: the fractional size etc
851 * all seem to differ on different machines. 864 * all seem to differ on different machines.
852 * 865 *
853 * Requires xtime_lock to access. 866 * Requires xtime_lock to access.
854 */ 867 */
855 unsigned long avenrun[3]; 868 unsigned long avenrun[3];
856 869
857 EXPORT_SYMBOL(avenrun); 870 EXPORT_SYMBOL(avenrun);
858 871
859 /* 872 /*
860 * calc_load - given tick count, update the avenrun load estimates. 873 * calc_load - given tick count, update the avenrun load estimates.
861 * This is called while holding a write_lock on xtime_lock. 874 * This is called while holding a write_lock on xtime_lock.
862 */ 875 */
863 static inline void calc_load(unsigned long ticks) 876 static inline void calc_load(unsigned long ticks)
864 { 877 {
865 unsigned long active_tasks; /* fixed-point */ 878 unsigned long active_tasks; /* fixed-point */
866 static int count = LOAD_FREQ; 879 static int count = LOAD_FREQ;
867 880
868 count -= ticks; 881 count -= ticks;
869 if (count < 0) { 882 if (count < 0) {
870 count += LOAD_FREQ; 883 count += LOAD_FREQ;
871 active_tasks = count_active_tasks(); 884 active_tasks = count_active_tasks();
872 CALC_LOAD(avenrun[0], EXP_1, active_tasks); 885 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
873 CALC_LOAD(avenrun[1], EXP_5, active_tasks); 886 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
874 CALC_LOAD(avenrun[2], EXP_15, active_tasks); 887 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
875 } 888 }
876 } 889 }
877 890
878 /* jiffies at the most recent update of wall time */ 891 /* jiffies at the most recent update of wall time */
879 unsigned long wall_jiffies = INITIAL_JIFFIES; 892 unsigned long wall_jiffies = INITIAL_JIFFIES;
880 893
881 /* 894 /*
882 * This read-write spinlock protects us from races in SMP while 895 * This read-write spinlock protects us from races in SMP while
883 * playing with xtime and avenrun. 896 * playing with xtime and avenrun.
884 */ 897 */
885 #ifndef ARCH_HAVE_XTIME_LOCK 898 #ifndef ARCH_HAVE_XTIME_LOCK
886 seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; 899 seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
887 900
888 EXPORT_SYMBOL(xtime_lock); 901 EXPORT_SYMBOL(xtime_lock);
889 #endif 902 #endif
890 903
891 /* 904 /*
892 * This function runs timers and the timer-tq in bottom half context. 905 * This function runs timers and the timer-tq in bottom half context.
893 */ 906 */
894 static void run_timer_softirq(struct softirq_action *h) 907 static void run_timer_softirq(struct softirq_action *h)
895 { 908 {
896 tvec_base_t *base = &__get_cpu_var(tvec_bases); 909 tvec_base_t *base = &__get_cpu_var(tvec_bases);
897 910
898 if (time_after_eq(jiffies, base->timer_jiffies)) 911 if (time_after_eq(jiffies, base->timer_jiffies))
899 __run_timers(base); 912 __run_timers(base);
900 } 913 }
901 914
902 /* 915 /*
903 * Called by the local, per-CPU timer interrupt on SMP. 916 * Called by the local, per-CPU timer interrupt on SMP.
904 */ 917 */
905 void run_local_timers(void) 918 void run_local_timers(void)
906 { 919 {
907 raise_softirq(TIMER_SOFTIRQ); 920 raise_softirq(TIMER_SOFTIRQ);
908 } 921 }
909 922
910 /* 923 /*
911 * Called by the timer interrupt. xtime_lock must already be taken 924 * Called by the timer interrupt. xtime_lock must already be taken
912 * by the timer IRQ! 925 * by the timer IRQ!
913 */ 926 */
914 static inline void update_times(void) 927 static inline void update_times(void)
915 { 928 {
916 unsigned long ticks; 929 unsigned long ticks;
917 930
918 ticks = jiffies - wall_jiffies; 931 ticks = jiffies - wall_jiffies;
919 if (ticks) { 932 if (ticks) {
920 wall_jiffies += ticks; 933 wall_jiffies += ticks;
921 update_wall_time(ticks); 934 update_wall_time(ticks);
922 } 935 }
923 calc_load(ticks); 936 calc_load(ticks);
924 } 937 }
925 938
926 /* 939 /*
927 * The 64-bit jiffies value is not atomic - you MUST NOT read it 940 * The 64-bit jiffies value is not atomic - you MUST NOT read it
928 * without sampling the sequence number in xtime_lock. 941 * without sampling the sequence number in xtime_lock.
929 * jiffies is defined in the linker script... 942 * jiffies is defined in the linker script...
930 */ 943 */
931 944
932 void do_timer(struct pt_regs *regs) 945 void do_timer(struct pt_regs *regs)
933 { 946 {
934 jiffies_64++; 947 jiffies_64++;
935 update_times(); 948 update_times();
936 } 949 }
937 950
938 #ifdef __ARCH_WANT_SYS_ALARM 951 #ifdef __ARCH_WANT_SYS_ALARM
939 952
940 /* 953 /*
941 * For backwards compatibility? This can be done in libc so Alpha 954 * For backwards compatibility? This can be done in libc so Alpha
942 * and all newer ports shouldn't need it. 955 * and all newer ports shouldn't need it.
943 */ 956 */
944 asmlinkage unsigned long sys_alarm(unsigned int seconds) 957 asmlinkage unsigned long sys_alarm(unsigned int seconds)
945 { 958 {
946 struct itimerval it_new, it_old; 959 struct itimerval it_new, it_old;
947 unsigned int oldalarm; 960 unsigned int oldalarm;
948 961
949 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; 962 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
950 it_new.it_value.tv_sec = seconds; 963 it_new.it_value.tv_sec = seconds;
951 it_new.it_value.tv_usec = 0; 964 it_new.it_value.tv_usec = 0;
952 do_setitimer(ITIMER_REAL, &it_new, &it_old); 965 do_setitimer(ITIMER_REAL, &it_new, &it_old);
953 oldalarm = it_old.it_value.tv_sec; 966 oldalarm = it_old.it_value.tv_sec;
954 /* ehhh.. We can't return 0 if we have an alarm pending.. */ 967 /* ehhh.. We can't return 0 if we have an alarm pending.. */
955 /* And we'd better return too much than too little anyway */ 968 /* And we'd better return too much than too little anyway */
956 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) 969 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
957 oldalarm++; 970 oldalarm++;
958 return oldalarm; 971 return oldalarm;
959 } 972 }
960 973
961 #endif 974 #endif
962 975
963 #ifndef __alpha__ 976 #ifndef __alpha__
964 977
965 /* 978 /*
966 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this 979 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
967 * should be moved into arch/i386 instead? 980 * should be moved into arch/i386 instead?
968 */ 981 */
969 982
970 /** 983 /**
971 * sys_getpid - return the thread group id of the current process 984 * sys_getpid - return the thread group id of the current process
972 * 985 *
973 * Note, despite the name, this returns the tgid not the pid. The tgid and 986 * Note, despite the name, this returns the tgid not the pid. The tgid and
974 * the pid are identical unless CLONE_THREAD was specified on clone() in 987 * the pid are identical unless CLONE_THREAD was specified on clone() in
975 * which case the tgid is the same in all threads of the same group. 988 * which case the tgid is the same in all threads of the same group.
976 * 989 *
977 * This is SMP safe as current->tgid does not change. 990 * This is SMP safe as current->tgid does not change.
978 */ 991 */
979 asmlinkage long sys_getpid(void) 992 asmlinkage long sys_getpid(void)
980 { 993 {
981 return current->tgid; 994 return current->tgid;
982 } 995 }
983 996
984 /* 997 /*
985 * Accessing ->group_leader->real_parent is not SMP-safe, it could 998 * Accessing ->group_leader->real_parent is not SMP-safe, it could
986 * change from under us. However, rather than getting any lock 999 * change from under us. However, rather than getting any lock
987 * we can use an optimistic algorithm: get the parent 1000 * we can use an optimistic algorithm: get the parent
988 * pid, and go back and check that the parent is still 1001 * pid, and go back and check that the parent is still
989 * the same. If it has changed (which is extremely unlikely 1002 * the same. If it has changed (which is extremely unlikely
990 * indeed), we just try again.. 1003 * indeed), we just try again..
991 * 1004 *
992 * NOTE! This depends on the fact that even if we _do_ 1005 * NOTE! This depends on the fact that even if we _do_
993 * get an old value of "parent", we can happily dereference 1006 * get an old value of "parent", we can happily dereference
994 * the pointer (it was and remains a dereferencable kernel pointer 1007 * the pointer (it was and remains a dereferencable kernel pointer
995 * no matter what): we just can't necessarily trust the result 1008 * no matter what): we just can't necessarily trust the result
996 * until we know that the parent pointer is valid. 1009 * until we know that the parent pointer is valid.
997 * 1010 *
998 * NOTE2: ->group_leader never changes from under us. 1011 * NOTE2: ->group_leader never changes from under us.
999 */ 1012 */
1000 asmlinkage long sys_getppid(void) 1013 asmlinkage long sys_getppid(void)
1001 { 1014 {
1002 int pid; 1015 int pid;
1003 struct task_struct *me = current; 1016 struct task_struct *me = current;
1004 struct task_struct *parent; 1017 struct task_struct *parent;
1005 1018
1006 parent = me->group_leader->real_parent; 1019 parent = me->group_leader->real_parent;
1007 for (;;) { 1020 for (;;) {
1008 pid = parent->tgid; 1021 pid = parent->tgid;
1009 #ifdef CONFIG_SMP 1022 #ifdef CONFIG_SMP
1010 { 1023 {
1011 struct task_struct *old = parent; 1024 struct task_struct *old = parent;
1012 1025
1013 /* 1026 /*
1014 * Make sure we read the pid before re-reading the 1027 * Make sure we read the pid before re-reading the
1015 * parent pointer: 1028 * parent pointer:
1016 */ 1029 */
1017 smp_rmb(); 1030 smp_rmb();
1018 parent = me->group_leader->real_parent; 1031 parent = me->group_leader->real_parent;
1019 if (old != parent) 1032 if (old != parent)
1020 continue; 1033 continue;
1021 } 1034 }
1022 #endif 1035 #endif
1023 break; 1036 break;
1024 } 1037 }
1025 return pid; 1038 return pid;
1026 } 1039 }
1027 1040
1028 asmlinkage long sys_getuid(void) 1041 asmlinkage long sys_getuid(void)
1029 { 1042 {
1030 /* Only we change this so SMP safe */ 1043 /* Only we change this so SMP safe */
1031 return current->uid; 1044 return current->uid;
1032 } 1045 }
1033 1046
1034 asmlinkage long sys_geteuid(void) 1047 asmlinkage long sys_geteuid(void)
1035 { 1048 {
1036 /* Only we change this so SMP safe */ 1049 /* Only we change this so SMP safe */
1037 return current->euid; 1050 return current->euid;
1038 } 1051 }
1039 1052
1040 asmlinkage long sys_getgid(void) 1053 asmlinkage long sys_getgid(void)
1041 { 1054 {
1042 /* Only we change this so SMP safe */ 1055 /* Only we change this so SMP safe */
1043 return current->gid; 1056 return current->gid;
1044 } 1057 }
1045 1058
1046 asmlinkage long sys_getegid(void) 1059 asmlinkage long sys_getegid(void)
1047 { 1060 {
1048 /* Only we change this so SMP safe */ 1061 /* Only we change this so SMP safe */
1049 return current->egid; 1062 return current->egid;
1050 } 1063 }
1051 1064
1052 #endif 1065 #endif
1053 1066
1054 static void process_timeout(unsigned long __data) 1067 static void process_timeout(unsigned long __data)
1055 { 1068 {
1056 wake_up_process((task_t *)__data); 1069 wake_up_process((task_t *)__data);
1057 } 1070 }
1058 1071
1059 /** 1072 /**
1060 * schedule_timeout - sleep until timeout 1073 * schedule_timeout - sleep until timeout
1061 * @timeout: timeout value in jiffies 1074 * @timeout: timeout value in jiffies
1062 * 1075 *
1063 * Make the current task sleep until @timeout jiffies have 1076 * Make the current task sleep until @timeout jiffies have
1064 * elapsed. The routine will return immediately unless 1077 * elapsed. The routine will return immediately unless
1065 * the current task state has been set (see set_current_state()). 1078 * the current task state has been set (see set_current_state()).
1066 * 1079 *
1067 * You can set the task state as follows - 1080 * You can set the task state as follows -
1068 * 1081 *
1069 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to 1082 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1070 * pass before the routine returns. The routine will return 0 1083 * pass before the routine returns. The routine will return 0
1071 * 1084 *
1072 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is 1085 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1073 * delivered to the current task. In this case the remaining time 1086 * delivered to the current task. In this case the remaining time
1074 * in jiffies will be returned, or 0 if the timer expired in time 1087 * in jiffies will be returned, or 0 if the timer expired in time
1075 * 1088 *
1076 * The current task state is guaranteed to be TASK_RUNNING when this 1089 * The current task state is guaranteed to be TASK_RUNNING when this
1077 * routine returns. 1090 * routine returns.
1078 * 1091 *
1079 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule 1092 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1080 * the CPU away without a bound on the timeout. In this case the return 1093 * the CPU away without a bound on the timeout. In this case the return
1081 * value will be %MAX_SCHEDULE_TIMEOUT. 1094 * value will be %MAX_SCHEDULE_TIMEOUT.
1082 * 1095 *
1083 * In all cases the return value is guaranteed to be non-negative. 1096 * In all cases the return value is guaranteed to be non-negative.
1084 */ 1097 */
1085 fastcall signed long __sched schedule_timeout(signed long timeout) 1098 fastcall signed long __sched schedule_timeout(signed long timeout)
1086 { 1099 {
1087 struct timer_list timer; 1100 struct timer_list timer;
1088 unsigned long expire; 1101 unsigned long expire;
1089 1102
1090 switch (timeout) 1103 switch (timeout)
1091 { 1104 {
1092 case MAX_SCHEDULE_TIMEOUT: 1105 case MAX_SCHEDULE_TIMEOUT:
1093 /* 1106 /*
1094 * These two special cases are useful to be comfortable 1107 * These two special cases are useful to be comfortable
1095 * in the caller. Nothing more. We could take 1108 * in the caller. Nothing more. We could take
1096 * MAX_SCHEDULE_TIMEOUT from one of the negative value 1109 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1097 * but I' d like to return a valid offset (>=0) to allow 1110 * but I' d like to return a valid offset (>=0) to allow
1098 * the caller to do everything it want with the retval. 1111 * the caller to do everything it want with the retval.
1099 */ 1112 */
1100 schedule(); 1113 schedule();
1101 goto out; 1114 goto out;
1102 default: 1115 default:
1103 /* 1116 /*
1104 * Another bit of PARANOID. Note that the retval will be 1117 * Another bit of PARANOID. Note that the retval will be
1105 * 0 since no piece of kernel is supposed to do a check 1118 * 0 since no piece of kernel is supposed to do a check
1106 * for a negative retval of schedule_timeout() (since it 1119 * for a negative retval of schedule_timeout() (since it
1107 * should never happens anyway). You just have the printk() 1120 * should never happens anyway). You just have the printk()
1108 * that will tell you if something is gone wrong and where. 1121 * that will tell you if something is gone wrong and where.
1109 */ 1122 */
1110 if (timeout < 0) 1123 if (timeout < 0)
1111 { 1124 {
1112 printk(KERN_ERR "schedule_timeout: wrong timeout " 1125 printk(KERN_ERR "schedule_timeout: wrong timeout "
1113 "value %lx from %p\n", timeout, 1126 "value %lx from %p\n", timeout,
1114 __builtin_return_address(0)); 1127 __builtin_return_address(0));
1115 current->state = TASK_RUNNING; 1128 current->state = TASK_RUNNING;
1116 goto out; 1129 goto out;
1117 } 1130 }
1118 } 1131 }
1119 1132
1120 expire = timeout + jiffies; 1133 expire = timeout + jiffies;
1121 1134
1122 init_timer(&timer); 1135 init_timer(&timer);
1123 timer.expires = expire; 1136 timer.expires = expire;
1124 timer.data = (unsigned long) current; 1137 timer.data = (unsigned long) current;
1125 timer.function = process_timeout; 1138 timer.function = process_timeout;
1126 1139
1127 add_timer(&timer); 1140 add_timer(&timer);
1128 schedule(); 1141 schedule();
1129 del_singleshot_timer_sync(&timer); 1142 del_singleshot_timer_sync(&timer);
1130 1143
1131 timeout = expire - jiffies; 1144 timeout = expire - jiffies;
1132 1145
1133 out: 1146 out:
1134 return timeout < 0 ? 0 : timeout; 1147 return timeout < 0 ? 0 : timeout;
1135 } 1148 }
1136 1149
1137 EXPORT_SYMBOL(schedule_timeout); 1150 EXPORT_SYMBOL(schedule_timeout);
1138 1151
1139 /* Thread ID - the internal kernel "pid" */ 1152 /* Thread ID - the internal kernel "pid" */
1140 asmlinkage long sys_gettid(void) 1153 asmlinkage long sys_gettid(void)
1141 { 1154 {
1142 return current->pid; 1155 return current->pid;
1143 } 1156 }
1144 1157
1145 static long __sched nanosleep_restart(struct restart_block *restart) 1158 static long __sched nanosleep_restart(struct restart_block *restart)
1146 { 1159 {
1147 unsigned long expire = restart->arg0, now = jiffies; 1160 unsigned long expire = restart->arg0, now = jiffies;
1148 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; 1161 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
1149 long ret; 1162 long ret;
1150 1163
1151 /* Did it expire while we handled signals? */ 1164 /* Did it expire while we handled signals? */
1152 if (!time_after(expire, now)) 1165 if (!time_after(expire, now))
1153 return 0; 1166 return 0;
1154 1167
1155 current->state = TASK_INTERRUPTIBLE; 1168 current->state = TASK_INTERRUPTIBLE;
1156 expire = schedule_timeout(expire - now); 1169 expire = schedule_timeout(expire - now);
1157 1170
1158 ret = 0; 1171 ret = 0;
1159 if (expire) { 1172 if (expire) {
1160 struct timespec t; 1173 struct timespec t;
1161 jiffies_to_timespec(expire, &t); 1174 jiffies_to_timespec(expire, &t);
1162 1175
1163 ret = -ERESTART_RESTARTBLOCK; 1176 ret = -ERESTART_RESTARTBLOCK;
1164 if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) 1177 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1165 ret = -EFAULT; 1178 ret = -EFAULT;
1166 /* The 'restart' block is already filled in */ 1179 /* The 'restart' block is already filled in */
1167 } 1180 }
1168 return ret; 1181 return ret;
1169 } 1182 }
1170 1183
1171 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) 1184 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1172 { 1185 {
1173 struct timespec t; 1186 struct timespec t;
1174 unsigned long expire; 1187 unsigned long expire;
1175 long ret; 1188 long ret;
1176 1189
1177 if (copy_from_user(&t, rqtp, sizeof(t))) 1190 if (copy_from_user(&t, rqtp, sizeof(t)))
1178 return -EFAULT; 1191 return -EFAULT;
1179 1192
1180 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) 1193 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
1181 return -EINVAL; 1194 return -EINVAL;
1182 1195
1183 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 1196 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1184 current->state = TASK_INTERRUPTIBLE; 1197 current->state = TASK_INTERRUPTIBLE;
1185 expire = schedule_timeout(expire); 1198 expire = schedule_timeout(expire);
1186 1199
1187 ret = 0; 1200 ret = 0;
1188 if (expire) { 1201 if (expire) {
1189 struct restart_block *restart; 1202 struct restart_block *restart;
1190 jiffies_to_timespec(expire, &t); 1203 jiffies_to_timespec(expire, &t);
1191 if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) 1204 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1192 return -EFAULT; 1205 return -EFAULT;
1193 1206
1194 restart = &current_thread_info()->restart_block; 1207 restart = &current_thread_info()->restart_block;
1195 restart->fn = nanosleep_restart; 1208 restart->fn = nanosleep_restart;
1196 restart->arg0 = jiffies + expire; 1209 restart->arg0 = jiffies + expire;
1197 restart->arg1 = (unsigned long) rmtp; 1210 restart->arg1 = (unsigned long) rmtp;
1198 ret = -ERESTART_RESTARTBLOCK; 1211 ret = -ERESTART_RESTARTBLOCK;
1199 } 1212 }
1200 return ret; 1213 return ret;
1201 } 1214 }
1202 1215
1203 /* 1216 /*
1204 * sys_sysinfo - fill in sysinfo struct 1217 * sys_sysinfo - fill in sysinfo struct
1205 */ 1218 */
1206 asmlinkage long sys_sysinfo(struct sysinfo __user *info) 1219 asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1207 { 1220 {
1208 struct sysinfo val; 1221 struct sysinfo val;
1209 unsigned long mem_total, sav_total; 1222 unsigned long mem_total, sav_total;
1210 unsigned int mem_unit, bitcount; 1223 unsigned int mem_unit, bitcount;
1211 unsigned long seq; 1224 unsigned long seq;
1212 1225
1213 memset((char *)&val, 0, sizeof(struct sysinfo)); 1226 memset((char *)&val, 0, sizeof(struct sysinfo));
1214 1227
1215 do { 1228 do {
1216 struct timespec tp; 1229 struct timespec tp;
1217 seq = read_seqbegin(&xtime_lock); 1230 seq = read_seqbegin(&xtime_lock);
1218 1231
1219 /* 1232 /*
1220 * This is annoying. The below is the same thing 1233 * This is annoying. The below is the same thing
1221 * posix_get_clock_monotonic() does, but it wants to 1234 * posix_get_clock_monotonic() does, but it wants to
1222 * take the lock which we want to cover the loads stuff 1235 * take the lock which we want to cover the loads stuff
1223 * too. 1236 * too.
1224 */ 1237 */
1225 1238
1226 getnstimeofday(&tp); 1239 getnstimeofday(&tp);
1227 tp.tv_sec += wall_to_monotonic.tv_sec; 1240 tp.tv_sec += wall_to_monotonic.tv_sec;
1228 tp.tv_nsec += wall_to_monotonic.tv_nsec; 1241 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1229 if (tp.tv_nsec - NSEC_PER_SEC >= 0) { 1242 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1230 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; 1243 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1231 tp.tv_sec++; 1244 tp.tv_sec++;
1232 } 1245 }
1233 val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 1246 val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1234 1247
1235 val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1248 val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
1236 val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); 1249 val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1237 val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); 1250 val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1238 1251
1239 val.procs = nr_threads; 1252 val.procs = nr_threads;
1240 } while (read_seqretry(&xtime_lock, seq)); 1253 } while (read_seqretry(&xtime_lock, seq));
1241 1254
1242 si_meminfo(&val); 1255 si_meminfo(&val);
1243 si_swapinfo(&val); 1256 si_swapinfo(&val);
1244 1257
1245 /* 1258 /*
1246 * If the sum of all the available memory (i.e. ram + swap) 1259 * If the sum of all the available memory (i.e. ram + swap)
1247 * is less than can be stored in a 32 bit unsigned long then 1260 * is less than can be stored in a 32 bit unsigned long then
1248 * we can be binary compatible with 2.2.x kernels. If not, 1261 * we can be binary compatible with 2.2.x kernels. If not,
1249 * well, in that case 2.2.x was broken anyways... 1262 * well, in that case 2.2.x was broken anyways...
1250 * 1263 *
1251 * -Erik Andersen <andersee@debian.org> 1264 * -Erik Andersen <andersee@debian.org>
1252 */ 1265 */
1253 1266
1254 mem_total = val.totalram + val.totalswap; 1267 mem_total = val.totalram + val.totalswap;
1255 if (mem_total < val.totalram || mem_total < val.totalswap) 1268 if (mem_total < val.totalram || mem_total < val.totalswap)
1256 goto out; 1269 goto out;
1257 bitcount = 0; 1270 bitcount = 0;
1258 mem_unit = val.mem_unit; 1271 mem_unit = val.mem_unit;
1259 while (mem_unit > 1) { 1272 while (mem_unit > 1) {
1260 bitcount++; 1273 bitcount++;
1261 mem_unit >>= 1; 1274 mem_unit >>= 1;
1262 sav_total = mem_total; 1275 sav_total = mem_total;
1263 mem_total <<= 1; 1276 mem_total <<= 1;
1264 if (mem_total < sav_total) 1277 if (mem_total < sav_total)
1265 goto out; 1278 goto out;
1266 } 1279 }
1267 1280
1268 /* 1281 /*
1269 * If mem_total did not overflow, multiply all memory values by 1282 * If mem_total did not overflow, multiply all memory values by
1270 * val.mem_unit and set it to 1. This leaves things compatible 1283 * val.mem_unit and set it to 1. This leaves things compatible
1271 * with 2.2.x, and also retains compatibility with earlier 2.4.x 1284 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1272 * kernels... 1285 * kernels...
1273 */ 1286 */
1274 1287
1275 val.mem_unit = 1; 1288 val.mem_unit = 1;
1276 val.totalram <<= bitcount; 1289 val.totalram <<= bitcount;
1277 val.freeram <<= bitcount; 1290 val.freeram <<= bitcount;
1278 val.sharedram <<= bitcount; 1291 val.sharedram <<= bitcount;
1279 val.bufferram <<= bitcount; 1292 val.bufferram <<= bitcount;
1280 val.totalswap <<= bitcount; 1293 val.totalswap <<= bitcount;
1281 val.freeswap <<= bitcount; 1294 val.freeswap <<= bitcount;
1282 val.totalhigh <<= bitcount; 1295 val.totalhigh <<= bitcount;
1283 val.freehigh <<= bitcount; 1296 val.freehigh <<= bitcount;
1284 1297
1285 out: 1298 out:
1286 if (copy_to_user(info, &val, sizeof(struct sysinfo))) 1299 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1287 return -EFAULT; 1300 return -EFAULT;
1288 1301
1289 return 0; 1302 return 0;
1290 } 1303 }
1291 1304
1292 static void __devinit init_timers_cpu(int cpu) 1305 static void __devinit init_timers_cpu(int cpu)
1293 { 1306 {
1294 int j; 1307 int j;
1295 tvec_base_t *base; 1308 tvec_base_t *base;
1296 1309
1297 base = &per_cpu(tvec_bases, cpu); 1310 base = &per_cpu(tvec_bases, cpu);
1298 spin_lock_init(&base->t_base.lock); 1311 spin_lock_init(&base->t_base.lock);
1299 for (j = 0; j < TVN_SIZE; j++) { 1312 for (j = 0; j < TVN_SIZE; j++) {
1300 INIT_LIST_HEAD(base->tv5.vec + j); 1313 INIT_LIST_HEAD(base->tv5.vec + j);
1301 INIT_LIST_HEAD(base->tv4.vec + j); 1314 INIT_LIST_HEAD(base->tv4.vec + j);
1302 INIT_LIST_HEAD(base->tv3.vec + j); 1315 INIT_LIST_HEAD(base->tv3.vec + j);
1303 INIT_LIST_HEAD(base->tv2.vec + j); 1316 INIT_LIST_HEAD(base->tv2.vec + j);
1304 } 1317 }
1305 for (j = 0; j < TVR_SIZE; j++) 1318 for (j = 0; j < TVR_SIZE; j++)
1306 INIT_LIST_HEAD(base->tv1.vec + j); 1319 INIT_LIST_HEAD(base->tv1.vec + j);
1307 1320
1308 base->timer_jiffies = jiffies; 1321 base->timer_jiffies = jiffies;
1309 } 1322 }
1310 1323
1311 #ifdef CONFIG_HOTPLUG_CPU 1324 #ifdef CONFIG_HOTPLUG_CPU
1312 static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1325 static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1313 { 1326 {
1314 struct timer_list *timer; 1327 struct timer_list *timer;
1315 1328
1316 while (!list_empty(head)) { 1329 while (!list_empty(head)) {
1317 timer = list_entry(head->next, struct timer_list, entry); 1330 timer = list_entry(head->next, struct timer_list, entry);
1318 detach_timer(timer, 0); 1331 detach_timer(timer, 0);
1319 timer->base = &new_base->t_base; 1332 timer->base = &new_base->t_base;
1320 internal_add_timer(new_base, timer); 1333 internal_add_timer(new_base, timer);
1321 } 1334 }
1322 } 1335 }
1323 1336
1324 static void __devinit migrate_timers(int cpu) 1337 static void __devinit migrate_timers(int cpu)
1325 { 1338 {
1326 tvec_base_t *old_base; 1339 tvec_base_t *old_base;
1327 tvec_base_t *new_base; 1340 tvec_base_t *new_base;
1328 int i; 1341 int i;
1329 1342
1330 BUG_ON(cpu_online(cpu)); 1343 BUG_ON(cpu_online(cpu));
1331 old_base = &per_cpu(tvec_bases, cpu); 1344 old_base = &per_cpu(tvec_bases, cpu);
1332 new_base = &get_cpu_var(tvec_bases); 1345 new_base = &get_cpu_var(tvec_bases);
1333 1346
1334 local_irq_disable(); 1347 local_irq_disable();
1335 spin_lock(&new_base->t_base.lock); 1348 spin_lock(&new_base->t_base.lock);
1336 spin_lock(&old_base->t_base.lock); 1349 spin_lock(&old_base->t_base.lock);
1337 1350
1338 if (old_base->t_base.running_timer) 1351 if (old_base->t_base.running_timer)
1339 BUG(); 1352 BUG();
1340 for (i = 0; i < TVR_SIZE; i++) 1353 for (i = 0; i < TVR_SIZE; i++)
1341 migrate_timer_list(new_base, old_base->tv1.vec + i); 1354 migrate_timer_list(new_base, old_base->tv1.vec + i);
1342 for (i = 0; i < TVN_SIZE; i++) { 1355 for (i = 0; i < TVN_SIZE; i++) {
1343 migrate_timer_list(new_base, old_base->tv2.vec + i); 1356 migrate_timer_list(new_base, old_base->tv2.vec + i);
1344 migrate_timer_list(new_base, old_base->tv3.vec + i); 1357 migrate_timer_list(new_base, old_base->tv3.vec + i);
1345 migrate_timer_list(new_base, old_base->tv4.vec + i); 1358 migrate_timer_list(new_base, old_base->tv4.vec + i);
1346 migrate_timer_list(new_base, old_base->tv5.vec + i); 1359 migrate_timer_list(new_base, old_base->tv5.vec + i);
1347 } 1360 }
1348 1361
1349 spin_unlock(&old_base->t_base.lock); 1362 spin_unlock(&old_base->t_base.lock);
1350 spin_unlock(&new_base->t_base.lock); 1363 spin_unlock(&new_base->t_base.lock);
1351 local_irq_enable(); 1364 local_irq_enable();
1352 put_cpu_var(tvec_bases); 1365 put_cpu_var(tvec_bases);
1353 } 1366 }
1354 #endif /* CONFIG_HOTPLUG_CPU */ 1367 #endif /* CONFIG_HOTPLUG_CPU */
1355 1368
1356 static int __devinit timer_cpu_notify(struct notifier_block *self, 1369 static int __devinit timer_cpu_notify(struct notifier_block *self,
1357 unsigned long action, void *hcpu) 1370 unsigned long action, void *hcpu)
1358 { 1371 {
1359 long cpu = (long)hcpu; 1372 long cpu = (long)hcpu;
1360 switch(action) { 1373 switch(action) {
1361 case CPU_UP_PREPARE: 1374 case CPU_UP_PREPARE:
1362 init_timers_cpu(cpu); 1375 init_timers_cpu(cpu);
1363 break; 1376 break;
1364 #ifdef CONFIG_HOTPLUG_CPU 1377 #ifdef CONFIG_HOTPLUG_CPU
1365 case CPU_DEAD: 1378 case CPU_DEAD:
1366 migrate_timers(cpu); 1379 migrate_timers(cpu);
1367 break; 1380 break;
1368 #endif 1381 #endif
1369 default: 1382 default:
1370 break; 1383 break;
1371 } 1384 }
1372 return NOTIFY_OK; 1385 return NOTIFY_OK;
1373 } 1386 }
1374 1387
1375 static struct notifier_block __devinitdata timers_nb = { 1388 static struct notifier_block __devinitdata timers_nb = {
1376 .notifier_call = timer_cpu_notify, 1389 .notifier_call = timer_cpu_notify,
1377 }; 1390 };
1378 1391
1379 1392
1380 void __init init_timers(void) 1393 void __init init_timers(void)
1381 { 1394 {
1382 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1395 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1383 (void *)(long)smp_processor_id()); 1396 (void *)(long)smp_processor_id());
1384 register_cpu_notifier(&timers_nb); 1397 register_cpu_notifier(&timers_nb);
1385 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1398 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1386 } 1399 }
1387 1400
1388 #ifdef CONFIG_TIME_INTERPOLATION 1401 #ifdef CONFIG_TIME_INTERPOLATION
1389 1402
1390 struct time_interpolator *time_interpolator; 1403 struct time_interpolator *time_interpolator;
1391 static struct time_interpolator *time_interpolator_list; 1404 static struct time_interpolator *time_interpolator_list;
1392 static DEFINE_SPINLOCK(time_interpolator_lock); 1405 static DEFINE_SPINLOCK(time_interpolator_lock);
1393 1406
1394 static inline u64 time_interpolator_get_cycles(unsigned int src) 1407 static inline u64 time_interpolator_get_cycles(unsigned int src)
1395 { 1408 {
1396 unsigned long (*x)(void); 1409 unsigned long (*x)(void);
1397 1410
1398 switch (src) 1411 switch (src)
1399 { 1412 {
1400 case TIME_SOURCE_FUNCTION: 1413 case TIME_SOURCE_FUNCTION:
1401 x = time_interpolator->addr; 1414 x = time_interpolator->addr;
1402 return x(); 1415 return x();
1403 1416
1404 case TIME_SOURCE_MMIO64 : 1417 case TIME_SOURCE_MMIO64 :
1405 return readq((void __iomem *) time_interpolator->addr); 1418 return readq((void __iomem *) time_interpolator->addr);
1406 1419
1407 case TIME_SOURCE_MMIO32 : 1420 case TIME_SOURCE_MMIO32 :
1408 return readl((void __iomem *) time_interpolator->addr); 1421 return readl((void __iomem *) time_interpolator->addr);
1409 1422
1410 default: return get_cycles(); 1423 default: return get_cycles();
1411 } 1424 }
1412 } 1425 }
1413 1426
1414 static inline u64 time_interpolator_get_counter(void) 1427 static inline u64 time_interpolator_get_counter(void)
1415 { 1428 {
1416 unsigned int src = time_interpolator->source; 1429 unsigned int src = time_interpolator->source;
1417 1430
1418 if (time_interpolator->jitter) 1431 if (time_interpolator->jitter)
1419 { 1432 {
1420 u64 lcycle; 1433 u64 lcycle;
1421 u64 now; 1434 u64 now;
1422 1435
1423 do { 1436 do {
1424 lcycle = time_interpolator->last_cycle; 1437 lcycle = time_interpolator->last_cycle;
1425 now = time_interpolator_get_cycles(src); 1438 now = time_interpolator_get_cycles(src);
1426 if (lcycle && time_after(lcycle, now)) 1439 if (lcycle && time_after(lcycle, now))
1427 return lcycle; 1440 return lcycle;
1428 /* Keep track of the last timer value returned. The use of cmpxchg here 1441 /* Keep track of the last timer value returned. The use of cmpxchg here
1429 * will cause contention in an SMP environment. 1442 * will cause contention in an SMP environment.
1430 */ 1443 */
1431 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); 1444 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1432 return now; 1445 return now;
1433 } 1446 }
1434 else 1447 else
1435 return time_interpolator_get_cycles(src); 1448 return time_interpolator_get_cycles(src);
1436 } 1449 }
1437 1450
1438 void time_interpolator_reset(void) 1451 void time_interpolator_reset(void)
1439 { 1452 {
1440 time_interpolator->offset = 0; 1453 time_interpolator->offset = 0;
1441 time_interpolator->last_counter = time_interpolator_get_counter(); 1454 time_interpolator->last_counter = time_interpolator_get_counter();
1442 } 1455 }
1443 1456
1444 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) 1457 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1445 1458
1446 unsigned long time_interpolator_get_offset(void) 1459 unsigned long time_interpolator_get_offset(void)
1447 { 1460 {
1448 /* If we do not have a time interpolator set up then just return zero */ 1461 /* If we do not have a time interpolator set up then just return zero */
1449 if (!time_interpolator) 1462 if (!time_interpolator)
1450 return 0; 1463 return 0;
1451 1464
1452 return time_interpolator->offset + 1465 return time_interpolator->offset +
1453 GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); 1466 GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
1454 } 1467 }
1455 1468
1456 #define INTERPOLATOR_ADJUST 65536 1469 #define INTERPOLATOR_ADJUST 65536
1457 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST 1470 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1458 1471
1459 static void time_interpolator_update(long delta_nsec) 1472 static void time_interpolator_update(long delta_nsec)
1460 { 1473 {
1461 u64 counter; 1474 u64 counter;
1462 unsigned long offset; 1475 unsigned long offset;
1463 1476
1464 /* If there is no time interpolator set up then do nothing */ 1477 /* If there is no time interpolator set up then do nothing */
1465 if (!time_interpolator) 1478 if (!time_interpolator)
1466 return; 1479 return;
1467 1480
1468 /* The interpolator compensates for late ticks by accumulating 1481 /* The interpolator compensates for late ticks by accumulating
1469 * the late time in time_interpolator->offset. A tick earlier than 1482 * the late time in time_interpolator->offset. A tick earlier than
1470 * expected will lead to a reset of the offset and a corresponding 1483 * expected will lead to a reset of the offset and a corresponding
1471 * jump of the clock forward. Again this only works if the 1484 * jump of the clock forward. Again this only works if the
1472 * interpolator clock is running slightly slower than the regular clock 1485 * interpolator clock is running slightly slower than the regular clock
1473 * and the tuning logic insures that. 1486 * and the tuning logic insures that.
1474 */ 1487 */
1475 1488
1476 counter = time_interpolator_get_counter(); 1489 counter = time_interpolator_get_counter();
1477 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); 1490 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
1478 1491
1479 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) 1492 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1480 time_interpolator->offset = offset - delta_nsec; 1493 time_interpolator->offset = offset - delta_nsec;
1481 else { 1494 else {
1482 time_interpolator->skips++; 1495 time_interpolator->skips++;
1483 time_interpolator->ns_skipped += delta_nsec - offset; 1496 time_interpolator->ns_skipped += delta_nsec - offset;
1484 time_interpolator->offset = 0; 1497 time_interpolator->offset = 0;
1485 } 1498 }
1486 time_interpolator->last_counter = counter; 1499 time_interpolator->last_counter = counter;
1487 1500
1488 /* Tuning logic for time interpolator invoked every minute or so. 1501 /* Tuning logic for time interpolator invoked every minute or so.
1489 * Decrease interpolator clock speed if no skips occurred and an offset is carried. 1502 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1490 * Increase interpolator clock speed if we skip too much time. 1503 * Increase interpolator clock speed if we skip too much time.
1491 */ 1504 */
1492 if (jiffies % INTERPOLATOR_ADJUST == 0) 1505 if (jiffies % INTERPOLATOR_ADJUST == 0)
1493 { 1506 {
1494 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) 1507 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
1495 time_interpolator->nsec_per_cyc--; 1508 time_interpolator->nsec_per_cyc--;
1496 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) 1509 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1497 time_interpolator->nsec_per_cyc++; 1510 time_interpolator->nsec_per_cyc++;
1498 time_interpolator->skips = 0; 1511 time_interpolator->skips = 0;
1499 time_interpolator->ns_skipped = 0; 1512 time_interpolator->ns_skipped = 0;
1500 } 1513 }
1501 } 1514 }
1502 1515
1503 static inline int 1516 static inline int
1504 is_better_time_interpolator(struct time_interpolator *new) 1517 is_better_time_interpolator(struct time_interpolator *new)
1505 { 1518 {
1506 if (!time_interpolator) 1519 if (!time_interpolator)
1507 return 1; 1520 return 1;
1508 return new->frequency > 2*time_interpolator->frequency || 1521 return new->frequency > 2*time_interpolator->frequency ||
1509 (unsigned long)new->drift < (unsigned long)time_interpolator->drift; 1522 (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1510 } 1523 }
1511 1524
1512 void 1525 void
1513 register_time_interpolator(struct time_interpolator *ti) 1526 register_time_interpolator(struct time_interpolator *ti)
1514 { 1527 {
1515 unsigned long flags; 1528 unsigned long flags;
1516 1529
1517 /* Sanity check */ 1530 /* Sanity check */
1518 if (ti->frequency == 0 || ti->mask == 0) 1531 if (ti->frequency == 0 || ti->mask == 0)
1519 BUG(); 1532 BUG();
1520 1533
1521 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; 1534 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1522 spin_lock(&time_interpolator_lock); 1535 spin_lock(&time_interpolator_lock);
1523 write_seqlock_irqsave(&xtime_lock, flags); 1536 write_seqlock_irqsave(&xtime_lock, flags);
1524 if (is_better_time_interpolator(ti)) { 1537 if (is_better_time_interpolator(ti)) {
1525 time_interpolator = ti; 1538 time_interpolator = ti;
1526 time_interpolator_reset(); 1539 time_interpolator_reset();
1527 } 1540 }
1528 write_sequnlock_irqrestore(&xtime_lock, flags); 1541 write_sequnlock_irqrestore(&xtime_lock, flags);
1529 1542
1530 ti->next = time_interpolator_list; 1543 ti->next = time_interpolator_list;
1531 time_interpolator_list = ti; 1544 time_interpolator_list = ti;
1532 spin_unlock(&time_interpolator_lock); 1545 spin_unlock(&time_interpolator_lock);
1533 } 1546 }
1534 1547
1535 void 1548 void
1536 unregister_time_interpolator(struct time_interpolator *ti) 1549 unregister_time_interpolator(struct time_interpolator *ti)
1537 { 1550 {
1538 struct time_interpolator *curr, **prev; 1551 struct time_interpolator *curr, **prev;
1539 unsigned long flags; 1552 unsigned long flags;
1540 1553
1541 spin_lock(&time_interpolator_lock); 1554 spin_lock(&time_interpolator_lock);
1542 prev = &time_interpolator_list; 1555 prev = &time_interpolator_list;
1543 for (curr = *prev; curr; curr = curr->next) { 1556 for (curr = *prev; curr; curr = curr->next) {
1544 if (curr == ti) { 1557 if (curr == ti) {
1545 *prev = curr->next; 1558 *prev = curr->next;
1546 break; 1559 break;
1547 } 1560 }
1548 prev = &curr->next; 1561 prev = &curr->next;
1549 } 1562 }
1550 1563
1551 write_seqlock_irqsave(&xtime_lock, flags); 1564 write_seqlock_irqsave(&xtime_lock, flags);
1552 if (ti == time_interpolator) { 1565 if (ti == time_interpolator) {
1553 /* we lost the best time-interpolator: */ 1566 /* we lost the best time-interpolator: */
1554 time_interpolator = NULL; 1567 time_interpolator = NULL;
1555 /* find the next-best interpolator */ 1568 /* find the next-best interpolator */
1556 for (curr = time_interpolator_list; curr; curr = curr->next) 1569 for (curr = time_interpolator_list; curr; curr = curr->next)
1557 if (is_better_time_interpolator(curr)) 1570 if (is_better_time_interpolator(curr))
1558 time_interpolator = curr; 1571 time_interpolator = curr;
1559 time_interpolator_reset(); 1572 time_interpolator_reset();
1560 } 1573 }
1561 write_sequnlock_irqrestore(&xtime_lock, flags); 1574 write_sequnlock_irqrestore(&xtime_lock, flags);
1562 spin_unlock(&time_interpolator_lock); 1575 spin_unlock(&time_interpolator_lock);
1563 } 1576 }
1564 #endif /* CONFIG_TIME_INTERPOLATION */ 1577 #endif /* CONFIG_TIME_INTERPOLATION */
1565 1578
1566 /** 1579 /**
1567 * msleep - sleep safely even with waitqueue interruptions 1580 * msleep - sleep safely even with waitqueue interruptions
1568 * @msecs: Time in milliseconds to sleep for 1581 * @msecs: Time in milliseconds to sleep for
1569 */ 1582 */
1570 void msleep(unsigned int msecs) 1583 void msleep(unsigned int msecs)
1571 { 1584 {
1572 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 1585 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1573 1586
1574 while (timeout) { 1587 while (timeout) {
1575 set_current_state(TASK_UNINTERRUPTIBLE); 1588 set_current_state(TASK_UNINTERRUPTIBLE);
1576 timeout = schedule_timeout(timeout); 1589 timeout = schedule_timeout(timeout);
1577 } 1590 }
1578 } 1591 }
1579 1592
1580 EXPORT_SYMBOL(msleep); 1593 EXPORT_SYMBOL(msleep);
1581 1594
1582 /** 1595 /**
1583 * msleep_interruptible - sleep waiting for waitqueue interruptions 1596 * msleep_interruptible - sleep waiting for waitqueue interruptions