Blame view

drivers/char/hangcheck-timer.c 6.2 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
  /*
   * hangcheck-timer.c
   *
   * Driver for a little io fencing timer.
   *
696f9486d   Joel Becker   [PATCH] hangcheck...
6
   * Copyright (C) 2002, 2003 Oracle.  All rights reserved.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
   *
   * Author: Joel Becker <joel.becker@oracle.com>
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public
   * License version 2 as published by the Free Software Foundation.
   * 
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * General Public License for more details.
   * 
   * You should have received a copy of the GNU General Public
   * License along with this program; if not, write to the
   * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   * Boston, MA 021110-1307, USA.
   */
  
  /*
   * The hangcheck-timer driver uses the TSC to catch delays that
   * jiffies does not notice.  A timer is set.  When the timer fires, it
   * checks whether it was delayed and if that delay exceeds a given
8dfba4d71   Joe Perches   drivers/char/: Sp...
29
   * margin of error.  The hangcheck_tick module parameter takes the timer
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
   * duration in seconds.  The hangcheck_margin parameter defines the
   * margin of error, in seconds.  The defaults are 60 seconds for the
   * timer and 180 seconds for the margin of error.  IOW, a timer is set
   * for 60 seconds.  When the timer fires, the callback checks the
   * actual duration that the timer waited.  If the duration exceeds the
   * alloted time and margin (here 60 + 180, or 240 seconds), the machine
   * is restarted.  A healthy machine will have the duration match the
   * expected timeout very closely.
   */
  
  #include <linux/module.h>
  #include <linux/moduleparam.h>
  #include <linux/types.h>
  #include <linux/kernel.h>
  #include <linux/fs.h>
  #include <linux/mm.h>
  #include <linux/reboot.h>
  #include <linux/init.h>
696f9486d   Joel Becker   [PATCH] hangcheck...
48
  #include <linux/delay.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
  #include <asm/uaccess.h>
696f9486d   Joel Becker   [PATCH] hangcheck...
50
  #include <linux/sysrq.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
51
  #include <linux/timer.h>
940370fc8   Yury Polyanskiy   hangcheck-timer: ...
52
  #include <linux/time.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
53

940370fc8   Yury Polyanskiy   hangcheck-timer: ...
54
  #define VERSION_STR "0.9.1"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
55
56
57
58
59
60
61
  
  #define DEFAULT_IOFENCE_MARGIN 60	/* Default fudge factor, in seconds */
  #define DEFAULT_IOFENCE_TICK 180	/* Default timer timeout, in seconds */
  
  static int hangcheck_tick = DEFAULT_IOFENCE_TICK;
  static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN;
  static int hangcheck_reboot;  /* Defaults to not reboot */
696f9486d   Joel Becker   [PATCH] hangcheck...
62
  static int hangcheck_dump_tasks;  /* Defaults to not dumping SysRQ T */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
63

696f9486d   Joel Becker   [PATCH] hangcheck...
64
  /* options - modular */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
65
66
67
68
69
70
  module_param(hangcheck_tick, int, 0);
  MODULE_PARM_DESC(hangcheck_tick, "Timer delay.");
  module_param(hangcheck_margin, int, 0);
  MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire.");
  module_param(hangcheck_reboot, int, 0);
  MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded.");
696f9486d   Joel Becker   [PATCH] hangcheck...
71
72
  module_param(hangcheck_dump_tasks, int, 0);
  MODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded.");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
73

696f9486d   Joel Becker   [PATCH] hangcheck...
74
  MODULE_AUTHOR("Oracle");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
75
76
  MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin.");
  MODULE_LICENSE("GPL");
696f9486d   Joel Becker   [PATCH] hangcheck...
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
  MODULE_VERSION(VERSION_STR);
  
  /* options - nonmodular */
  #ifndef MODULE
  
  static int __init hangcheck_parse_tick(char *str)
  {
  	int par;
  	if (get_option(&str,&par))
  		hangcheck_tick = par;
  	return 1;
  }
  
  static int __init hangcheck_parse_margin(char *str)
  {
  	int par;
  	if (get_option(&str,&par))
  		hangcheck_margin = par;
  	return 1;
  }
  
  static int __init hangcheck_parse_reboot(char *str)
  {
  	int par;
  	if (get_option(&str,&par))
  		hangcheck_reboot = par;
  	return 1;
  }
  
  static int __init hangcheck_parse_dump_tasks(char *str)
  {
  	int par;
  	if (get_option(&str,&par))
  		hangcheck_dump_tasks = par;
  	return 1;
  }
  
  __setup("hcheck_tick", hangcheck_parse_tick);
  __setup("hcheck_margin", hangcheck_parse_margin);
  __setup("hcheck_reboot", hangcheck_parse_reboot);
  __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
  #endif /* not MODULE */
1489939f0   John Stultz   [PATCH] time: x86...
119
  #if defined(CONFIG_S390)
696f9486d   Joel Becker   [PATCH] hangcheck...
120
121
  # define HAVE_MONOTONIC
  # define TIMER_FREQ 1000000000ULL
ede65f392   Andrew Morton   [PATCH] hangcheck...
122
  #else
940370fc8   Yury Polyanskiy   hangcheck-timer: ...
123
  # define TIMER_FREQ 1000000000ULL
696f9486d   Joel Becker   [PATCH] hangcheck...
124
125
126
127
128
129
130
  #endif
  
  #ifdef HAVE_MONOTONIC
  extern unsigned long long monotonic_clock(void);
  #else
  static inline unsigned long long monotonic_clock(void)
  {
940370fc8   Yury Polyanskiy   hangcheck-timer: ...
131
132
133
  	struct timespec ts;
  	getrawmonotonic(&ts);
  	return timespec_to_ns(&ts);
696f9486d   Joel Becker   [PATCH] hangcheck...
134
135
  }
  #endif  /* HAVE_MONOTONIC */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
136
137
138
139
140
141
  
  
  /* Last time scheduled */
  static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
  
  static void hangcheck_fire(unsigned long);
8d06afab7   Ingo Molnar   [PATCH] timer ini...
142
  static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire, 0, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
143

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
144
145
146
147
148
149
150
151
152
153
154
155
156
  
  static void hangcheck_fire(unsigned long data)
  {
  	unsigned long long cur_tsc, tsc_diff;
  
  	cur_tsc = monotonic_clock();
  
  	if (cur_tsc > hangcheck_tsc)
  		tsc_diff = cur_tsc - hangcheck_tsc;
  	else
  		tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */
  
  	if (tsc_diff > hangcheck_tsc_margin) {
696f9486d   Joel Becker   [PATCH] hangcheck...
157
158
159
160
  		if (hangcheck_dump_tasks) {
  			printk(KERN_CRIT "Hangcheck: Task state:
  ");
  #ifdef CONFIG_MAGIC_SYSRQ
f335397d1   Dmitry Torokhov   Input: sysrq - dr...
161
  			handle_sysrq('t');
696f9486d   Joel Becker   [PATCH] hangcheck...
162
163
  #endif  /* CONFIG_MAGIC_SYSRQ */
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
165
166
  		if (hangcheck_reboot) {
  			printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.
  ");
970d32443   Eric W. Biederman   [PATCH] In hangch...
167
  			emergency_restart();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
168
169
170
171
172
  		} else {
  			printk(KERN_CRIT "Hangcheck: hangcheck value past margin!
  ");
  		}
  	}
940370fc8   Yury Polyanskiy   hangcheck-timer: ...
173
174
175
176
177
178
179
180
  #if 0
  	/*
  	 * Enable to investigate delays in detail
  	 */
  	printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)
  ",
  			tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
181
182
183
184
185
186
187
188
189
190
  	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
  	hangcheck_tsc = monotonic_clock();
  }
  
  
  static int __init hangcheck_init(void)
  {
  	printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).
  ",
  	       VERSION_STR, hangcheck_tick, hangcheck_margin);
696f9486d   Joel Becker   [PATCH] hangcheck...
191
192
193
  #if defined (HAVE_MONOTONIC)
  	printk("Hangcheck: Using monotonic_clock().
  ");
696f9486d   Joel Becker   [PATCH] hangcheck...
194
  #else
940370fc8   Yury Polyanskiy   hangcheck-timer: ...
195
196
  	printk("Hangcheck: Using getrawmonotonic().
  ");
696f9486d   Joel Becker   [PATCH] hangcheck...
197
198
199
200
  #endif  /* HAVE_MONOTONIC */
  	hangcheck_tsc_margin =
  		(unsigned long long)(hangcheck_margin + hangcheck_tick);
  	hangcheck_tsc_margin *= (unsigned long long)TIMER_FREQ;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
201
202
203
204
205
206
207
208
209
210
211
  
  	hangcheck_tsc = monotonic_clock();
  	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
  
  	return 0;
  }
  
  
  static void __exit hangcheck_exit(void)
  {
  	del_timer_sync(&hangcheck_ticktock);
696f9486d   Joel Becker   [PATCH] hangcheck...
212
213
          printk("Hangcheck: Stopped hangcheck timer.
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
214
215
216
217
  }
  
  module_init(hangcheck_init);
  module_exit(hangcheck_exit);