Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit eaa958402ea40851097d051f52ba1bb7a885efe9

Authored by Yinghai Lu 2009-06-07 05:51:36 +0800

Committed by Rusty Russell 2009-06-09 21:00:27 +0800

Exists in master and in 7 other branches

cpumask: alloc zeroed cpumask for static cpumask_var_ts

These are defined as static cpumask_var_t so if MAXSMP is not used,
they are cleared already.  Avoid surprises when MAXSMP is enabled.

Signed-off-by: Yinghai Lu <yinghai.lu@kernel.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

Showing 11 changed files with 11 additions and 11 deletions Inline Diff

arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
arch/x86/kernel/cpu/cpufreq/powernow-k7.c
arch/x86/kernel/cpu/cpufreq/powernow-k8.c
arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
arch/x86/kernel/cpu/mcheck/mce_64.c
arch/x86/kernel/tlb_uv.c
drivers/acpi/processor_core.c
drivers/cpufreq/cpufreq.c
kernel/sched_cpupri.c
kernel/sched_rt.c
kernel/smp.c

arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* acpi-cpufreq.c - ACPI Processor P-States Driver	2	* acpi-cpufreq.c - ACPI Processor P-States Driver
3	*	3	*
4	* Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>	4	* Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5	* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>	5	* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6	* Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>	6	* Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7	* Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>	7	* Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
8	*	8	*
9	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~	9	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10	*	10	*
11	* This program is free software; you can redistribute it and/or modify	11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by	12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or (at	13	* the Free Software Foundation; either version 2 of the License, or (at
14	* your option) any later version.	14	* your option) any later version.
15	*	15	*
16	* This program is distributed in the hope that it will be useful, but	16	* This program is distributed in the hope that it will be useful, but
17	* WITHOUT ANY WARRANTY; without even the implied warranty of	17	* WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19	* General Public License for more details.	19	* General Public License for more details.
20	*	20	*
21	* You should have received a copy of the GNU General Public License along	21	* You should have received a copy of the GNU General Public License along
22	* with this program; if not, write to the Free Software Foundation, Inc.,	22	* with this program; if not, write to the Free Software Foundation, Inc.,
23	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.	23	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
24	*	24	*
25	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~	25	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26	*/	26	*/
27		27
28	#include <linux/kernel.h>	28	#include <linux/kernel.h>
29	#include <linux/module.h>	29	#include <linux/module.h>
30	#include <linux/init.h>	30	#include <linux/init.h>
31	#include <linux/smp.h>	31	#include <linux/smp.h>
32	#include <linux/sched.h>	32	#include <linux/sched.h>
33	#include <linux/cpufreq.h>	33	#include <linux/cpufreq.h>
34	#include <linux/compiler.h>	34	#include <linux/compiler.h>
35	#include <linux/dmi.h>	35	#include <linux/dmi.h>
36	#include <trace/power.h>	36	#include <trace/power.h>
37		37
38	#include <linux/acpi.h>	38	#include <linux/acpi.h>
39	#include <linux/io.h>	39	#include <linux/io.h>
40	#include <linux/delay.h>	40	#include <linux/delay.h>
41	#include <linux/uaccess.h>	41	#include <linux/uaccess.h>
42		42
43	#include <acpi/processor.h>	43	#include <acpi/processor.h>
44		44
45	#include <asm/msr.h>	45	#include <asm/msr.h>
46	#include <asm/processor.h>	46	#include <asm/processor.h>
47	#include <asm/cpufeature.h>	47	#include <asm/cpufeature.h>
48		48
49	#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \	49	#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
50	"acpi-cpufreq", msg)	50	"acpi-cpufreq", msg)
51		51
52	MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");	52	MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
53	MODULE_DESCRIPTION("ACPI Processor P-States Driver");	53	MODULE_DESCRIPTION("ACPI Processor P-States Driver");
54	MODULE_LICENSE("GPL");	54	MODULE_LICENSE("GPL");
55		55
56	enum {	56	enum {
57	UNDEFINED_CAPABLE = 0,	57	UNDEFINED_CAPABLE = 0,
58	SYSTEM_INTEL_MSR_CAPABLE,	58	SYSTEM_INTEL_MSR_CAPABLE,
59	SYSTEM_IO_CAPABLE,	59	SYSTEM_IO_CAPABLE,
60	};	60	};
61		61
62	#define INTEL_MSR_RANGE (0xffff)	62	#define INTEL_MSR_RANGE (0xffff)
63	#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)	63	#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
64		64
65	struct acpi_cpufreq_data {	65	struct acpi_cpufreq_data {
66	struct acpi_processor_performance *acpi_data;	66	struct acpi_processor_performance *acpi_data;
67	struct cpufreq_frequency_table *freq_table;	67	struct cpufreq_frequency_table *freq_table;
68	unsigned int resume;	68	unsigned int resume;
69	unsigned int cpu_feature;	69	unsigned int cpu_feature;
70	};	70	};
71		71
72	static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);	72	static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
73		73
74	struct acpi_msr_data {	74	struct acpi_msr_data {
75	u64 saved_aperf, saved_mperf;	75	u64 saved_aperf, saved_mperf;
76	};	76	};
77		77
78	static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);	78	static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
79		79
80	DEFINE_TRACE(power_mark);	80	DEFINE_TRACE(power_mark);
81		81
82	/* acpi_perf_data is a pointer to percpu data. */	82	/* acpi_perf_data is a pointer to percpu data. */
83	static struct acpi_processor_performance *acpi_perf_data;	83	static struct acpi_processor_performance *acpi_perf_data;
84		84
85	static struct cpufreq_driver acpi_cpufreq_driver;	85	static struct cpufreq_driver acpi_cpufreq_driver;
86		86
87	static unsigned int acpi_pstate_strict;	87	static unsigned int acpi_pstate_strict;
88		88
89	static int check_est_cpu(unsigned int cpuid)	89	static int check_est_cpu(unsigned int cpuid)
90	{	90	{
91	struct cpuinfo_x86 *cpu = &cpu_data(cpuid);	91	struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
92		92
93	if (cpu->x86_vendor != X86_VENDOR_INTEL \|\|	93	if (cpu->x86_vendor != X86_VENDOR_INTEL \|\|
94	!cpu_has(cpu, X86_FEATURE_EST))	94	!cpu_has(cpu, X86_FEATURE_EST))
95	return 0;	95	return 0;
96		96
97	return 1;	97	return 1;
98	}	98	}
99		99
100	static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)	100	static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
101	{	101	{
102	struct acpi_processor_performance *perf;	102	struct acpi_processor_performance *perf;
103	int i;	103	int i;
104		104
105	perf = data->acpi_data;	105	perf = data->acpi_data;
106		106
107	for (i = 0; i < perf->state_count; i++) {	107	for (i = 0; i < perf->state_count; i++) {
108	if (value == perf->states[i].status)	108	if (value == perf->states[i].status)
109	return data->freq_table[i].frequency;	109	return data->freq_table[i].frequency;
110	}	110	}
111	return 0;	111	return 0;
112	}	112	}
113		113
114	static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)	114	static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
115	{	115	{
116	int i;	116	int i;
117	struct acpi_processor_performance *perf;	117	struct acpi_processor_performance *perf;
118		118
119	msr &= INTEL_MSR_RANGE;	119	msr &= INTEL_MSR_RANGE;
120	perf = data->acpi_data;	120	perf = data->acpi_data;
121		121
122	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {	122	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
123	if (msr == perf->states[data->freq_table[i].index].status)	123	if (msr == perf->states[data->freq_table[i].index].status)
124	return data->freq_table[i].frequency;	124	return data->freq_table[i].frequency;
125	}	125	}
126	return data->freq_table[0].frequency;	126	return data->freq_table[0].frequency;
127	}	127	}
128		128
129	static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)	129	static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
130	{	130	{
131	switch (data->cpu_feature) {	131	switch (data->cpu_feature) {
132	case SYSTEM_INTEL_MSR_CAPABLE:	132	case SYSTEM_INTEL_MSR_CAPABLE:
133	return extract_msr(val, data);	133	return extract_msr(val, data);
134	case SYSTEM_IO_CAPABLE:	134	case SYSTEM_IO_CAPABLE:
135	return extract_io(val, data);	135	return extract_io(val, data);
136	default:	136	default:
137	return 0;	137	return 0;
138	}	138	}
139	}	139	}
140		140
141	struct msr_addr {	141	struct msr_addr {
142	u32 reg;	142	u32 reg;
143	};	143	};
144		144
145	struct io_addr {	145	struct io_addr {
146	u16 port;	146	u16 port;
147	u8 bit_width;	147	u8 bit_width;
148	};	148	};
149		149
150	struct drv_cmd {	150	struct drv_cmd {
151	unsigned int type;	151	unsigned int type;
152	const struct cpumask *mask;	152	const struct cpumask *mask;
153	union {	153	union {
154	struct msr_addr msr;	154	struct msr_addr msr;
155	struct io_addr io;	155	struct io_addr io;
156	} addr;	156	} addr;
157	u32 val;	157	u32 val;
158	};	158	};
159		159
160	/* Called via smp_call_function_single(), on the target CPU */	160	/* Called via smp_call_function_single(), on the target CPU */
161	static void do_drv_read(void *_cmd)	161	static void do_drv_read(void *_cmd)
162	{	162	{
163	struct drv_cmd *cmd = _cmd;	163	struct drv_cmd *cmd = _cmd;
164	u32 h;	164	u32 h;
165		165
166	switch (cmd->type) {	166	switch (cmd->type) {
167	case SYSTEM_INTEL_MSR_CAPABLE:	167	case SYSTEM_INTEL_MSR_CAPABLE:
168	rdmsr(cmd->addr.msr.reg, cmd->val, h);	168	rdmsr(cmd->addr.msr.reg, cmd->val, h);
169	break;	169	break;
170	case SYSTEM_IO_CAPABLE:	170	case SYSTEM_IO_CAPABLE:
171	acpi_os_read_port((acpi_io_address)cmd->addr.io.port,	171	acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
172	&cmd->val,	172	&cmd->val,
173	(u32)cmd->addr.io.bit_width);	173	(u32)cmd->addr.io.bit_width);
174	break;	174	break;
175	default:	175	default:
176	break;	176	break;
177	}	177	}
178	}	178	}
179		179
180	/* Called via smp_call_function_many(), on the target CPUs */	180	/* Called via smp_call_function_many(), on the target CPUs */
181	static void do_drv_write(void *_cmd)	181	static void do_drv_write(void *_cmd)
182	{	182	{
183	struct drv_cmd *cmd = _cmd;	183	struct drv_cmd *cmd = _cmd;
184	u32 lo, hi;	184	u32 lo, hi;
185		185
186	switch (cmd->type) {	186	switch (cmd->type) {
187	case SYSTEM_INTEL_MSR_CAPABLE:	187	case SYSTEM_INTEL_MSR_CAPABLE:
188	rdmsr(cmd->addr.msr.reg, lo, hi);	188	rdmsr(cmd->addr.msr.reg, lo, hi);
189	lo = (lo & ~INTEL_MSR_RANGE) \| (cmd->val & INTEL_MSR_RANGE);	189	lo = (lo & ~INTEL_MSR_RANGE) \| (cmd->val & INTEL_MSR_RANGE);
190	wrmsr(cmd->addr.msr.reg, lo, hi);	190	wrmsr(cmd->addr.msr.reg, lo, hi);
191	break;	191	break;
192	case SYSTEM_IO_CAPABLE:	192	case SYSTEM_IO_CAPABLE:
193	acpi_os_write_port((acpi_io_address)cmd->addr.io.port,	193	acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
194	cmd->val,	194	cmd->val,
195	(u32)cmd->addr.io.bit_width);	195	(u32)cmd->addr.io.bit_width);
196	break;	196	break;
197	default:	197	default:
198	break;	198	break;
199	}	199	}
200	}	200	}
201		201
202	static void drv_read(struct drv_cmd *cmd)	202	static void drv_read(struct drv_cmd *cmd)
203	{	203	{
204	cmd->val = 0;	204	cmd->val = 0;
205		205
206	smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1);	206	smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1);
207	}	207	}
208		208
209	static void drv_write(struct drv_cmd *cmd)	209	static void drv_write(struct drv_cmd *cmd)
210	{	210	{
211	int this_cpu;	211	int this_cpu;
212		212
213	this_cpu = get_cpu();	213	this_cpu = get_cpu();
214	if (cpumask_test_cpu(this_cpu, cmd->mask))	214	if (cpumask_test_cpu(this_cpu, cmd->mask))
215	do_drv_write(cmd);	215	do_drv_write(cmd);
216	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);	216	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
217	put_cpu();	217	put_cpu();
218	}	218	}
219		219
220	static u32 get_cur_val(const struct cpumask *mask)	220	static u32 get_cur_val(const struct cpumask *mask)
221	{	221	{
222	struct acpi_processor_performance *perf;	222	struct acpi_processor_performance *perf;
223	struct drv_cmd cmd;	223	struct drv_cmd cmd;
224		224
225	if (unlikely(cpumask_empty(mask)))	225	if (unlikely(cpumask_empty(mask)))
226	return 0;	226	return 0;
227		227
228	switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) {	228	switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) {
229	case SYSTEM_INTEL_MSR_CAPABLE:	229	case SYSTEM_INTEL_MSR_CAPABLE:
230	cmd.type = SYSTEM_INTEL_MSR_CAPABLE;	230	cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
231	cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;	231	cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
232	break;	232	break;
233	case SYSTEM_IO_CAPABLE:	233	case SYSTEM_IO_CAPABLE:
234	cmd.type = SYSTEM_IO_CAPABLE;	234	cmd.type = SYSTEM_IO_CAPABLE;
235	perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data;	235	perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data;
236	cmd.addr.io.port = perf->control_register.address;	236	cmd.addr.io.port = perf->control_register.address;
237	cmd.addr.io.bit_width = perf->control_register.bit_width;	237	cmd.addr.io.bit_width = perf->control_register.bit_width;
238	break;	238	break;
239	default:	239	default:
240	return 0;	240	return 0;
241	}	241	}
242		242
243	cmd.mask = mask;	243	cmd.mask = mask;
244	drv_read(&cmd);	244	drv_read(&cmd);
245		245
246	dprintk("get_cur_val = %u\n", cmd.val);	246	dprintk("get_cur_val = %u\n", cmd.val);
247		247
248	return cmd.val;	248	return cmd.val;
249	}	249	}
250		250
251	struct perf_pair {	251	struct perf_pair {
252	union {	252	union {
253	struct {	253	struct {
254	u32 lo;	254	u32 lo;
255	u32 hi;	255	u32 hi;
256	} split;	256	} split;
257	u64 whole;	257	u64 whole;
258	} aperf, mperf;	258	} aperf, mperf;
259	};	259	};
260		260
261	/* Called via smp_call_function_single(), on the target CPU */	261	/* Called via smp_call_function_single(), on the target CPU */
262	static void read_measured_perf_ctrs(void *_cur)	262	static void read_measured_perf_ctrs(void *_cur)
263	{	263	{
264	struct perf_pair *cur = _cur;	264	struct perf_pair *cur = _cur;
265		265
266	rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);	266	rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
267	rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);	267	rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
268	}	268	}
269		269
270	/*	270	/*
271	* Return the measured active (C0) frequency on this CPU since last call	271	* Return the measured active (C0) frequency on this CPU since last call
272	* to this function.	272	* to this function.
273	* Input: cpu number	273	* Input: cpu number
274	* Return: Average CPU frequency in terms of max frequency (zero on error)	274	* Return: Average CPU frequency in terms of max frequency (zero on error)
275	*	275	*
276	* We use IA32_MPERF and IA32_APERF MSRs to get the measured performance	276	* We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
277	* over a period of time, while CPU is in C0 state.	277	* over a period of time, while CPU is in C0 state.
278	* IA32_MPERF counts at the rate of max advertised frequency	278	* IA32_MPERF counts at the rate of max advertised frequency
279	* IA32_APERF counts at the rate of actual CPU frequency	279	* IA32_APERF counts at the rate of actual CPU frequency
280	* Only IA32_APERF/IA32_MPERF ratio is architecturally defined and	280	* Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
281	* no meaning should be associated with absolute values of these MSRs.	281	* no meaning should be associated with absolute values of these MSRs.
282	*/	282	*/
283	static unsigned int get_measured_perf(struct cpufreq_policy *policy,	283	static unsigned int get_measured_perf(struct cpufreq_policy *policy,
284	unsigned int cpu)	284	unsigned int cpu)
285	{	285	{
286	struct perf_pair readin, cur;	286	struct perf_pair readin, cur;
287	unsigned int perf_percent;	287	unsigned int perf_percent;
288	unsigned int retval;	288	unsigned int retval;
289		289
290	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))	290	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
291	return 0;	291	return 0;
292		292
293	cur.aperf.whole = readin.aperf.whole -	293	cur.aperf.whole = readin.aperf.whole -
294	per_cpu(msr_data, cpu).saved_aperf;	294	per_cpu(msr_data, cpu).saved_aperf;
295	cur.mperf.whole = readin.mperf.whole -	295	cur.mperf.whole = readin.mperf.whole -
296	per_cpu(msr_data, cpu).saved_mperf;	296	per_cpu(msr_data, cpu).saved_mperf;
297	per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;	297	per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
298	per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;	298	per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
299		299
300	#ifdef __i386__	300	#ifdef __i386__
301	/*	301	/*
302	* We dont want to do 64 bit divide with 32 bit kernel	302	* We dont want to do 64 bit divide with 32 bit kernel
303	* Get an approximate value. Return failure in case we cannot get	303	* Get an approximate value. Return failure in case we cannot get
304	* an approximate value.	304	* an approximate value.
305	*/	305	*/
306	if (unlikely(cur.aperf.split.hi \|\| cur.mperf.split.hi)) {	306	if (unlikely(cur.aperf.split.hi \|\| cur.mperf.split.hi)) {
307	int shift_count;	307	int shift_count;
308	u32 h;	308	u32 h;
309		309
310	h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);	310	h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
311	shift_count = fls(h);	311	shift_count = fls(h);
312		312
313	cur.aperf.whole >>= shift_count;	313	cur.aperf.whole >>= shift_count;
314	cur.mperf.whole >>= shift_count;	314	cur.mperf.whole >>= shift_count;
315	}	315	}
316		316
317	if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {	317	if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
318	int shift_count = 7;	318	int shift_count = 7;
319	cur.aperf.split.lo >>= shift_count;	319	cur.aperf.split.lo >>= shift_count;
320	cur.mperf.split.lo >>= shift_count;	320	cur.mperf.split.lo >>= shift_count;
321	}	321	}
322		322
323	if (cur.aperf.split.lo && cur.mperf.split.lo)	323	if (cur.aperf.split.lo && cur.mperf.split.lo)
324	perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;	324	perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
325	else	325	else
326	perf_percent = 0;	326	perf_percent = 0;
327		327
328	#else	328	#else
329	if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {	329	if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
330	int shift_count = 7;	330	int shift_count = 7;
331	cur.aperf.whole >>= shift_count;	331	cur.aperf.whole >>= shift_count;
332	cur.mperf.whole >>= shift_count;	332	cur.mperf.whole >>= shift_count;
333	}	333	}
334		334
335	if (cur.aperf.whole && cur.mperf.whole)	335	if (cur.aperf.whole && cur.mperf.whole)
336	perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;	336	perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
337	else	337	else
338	perf_percent = 0;	338	perf_percent = 0;
339		339
340	#endif	340	#endif
341		341
342	retval = (policy->cpuinfo.max_freq * perf_percent) / 100;	342	retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
343		343
344	return retval;	344	return retval;
345	}	345	}
346		346
347	static unsigned int get_cur_freq_on_cpu(unsigned int cpu)	347	static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
348	{	348	{
349	struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);	349	struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
350	unsigned int freq;	350	unsigned int freq;
351	unsigned int cached_freq;	351	unsigned int cached_freq;
352		352
353	dprintk("get_cur_freq_on_cpu (%d)\n", cpu);	353	dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
354		354
355	if (unlikely(data == NULL \|\|	355	if (unlikely(data == NULL \|\|
356	data->acpi_data == NULL \|\| data->freq_table == NULL)) {	356	data->acpi_data == NULL \|\| data->freq_table == NULL)) {
357	return 0;	357	return 0;
358	}	358	}
359		359
360	cached_freq = data->freq_table[data->acpi_data->state].frequency;	360	cached_freq = data->freq_table[data->acpi_data->state].frequency;
361	freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);	361	freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
362	if (freq != cached_freq) {	362	if (freq != cached_freq) {
363	/*	363	/*
364	* The dreaded BIOS frequency change behind our back.	364	* The dreaded BIOS frequency change behind our back.
365	* Force set the frequency on next target call.	365	* Force set the frequency on next target call.
366	*/	366	*/
367	data->resume = 1;	367	data->resume = 1;
368	}	368	}
369		369
370	dprintk("cur freq = %u\n", freq);	370	dprintk("cur freq = %u\n", freq);
371		371
372	return freq;	372	return freq;
373	}	373	}
374		374
375	static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,	375	static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
376	struct acpi_cpufreq_data *data)	376	struct acpi_cpufreq_data *data)
377	{	377	{
378	unsigned int cur_freq;	378	unsigned int cur_freq;
379	unsigned int i;	379	unsigned int i;
380		380
381	for (i = 0; i < 100; i++) {	381	for (i = 0; i < 100; i++) {
382	cur_freq = extract_freq(get_cur_val(mask), data);	382	cur_freq = extract_freq(get_cur_val(mask), data);
383	if (cur_freq == freq)	383	if (cur_freq == freq)
384	return 1;	384	return 1;
385	udelay(10);	385	udelay(10);
386	}	386	}
387	return 0;	387	return 0;
388	}	388	}
389		389
390	static int acpi_cpufreq_target(struct cpufreq_policy *policy,	390	static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391	unsigned int target_freq, unsigned int relation)	391	unsigned int target_freq, unsigned int relation)
392	{	392	{
393	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);	393	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
394	struct acpi_processor_performance *perf;	394	struct acpi_processor_performance *perf;
395	struct cpufreq_freqs freqs;	395	struct cpufreq_freqs freqs;
396	struct drv_cmd cmd;	396	struct drv_cmd cmd;
397	unsigned int next_state = 0; /* Index into freq_table */	397	unsigned int next_state = 0; /* Index into freq_table */
398	unsigned int next_perf_state = 0; /* Index into perf table */	398	unsigned int next_perf_state = 0; /* Index into perf table */
399	unsigned int i;	399	unsigned int i;
400	int result = 0;	400	int result = 0;
401	struct power_trace it;	401	struct power_trace it;
402		402
403	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);	403	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
404		404
405	if (unlikely(data == NULL \|\|	405	if (unlikely(data == NULL \|\|
406	data->acpi_data == NULL \|\| data->freq_table == NULL)) {	406	data->acpi_data == NULL \|\| data->freq_table == NULL)) {
407	return -ENODEV;	407	return -ENODEV;
408	}	408	}
409		409
410	perf = data->acpi_data;	410	perf = data->acpi_data;
411	result = cpufreq_frequency_table_target(policy,	411	result = cpufreq_frequency_table_target(policy,
412	data->freq_table,	412	data->freq_table,
413	target_freq,	413	target_freq,
414	relation, &next_state);	414	relation, &next_state);
415	if (unlikely(result)) {	415	if (unlikely(result)) {
416	result = -ENODEV;	416	result = -ENODEV;
417	goto out;	417	goto out;
418	}	418	}
419		419
420	next_perf_state = data->freq_table[next_state].index;	420	next_perf_state = data->freq_table[next_state].index;
421	if (perf->state == next_perf_state) {	421	if (perf->state == next_perf_state) {
422	if (unlikely(data->resume)) {	422	if (unlikely(data->resume)) {
423	dprintk("Called after resume, resetting to P%d\n",	423	dprintk("Called after resume, resetting to P%d\n",
424	next_perf_state);	424	next_perf_state);
425	data->resume = 0;	425	data->resume = 0;
426	} else {	426	} else {
427	dprintk("Already at target state (P%d)\n",	427	dprintk("Already at target state (P%d)\n",
428	next_perf_state);	428	next_perf_state);
429	goto out;	429	goto out;
430	}	430	}
431	}	431	}
432		432
433	trace_power_mark(&it, POWER_PSTATE, next_perf_state);	433	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
434		434
435	switch (data->cpu_feature) {	435	switch (data->cpu_feature) {
436	case SYSTEM_INTEL_MSR_CAPABLE:	436	case SYSTEM_INTEL_MSR_CAPABLE:
437	cmd.type = SYSTEM_INTEL_MSR_CAPABLE;	437	cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
438	cmd.addr.msr.reg = MSR_IA32_PERF_CTL;	438	cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
439	cmd.val = (u32) perf->states[next_perf_state].control;	439	cmd.val = (u32) perf->states[next_perf_state].control;
440	break;	440	break;
441	case SYSTEM_IO_CAPABLE:	441	case SYSTEM_IO_CAPABLE:
442	cmd.type = SYSTEM_IO_CAPABLE;	442	cmd.type = SYSTEM_IO_CAPABLE;
443	cmd.addr.io.port = perf->control_register.address;	443	cmd.addr.io.port = perf->control_register.address;
444	cmd.addr.io.bit_width = perf->control_register.bit_width;	444	cmd.addr.io.bit_width = perf->control_register.bit_width;
445	cmd.val = (u32) perf->states[next_perf_state].control;	445	cmd.val = (u32) perf->states[next_perf_state].control;
446	break;	446	break;
447	default:	447	default:
448	result = -ENODEV;	448	result = -ENODEV;
449	goto out;	449	goto out;
450	}	450	}
451		451
452	/* cpufreq holds the hotplug lock, so we are safe from here on */	452	/* cpufreq holds the hotplug lock, so we are safe from here on */
453	if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)	453	if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
454	cmd.mask = policy->cpus;	454	cmd.mask = policy->cpus;
455	else	455	else
456	cmd.mask = cpumask_of(policy->cpu);	456	cmd.mask = cpumask_of(policy->cpu);
457		457
458	freqs.old = perf->states[perf->state].core_frequency * 1000;	458	freqs.old = perf->states[perf->state].core_frequency * 1000;
459	freqs.new = data->freq_table[next_state].frequency;	459	freqs.new = data->freq_table[next_state].frequency;
460	for_each_cpu(i, cmd.mask) {	460	for_each_cpu(i, cmd.mask) {
461	freqs.cpu = i;	461	freqs.cpu = i;
462	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);	462	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
463	}	463	}
464		464
465	drv_write(&cmd);	465	drv_write(&cmd);
466		466
467	if (acpi_pstate_strict) {	467	if (acpi_pstate_strict) {
468	if (!check_freqs(cmd.mask, freqs.new, data)) {	468	if (!check_freqs(cmd.mask, freqs.new, data)) {
469	dprintk("acpi_cpufreq_target failed (%d)\n",	469	dprintk("acpi_cpufreq_target failed (%d)\n",
470	policy->cpu);	470	policy->cpu);
471	result = -EAGAIN;	471	result = -EAGAIN;
472	goto out;	472	goto out;
473	}	473	}
474	}	474	}
475		475
476	for_each_cpu(i, cmd.mask) {	476	for_each_cpu(i, cmd.mask) {
477	freqs.cpu = i;	477	freqs.cpu = i;
478	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);	478	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
479	}	479	}
480	perf->state = next_perf_state;	480	perf->state = next_perf_state;
481		481
482	out:	482	out:
483	return result;	483	return result;
484	}	484	}
485		485
486	static int acpi_cpufreq_verify(struct cpufreq_policy *policy)	486	static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
487	{	487	{
488	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);	488	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
489		489
490	dprintk("acpi_cpufreq_verify\n");	490	dprintk("acpi_cpufreq_verify\n");
491		491
492	return cpufreq_frequency_table_verify(policy, data->freq_table);	492	return cpufreq_frequency_table_verify(policy, data->freq_table);
493	}	493	}
494		494
495	static unsigned long	495	static unsigned long
496	acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)	496	acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
497	{	497	{
498	struct acpi_processor_performance *perf = data->acpi_data;	498	struct acpi_processor_performance *perf = data->acpi_data;
499		499
500	if (cpu_khz) {	500	if (cpu_khz) {
501	/* search the closest match to cpu_khz */	501	/* search the closest match to cpu_khz */
502	unsigned int i;	502	unsigned int i;
503	unsigned long freq;	503	unsigned long freq;
504	unsigned long freqn = perf->states[0].core_frequency * 1000;	504	unsigned long freqn = perf->states[0].core_frequency * 1000;
505		505
506	for (i = 0; i < (perf->state_count-1); i++) {	506	for (i = 0; i < (perf->state_count-1); i++) {
507	freq = freqn;	507	freq = freqn;
508	freqn = perf->states[i+1].core_frequency * 1000;	508	freqn = perf->states[i+1].core_frequency * 1000;
509	if ((2 * cpu_khz) > (freqn + freq)) {	509	if ((2 * cpu_khz) > (freqn + freq)) {
510	perf->state = i;	510	perf->state = i;
511	return freq;	511	return freq;
512	}	512	}
513	}	513	}
514	perf->state = perf->state_count-1;	514	perf->state = perf->state_count-1;
515	return freqn;	515	return freqn;
516	} else {	516	} else {
517	/* assume CPU is at P0... */	517	/* assume CPU is at P0... */
518	perf->state = 0;	518	perf->state = 0;
519	return perf->states[0].core_frequency * 1000;	519	return perf->states[0].core_frequency * 1000;
520	}	520	}
521	}	521	}
522		522
523	static void free_acpi_perf_data(void)	523	static void free_acpi_perf_data(void)
524	{	524	{
525	unsigned int i;	525	unsigned int i;
526		526
527	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */	527	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
528	for_each_possible_cpu(i)	528	for_each_possible_cpu(i)
529	free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)	529	free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
530	->shared_cpu_map);	530	->shared_cpu_map);
531	free_percpu(acpi_perf_data);	531	free_percpu(acpi_perf_data);
532	}	532	}
533		533
534	/*	534	/*
535	* acpi_cpufreq_early_init - initialize ACPI P-States library	535	* acpi_cpufreq_early_init - initialize ACPI P-States library
536	*	536	*
537	* Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)	537	* Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
538	* in order to determine correct frequency and voltage pairings. We can	538	* in order to determine correct frequency and voltage pairings. We can
539	* do _PDC and _PSD and find out the processor dependency for the	539	* do _PDC and _PSD and find out the processor dependency for the
540	* actual init that will happen later...	540	* actual init that will happen later...
541	*/	541	*/
542	static int __init acpi_cpufreq_early_init(void)	542	static int __init acpi_cpufreq_early_init(void)
543	{	543	{
544	unsigned int i;	544	unsigned int i;
545	dprintk("acpi_cpufreq_early_init\n");	545	dprintk("acpi_cpufreq_early_init\n");
546		546
547	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);	547	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
548	if (!acpi_perf_data) {	548	if (!acpi_perf_data) {
549	dprintk("Memory allocation error for acpi_perf_data.\n");	549	dprintk("Memory allocation error for acpi_perf_data.\n");
550	return -ENOMEM;	550	return -ENOMEM;
551	}	551	}
552	for_each_possible_cpu(i) {	552	for_each_possible_cpu(i) {
553	if (!alloc_cpumask_var_node(	553	if (!zalloc_cpumask_var_node(
554	&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,	554	&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
555	GFP_KERNEL, cpu_to_node(i))) {	555	GFP_KERNEL, cpu_to_node(i))) {
556		556
557	/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */	557	/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
558	free_acpi_perf_data();	558	free_acpi_perf_data();
559	return -ENOMEM;	559	return -ENOMEM;
560	}	560	}
561	}	561	}
562		562
563	/* Do initialization in ACPI core */	563	/* Do initialization in ACPI core */
564	acpi_processor_preregister_performance(acpi_perf_data);	564	acpi_processor_preregister_performance(acpi_perf_data);
565	return 0;	565	return 0;
566	}	566	}
567		567
568	#ifdef CONFIG_SMP	568	#ifdef CONFIG_SMP
569	/*	569	/*
570	* Some BIOSes do SW_ANY coordination internally, either set it up in hw	570	* Some BIOSes do SW_ANY coordination internally, either set it up in hw
571	* or do it in BIOS firmware and won't inform about it to OS. If not	571	* or do it in BIOS firmware and won't inform about it to OS. If not
572	* detected, this has a side effect of making CPU run at a different speed	572	* detected, this has a side effect of making CPU run at a different speed
573	* than OS intended it to run at. Detect it and handle it cleanly.	573	* than OS intended it to run at. Detect it and handle it cleanly.
574	*/	574	*/
575	static int bios_with_sw_any_bug;	575	static int bios_with_sw_any_bug;
576		576
577	static int sw_any_bug_found(const struct dmi_system_id *d)	577	static int sw_any_bug_found(const struct dmi_system_id *d)
578	{	578	{
579	bios_with_sw_any_bug = 1;	579	bios_with_sw_any_bug = 1;
580	return 0;	580	return 0;
581	}	581	}
582		582
583	static const struct dmi_system_id sw_any_bug_dmi_table[] = {	583	static const struct dmi_system_id sw_any_bug_dmi_table[] = {
584	{	584	{
585	.callback = sw_any_bug_found,	585	.callback = sw_any_bug_found,
586	.ident = "Supermicro Server X6DLP",	586	.ident = "Supermicro Server X6DLP",
587	.matches = {	587	.matches = {
588	DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),	588	DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
589	DMI_MATCH(DMI_BIOS_VERSION, "080010"),	589	DMI_MATCH(DMI_BIOS_VERSION, "080010"),
590	DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),	590	DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
591	},	591	},
592	},	592	},
593	{ }	593	{ }
594	};	594	};
595	#endif	595	#endif
596		596
597	static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)	597	static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
598	{	598	{
599	unsigned int i;	599	unsigned int i;
600	unsigned int valid_states = 0;	600	unsigned int valid_states = 0;
601	unsigned int cpu = policy->cpu;	601	unsigned int cpu = policy->cpu;
602	struct acpi_cpufreq_data *data;	602	struct acpi_cpufreq_data *data;
603	unsigned int result = 0;	603	unsigned int result = 0;
604	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);	604	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
605	struct acpi_processor_performance *perf;	605	struct acpi_processor_performance *perf;
606		606
607	dprintk("acpi_cpufreq_cpu_init\n");	607	dprintk("acpi_cpufreq_cpu_init\n");
608		608
609	data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);	609	data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
610	if (!data)	610	if (!data)
611	return -ENOMEM;	611	return -ENOMEM;
612		612
613	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);	613	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
614	per_cpu(drv_data, cpu) = data;	614	per_cpu(drv_data, cpu) = data;
615		615
616	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))	616	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
617	acpi_cpufreq_driver.flags \|= CPUFREQ_CONST_LOOPS;	617	acpi_cpufreq_driver.flags \|= CPUFREQ_CONST_LOOPS;
618		618
619	result = acpi_processor_register_performance(data->acpi_data, cpu);	619	result = acpi_processor_register_performance(data->acpi_data, cpu);
620	if (result)	620	if (result)
621	goto err_free;	621	goto err_free;
622		622
623	perf = data->acpi_data;	623	perf = data->acpi_data;
624	policy->shared_type = perf->shared_type;	624	policy->shared_type = perf->shared_type;
625		625
626	/*	626	/*
627	* Will let policy->cpus know about dependency only when software	627	* Will let policy->cpus know about dependency only when software
628	* coordination is required.	628	* coordination is required.
629	*/	629	*/
630	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL \|\|	630	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL \|\|
631	policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {	631	policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
632	cpumask_copy(policy->cpus, perf->shared_cpu_map);	632	cpumask_copy(policy->cpus, perf->shared_cpu_map);
633	}	633	}
634	cpumask_copy(policy->related_cpus, perf->shared_cpu_map);	634	cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
635		635
636	#ifdef CONFIG_SMP	636	#ifdef CONFIG_SMP
637	dmi_check_system(sw_any_bug_dmi_table);	637	dmi_check_system(sw_any_bug_dmi_table);
638	if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {	638	if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
639	policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;	639	policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
640	cpumask_copy(policy->cpus, cpu_core_mask(cpu));	640	cpumask_copy(policy->cpus, cpu_core_mask(cpu));
641	}	641	}
642	#endif	642	#endif
643		643
644	/* capability check */	644	/* capability check */
645	if (perf->state_count <= 1) {	645	if (perf->state_count <= 1) {
646	dprintk("No P-States\n");	646	dprintk("No P-States\n");
647	result = -ENODEV;	647	result = -ENODEV;
648	goto err_unreg;	648	goto err_unreg;
649	}	649	}
650		650
651	if (perf->control_register.space_id != perf->status_register.space_id) {	651	if (perf->control_register.space_id != perf->status_register.space_id) {
652	result = -ENODEV;	652	result = -ENODEV;
653	goto err_unreg;	653	goto err_unreg;
654	}	654	}
655		655
656	switch (perf->control_register.space_id) {	656	switch (perf->control_register.space_id) {
657	case ACPI_ADR_SPACE_SYSTEM_IO:	657	case ACPI_ADR_SPACE_SYSTEM_IO:
658	dprintk("SYSTEM IO addr space\n");	658	dprintk("SYSTEM IO addr space\n");
659	data->cpu_feature = SYSTEM_IO_CAPABLE;	659	data->cpu_feature = SYSTEM_IO_CAPABLE;
660	break;	660	break;
661	case ACPI_ADR_SPACE_FIXED_HARDWARE:	661	case ACPI_ADR_SPACE_FIXED_HARDWARE:
662	dprintk("HARDWARE addr space\n");	662	dprintk("HARDWARE addr space\n");
663	if (!check_est_cpu(cpu)) {	663	if (!check_est_cpu(cpu)) {
664	result = -ENODEV;	664	result = -ENODEV;
665	goto err_unreg;	665	goto err_unreg;
666	}	666	}
667	data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;	667	data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
668	break;	668	break;
669	default:	669	default:
670	dprintk("Unknown addr space %d\n",	670	dprintk("Unknown addr space %d\n",
671	(u32) (perf->control_register.space_id));	671	(u32) (perf->control_register.space_id));
672	result = -ENODEV;	672	result = -ENODEV;
673	goto err_unreg;	673	goto err_unreg;
674	}	674	}
675		675
676	data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *	676	data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
677	(perf->state_count+1), GFP_KERNEL);	677	(perf->state_count+1), GFP_KERNEL);
678	if (!data->freq_table) {	678	if (!data->freq_table) {
679	result = -ENOMEM;	679	result = -ENOMEM;
680	goto err_unreg;	680	goto err_unreg;
681	}	681	}
682		682
683	/* detect transition latency */	683	/* detect transition latency */
684	policy->cpuinfo.transition_latency = 0;	684	policy->cpuinfo.transition_latency = 0;
685	for (i = 0; i < perf->state_count; i++) {	685	for (i = 0; i < perf->state_count; i++) {
686	if ((perf->states[i].transition_latency * 1000) >	686	if ((perf->states[i].transition_latency * 1000) >
687	policy->cpuinfo.transition_latency)	687	policy->cpuinfo.transition_latency)
688	policy->cpuinfo.transition_latency =	688	policy->cpuinfo.transition_latency =
689	perf->states[i].transition_latency * 1000;	689	perf->states[i].transition_latency * 1000;
690	}	690	}
691		691
692	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */	692	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
693	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&	693	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
694	policy->cpuinfo.transition_latency > 20 * 1000) {	694	policy->cpuinfo.transition_latency > 20 * 1000) {
695	policy->cpuinfo.transition_latency = 20 * 1000;	695	policy->cpuinfo.transition_latency = 20 * 1000;
696	printk_once(KERN_INFO	696	printk_once(KERN_INFO
697	"P-state transition latency capped at 20 uS\n");	697	"P-state transition latency capped at 20 uS\n");
698	}	698	}
699		699
700	/* table init */	700	/* table init */
701	for (i = 0; i < perf->state_count; i++) {	701	for (i = 0; i < perf->state_count; i++) {
702	if (i > 0 && perf->states[i].core_frequency >=	702	if (i > 0 && perf->states[i].core_frequency >=
703	data->freq_table[valid_states-1].frequency / 1000)	703	data->freq_table[valid_states-1].frequency / 1000)
704	continue;	704	continue;
705		705
706	data->freq_table[valid_states].index = i;	706	data->freq_table[valid_states].index = i;
707	data->freq_table[valid_states].frequency =	707	data->freq_table[valid_states].frequency =
708	perf->states[i].core_frequency * 1000;	708	perf->states[i].core_frequency * 1000;
709	valid_states++;	709	valid_states++;
710	}	710	}
711	data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;	711	data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
712	perf->state = 0;	712	perf->state = 0;
713		713
714	result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);	714	result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
715	if (result)	715	if (result)
716	goto err_freqfree;	716	goto err_freqfree;
717		717
718	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)	718	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
719	printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");	719	printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
720		720
721	switch (perf->control_register.space_id) {	721	switch (perf->control_register.space_id) {
722	case ACPI_ADR_SPACE_SYSTEM_IO:	722	case ACPI_ADR_SPACE_SYSTEM_IO:
723	/* Current speed is unknown and not detectable by IO port */	723	/* Current speed is unknown and not detectable by IO port */
724	policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);	724	policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
725	break;	725	break;
726	case ACPI_ADR_SPACE_FIXED_HARDWARE:	726	case ACPI_ADR_SPACE_FIXED_HARDWARE:
727	acpi_cpufreq_driver.get = get_cur_freq_on_cpu;	727	acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
728	policy->cur = get_cur_freq_on_cpu(cpu);	728	policy->cur = get_cur_freq_on_cpu(cpu);
729	break;	729	break;
730	default:	730	default:
731	break;	731	break;
732	}	732	}
733		733
734	/* notify BIOS that we exist */	734	/* notify BIOS that we exist */
735	acpi_processor_notify_smm(THIS_MODULE);	735	acpi_processor_notify_smm(THIS_MODULE);
736		736
737	/* Check for APERF/MPERF support in hardware */	737	/* Check for APERF/MPERF support in hardware */
738	if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {	738	if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
739	unsigned int ecx;	739	unsigned int ecx;
740	ecx = cpuid_ecx(6);	740	ecx = cpuid_ecx(6);
741	if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)	741	if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
742	acpi_cpufreq_driver.getavg = get_measured_perf;	742	acpi_cpufreq_driver.getavg = get_measured_perf;
743	}	743	}
744		744
745	dprintk("CPU%u - ACPI performance management activated.\n", cpu);	745	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
746	for (i = 0; i < perf->state_count; i++)	746	for (i = 0; i < perf->state_count; i++)
747	dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",	747	dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
748	(i == perf->state ? '*' : ' '), i,	748	(i == perf->state ? '*' : ' '), i,
749	(u32) perf->states[i].core_frequency,	749	(u32) perf->states[i].core_frequency,
750	(u32) perf->states[i].power,	750	(u32) perf->states[i].power,
751	(u32) perf->states[i].transition_latency);	751	(u32) perf->states[i].transition_latency);
752		752
753	cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);	753	cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
754		754
755	/*	755	/*
756	* the first call to ->target() should result in us actually	756	* the first call to ->target() should result in us actually
757	* writing something to the appropriate registers.	757	* writing something to the appropriate registers.
758	*/	758	*/
759	data->resume = 1;	759	data->resume = 1;
760		760
761	return result;	761	return result;
762		762
763	err_freqfree:	763	err_freqfree:
764	kfree(data->freq_table);	764	kfree(data->freq_table);
765	err_unreg:	765	err_unreg:
766	acpi_processor_unregister_performance(perf, cpu);	766	acpi_processor_unregister_performance(perf, cpu);
767	err_free:	767	err_free:
768	kfree(data);	768	kfree(data);
769	per_cpu(drv_data, cpu) = NULL;	769	per_cpu(drv_data, cpu) = NULL;
770		770
771	return result;	771	return result;
772	}	772	}
773		773
774	static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)	774	static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
775	{	775	{
776	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);	776	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
777		777
778	dprintk("acpi_cpufreq_cpu_exit\n");	778	dprintk("acpi_cpufreq_cpu_exit\n");
779		779
780	if (data) {	780	if (data) {
781	cpufreq_frequency_table_put_attr(policy->cpu);	781	cpufreq_frequency_table_put_attr(policy->cpu);
782	per_cpu(drv_data, policy->cpu) = NULL;	782	per_cpu(drv_data, policy->cpu) = NULL;
783	acpi_processor_unregister_performance(data->acpi_data,	783	acpi_processor_unregister_performance(data->acpi_data,
784	policy->cpu);	784	policy->cpu);
785	kfree(data);	785	kfree(data);
786	}	786	}
787		787
788	return 0;	788	return 0;
789	}	789	}
790		790
791	static int acpi_cpufreq_resume(struct cpufreq_policy *policy)	791	static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
792	{	792	{
793	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);	793	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
794		794
795	dprintk("acpi_cpufreq_resume\n");	795	dprintk("acpi_cpufreq_resume\n");
796		796
797	data->resume = 1;	797	data->resume = 1;
798		798
799	return 0;	799	return 0;
800	}	800	}
801		801
802	static struct freq_attr *acpi_cpufreq_attr[] = {	802	static struct freq_attr *acpi_cpufreq_attr[] = {
803	&cpufreq_freq_attr_scaling_available_freqs,	803	&cpufreq_freq_attr_scaling_available_freqs,
804	NULL,	804	NULL,
805	};	805	};
806		806
807	static struct cpufreq_driver acpi_cpufreq_driver = {	807	static struct cpufreq_driver acpi_cpufreq_driver = {
808	.verify = acpi_cpufreq_verify,	808	.verify = acpi_cpufreq_verify,
809	.target = acpi_cpufreq_target,	809	.target = acpi_cpufreq_target,
810	.init = acpi_cpufreq_cpu_init,	810	.init = acpi_cpufreq_cpu_init,
811	.exit = acpi_cpufreq_cpu_exit,	811	.exit = acpi_cpufreq_cpu_exit,
812	.resume = acpi_cpufreq_resume,	812	.resume = acpi_cpufreq_resume,
813	.name = "acpi-cpufreq",	813	.name = "acpi-cpufreq",
814	.owner = THIS_MODULE,	814	.owner = THIS_MODULE,
815	.attr = acpi_cpufreq_attr,	815	.attr = acpi_cpufreq_attr,
816	};	816	};
817		817
818	static int __init acpi_cpufreq_init(void)	818	static int __init acpi_cpufreq_init(void)
819	{	819	{
820	int ret;	820	int ret;
821		821
822	if (acpi_disabled)	822	if (acpi_disabled)
823	return 0;	823	return 0;
824		824
825	dprintk("acpi_cpufreq_init\n");	825	dprintk("acpi_cpufreq_init\n");
826		826
827	ret = acpi_cpufreq_early_init();	827	ret = acpi_cpufreq_early_init();
828	if (ret)	828	if (ret)
829	return ret;	829	return ret;
830		830
831	ret = cpufreq_register_driver(&acpi_cpufreq_driver);	831	ret = cpufreq_register_driver(&acpi_cpufreq_driver);
832	if (ret)	832	if (ret)
833	free_acpi_perf_data();	833	free_acpi_perf_data();
834		834
835	return ret;	835	return ret;
836	}	836	}
837		837
838	static void __exit acpi_cpufreq_exit(void)	838	static void __exit acpi_cpufreq_exit(void)
839	{	839	{
840	dprintk("acpi_cpufreq_exit\n");	840	dprintk("acpi_cpufreq_exit\n");
841		841
842	cpufreq_unregister_driver(&acpi_cpufreq_driver);	842	cpufreq_unregister_driver(&acpi_cpufreq_driver);
843		843
844	free_percpu(acpi_perf_data);	844	free_percpu(acpi_perf_data);
845	}	845	}
846		846
847	module_param(acpi_pstate_strict, uint, 0644);	847	module_param(acpi_pstate_strict, uint, 0644);
848	MODULE_PARM_DESC(acpi_pstate_strict,	848	MODULE_PARM_DESC(acpi_pstate_strict,
849	"value 0 or non-zero. non-zero -> strict ACPI checks are "	849	"value 0 or non-zero. non-zero -> strict ACPI checks are "
850	"performed during frequency changes.");	850	"performed during frequency changes.");
851		851
852	late_initcall(acpi_cpufreq_init);	852	late_initcall(acpi_cpufreq_init);
853	module_exit(acpi_cpufreq_exit);	853	module_exit(acpi_cpufreq_exit);
854		854
855	MODULE_ALIAS("acpi");	855	MODULE_ALIAS("acpi");
856		856

arch/x86/kernel/cpu/cpufreq/powernow-k7.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* AMD K7 Powernow driver.	2	* AMD K7 Powernow driver.
3	* (C) 2003 Dave Jones on behalf of SuSE Labs.	3	* (C) 2003 Dave Jones on behalf of SuSE Labs.
4	* (C) 2003-2004 Dave Jones <davej@redhat.com>	4	* (C) 2003-2004 Dave Jones <davej@redhat.com>
5	*	5	*
6	* Licensed under the terms of the GNU GPL License version 2.	6	* Licensed under the terms of the GNU GPL License version 2.
7	* Based upon datasheets & sample CPUs kindly provided by AMD.	7	* Based upon datasheets & sample CPUs kindly provided by AMD.
8	*	8	*
9	* Errata 5:	9	* Errata 5:
10	* CPU may fail to execute a FID/VID change in presence of interrupt.	10	* CPU may fail to execute a FID/VID change in presence of interrupt.
11	* - We cli/sti on stepping A0 CPUs around the FID/VID transition.	11	* - We cli/sti on stepping A0 CPUs around the FID/VID transition.
12	* Errata 15:	12	* Errata 15:
13	* CPU with half frequency multipliers may hang upon wakeup from disconnect.	13	* CPU with half frequency multipliers may hang upon wakeup from disconnect.
14	* - We disable half multipliers if ACPI is used on A0 stepping CPUs.	14	* - We disable half multipliers if ACPI is used on A0 stepping CPUs.
15	*/	15	*/
16		16
17	#include <linux/kernel.h>	17	#include <linux/kernel.h>
18	#include <linux/module.h>	18	#include <linux/module.h>
19	#include <linux/moduleparam.h>	19	#include <linux/moduleparam.h>
20	#include <linux/init.h>	20	#include <linux/init.h>
21	#include <linux/cpufreq.h>	21	#include <linux/cpufreq.h>
22	#include <linux/slab.h>	22	#include <linux/slab.h>
23	#include <linux/string.h>	23	#include <linux/string.h>
24	#include <linux/dmi.h>	24	#include <linux/dmi.h>
25	#include <linux/timex.h>	25	#include <linux/timex.h>
26	#include <linux/io.h>	26	#include <linux/io.h>
27		27
28	#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */	28	#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
29	#include <asm/msr.h>	29	#include <asm/msr.h>
30	#include <asm/system.h>	30	#include <asm/system.h>
31		31
32	#ifdef CONFIG_X86_POWERNOW_K7_ACPI	32	#ifdef CONFIG_X86_POWERNOW_K7_ACPI
33	#include <linux/acpi.h>	33	#include <linux/acpi.h>
34	#include <acpi/processor.h>	34	#include <acpi/processor.h>
35	#endif	35	#endif
36		36
37	#include "powernow-k7.h"	37	#include "powernow-k7.h"
38		38
39	#define PFX "powernow: "	39	#define PFX "powernow: "
40		40
41		41
42	struct psb_s {	42	struct psb_s {
43	u8 signature[10];	43	u8 signature[10];
44	u8 tableversion;	44	u8 tableversion;
45	u8 flags;	45	u8 flags;
46	u16 settlingtime;	46	u16 settlingtime;
47	u8 reserved1;	47	u8 reserved1;
48	u8 numpst;	48	u8 numpst;
49	};	49	};
50		50
51	struct pst_s {	51	struct pst_s {
52	u32 cpuid;	52	u32 cpuid;
53	u8 fsbspeed;	53	u8 fsbspeed;
54	u8 maxfid;	54	u8 maxfid;
55	u8 startvid;	55	u8 startvid;
56	u8 numpstates;	56	u8 numpstates;
57	};	57	};
58		58
59	#ifdef CONFIG_X86_POWERNOW_K7_ACPI	59	#ifdef CONFIG_X86_POWERNOW_K7_ACPI
60	union powernow_acpi_control_t {	60	union powernow_acpi_control_t {
61	struct {	61	struct {
62	unsigned long fid:5,	62	unsigned long fid:5,
63	vid:5,	63	vid:5,
64	sgtc:20,	64	sgtc:20,
65	res1:2;	65	res1:2;
66	} bits;	66	} bits;
67	unsigned long val;	67	unsigned long val;
68	};	68	};
69	#endif	69	#endif
70		70
71	#ifdef CONFIG_CPU_FREQ_DEBUG	71	#ifdef CONFIG_CPU_FREQ_DEBUG
72	/* divide by 1000 to get VCore voltage in V. */	72	/* divide by 1000 to get VCore voltage in V. */
73	static const int mobile_vid_table[32] = {	73	static const int mobile_vid_table[32] = {
74	2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,	74	2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
75	1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,	75	1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
76	1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,	76	1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
77	1075, 1050, 1025, 1000, 975, 950, 925, 0,	77	1075, 1050, 1025, 1000, 975, 950, 925, 0,
78	};	78	};
79	#endif	79	#endif
80		80
81	/* divide by 10 to get FID. */	81	/* divide by 10 to get FID. */
82	static const int fid_codes[32] = {	82	static const int fid_codes[32] = {
83	110, 115, 120, 125, 50, 55, 60, 65,	83	110, 115, 120, 125, 50, 55, 60, 65,
84	70, 75, 80, 85, 90, 95, 100, 105,	84	70, 75, 80, 85, 90, 95, 100, 105,
85	30, 190, 40, 200, 130, 135, 140, 210,	85	30, 190, 40, 200, 130, 135, 140, 210,
86	150, 225, 160, 165, 170, 180, -1, -1,	86	150, 225, 160, 165, 170, 180, -1, -1,
87	};	87	};
88		88
89	/* This parameter is used in order to force ACPI instead of legacy method for	89	/* This parameter is used in order to force ACPI instead of legacy method for
90	* configuration purpose.	90	* configuration purpose.
91	*/	91	*/
92		92
93	static int acpi_force;	93	static int acpi_force;
94		94
95	static struct cpufreq_frequency_table *powernow_table;	95	static struct cpufreq_frequency_table *powernow_table;
96		96
97	static unsigned int can_scale_bus;	97	static unsigned int can_scale_bus;
98	static unsigned int can_scale_vid;	98	static unsigned int can_scale_vid;
99	static unsigned int minimum_speed = -1;	99	static unsigned int minimum_speed = -1;
100	static unsigned int maximum_speed;	100	static unsigned int maximum_speed;
101	static unsigned int number_scales;	101	static unsigned int number_scales;
102	static unsigned int fsb;	102	static unsigned int fsb;
103	static unsigned int latency;	103	static unsigned int latency;
104	static char have_a0;	104	static char have_a0;
105		105
106	#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \	106	#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
107	"powernow-k7", msg)	107	"powernow-k7", msg)
108		108
109	static int check_fsb(unsigned int fsbspeed)	109	static int check_fsb(unsigned int fsbspeed)
110	{	110	{
111	int delta;	111	int delta;
112	unsigned int f = fsb / 1000;	112	unsigned int f = fsb / 1000;
113		113
114	delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;	114	delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
115	return delta < 5;	115	return delta < 5;
116	}	116	}
117		117
118	static int check_powernow(void)	118	static int check_powernow(void)
119	{	119	{
120	struct cpuinfo_x86 *c = &cpu_data(0);	120	struct cpuinfo_x86 *c = &cpu_data(0);
121	unsigned int maxei, eax, ebx, ecx, edx;	121	unsigned int maxei, eax, ebx, ecx, edx;
122		122
123	if ((c->x86_vendor != X86_VENDOR_AMD) \|\| (c->x86 != 6)) {	123	if ((c->x86_vendor != X86_VENDOR_AMD) \|\| (c->x86 != 6)) {
124	#ifdef MODULE	124	#ifdef MODULE
125	printk(KERN_INFO PFX "This module only works with "	125	printk(KERN_INFO PFX "This module only works with "
126	"AMD K7 CPUs\n");	126	"AMD K7 CPUs\n");
127	#endif	127	#endif
128	return 0;	128	return 0;
129	}	129	}
130		130
131	/* Get maximum capabilities */	131	/* Get maximum capabilities */
132	maxei = cpuid_eax(0x80000000);	132	maxei = cpuid_eax(0x80000000);
133	if (maxei < 0x80000007) { /* Any powernow info ? */	133	if (maxei < 0x80000007) { /* Any powernow info ? */
134	#ifdef MODULE	134	#ifdef MODULE
135	printk(KERN_INFO PFX "No powernow capabilities detected\n");	135	printk(KERN_INFO PFX "No powernow capabilities detected\n");
136	#endif	136	#endif
137	return 0;	137	return 0;
138	}	138	}
139		139
140	if ((c->x86_model == 6) && (c->x86_mask == 0)) {	140	if ((c->x86_model == 6) && (c->x86_mask == 0)) {
141	printk(KERN_INFO PFX "K7 660[A0] core detected, "	141	printk(KERN_INFO PFX "K7 660[A0] core detected, "
142	"enabling errata workarounds\n");	142	"enabling errata workarounds\n");
143	have_a0 = 1;	143	have_a0 = 1;
144	}	144	}
145		145
146	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);	146	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
147		147
148	/* Check we can actually do something before we say anything.*/	148	/* Check we can actually do something before we say anything.*/
149	if (!(edx & (1 << 1 \| 1 << 2)))	149	if (!(edx & (1 << 1 \| 1 << 2)))
150	return 0;	150	return 0;
151		151
152	printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");	152	printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
153		153
154	if (edx & 1 << 1) {	154	if (edx & 1 << 1) {
155	printk("frequency");	155	printk("frequency");
156	can_scale_bus = 1;	156	can_scale_bus = 1;
157	}	157	}
158		158
159	if ((edx & (1 << 1 \| 1 << 2)) == 0x6)	159	if ((edx & (1 << 1 \| 1 << 2)) == 0x6)
160	printk(" and ");	160	printk(" and ");
161		161
162	if (edx & 1 << 2) {	162	if (edx & 1 << 2) {
163	printk("voltage");	163	printk("voltage");
164	can_scale_vid = 1;	164	can_scale_vid = 1;
165	}	165	}
166		166
167	printk(".\n");	167	printk(".\n");
168	return 1;	168	return 1;
169	}	169	}
170		170
171	#ifdef CONFIG_X86_POWERNOW_K7_ACPI	171	#ifdef CONFIG_X86_POWERNOW_K7_ACPI
172	static void invalidate_entry(unsigned int entry)	172	static void invalidate_entry(unsigned int entry)
173	{	173	{
174	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;	174	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
175	}	175	}
176	#endif	176	#endif
177		177
178	static int get_ranges(unsigned char *pst)	178	static int get_ranges(unsigned char *pst)
179	{	179	{
180	unsigned int j;	180	unsigned int j;
181	unsigned int speed;	181	unsigned int speed;
182	u8 fid, vid;	182	u8 fid, vid;
183		183
184	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *	184	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
185	(number_scales + 1)), GFP_KERNEL);	185	(number_scales + 1)), GFP_KERNEL);
186	if (!powernow_table)	186	if (!powernow_table)
187	return -ENOMEM;	187	return -ENOMEM;
188		188
189	for (j = 0 ; j < number_scales; j++) {	189	for (j = 0 ; j < number_scales; j++) {
190	fid = *pst++;	190	fid = *pst++;
191		191
192	powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;	192	powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
193	powernow_table[j].index = fid; /* lower 8 bits */	193	powernow_table[j].index = fid; /* lower 8 bits */
194		194
195	speed = powernow_table[j].frequency;	195	speed = powernow_table[j].frequency;
196		196
197	if ((fid_codes[fid] % 10) == 5) {	197	if ((fid_codes[fid] % 10) == 5) {
198	#ifdef CONFIG_X86_POWERNOW_K7_ACPI	198	#ifdef CONFIG_X86_POWERNOW_K7_ACPI
199	if (have_a0 == 1)	199	if (have_a0 == 1)
200	invalidate_entry(j);	200	invalidate_entry(j);
201	#endif	201	#endif
202	}	202	}
203		203
204	if (speed < minimum_speed)	204	if (speed < minimum_speed)
205	minimum_speed = speed;	205	minimum_speed = speed;
206	if (speed > maximum_speed)	206	if (speed > maximum_speed)
207	maximum_speed = speed;	207	maximum_speed = speed;
208		208
209	vid = *pst++;	209	vid = *pst++;
210	powernow_table[j].index \|= (vid << 8); /* upper 8 bits */	210	powernow_table[j].index \|= (vid << 8); /* upper 8 bits */
211		211
212	dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "	212	dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
213	"VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,	213	"VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
214	fid_codes[fid] % 10, speed/1000, vid,	214	fid_codes[fid] % 10, speed/1000, vid,
215	mobile_vid_table[vid]/1000,	215	mobile_vid_table[vid]/1000,
216	mobile_vid_table[vid]%1000);	216	mobile_vid_table[vid]%1000);
217	}	217	}
218	powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;	218	powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
219	powernow_table[number_scales].index = 0;	219	powernow_table[number_scales].index = 0;
220		220
221	return 0;	221	return 0;
222	}	222	}
223		223
224		224
225	static void change_FID(int fid)	225	static void change_FID(int fid)
226	{	226	{
227	union msr_fidvidctl fidvidctl;	227	union msr_fidvidctl fidvidctl;
228		228
229	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);	229	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
230	if (fidvidctl.bits.FID != fid) {	230	if (fidvidctl.bits.FID != fid) {
231	fidvidctl.bits.SGTC = latency;	231	fidvidctl.bits.SGTC = latency;
232	fidvidctl.bits.FID = fid;	232	fidvidctl.bits.FID = fid;
233	fidvidctl.bits.VIDC = 0;	233	fidvidctl.bits.VIDC = 0;
234	fidvidctl.bits.FIDC = 1;	234	fidvidctl.bits.FIDC = 1;
235	wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);	235	wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
236	}	236	}
237	}	237	}
238		238
239		239
240	static void change_VID(int vid)	240	static void change_VID(int vid)
241	{	241	{
242	union msr_fidvidctl fidvidctl;	242	union msr_fidvidctl fidvidctl;
243		243
244	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);	244	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
245	if (fidvidctl.bits.VID != vid) {	245	if (fidvidctl.bits.VID != vid) {
246	fidvidctl.bits.SGTC = latency;	246	fidvidctl.bits.SGTC = latency;
247	fidvidctl.bits.VID = vid;	247	fidvidctl.bits.VID = vid;
248	fidvidctl.bits.FIDC = 0;	248	fidvidctl.bits.FIDC = 0;
249	fidvidctl.bits.VIDC = 1;	249	fidvidctl.bits.VIDC = 1;
250	wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);	250	wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
251	}	251	}
252	}	252	}
253		253
254		254
255	static void change_speed(unsigned int index)	255	static void change_speed(unsigned int index)
256	{	256	{
257	u8 fid, vid;	257	u8 fid, vid;
258	struct cpufreq_freqs freqs;	258	struct cpufreq_freqs freqs;
259	union msr_fidvidstatus fidvidstatus;	259	union msr_fidvidstatus fidvidstatus;
260	int cfid;	260	int cfid;
261		261
262	/* fid are the lower 8 bits of the index we stored into	262	/* fid are the lower 8 bits of the index we stored into
263	* the cpufreq frequency table in powernow_decode_bios,	263	* the cpufreq frequency table in powernow_decode_bios,
264	* vid are the upper 8 bits.	264	* vid are the upper 8 bits.
265	*/	265	*/
266		266
267	fid = powernow_table[index].index & 0xFF;	267	fid = powernow_table[index].index & 0xFF;
268	vid = (powernow_table[index].index & 0xFF00) >> 8;	268	vid = (powernow_table[index].index & 0xFF00) >> 8;
269		269
270	freqs.cpu = 0;	270	freqs.cpu = 0;
271		271
272	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);	272	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
273	cfid = fidvidstatus.bits.CFID;	273	cfid = fidvidstatus.bits.CFID;
274	freqs.old = fsb * fid_codes[cfid] / 10;	274	freqs.old = fsb * fid_codes[cfid] / 10;
275		275
276	freqs.new = powernow_table[index].frequency;	276	freqs.new = powernow_table[index].frequency;
277		277
278	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);	278	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
279		279
280	/* Now do the magic poking into the MSRs. */	280	/* Now do the magic poking into the MSRs. */
281		281
282	if (have_a0 == 1) /* A0 errata 5 */	282	if (have_a0 == 1) /* A0 errata 5 */
283	local_irq_disable();	283	local_irq_disable();
284		284
285	if (freqs.old > freqs.new) {	285	if (freqs.old > freqs.new) {
286	/* Going down, so change FID first */	286	/* Going down, so change FID first */
287	change_FID(fid);	287	change_FID(fid);
288	change_VID(vid);	288	change_VID(vid);
289	} else {	289	} else {
290	/* Going up, so change VID first */	290	/* Going up, so change VID first */
291	change_VID(vid);	291	change_VID(vid);
292	change_FID(fid);	292	change_FID(fid);
293	}	293	}
294		294
295		295
296	if (have_a0 == 1)	296	if (have_a0 == 1)
297	local_irq_enable();	297	local_irq_enable();
298		298
299	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);	299	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
300	}	300	}
301		301
302		302
303	#ifdef CONFIG_X86_POWERNOW_K7_ACPI	303	#ifdef CONFIG_X86_POWERNOW_K7_ACPI
304		304
305	static struct acpi_processor_performance *acpi_processor_perf;	305	static struct acpi_processor_performance *acpi_processor_perf;
306		306
307	static int powernow_acpi_init(void)	307	static int powernow_acpi_init(void)
308	{	308	{
309	int i;	309	int i;
310	int retval = 0;	310	int retval = 0;
311	union powernow_acpi_control_t pc;	311	union powernow_acpi_control_t pc;
312		312
313	if (acpi_processor_perf != NULL && powernow_table != NULL) {	313	if (acpi_processor_perf != NULL && powernow_table != NULL) {
314	retval = -EINVAL;	314	retval = -EINVAL;
315	goto err0;	315	goto err0;
316	}	316	}
317		317
318	acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),	318	acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
319	GFP_KERNEL);	319	GFP_KERNEL);
320	if (!acpi_processor_perf) {	320	if (!acpi_processor_perf) {
321	retval = -ENOMEM;	321	retval = -ENOMEM;
322	goto err0;	322	goto err0;
323	}	323	}
324		324
325	if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,	325	if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326	GFP_KERNEL)) {	326	GFP_KERNEL)) {
327	retval = -ENOMEM;	327	retval = -ENOMEM;
328	goto err05;	328	goto err05;
329	}	329	}
330		330
331	if (acpi_processor_register_performance(acpi_processor_perf, 0)) {	331	if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
332	retval = -EIO;	332	retval = -EIO;
333	goto err1;	333	goto err1;
334	}	334	}
335		335
336	if (acpi_processor_perf->control_register.space_id !=	336	if (acpi_processor_perf->control_register.space_id !=
337	ACPI_ADR_SPACE_FIXED_HARDWARE) {	337	ACPI_ADR_SPACE_FIXED_HARDWARE) {
338	retval = -ENODEV;	338	retval = -ENODEV;
339	goto err2;	339	goto err2;
340	}	340	}
341		341
342	if (acpi_processor_perf->status_register.space_id !=	342	if (acpi_processor_perf->status_register.space_id !=
343	ACPI_ADR_SPACE_FIXED_HARDWARE) {	343	ACPI_ADR_SPACE_FIXED_HARDWARE) {
344	retval = -ENODEV;	344	retval = -ENODEV;
345	goto err2;	345	goto err2;
346	}	346	}
347		347
348	number_scales = acpi_processor_perf->state_count;	348	number_scales = acpi_processor_perf->state_count;
349		349
350	if (number_scales < 2) {	350	if (number_scales < 2) {
351	retval = -ENODEV;	351	retval = -ENODEV;
352	goto err2;	352	goto err2;
353	}	353	}
354		354
355	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *	355	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
356	(number_scales + 1)), GFP_KERNEL);	356	(number_scales + 1)), GFP_KERNEL);
357	if (!powernow_table) {	357	if (!powernow_table) {
358	retval = -ENOMEM;	358	retval = -ENOMEM;
359	goto err2;	359	goto err2;
360	}	360	}
361		361
362	pc.val = (unsigned long) acpi_processor_perf->states[0].control;	362	pc.val = (unsigned long) acpi_processor_perf->states[0].control;
363	for (i = 0; i < number_scales; i++) {	363	for (i = 0; i < number_scales; i++) {
364	u8 fid, vid;	364	u8 fid, vid;
365	struct acpi_processor_px *state =	365	struct acpi_processor_px *state =
366	&acpi_processor_perf->states[i];	366	&acpi_processor_perf->states[i];
367	unsigned int speed, speed_mhz;	367	unsigned int speed, speed_mhz;
368		368
369	pc.val = (unsigned long) state->control;	369	pc.val = (unsigned long) state->control;
370	dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",	370	dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
371	i,	371	i,
372	(u32) state->core_frequency,	372	(u32) state->core_frequency,
373	(u32) state->power,	373	(u32) state->power,
374	(u32) state->transition_latency,	374	(u32) state->transition_latency,
375	(u32) state->control,	375	(u32) state->control,
376	pc.bits.sgtc);	376	pc.bits.sgtc);
377		377
378	vid = pc.bits.vid;	378	vid = pc.bits.vid;
379	fid = pc.bits.fid;	379	fid = pc.bits.fid;
380		380
381	powernow_table[i].frequency = fsb * fid_codes[fid] / 10;	381	powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
382	powernow_table[i].index = fid; /* lower 8 bits */	382	powernow_table[i].index = fid; /* lower 8 bits */
383	powernow_table[i].index \|= (vid << 8); /* upper 8 bits */	383	powernow_table[i].index \|= (vid << 8); /* upper 8 bits */
384		384
385	speed = powernow_table[i].frequency;	385	speed = powernow_table[i].frequency;
386	speed_mhz = speed / 1000;	386	speed_mhz = speed / 1000;
387		387
388	/* processor_perflib will multiply the MHz value by 1000 to	388	/* processor_perflib will multiply the MHz value by 1000 to
389	* get a KHz value (e.g. 1266000). However, powernow-k7 works	389	* get a KHz value (e.g. 1266000). However, powernow-k7 works
390	* with true KHz values (e.g. 1266768). To ensure that all	390	* with true KHz values (e.g. 1266768). To ensure that all
391	* powernow frequencies are available, we must ensure that	391	* powernow frequencies are available, we must ensure that
392	* ACPI doesn't restrict them, so we round up the MHz value	392	* ACPI doesn't restrict them, so we round up the MHz value
393	* to ensure that perflib's computed KHz value is greater than	393	* to ensure that perflib's computed KHz value is greater than
394	* or equal to powernow's KHz value.	394	* or equal to powernow's KHz value.
395	*/	395	*/
396	if (speed % 1000 > 0)	396	if (speed % 1000 > 0)
397	speed_mhz++;	397	speed_mhz++;
398		398
399	if ((fid_codes[fid] % 10) == 5) {	399	if ((fid_codes[fid] % 10) == 5) {
400	if (have_a0 == 1)	400	if (have_a0 == 1)
401	invalidate_entry(i);	401	invalidate_entry(i);
402	}	402	}
403		403
404	dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "	404	dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
405	"VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,	405	"VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
406	fid_codes[fid] % 10, speed_mhz, vid,	406	fid_codes[fid] % 10, speed_mhz, vid,
407	mobile_vid_table[vid]/1000,	407	mobile_vid_table[vid]/1000,
408	mobile_vid_table[vid]%1000);	408	mobile_vid_table[vid]%1000);
409		409
410	if (state->core_frequency != speed_mhz) {	410	if (state->core_frequency != speed_mhz) {
411	state->core_frequency = speed_mhz;	411	state->core_frequency = speed_mhz;
412	dprintk(" Corrected ACPI frequency to %d\n",	412	dprintk(" Corrected ACPI frequency to %d\n",
413	speed_mhz);	413	speed_mhz);
414	}	414	}
415		415
416	if (latency < pc.bits.sgtc)	416	if (latency < pc.bits.sgtc)
417	latency = pc.bits.sgtc;	417	latency = pc.bits.sgtc;
418		418
419	if (speed < minimum_speed)	419	if (speed < minimum_speed)
420	minimum_speed = speed;	420	minimum_speed = speed;
421	if (speed > maximum_speed)	421	if (speed > maximum_speed)
422	maximum_speed = speed;	422	maximum_speed = speed;
423	}	423	}
424		424
425	powernow_table[i].frequency = CPUFREQ_TABLE_END;	425	powernow_table[i].frequency = CPUFREQ_TABLE_END;
426	powernow_table[i].index = 0;	426	powernow_table[i].index = 0;
427		427
428	/* notify BIOS that we exist */	428	/* notify BIOS that we exist */
429	acpi_processor_notify_smm(THIS_MODULE);	429	acpi_processor_notify_smm(THIS_MODULE);
430		430
431	return 0;	431	return 0;
432		432
433	err2:	433	err2:
434	acpi_processor_unregister_performance(acpi_processor_perf, 0);	434	acpi_processor_unregister_performance(acpi_processor_perf, 0);
435	err1:	435	err1:
436	free_cpumask_var(acpi_processor_perf->shared_cpu_map);	436	free_cpumask_var(acpi_processor_perf->shared_cpu_map);
437	err05:	437	err05:
438	kfree(acpi_processor_perf);	438	kfree(acpi_processor_perf);
439	err0:	439	err0:
440	printk(KERN_WARNING PFX "ACPI perflib can not be used on "	440	printk(KERN_WARNING PFX "ACPI perflib can not be used on "
441	"this platform\n");	441	"this platform\n");
442	acpi_processor_perf = NULL;	442	acpi_processor_perf = NULL;
443	return retval;	443	return retval;
444	}	444	}
445	#else	445	#else
446	static int powernow_acpi_init(void)	446	static int powernow_acpi_init(void)
447	{	447	{
448	printk(KERN_INFO PFX "no support for ACPI processor found."	448	printk(KERN_INFO PFX "no support for ACPI processor found."
449	" Please recompile your kernel with ACPI processor\n");	449	" Please recompile your kernel with ACPI processor\n");
450	return -EINVAL;	450	return -EINVAL;
451	}	451	}
452	#endif	452	#endif
453		453
454	static void print_pst_entry(struct pst_s *pst, unsigned int j)	454	static void print_pst_entry(struct pst_s *pst, unsigned int j)
455	{	455	{
456	dprintk("PST:%d (@%p)\n", j, pst);	456	dprintk("PST:%d (@%p)\n", j, pst);
457	dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",	457	dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
458	pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);	458	pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
459	}	459	}
460		460
461	static int powernow_decode_bios(int maxfid, int startvid)	461	static int powernow_decode_bios(int maxfid, int startvid)
462	{	462	{
463	struct psb_s *psb;	463	struct psb_s *psb;
464	struct pst_s *pst;	464	struct pst_s *pst;
465	unsigned int i, j;	465	unsigned int i, j;
466	unsigned char *p;	466	unsigned char *p;
467	unsigned int etuple;	467	unsigned int etuple;
468	unsigned int ret;	468	unsigned int ret;
469		469
470	etuple = cpuid_eax(0x80000001);	470	etuple = cpuid_eax(0x80000001);
471		471
472	for (i = 0xC0000; i < 0xffff0 ; i += 16) {	472	for (i = 0xC0000; i < 0xffff0 ; i += 16) {
473		473
474	p = phys_to_virt(i);	474	p = phys_to_virt(i);
475		475
476	if (memcmp(p, "AMDK7PNOW!", 10) == 0) {	476	if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
477	dprintk("Found PSB header at %p\n", p);	477	dprintk("Found PSB header at %p\n", p);
478	psb = (struct psb_s *) p;	478	psb = (struct psb_s *) p;
479	dprintk("Table version: 0x%x\n", psb->tableversion);	479	dprintk("Table version: 0x%x\n", psb->tableversion);
480	if (psb->tableversion != 0x12) {	480	if (psb->tableversion != 0x12) {
481	printk(KERN_INFO PFX "Sorry, only v1.2 tables"	481	printk(KERN_INFO PFX "Sorry, only v1.2 tables"
482	" supported right now\n");	482	" supported right now\n");
483	return -ENODEV;	483	return -ENODEV;
484	}	484	}
485		485
486	dprintk("Flags: 0x%x\n", psb->flags);	486	dprintk("Flags: 0x%x\n", psb->flags);
487	if ((psb->flags & 1) == 0)	487	if ((psb->flags & 1) == 0)
488	dprintk("Mobile voltage regulator\n");	488	dprintk("Mobile voltage regulator\n");
489	else	489	else
490	dprintk("Desktop voltage regulator\n");	490	dprintk("Desktop voltage regulator\n");
491		491
492	latency = psb->settlingtime;	492	latency = psb->settlingtime;
493	if (latency < 100) {	493	if (latency < 100) {
494	printk(KERN_INFO PFX "BIOS set settling time "	494	printk(KERN_INFO PFX "BIOS set settling time "
495	"to %d microseconds. "	495	"to %d microseconds. "
496	"Should be at least 100. "	496	"Should be at least 100. "
497	"Correcting.\n", latency);	497	"Correcting.\n", latency);
498	latency = 100;	498	latency = 100;
499	}	499	}
500	dprintk("Settling Time: %d microseconds.\n",	500	dprintk("Settling Time: %d microseconds.\n",
501	psb->settlingtime);	501	psb->settlingtime);
502	dprintk("Has %d PST tables. (Only dumping ones "	502	dprintk("Has %d PST tables. (Only dumping ones "
503	"relevant to this CPU).\n",	503	"relevant to this CPU).\n",
504	psb->numpst);	504	psb->numpst);
505		505
506	p += sizeof(struct psb_s);	506	p += sizeof(struct psb_s);
507		507
508	pst = (struct pst_s *) p;	508	pst = (struct pst_s *) p;
509		509
510	for (j = 0; j < psb->numpst; j++) {	510	for (j = 0; j < psb->numpst; j++) {
511	pst = (struct pst_s *) p;	511	pst = (struct pst_s *) p;
512	number_scales = pst->numpstates;	512	number_scales = pst->numpstates;
513		513
514	if ((etuple == pst->cpuid) &&	514	if ((etuple == pst->cpuid) &&
515	check_fsb(pst->fsbspeed) &&	515	check_fsb(pst->fsbspeed) &&
516	(maxfid == pst->maxfid) &&	516	(maxfid == pst->maxfid) &&
517	(startvid == pst->startvid)) {	517	(startvid == pst->startvid)) {
518	print_pst_entry(pst, j);	518	print_pst_entry(pst, j);
519	p = (char *)pst + sizeof(struct pst_s);	519	p = (char *)pst + sizeof(struct pst_s);
520	ret = get_ranges(p);	520	ret = get_ranges(p);
521	return ret;	521	return ret;
522	} else {	522	} else {
523	unsigned int k;	523	unsigned int k;
524	p = (char *)pst + sizeof(struct pst_s);	524	p = (char *)pst + sizeof(struct pst_s);
525	for (k = 0; k < number_scales; k++)	525	for (k = 0; k < number_scales; k++)
526	p += 2;	526	p += 2;
527	}	527	}
528	}	528	}
529	printk(KERN_INFO PFX "No PST tables match this cpuid "	529	printk(KERN_INFO PFX "No PST tables match this cpuid "
530	"(0x%x)\n", etuple);	530	"(0x%x)\n", etuple);
531	printk(KERN_INFO PFX "This is indicative of a broken "	531	printk(KERN_INFO PFX "This is indicative of a broken "
532	"BIOS.\n");	532	"BIOS.\n");
533		533
534	return -EINVAL;	534	return -EINVAL;
535	}	535	}
536	p++;	536	p++;
537	}	537	}
538		538
539	return -ENODEV;	539	return -ENODEV;
540	}	540	}
541		541
542		542
543	static int powernow_target(struct cpufreq_policy *policy,	543	static int powernow_target(struct cpufreq_policy *policy,
544	unsigned int target_freq,	544	unsigned int target_freq,
545	unsigned int relation)	545	unsigned int relation)
546	{	546	{
547	unsigned int newstate;	547	unsigned int newstate;
548		548
549	if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,	549	if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
550	relation, &newstate))	550	relation, &newstate))
551	return -EINVAL;	551	return -EINVAL;
552		552
553	change_speed(newstate);	553	change_speed(newstate);
554		554
555	return 0;	555	return 0;
556	}	556	}
557		557
558		558
559	static int powernow_verify(struct cpufreq_policy *policy)	559	static int powernow_verify(struct cpufreq_policy *policy)
560	{	560	{
561	return cpufreq_frequency_table_verify(policy, powernow_table);	561	return cpufreq_frequency_table_verify(policy, powernow_table);
562	}	562	}
563		563
564	/*	564	/*
565	* We use the fact that the bus frequency is somehow	565	* We use the fact that the bus frequency is somehow
566	* a multiple of 100000/3 khz, then we compute sgtc according	566	* a multiple of 100000/3 khz, then we compute sgtc according
567	* to this multiple.	567	* to this multiple.
568	* That way, we match more how AMD thinks all of that work.	568	* That way, we match more how AMD thinks all of that work.
569	* We will then get the same kind of behaviour already tested under	569	* We will then get the same kind of behaviour already tested under
570	* the "well-known" other OS.	570	* the "well-known" other OS.
571	*/	571	*/
572	static int __init fixup_sgtc(void)	572	static int __init fixup_sgtc(void)
573	{	573	{
574	unsigned int sgtc;	574	unsigned int sgtc;
575	unsigned int m;	575	unsigned int m;
576		576
577	m = fsb / 3333;	577	m = fsb / 3333;
578	if ((m % 10) >= 5)	578	if ((m % 10) >= 5)
579	m += 5;	579	m += 5;
580		580
581	m /= 10;	581	m /= 10;
582		582
583	sgtc = 100 * m * latency;	583	sgtc = 100 * m * latency;
584	sgtc = sgtc / 3;	584	sgtc = sgtc / 3;
585	if (sgtc > 0xfffff) {	585	if (sgtc > 0xfffff) {
586	printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);	586	printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
587	sgtc = 0xfffff;	587	sgtc = 0xfffff;
588	}	588	}
589	return sgtc;	589	return sgtc;
590	}	590	}
591		591
592	static unsigned int powernow_get(unsigned int cpu)	592	static unsigned int powernow_get(unsigned int cpu)
593	{	593	{
594	union msr_fidvidstatus fidvidstatus;	594	union msr_fidvidstatus fidvidstatus;
595	unsigned int cfid;	595	unsigned int cfid;
596		596
597	if (cpu)	597	if (cpu)
598	return 0;	598	return 0;
599	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);	599	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
600	cfid = fidvidstatus.bits.CFID;	600	cfid = fidvidstatus.bits.CFID;
601		601
602	return fsb * fid_codes[cfid] / 10;	602	return fsb * fid_codes[cfid] / 10;
603	}	603	}
604		604
605		605
606	static int __init acer_cpufreq_pst(const struct dmi_system_id *d)	606	static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
607	{	607	{
608	printk(KERN_WARNING PFX	608	printk(KERN_WARNING PFX
609	"%s laptop with broken PST tables in BIOS detected.\n",	609	"%s laptop with broken PST tables in BIOS detected.\n",
610	d->ident);	610	d->ident);
611	printk(KERN_WARNING PFX	611	printk(KERN_WARNING PFX
612	"You need to downgrade to 3A21 (09/09/2002), or try a newer "	612	"You need to downgrade to 3A21 (09/09/2002), or try a newer "
613	"BIOS than 3A71 (01/20/2003)\n");	613	"BIOS than 3A71 (01/20/2003)\n");
614	printk(KERN_WARNING PFX	614	printk(KERN_WARNING PFX
615	"cpufreq scaling has been disabled as a result of this.\n");	615	"cpufreq scaling has been disabled as a result of this.\n");
616	return 0;	616	return 0;
617	}	617	}
618		618
619	/*	619	/*
620	* Some Athlon laptops have really fucked PST tables.	620	* Some Athlon laptops have really fucked PST tables.
621	* A BIOS update is all that can save them.	621	* A BIOS update is all that can save them.
622	* Mention this, and disable cpufreq.	622	* Mention this, and disable cpufreq.
623	*/	623	*/
624	static struct dmi_system_id __initdata powernow_dmi_table[] = {	624	static struct dmi_system_id __initdata powernow_dmi_table[] = {
625	{	625	{
626	.callback = acer_cpufreq_pst,	626	.callback = acer_cpufreq_pst,
627	.ident = "Acer Aspire",	627	.ident = "Acer Aspire",
628	.matches = {	628	.matches = {
629	DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),	629	DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
630	DMI_MATCH(DMI_BIOS_VERSION, "3A71"),	630	DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
631	},	631	},
632	},	632	},
633	{ }	633	{ }
634	};	634	};
635		635
636	static int __init powernow_cpu_init(struct cpufreq_policy *policy)	636	static int __init powernow_cpu_init(struct cpufreq_policy *policy)
637	{	637	{
638	union msr_fidvidstatus fidvidstatus;	638	union msr_fidvidstatus fidvidstatus;
639	int result;	639	int result;
640		640
641	if (policy->cpu != 0)	641	if (policy->cpu != 0)
642	return -ENODEV;	642	return -ENODEV;
643		643
644	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);	644	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
645		645
646	recalibrate_cpu_khz();	646	recalibrate_cpu_khz();
647		647
648	fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];	648	fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
649	if (!fsb) {	649	if (!fsb) {
650	printk(KERN_WARNING PFX "can not determine bus frequency\n");	650	printk(KERN_WARNING PFX "can not determine bus frequency\n");
651	return -EINVAL;	651	return -EINVAL;
652	}	652	}
653	dprintk("FSB: %3dMHz\n", fsb/1000);	653	dprintk("FSB: %3dMHz\n", fsb/1000);
654		654
655	if (dmi_check_system(powernow_dmi_table) \|\| acpi_force) {	655	if (dmi_check_system(powernow_dmi_table) \|\| acpi_force) {
656	printk(KERN_INFO PFX "PSB/PST known to be broken. "	656	printk(KERN_INFO PFX "PSB/PST known to be broken. "
657	"Trying ACPI instead\n");	657	"Trying ACPI instead\n");
658	result = powernow_acpi_init();	658	result = powernow_acpi_init();
659	} else {	659	} else {
660	result = powernow_decode_bios(fidvidstatus.bits.MFID,	660	result = powernow_decode_bios(fidvidstatus.bits.MFID,
661	fidvidstatus.bits.SVID);	661	fidvidstatus.bits.SVID);
662	if (result) {	662	if (result) {
663	printk(KERN_INFO PFX "Trying ACPI perflib\n");	663	printk(KERN_INFO PFX "Trying ACPI perflib\n");
664	maximum_speed = 0;	664	maximum_speed = 0;
665	minimum_speed = -1;	665	minimum_speed = -1;
666	latency = 0;	666	latency = 0;
667	result = powernow_acpi_init();	667	result = powernow_acpi_init();
668	if (result) {	668	if (result) {
669	printk(KERN_INFO PFX	669	printk(KERN_INFO PFX
670	"ACPI and legacy methods failed\n");	670	"ACPI and legacy methods failed\n");
671	}	671	}
672	} else {	672	} else {
673	/* SGTC use the bus clock as timer */	673	/* SGTC use the bus clock as timer */
674	latency = fixup_sgtc();	674	latency = fixup_sgtc();
675	printk(KERN_INFO PFX "SGTC: %d\n", latency);	675	printk(KERN_INFO PFX "SGTC: %d\n", latency);
676	}	676	}
677	}	677	}
678		678
679	if (result)	679	if (result)
680	return result;	680	return result;
681		681
682	printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",	682	printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
683	minimum_speed/1000, maximum_speed/1000);	683	minimum_speed/1000, maximum_speed/1000);
684		684
685	policy->cpuinfo.transition_latency =	685	policy->cpuinfo.transition_latency =
686	cpufreq_scale(2000000UL, fsb, latency);	686	cpufreq_scale(2000000UL, fsb, latency);
687		687
688	policy->cur = powernow_get(0);	688	policy->cur = powernow_get(0);
689		689
690	cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);	690	cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
691		691
692	return cpufreq_frequency_table_cpuinfo(policy, powernow_table);	692	return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
693	}	693	}
694		694
695	static int powernow_cpu_exit(struct cpufreq_policy *policy)	695	static int powernow_cpu_exit(struct cpufreq_policy *policy)
696	{	696	{
697	cpufreq_frequency_table_put_attr(policy->cpu);	697	cpufreq_frequency_table_put_attr(policy->cpu);
698		698
699	#ifdef CONFIG_X86_POWERNOW_K7_ACPI	699	#ifdef CONFIG_X86_POWERNOW_K7_ACPI
700	if (acpi_processor_perf) {	700	if (acpi_processor_perf) {
701	acpi_processor_unregister_performance(acpi_processor_perf, 0);	701	acpi_processor_unregister_performance(acpi_processor_perf, 0);
702	free_cpumask_var(acpi_processor_perf->shared_cpu_map);	702	free_cpumask_var(acpi_processor_perf->shared_cpu_map);
703	kfree(acpi_processor_perf);	703	kfree(acpi_processor_perf);
704	}	704	}
705	#endif	705	#endif
706		706
707	kfree(powernow_table);	707	kfree(powernow_table);
708	return 0;	708	return 0;
709	}	709	}
710		710
711	static struct freq_attr *powernow_table_attr[] = {	711	static struct freq_attr *powernow_table_attr[] = {
712	&cpufreq_freq_attr_scaling_available_freqs,	712	&cpufreq_freq_attr_scaling_available_freqs,
713	NULL,	713	NULL,
714	};	714	};
715		715
716	static struct cpufreq_driver powernow_driver = {	716	static struct cpufreq_driver powernow_driver = {
717	.verify = powernow_verify,	717	.verify = powernow_verify,
718	.target = powernow_target,	718	.target = powernow_target,
719	.get = powernow_get,	719	.get = powernow_get,
720	.init = powernow_cpu_init,	720	.init = powernow_cpu_init,
721	.exit = powernow_cpu_exit,	721	.exit = powernow_cpu_exit,
722	.name = "powernow-k7",	722	.name = "powernow-k7",
723	.owner = THIS_MODULE,	723	.owner = THIS_MODULE,
724	.attr = powernow_table_attr,	724	.attr = powernow_table_attr,
725	};	725	};
726		726
727	static int __init powernow_init(void)	727	static int __init powernow_init(void)
728	{	728	{
729	if (check_powernow() == 0)	729	if (check_powernow() == 0)
730	return -ENODEV;	730	return -ENODEV;
731	return cpufreq_register_driver(&powernow_driver);	731	return cpufreq_register_driver(&powernow_driver);
732	}	732	}
733		733
734		734
735	static void __exit powernow_exit(void)	735	static void __exit powernow_exit(void)
736	{	736	{
737	cpufreq_unregister_driver(&powernow_driver);	737	cpufreq_unregister_driver(&powernow_driver);
738	}	738	}
739		739
740	module_param(acpi_force, int, 0444);	740	module_param(acpi_force, int, 0444);
741	MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");	741	MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
742		742
743	MODULE_AUTHOR("Dave Jones <davej@redhat.com>");	743	MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
744	MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");	744	MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
745	MODULE_LICENSE("GPL");	745	MODULE_LICENSE("GPL");
746		746
747	late_initcall(powernow_init);	747	late_initcall(powernow_init);
748	module_exit(powernow_exit);	748	module_exit(powernow_exit);
749		749
750		750

arch/x86/kernel/cpu/cpufreq/powernow-k8.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* (c) 2003-2006 Advanced Micro Devices, Inc.	2	* (c) 2003-2006 Advanced Micro Devices, Inc.
3	* Your use of this code is subject to the terms and conditions of the	3	* Your use of this code is subject to the terms and conditions of the
4	* GNU general public license version 2. See "COPYING" or	4	* GNU general public license version 2. See "COPYING" or
5	* http://www.gnu.org/licenses/gpl.html	5	* http://www.gnu.org/licenses/gpl.html
6	*	6	*
7	* Support : mark.langsdorf@amd.com	7	* Support : mark.langsdorf@amd.com
8	*	8	*
9	* Based on the powernow-k7.c module written by Dave Jones.	9	* Based on the powernow-k7.c module written by Dave Jones.
10	* (C) 2003 Dave Jones on behalf of SuSE Labs	10	* (C) 2003 Dave Jones on behalf of SuSE Labs
11	* (C) 2004 Dominik Brodowski <linux@brodo.de>	11	* (C) 2004 Dominik Brodowski <linux@brodo.de>
12	* (C) 2004 Pavel Machek <pavel@suse.cz>	12	* (C) 2004 Pavel Machek <pavel@suse.cz>
13	* Licensed under the terms of the GNU GPL License version 2.	13	* Licensed under the terms of the GNU GPL License version 2.
14	* Based upon datasheets & sample CPUs kindly provided by AMD.	14	* Based upon datasheets & sample CPUs kindly provided by AMD.
15	*	15	*
16	* Valuable input gratefully received from Dave Jones, Pavel Machek,	16	* Valuable input gratefully received from Dave Jones, Pavel Machek,
17	* Dominik Brodowski, Jacob Shin, and others.	17	* Dominik Brodowski, Jacob Shin, and others.
18	* Originally developed by Paul Devriendt.	18	* Originally developed by Paul Devriendt.
19	* Processor information obtained from Chapter 9 (Power and Thermal Management)	19	* Processor information obtained from Chapter 9 (Power and Thermal Management)
20	* of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD	20	* of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
21	* Opteron Processors" available for download from www.amd.com	21	* Opteron Processors" available for download from www.amd.com
22	*	22	*
23	* Tables for specific CPUs can be inferred from	23	* Tables for specific CPUs can be inferred from
24	* http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf	24	* http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
25	*/	25	*/
26		26
27	#include <linux/kernel.h>	27	#include <linux/kernel.h>
28	#include <linux/smp.h>	28	#include <linux/smp.h>
29	#include <linux/module.h>	29	#include <linux/module.h>
30	#include <linux/init.h>	30	#include <linux/init.h>
31	#include <linux/cpufreq.h>	31	#include <linux/cpufreq.h>
32	#include <linux/slab.h>	32	#include <linux/slab.h>
33	#include <linux/string.h>	33	#include <linux/string.h>
34	#include <linux/cpumask.h>	34	#include <linux/cpumask.h>
35	#include <linux/sched.h> /* for current / set_cpus_allowed() */	35	#include <linux/sched.h> /* for current / set_cpus_allowed() */
36	#include <linux/io.h>	36	#include <linux/io.h>
37	#include <linux/delay.h>	37	#include <linux/delay.h>
38		38
39	#include <asm/msr.h>	39	#include <asm/msr.h>
40		40
41	#include <linux/acpi.h>	41	#include <linux/acpi.h>
42	#include <linux/mutex.h>	42	#include <linux/mutex.h>
43	#include <acpi/processor.h>	43	#include <acpi/processor.h>
44		44
45	#define PFX "powernow-k8: "	45	#define PFX "powernow-k8: "
46	#define VERSION "version 2.20.00"	46	#define VERSION "version 2.20.00"
47	#include "powernow-k8.h"	47	#include "powernow-k8.h"
48		48
49	/* serialize freq changes */	49	/* serialize freq changes */
50	static DEFINE_MUTEX(fidvid_mutex);	50	static DEFINE_MUTEX(fidvid_mutex);
51		51
52	static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);	52	static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
53		53
54	static int cpu_family = CPU_OPTERON;	54	static int cpu_family = CPU_OPTERON;
55		55
56	#ifndef CONFIG_SMP	56	#ifndef CONFIG_SMP
57	static inline const struct cpumask *cpu_core_mask(int cpu)	57	static inline const struct cpumask *cpu_core_mask(int cpu)
58	{	58	{
59	return cpumask_of(0);	59	return cpumask_of(0);
60	}	60	}
61	#endif	61	#endif
62		62
63	/* Return a frequency in MHz, given an input fid */	63	/* Return a frequency in MHz, given an input fid */
64	static u32 find_freq_from_fid(u32 fid)	64	static u32 find_freq_from_fid(u32 fid)
65	{	65	{
66	return 800 + (fid * 100);	66	return 800 + (fid * 100);
67	}	67	}
68		68
69	/* Return a frequency in KHz, given an input fid */	69	/* Return a frequency in KHz, given an input fid */
70	static u32 find_khz_freq_from_fid(u32 fid)	70	static u32 find_khz_freq_from_fid(u32 fid)
71	{	71	{
72	return 1000 * find_freq_from_fid(fid);	72	return 1000 * find_freq_from_fid(fid);
73	}	73	}
74		74
75	static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,	75	static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
76	u32 pstate)	76	u32 pstate)
77	{	77	{
78	return data[pstate].frequency;	78	return data[pstate].frequency;
79	}	79	}
80		80
81	/* Return the vco fid for an input fid	81	/* Return the vco fid for an input fid
82	*	82	*
83	* Each "low" fid has corresponding "high" fid, and you can get to "low" fids	83	* Each "low" fid has corresponding "high" fid, and you can get to "low" fids
84	* only from corresponding high fids. This returns "high" fid corresponding to	84	* only from corresponding high fids. This returns "high" fid corresponding to
85	* "low" one.	85	* "low" one.
86	*/	86	*/
87	static u32 convert_fid_to_vco_fid(u32 fid)	87	static u32 convert_fid_to_vco_fid(u32 fid)
88	{	88	{
89	if (fid < HI_FID_TABLE_BOTTOM)	89	if (fid < HI_FID_TABLE_BOTTOM)
90	return 8 + (2 * fid);	90	return 8 + (2 * fid);
91	else	91	else
92	return fid;	92	return fid;
93	}	93	}
94		94
95	/*	95	/*
96	* Return 1 if the pending bit is set. Unless we just instructed the processor	96	* Return 1 if the pending bit is set. Unless we just instructed the processor
97	* to transition to a new state, seeing this bit set is really bad news.	97	* to transition to a new state, seeing this bit set is really bad news.
98	*/	98	*/
99	static int pending_bit_stuck(void)	99	static int pending_bit_stuck(void)
100	{	100	{
101	u32 lo, hi;	101	u32 lo, hi;
102		102
103	if (cpu_family == CPU_HW_PSTATE)	103	if (cpu_family == CPU_HW_PSTATE)
104	return 0;	104	return 0;
105		105
106	rdmsr(MSR_FIDVID_STATUS, lo, hi);	106	rdmsr(MSR_FIDVID_STATUS, lo, hi);
107	return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;	107	return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
108	}	108	}
109		109
110	/*	110	/*
111	* Update the global current fid / vid values from the status msr.	111	* Update the global current fid / vid values from the status msr.
112	* Returns 1 on error.	112	* Returns 1 on error.
113	*/	113	*/
114	static int query_current_values_with_pending_wait(struct powernow_k8_data *data)	114	static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
115	{	115	{
116	u32 lo, hi;	116	u32 lo, hi;
117	u32 i = 0;	117	u32 i = 0;
118		118
119	if (cpu_family == CPU_HW_PSTATE) {	119	if (cpu_family == CPU_HW_PSTATE) {
120	if (data->currpstate == HW_PSTATE_INVALID) {	120	if (data->currpstate == HW_PSTATE_INVALID) {
121	/* read (initial) hw pstate if not yet set */	121	/* read (initial) hw pstate if not yet set */
122	rdmsr(MSR_PSTATE_STATUS, lo, hi);	122	rdmsr(MSR_PSTATE_STATUS, lo, hi);
123	i = lo & HW_PSTATE_MASK;	123	i = lo & HW_PSTATE_MASK;
124		124
125	/*	125	/*
126	* a workaround for family 11h erratum 311 might cause	126	* a workaround for family 11h erratum 311 might cause
127	* an "out-of-range Pstate if the core is in Pstate-0	127	* an "out-of-range Pstate if the core is in Pstate-0
128	*/	128	*/
129	if (i >= data->numps)	129	if (i >= data->numps)
130	data->currpstate = HW_PSTATE_0;	130	data->currpstate = HW_PSTATE_0;
131	else	131	else
132	data->currpstate = i;	132	data->currpstate = i;
133	}	133	}
134	return 0;	134	return 0;
135	}	135	}
136	do {	136	do {
137	if (i++ > 10000) {	137	if (i++ > 10000) {
138	dprintk("detected change pending stuck\n");	138	dprintk("detected change pending stuck\n");
139	return 1;	139	return 1;
140	}	140	}
141	rdmsr(MSR_FIDVID_STATUS, lo, hi);	141	rdmsr(MSR_FIDVID_STATUS, lo, hi);
142	} while (lo & MSR_S_LO_CHANGE_PENDING);	142	} while (lo & MSR_S_LO_CHANGE_PENDING);
143		143
144	data->currvid = hi & MSR_S_HI_CURRENT_VID;	144	data->currvid = hi & MSR_S_HI_CURRENT_VID;
145	data->currfid = lo & MSR_S_LO_CURRENT_FID;	145	data->currfid = lo & MSR_S_LO_CURRENT_FID;
146		146
147	return 0;	147	return 0;
148	}	148	}
149		149
150	/* the isochronous relief time */	150	/* the isochronous relief time */
151	static void count_off_irt(struct powernow_k8_data *data)	151	static void count_off_irt(struct powernow_k8_data *data)
152	{	152	{
153	udelay((1 << data->irt) * 10);	153	udelay((1 << data->irt) * 10);
154	return;	154	return;
155	}	155	}
156		156
157	/* the voltage stabilization time */	157	/* the voltage stabilization time */
158	static void count_off_vst(struct powernow_k8_data *data)	158	static void count_off_vst(struct powernow_k8_data *data)
159	{	159	{
160	udelay(data->vstable * VST_UNITS_20US);	160	udelay(data->vstable * VST_UNITS_20US);
161	return;	161	return;
162	}	162	}
163		163
164	/* need to init the control msr to a safe value (for each cpu) */	164	/* need to init the control msr to a safe value (for each cpu) */
165	static void fidvid_msr_init(void)	165	static void fidvid_msr_init(void)
166	{	166	{
167	u32 lo, hi;	167	u32 lo, hi;
168	u8 fid, vid;	168	u8 fid, vid;
169		169
170	rdmsr(MSR_FIDVID_STATUS, lo, hi);	170	rdmsr(MSR_FIDVID_STATUS, lo, hi);
171	vid = hi & MSR_S_HI_CURRENT_VID;	171	vid = hi & MSR_S_HI_CURRENT_VID;
172	fid = lo & MSR_S_LO_CURRENT_FID;	172	fid = lo & MSR_S_LO_CURRENT_FID;
173	lo = fid \| (vid << MSR_C_LO_VID_SHIFT);	173	lo = fid \| (vid << MSR_C_LO_VID_SHIFT);
174	hi = MSR_C_HI_STP_GNT_BENIGN;	174	hi = MSR_C_HI_STP_GNT_BENIGN;
175	dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);	175	dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
176	wrmsr(MSR_FIDVID_CTL, lo, hi);	176	wrmsr(MSR_FIDVID_CTL, lo, hi);
177	}	177	}
178		178
179	/* write the new fid value along with the other control fields to the msr */	179	/* write the new fid value along with the other control fields to the msr */
180	static int write_new_fid(struct powernow_k8_data *data, u32 fid)	180	static int write_new_fid(struct powernow_k8_data *data, u32 fid)
181	{	181	{
182	u32 lo;	182	u32 lo;
183	u32 savevid = data->currvid;	183	u32 savevid = data->currvid;
184	u32 i = 0;	184	u32 i = 0;
185		185
186	if ((fid & INVALID_FID_MASK) \|\| (data->currvid & INVALID_VID_MASK)) {	186	if ((fid & INVALID_FID_MASK) \|\| (data->currvid & INVALID_VID_MASK)) {
187	printk(KERN_ERR PFX "internal error - overflow on fid write\n");	187	printk(KERN_ERR PFX "internal error - overflow on fid write\n");
188	return 1;	188	return 1;
189	}	189	}
190		190
191	lo = fid;	191	lo = fid;
192	lo \|= (data->currvid << MSR_C_LO_VID_SHIFT);	192	lo \|= (data->currvid << MSR_C_LO_VID_SHIFT);
193	lo \|= MSR_C_LO_INIT_FID_VID;	193	lo \|= MSR_C_LO_INIT_FID_VID;
194		194
195	dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",	195	dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
196	fid, lo, data->plllock * PLL_LOCK_CONVERSION);	196	fid, lo, data->plllock * PLL_LOCK_CONVERSION);
197		197
198	do {	198	do {
199	wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);	199	wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
200	if (i++ > 100) {	200	if (i++ > 100) {
201	printk(KERN_ERR PFX	201	printk(KERN_ERR PFX
202	"Hardware error - pending bit very stuck - "	202	"Hardware error - pending bit very stuck - "
203	"no further pstate changes possible\n");	203	"no further pstate changes possible\n");
204	return 1;	204	return 1;
205	}	205	}
206	} while (query_current_values_with_pending_wait(data));	206	} while (query_current_values_with_pending_wait(data));
207		207
208	count_off_irt(data);	208	count_off_irt(data);
209		209
210	if (savevid != data->currvid) {	210	if (savevid != data->currvid) {
211	printk(KERN_ERR PFX	211	printk(KERN_ERR PFX
212	"vid change on fid trans, old 0x%x, new 0x%x\n",	212	"vid change on fid trans, old 0x%x, new 0x%x\n",
213	savevid, data->currvid);	213	savevid, data->currvid);
214	return 1;	214	return 1;
215	}	215	}
216		216
217	if (fid != data->currfid) {	217	if (fid != data->currfid) {
218	printk(KERN_ERR PFX	218	printk(KERN_ERR PFX
219	"fid trans failed, fid 0x%x, curr 0x%x\n", fid,	219	"fid trans failed, fid 0x%x, curr 0x%x\n", fid,
220	data->currfid);	220	data->currfid);
221	return 1;	221	return 1;
222	}	222	}
223		223
224	return 0;	224	return 0;
225	}	225	}
226		226
227	/* Write a new vid to the hardware */	227	/* Write a new vid to the hardware */
228	static int write_new_vid(struct powernow_k8_data *data, u32 vid)	228	static int write_new_vid(struct powernow_k8_data *data, u32 vid)
229	{	229	{
230	u32 lo;	230	u32 lo;
231	u32 savefid = data->currfid;	231	u32 savefid = data->currfid;
232	int i = 0;	232	int i = 0;
233		233
234	if ((data->currfid & INVALID_FID_MASK) \|\| (vid & INVALID_VID_MASK)) {	234	if ((data->currfid & INVALID_FID_MASK) \|\| (vid & INVALID_VID_MASK)) {
235	printk(KERN_ERR PFX "internal error - overflow on vid write\n");	235	printk(KERN_ERR PFX "internal error - overflow on vid write\n");
236	return 1;	236	return 1;
237	}	237	}
238		238
239	lo = data->currfid;	239	lo = data->currfid;
240	lo \|= (vid << MSR_C_LO_VID_SHIFT);	240	lo \|= (vid << MSR_C_LO_VID_SHIFT);
241	lo \|= MSR_C_LO_INIT_FID_VID;	241	lo \|= MSR_C_LO_INIT_FID_VID;
242		242
243	dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",	243	dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
244	vid, lo, STOP_GRANT_5NS);	244	vid, lo, STOP_GRANT_5NS);
245		245
246	do {	246	do {
247	wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);	247	wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
248	if (i++ > 100) {	248	if (i++ > 100) {
249	printk(KERN_ERR PFX "internal error - pending bit "	249	printk(KERN_ERR PFX "internal error - pending bit "
250	"very stuck - no further pstate "	250	"very stuck - no further pstate "
251	"changes possible\n");	251	"changes possible\n");
252	return 1;	252	return 1;
253	}	253	}
254	} while (query_current_values_with_pending_wait(data));	254	} while (query_current_values_with_pending_wait(data));
255		255
256	if (savefid != data->currfid) {	256	if (savefid != data->currfid) {
257	printk(KERN_ERR PFX "fid changed on vid trans, old "	257	printk(KERN_ERR PFX "fid changed on vid trans, old "
258	"0x%x new 0x%x\n",	258	"0x%x new 0x%x\n",
259	savefid, data->currfid);	259	savefid, data->currfid);
260	return 1;	260	return 1;
261	}	261	}
262		262
263	if (vid != data->currvid) {	263	if (vid != data->currvid) {
264	printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "	264	printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
265	"curr 0x%x\n",	265	"curr 0x%x\n",
266	vid, data->currvid);	266	vid, data->currvid);
267	return 1;	267	return 1;
268	}	268	}
269		269
270	return 0;	270	return 0;
271	}	271	}
272		272
273	/*	273	/*
274	* Reduce the vid by the max of step or reqvid.	274	* Reduce the vid by the max of step or reqvid.
275	* Decreasing vid codes represent increasing voltages:	275	* Decreasing vid codes represent increasing voltages:
276	* vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.	276	* vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
277	*/	277	*/
278	static int decrease_vid_code_by_step(struct powernow_k8_data *data,	278	static int decrease_vid_code_by_step(struct powernow_k8_data *data,
279	u32 reqvid, u32 step)	279	u32 reqvid, u32 step)
280	{	280	{
281	if ((data->currvid - reqvid) > step)	281	if ((data->currvid - reqvid) > step)
282	reqvid = data->currvid - step;	282	reqvid = data->currvid - step;
283		283
284	if (write_new_vid(data, reqvid))	284	if (write_new_vid(data, reqvid))
285	return 1;	285	return 1;
286		286
287	count_off_vst(data);	287	count_off_vst(data);
288		288
289	return 0;	289	return 0;
290	}	290	}
291		291
292	/* Change hardware pstate by single MSR write */	292	/* Change hardware pstate by single MSR write */
293	static int transition_pstate(struct powernow_k8_data *data, u32 pstate)	293	static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
294	{	294	{
295	wrmsr(MSR_PSTATE_CTRL, pstate, 0);	295	wrmsr(MSR_PSTATE_CTRL, pstate, 0);
296	data->currpstate = pstate;	296	data->currpstate = pstate;
297	return 0;	297	return 0;
298	}	298	}
299		299
300	/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */	300	/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
301	static int transition_fid_vid(struct powernow_k8_data *data,	301	static int transition_fid_vid(struct powernow_k8_data *data,
302	u32 reqfid, u32 reqvid)	302	u32 reqfid, u32 reqvid)
303	{	303	{
304	if (core_voltage_pre_transition(data, reqvid))	304	if (core_voltage_pre_transition(data, reqvid))
305	return 1;	305	return 1;
306		306
307	if (core_frequency_transition(data, reqfid))	307	if (core_frequency_transition(data, reqfid))
308	return 1;	308	return 1;
309		309
310	if (core_voltage_post_transition(data, reqvid))	310	if (core_voltage_post_transition(data, reqvid))
311	return 1;	311	return 1;
312		312
313	if (query_current_values_with_pending_wait(data))	313	if (query_current_values_with_pending_wait(data))
314	return 1;	314	return 1;
315		315
316	if ((reqfid != data->currfid) \|\| (reqvid != data->currvid)) {	316	if ((reqfid != data->currfid) \|\| (reqvid != data->currvid)) {
317	printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "	317	printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
318	"curr 0x%x 0x%x\n",	318	"curr 0x%x 0x%x\n",
319	smp_processor_id(),	319	smp_processor_id(),
320	reqfid, reqvid, data->currfid, data->currvid);	320	reqfid, reqvid, data->currfid, data->currvid);
321	return 1;	321	return 1;
322	}	322	}
323		323
324	dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",	324	dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
325	smp_processor_id(), data->currfid, data->currvid);	325	smp_processor_id(), data->currfid, data->currvid);
326		326
327	return 0;	327	return 0;
328	}	328	}
329		329
330	/* Phase 1 - core voltage transition ... setup voltage */	330	/* Phase 1 - core voltage transition ... setup voltage */
331	static int core_voltage_pre_transition(struct powernow_k8_data *data,	331	static int core_voltage_pre_transition(struct powernow_k8_data *data,
332	u32 reqvid)	332	u32 reqvid)
333	{	333	{
334	u32 rvosteps = data->rvo;	334	u32 rvosteps = data->rvo;
335	u32 savefid = data->currfid;	335	u32 savefid = data->currfid;
336	u32 maxvid, lo;	336	u32 maxvid, lo;
337		337
338	dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "	338	dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
339	"reqvid 0x%x, rvo 0x%x\n",	339	"reqvid 0x%x, rvo 0x%x\n",
340	smp_processor_id(),	340	smp_processor_id(),
341	data->currfid, data->currvid, reqvid, data->rvo);	341	data->currfid, data->currvid, reqvid, data->rvo);
342		342
343	rdmsr(MSR_FIDVID_STATUS, lo, maxvid);	343	rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
344	maxvid = 0x1f & (maxvid >> 16);	344	maxvid = 0x1f & (maxvid >> 16);
345	dprintk("ph1 maxvid=0x%x\n", maxvid);	345	dprintk("ph1 maxvid=0x%x\n", maxvid);
346	if (reqvid < maxvid) /* lower numbers are higher voltages */	346	if (reqvid < maxvid) /* lower numbers are higher voltages */
347	reqvid = maxvid;	347	reqvid = maxvid;
348		348
349	while (data->currvid > reqvid) {	349	while (data->currvid > reqvid) {
350	dprintk("ph1: curr 0x%x, req vid 0x%x\n",	350	dprintk("ph1: curr 0x%x, req vid 0x%x\n",
351	data->currvid, reqvid);	351	data->currvid, reqvid);
352	if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))	352	if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
353	return 1;	353	return 1;
354	}	354	}
355		355
356	while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) {	356	while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) {
357	if (data->currvid == maxvid) {	357	if (data->currvid == maxvid) {
358	rvosteps = 0;	358	rvosteps = 0;
359	} else {	359	} else {
360	dprintk("ph1: changing vid for rvo, req 0x%x\n",	360	dprintk("ph1: changing vid for rvo, req 0x%x\n",
361	data->currvid - 1);	361	data->currvid - 1);
362	if (decrease_vid_code_by_step(data, data->currvid-1, 1))	362	if (decrease_vid_code_by_step(data, data->currvid-1, 1))
363	return 1;	363	return 1;
364	rvosteps--;	364	rvosteps--;
365	}	365	}
366	}	366	}
367		367
368	if (query_current_values_with_pending_wait(data))	368	if (query_current_values_with_pending_wait(data))
369	return 1;	369	return 1;
370		370
371	if (savefid != data->currfid) {	371	if (savefid != data->currfid) {
372	printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",	372	printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
373	data->currfid);	373	data->currfid);
374	return 1;	374	return 1;
375	}	375	}
376		376
377	dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",	377	dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
378	data->currfid, data->currvid);	378	data->currfid, data->currvid);
379		379
380	return 0;	380	return 0;
381	}	381	}
382		382
383	/* Phase 2 - core frequency transition */	383	/* Phase 2 - core frequency transition */
384	static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)	384	static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
385	{	385	{
386	u32 vcoreqfid, vcocurrfid, vcofiddiff;	386	u32 vcoreqfid, vcocurrfid, vcofiddiff;
387	u32 fid_interval, savevid = data->currvid;	387	u32 fid_interval, savevid = data->currvid;
388		388
389	if ((reqfid < HI_FID_TABLE_BOTTOM) &&	389	if ((reqfid < HI_FID_TABLE_BOTTOM) &&
390	(data->currfid < HI_FID_TABLE_BOTTOM)) {	390	(data->currfid < HI_FID_TABLE_BOTTOM)) {
391	printk(KERN_ERR PFX "ph2: illegal lo-lo transition "	391	printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
392	"0x%x 0x%x\n", reqfid, data->currfid);	392	"0x%x 0x%x\n", reqfid, data->currfid);
393	return 1;	393	return 1;
394	}	394	}
395		395
396	if (data->currfid == reqfid) {	396	if (data->currfid == reqfid) {
397	printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",	397	printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
398	data->currfid);	398	data->currfid);
399	return 0;	399	return 0;
400	}	400	}
401		401
402	dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "	402	dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
403	"reqfid 0x%x\n",	403	"reqfid 0x%x\n",
404	smp_processor_id(),	404	smp_processor_id(),
405	data->currfid, data->currvid, reqfid);	405	data->currfid, data->currvid, reqfid);
406		406
407	vcoreqfid = convert_fid_to_vco_fid(reqfid);	407	vcoreqfid = convert_fid_to_vco_fid(reqfid);
408	vcocurrfid = convert_fid_to_vco_fid(data->currfid);	408	vcocurrfid = convert_fid_to_vco_fid(data->currfid);
409	vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid	409	vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
410	: vcoreqfid - vcocurrfid;	410	: vcoreqfid - vcocurrfid;
411		411
412	while (vcofiddiff > 2) {	412	while (vcofiddiff > 2) {
413	(data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);	413	(data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
414		414
415	if (reqfid > data->currfid) {	415	if (reqfid > data->currfid) {
416	if (data->currfid > LO_FID_TABLE_TOP) {	416	if (data->currfid > LO_FID_TABLE_TOP) {
417	if (write_new_fid(data,	417	if (write_new_fid(data,
418	data->currfid + fid_interval))	418	data->currfid + fid_interval))
419	return 1;	419	return 1;
420	} else {	420	} else {
421	if (write_new_fid	421	if (write_new_fid
422	(data,	422	(data,
423	2 + convert_fid_to_vco_fid(data->currfid)))	423	2 + convert_fid_to_vco_fid(data->currfid)))
424	return 1;	424	return 1;
425	}	425	}
426	} else {	426	} else {
427	if (write_new_fid(data, data->currfid - fid_interval))	427	if (write_new_fid(data, data->currfid - fid_interval))
428	return 1;	428	return 1;
429	}	429	}
430		430
431	vcocurrfid = convert_fid_to_vco_fid(data->currfid);	431	vcocurrfid = convert_fid_to_vco_fid(data->currfid);
432	vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid	432	vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
433	: vcoreqfid - vcocurrfid;	433	: vcoreqfid - vcocurrfid;
434	}	434	}
435		435
436	if (write_new_fid(data, reqfid))	436	if (write_new_fid(data, reqfid))
437	return 1;	437	return 1;
438		438
439	if (query_current_values_with_pending_wait(data))	439	if (query_current_values_with_pending_wait(data))
440	return 1;	440	return 1;
441		441
442	if (data->currfid != reqfid) {	442	if (data->currfid != reqfid) {
443	printk(KERN_ERR PFX	443	printk(KERN_ERR PFX
444	"ph2: mismatch, failed fid transition, "	444	"ph2: mismatch, failed fid transition, "
445	"curr 0x%x, req 0x%x\n",	445	"curr 0x%x, req 0x%x\n",
446	data->currfid, reqfid);	446	data->currfid, reqfid);
447	return 1;	447	return 1;
448	}	448	}
449		449
450	if (savevid != data->currvid) {	450	if (savevid != data->currvid) {
451	printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",	451	printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
452	savevid, data->currvid);	452	savevid, data->currvid);
453	return 1;	453	return 1;
454	}	454	}
455		455
456	dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",	456	dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
457	data->currfid, data->currvid);	457	data->currfid, data->currvid);
458		458
459	return 0;	459	return 0;
460	}	460	}
461		461
462	/* Phase 3 - core voltage transition flow ... jump to the final vid. */	462	/* Phase 3 - core voltage transition flow ... jump to the final vid. */
463	static int core_voltage_post_transition(struct powernow_k8_data *data,	463	static int core_voltage_post_transition(struct powernow_k8_data *data,
464	u32 reqvid)	464	u32 reqvid)
465	{	465	{
466	u32 savefid = data->currfid;	466	u32 savefid = data->currfid;
467	u32 savereqvid = reqvid;	467	u32 savereqvid = reqvid;
468		468
469	dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",	469	dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
470	smp_processor_id(),	470	smp_processor_id(),
471	data->currfid, data->currvid);	471	data->currfid, data->currvid);
472		472
473	if (reqvid != data->currvid) {	473	if (reqvid != data->currvid) {
474	if (write_new_vid(data, reqvid))	474	if (write_new_vid(data, reqvid))
475	return 1;	475	return 1;
476		476
477	if (savefid != data->currfid) {	477	if (savefid != data->currfid) {
478	printk(KERN_ERR PFX	478	printk(KERN_ERR PFX
479	"ph3: bad fid change, save 0x%x, curr 0x%x\n",	479	"ph3: bad fid change, save 0x%x, curr 0x%x\n",
480	savefid, data->currfid);	480	savefid, data->currfid);
481	return 1;	481	return 1;
482	}	482	}
483		483
484	if (data->currvid != reqvid) {	484	if (data->currvid != reqvid) {
485	printk(KERN_ERR PFX	485	printk(KERN_ERR PFX
486	"ph3: failed vid transition\n, "	486	"ph3: failed vid transition\n, "
487	"req 0x%x, curr 0x%x",	487	"req 0x%x, curr 0x%x",
488	reqvid, data->currvid);	488	reqvid, data->currvid);
489	return 1;	489	return 1;
490	}	490	}
491	}	491	}
492		492
493	if (query_current_values_with_pending_wait(data))	493	if (query_current_values_with_pending_wait(data))
494	return 1;	494	return 1;
495		495
496	if (savereqvid != data->currvid) {	496	if (savereqvid != data->currvid) {
497	dprintk("ph3 failed, currvid 0x%x\n", data->currvid);	497	dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
498	return 1;	498	return 1;
499	}	499	}
500		500
501	if (savefid != data->currfid) {	501	if (savefid != data->currfid) {
502	dprintk("ph3 failed, currfid changed 0x%x\n",	502	dprintk("ph3 failed, currfid changed 0x%x\n",
503	data->currfid);	503	data->currfid);
504	return 1;	504	return 1;
505	}	505	}
506		506
507	dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",	507	dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
508	data->currfid, data->currvid);	508	data->currfid, data->currvid);
509		509
510	return 0;	510	return 0;
511	}	511	}
512		512
513	static int check_supported_cpu(unsigned int cpu)	513	static int check_supported_cpu(unsigned int cpu)
514	{	514	{
515	cpumask_t oldmask;	515	cpumask_t oldmask;
516	u32 eax, ebx, ecx, edx;	516	u32 eax, ebx, ecx, edx;
517	unsigned int rc = 0;	517	unsigned int rc = 0;
518		518
519	oldmask = current->cpus_allowed;	519	oldmask = current->cpus_allowed;
520	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));	520	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
521		521
522	if (smp_processor_id() != cpu) {	522	if (smp_processor_id() != cpu) {
523	printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);	523	printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
524	goto out;	524	goto out;
525	}	525	}
526		526
527	if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)	527	if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
528	goto out;	528	goto out;
529		529
530	eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);	530	eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
531	if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&	531	if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
532	((eax & CPUID_XFAM) < CPUID_XFAM_10H))	532	((eax & CPUID_XFAM) < CPUID_XFAM_10H))
533	goto out;	533	goto out;
534		534
535	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {	535	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
536	if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) \|\|	536	if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) \|\|
537	((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {	537	((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
538	printk(KERN_INFO PFX	538	printk(KERN_INFO PFX
539	"Processor cpuid %x not supported\n", eax);	539	"Processor cpuid %x not supported\n", eax);
540	goto out;	540	goto out;
541	}	541	}
542		542
543	eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);	543	eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
544	if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {	544	if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
545	printk(KERN_INFO PFX	545	printk(KERN_INFO PFX
546	"No frequency change capabilities detected\n");	546	"No frequency change capabilities detected\n");
547	goto out;	547	goto out;
548	}	548	}
549		549
550	cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);	550	cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
551	if ((edx & P_STATE_TRANSITION_CAPABLE)	551	if ((edx & P_STATE_TRANSITION_CAPABLE)
552	!= P_STATE_TRANSITION_CAPABLE) {	552	!= P_STATE_TRANSITION_CAPABLE) {
553	printk(KERN_INFO PFX	553	printk(KERN_INFO PFX
554	"Power state transitions not supported\n");	554	"Power state transitions not supported\n");
555	goto out;	555	goto out;
556	}	556	}
557	} else { /* must be a HW Pstate capable processor */	557	} else { /* must be a HW Pstate capable processor */
558	cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);	558	cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
559	if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)	559	if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
560	cpu_family = CPU_HW_PSTATE;	560	cpu_family = CPU_HW_PSTATE;
561	else	561	else
562	goto out;	562	goto out;
563	}	563	}
564		564
565	rc = 1;	565	rc = 1;
566		566
567	out:	567	out:
568	set_cpus_allowed_ptr(current, &oldmask);	568	set_cpus_allowed_ptr(current, &oldmask);
569	return rc;	569	return rc;
570	}	570	}
571		571
572	static int check_pst_table(struct powernow_k8_data data, struct pst_s pst,	572	static int check_pst_table(struct powernow_k8_data data, struct pst_s pst,
573	u8 maxvid)	573	u8 maxvid)
574	{	574	{
575	unsigned int j;	575	unsigned int j;
576	u8 lastfid = 0xff;	576	u8 lastfid = 0xff;
577		577
578	for (j = 0; j < data->numps; j++) {	578	for (j = 0; j < data->numps; j++) {
579	if (pst[j].vid > LEAST_VID) {	579	if (pst[j].vid > LEAST_VID) {
580	printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",	580	printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
581	j, pst[j].vid);	581	j, pst[j].vid);
582	return -EINVAL;	582	return -EINVAL;
583	}	583	}
584	if (pst[j].vid < data->rvo) {	584	if (pst[j].vid < data->rvo) {
585	/* vid + rvo >= 0 */	585	/* vid + rvo >= 0 */
586	printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"	586	printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
587	" %d\n", j);	587	" %d\n", j);
588	return -ENODEV;	588	return -ENODEV;
589	}	589	}
590	if (pst[j].vid < maxvid + data->rvo) {	590	if (pst[j].vid < maxvid + data->rvo) {
591	/* vid + rvo >= maxvid */	591	/* vid + rvo >= maxvid */
592	printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"	592	printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
593	" %d\n", j);	593	" %d\n", j);
594	return -ENODEV;	594	return -ENODEV;
595	}	595	}
596	if (pst[j].fid > MAX_FID) {	596	if (pst[j].fid > MAX_FID) {
597	printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"	597	printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
598	" %d\n", j);	598	" %d\n", j);
599	return -ENODEV;	599	return -ENODEV;
600	}	600	}
601	if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {	601	if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
602	/* Only first fid is allowed to be in "low" range */	602	/* Only first fid is allowed to be in "low" range */
603	printk(KERN_ERR FW_BUG PFX "two low fids - %d : "	603	printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
604	"0x%x\n", j, pst[j].fid);	604	"0x%x\n", j, pst[j].fid);
605	return -EINVAL;	605	return -EINVAL;
606	}	606	}
607	if (pst[j].fid < lastfid)	607	if (pst[j].fid < lastfid)
608	lastfid = pst[j].fid;	608	lastfid = pst[j].fid;
609	}	609	}
610	if (lastfid & 1) {	610	if (lastfid & 1) {
611	printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");	611	printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
612	return -EINVAL;	612	return -EINVAL;
613	}	613	}
614	if (lastfid > LO_FID_TABLE_TOP)	614	if (lastfid > LO_FID_TABLE_TOP)
615	printk(KERN_INFO FW_BUG PFX	615	printk(KERN_INFO FW_BUG PFX
616	"first fid not from lo freq table\n");	616	"first fid not from lo freq table\n");
617		617
618	return 0;	618	return 0;
619	}	619	}
620		620
621	static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)	621	static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)
622	{	622	{
623	data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;	623	data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
624	}	624	}
625		625
626	static void print_basics(struct powernow_k8_data *data)	626	static void print_basics(struct powernow_k8_data *data)
627	{	627	{
628	int j;	628	int j;
629	for (j = 0; j < data->numps; j++) {	629	for (j = 0; j < data->numps; j++) {
630	if (data->powernow_table[j].frequency !=	630	if (data->powernow_table[j].frequency !=
631	CPUFREQ_ENTRY_INVALID) {	631	CPUFREQ_ENTRY_INVALID) {
632	if (cpu_family == CPU_HW_PSTATE) {	632	if (cpu_family == CPU_HW_PSTATE) {
633	printk(KERN_INFO PFX	633	printk(KERN_INFO PFX
634	" %d : pstate %d (%d MHz)\n", j,	634	" %d : pstate %d (%d MHz)\n", j,
635	data->powernow_table[j].index,	635	data->powernow_table[j].index,
636	data->powernow_table[j].frequency/1000);	636	data->powernow_table[j].frequency/1000);
637	} else {	637	} else {
638	printk(KERN_INFO PFX	638	printk(KERN_INFO PFX
639	" %d : fid 0x%x (%d MHz), vid 0x%x\n",	639	" %d : fid 0x%x (%d MHz), vid 0x%x\n",
640	j,	640	j,
641	data->powernow_table[j].index & 0xff,	641	data->powernow_table[j].index & 0xff,
642	data->powernow_table[j].frequency/1000,	642	data->powernow_table[j].frequency/1000,
643	data->powernow_table[j].index >> 8);	643	data->powernow_table[j].index >> 8);
644	}	644	}
645	}	645	}
646	}	646	}
647	if (data->batps)	647	if (data->batps)
648	printk(KERN_INFO PFX "Only %d pstates on battery\n",	648	printk(KERN_INFO PFX "Only %d pstates on battery\n",
649	data->batps);	649	data->batps);
650	}	650	}
651		651
652	static u32 freq_from_fid_did(u32 fid, u32 did)	652	static u32 freq_from_fid_did(u32 fid, u32 did)
653	{	653	{
654	u32 mhz = 0;	654	u32 mhz = 0;
655		655
656	if (boot_cpu_data.x86 == 0x10)	656	if (boot_cpu_data.x86 == 0x10)
657	mhz = (100 * (fid + 0x10)) >> did;	657	mhz = (100 * (fid + 0x10)) >> did;
658	else if (boot_cpu_data.x86 == 0x11)	658	else if (boot_cpu_data.x86 == 0x11)
659	mhz = (100 * (fid + 8)) >> did;	659	mhz = (100 * (fid + 8)) >> did;
660	else	660	else
661	BUG();	661	BUG();
662		662
663	return mhz * 1000;	663	return mhz * 1000;
664	}	664	}
665		665
666	static int fill_powernow_table(struct powernow_k8_data *data,	666	static int fill_powernow_table(struct powernow_k8_data *data,
667	struct pst_s *pst, u8 maxvid)	667	struct pst_s *pst, u8 maxvid)
668	{	668	{
669	struct cpufreq_frequency_table *powernow_table;	669	struct cpufreq_frequency_table *powernow_table;
670	unsigned int j;	670	unsigned int j;
671		671
672	if (data->batps) {	672	if (data->batps) {
673	/* use ACPI support to get full speed on mains power */	673	/* use ACPI support to get full speed on mains power */
674	printk(KERN_WARNING PFX	674	printk(KERN_WARNING PFX
675	"Only %d pstates usable (use ACPI driver for full "	675	"Only %d pstates usable (use ACPI driver for full "
676	"range\n", data->batps);	676	"range\n", data->batps);
677	data->numps = data->batps;	677	data->numps = data->batps;
678	}	678	}
679		679
680	for (j = 1; j < data->numps; j++) {	680	for (j = 1; j < data->numps; j++) {
681	if (pst[j-1].fid >= pst[j].fid) {	681	if (pst[j-1].fid >= pst[j].fid) {
682	printk(KERN_ERR PFX "PST out of sequence\n");	682	printk(KERN_ERR PFX "PST out of sequence\n");
683	return -EINVAL;	683	return -EINVAL;
684	}	684	}
685	}	685	}
686		686
687	if (data->numps < 2) {	687	if (data->numps < 2) {
688	printk(KERN_ERR PFX "no p states to transition\n");	688	printk(KERN_ERR PFX "no p states to transition\n");
689	return -ENODEV;	689	return -ENODEV;
690	}	690	}
691		691
692	if (check_pst_table(data, pst, maxvid))	692	if (check_pst_table(data, pst, maxvid))
693	return -EINVAL;	693	return -EINVAL;
694		694
695	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)	695	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
696	* (data->numps + 1)), GFP_KERNEL);	696	* (data->numps + 1)), GFP_KERNEL);
697	if (!powernow_table) {	697	if (!powernow_table) {
698	printk(KERN_ERR PFX "powernow_table memory alloc failure\n");	698	printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
699	return -ENOMEM;	699	return -ENOMEM;
700	}	700	}
701		701
702	for (j = 0; j < data->numps; j++) {	702	for (j = 0; j < data->numps; j++) {
703	int freq;	703	int freq;
704	powernow_table[j].index = pst[j].fid; /* lower 8 bits */	704	powernow_table[j].index = pst[j].fid; /* lower 8 bits */
705	powernow_table[j].index \|= (pst[j].vid << 8); /* upper 8 bits */	705	powernow_table[j].index \|= (pst[j].vid << 8); /* upper 8 bits */
706	freq = find_khz_freq_from_fid(pst[j].fid);	706	freq = find_khz_freq_from_fid(pst[j].fid);
707	powernow_table[j].frequency = freq;	707	powernow_table[j].frequency = freq;
708	}	708	}
709	powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;	709	powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
710	powernow_table[data->numps].index = 0;	710	powernow_table[data->numps].index = 0;
711		711
712	if (query_current_values_with_pending_wait(data)) {	712	if (query_current_values_with_pending_wait(data)) {
713	kfree(powernow_table);	713	kfree(powernow_table);
714	return -EIO;	714	return -EIO;
715	}	715	}
716		716
717	dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);	717	dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
718	data->powernow_table = powernow_table;	718	data->powernow_table = powernow_table;
719	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)	719	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
720	print_basics(data);	720	print_basics(data);
721		721
722	for (j = 0; j < data->numps; j++)	722	for (j = 0; j < data->numps; j++)
723	if ((pst[j].fid == data->currfid) &&	723	if ((pst[j].fid == data->currfid) &&
724	(pst[j].vid == data->currvid))	724	(pst[j].vid == data->currvid))
725	return 0;	725	return 0;
726		726
727	dprintk("currfid/vid do not match PST, ignoring\n");	727	dprintk("currfid/vid do not match PST, ignoring\n");
728	return 0;	728	return 0;
729	}	729	}
730		730
731	/* Find and validate the PSB/PST table in BIOS. */	731	/* Find and validate the PSB/PST table in BIOS. */
732	static int find_psb_table(struct powernow_k8_data *data)	732	static int find_psb_table(struct powernow_k8_data *data)
733	{	733	{
734	struct psb_s *psb;	734	struct psb_s *psb;
735	unsigned int i;	735	unsigned int i;
736	u32 mvs;	736	u32 mvs;
737	u8 maxvid;	737	u8 maxvid;
738	u32 cpst = 0;	738	u32 cpst = 0;
739	u32 thiscpuid;	739	u32 thiscpuid;
740		740
741	for (i = 0xc0000; i < 0xffff0; i += 0x10) {	741	for (i = 0xc0000; i < 0xffff0; i += 0x10) {
742	/* Scan BIOS looking for the signature. */	742	/* Scan BIOS looking for the signature. */
743	/* It can not be at ffff0 - it is too big. */	743	/* It can not be at ffff0 - it is too big. */
744		744
745	psb = phys_to_virt(i);	745	psb = phys_to_virt(i);
746	if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)	746	if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
747	continue;	747	continue;
748		748
749	dprintk("found PSB header at 0x%p\n", psb);	749	dprintk("found PSB header at 0x%p\n", psb);
750		750
751	dprintk("table vers: 0x%x\n", psb->tableversion);	751	dprintk("table vers: 0x%x\n", psb->tableversion);
752	if (psb->tableversion != PSB_VERSION_1_4) {	752	if (psb->tableversion != PSB_VERSION_1_4) {
753	printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");	753	printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
754	return -ENODEV;	754	return -ENODEV;
755	}	755	}
756		756
757	dprintk("flags: 0x%x\n", psb->flags1);	757	dprintk("flags: 0x%x\n", psb->flags1);
758	if (psb->flags1) {	758	if (psb->flags1) {
759	printk(KERN_ERR FW_BUG PFX "unknown flags\n");	759	printk(KERN_ERR FW_BUG PFX "unknown flags\n");
760	return -ENODEV;	760	return -ENODEV;
761	}	761	}
762		762
763	data->vstable = psb->vstable;	763	data->vstable = psb->vstable;
764	dprintk("voltage stabilization time: %d(*20us)\n",	764	dprintk("voltage stabilization time: %d(*20us)\n",
765	data->vstable);	765	data->vstable);
766		766
767	dprintk("flags2: 0x%x\n", psb->flags2);	767	dprintk("flags2: 0x%x\n", psb->flags2);
768	data->rvo = psb->flags2 & 3;	768	data->rvo = psb->flags2 & 3;
769	data->irt = ((psb->flags2) >> 2) & 3;	769	data->irt = ((psb->flags2) >> 2) & 3;
770	mvs = ((psb->flags2) >> 4) & 3;	770	mvs = ((psb->flags2) >> 4) & 3;
771	data->vidmvs = 1 << mvs;	771	data->vidmvs = 1 << mvs;
772	data->batps = ((psb->flags2) >> 6) & 3;	772	data->batps = ((psb->flags2) >> 6) & 3;
773		773
774	dprintk("ramp voltage offset: %d\n", data->rvo);	774	dprintk("ramp voltage offset: %d\n", data->rvo);
775	dprintk("isochronous relief time: %d\n", data->irt);	775	dprintk("isochronous relief time: %d\n", data->irt);
776	dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);	776	dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
777		777
778	dprintk("numpst: 0x%x\n", psb->num_tables);	778	dprintk("numpst: 0x%x\n", psb->num_tables);
779	cpst = psb->num_tables;	779	cpst = psb->num_tables;
780	if ((psb->cpuid == 0x00000fc0) \|\|	780	if ((psb->cpuid == 0x00000fc0) \|\|
781	(psb->cpuid == 0x00000fe0)) {	781	(psb->cpuid == 0x00000fe0)) {
782	thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);	782	thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
783	if ((thiscpuid == 0x00000fc0) \|\|	783	if ((thiscpuid == 0x00000fc0) \|\|
784	(thiscpuid == 0x00000fe0))	784	(thiscpuid == 0x00000fe0))
785	cpst = 1;	785	cpst = 1;
786	}	786	}
787	if (cpst != 1) {	787	if (cpst != 1) {
788	printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");	788	printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
789	return -ENODEV;	789	return -ENODEV;
790	}	790	}
791		791
792	data->plllock = psb->plllocktime;	792	data->plllock = psb->plllocktime;
793	dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);	793	dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
794	dprintk("maxfid: 0x%x\n", psb->maxfid);	794	dprintk("maxfid: 0x%x\n", psb->maxfid);
795	dprintk("maxvid: 0x%x\n", psb->maxvid);	795	dprintk("maxvid: 0x%x\n", psb->maxvid);
796	maxvid = psb->maxvid;	796	maxvid = psb->maxvid;
797		797
798	data->numps = psb->numps;	798	data->numps = psb->numps;
799	dprintk("numpstates: 0x%x\n", data->numps);	799	dprintk("numpstates: 0x%x\n", data->numps);
800	return fill_powernow_table(data,	800	return fill_powernow_table(data,
801	(struct pst_s *)(psb+1), maxvid);	801	(struct pst_s *)(psb+1), maxvid);
802	}	802	}
803	/*	803	/*
804	* If you see this message, complain to BIOS manufacturer. If	804	* If you see this message, complain to BIOS manufacturer. If
805	* he tells you "we do not support Linux" or some similar	805	* he tells you "we do not support Linux" or some similar
806	* nonsense, remember that Windows 2000 uses the same legacy	806	* nonsense, remember that Windows 2000 uses the same legacy
807	* mechanism that the old Linux PSB driver uses. Tell them it	807	* mechanism that the old Linux PSB driver uses. Tell them it
808	* is broken with Windows 2000.	808	* is broken with Windows 2000.
809	*	809	*
810	* The reference to the AMD documentation is chapter 9 in the	810	* The reference to the AMD documentation is chapter 9 in the
811	* BIOS and Kernel Developer's Guide, which is available on	811	* BIOS and Kernel Developer's Guide, which is available on
812	* www.amd.com	812	* www.amd.com
813	*/	813	*/
814	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");	814	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
815	return -ENODEV;	815	return -ENODEV;
816	}	816	}
817		817
818	static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,	818	static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
819	unsigned int index)	819	unsigned int index)
820	{	820	{
821	acpi_integer control;	821	acpi_integer control;
822		822
823	if (!data->acpi_data.state_count \|\| (cpu_family == CPU_HW_PSTATE))	823	if (!data->acpi_data.state_count \|\| (cpu_family == CPU_HW_PSTATE))
824	return;	824	return;
825		825
826	control = data->acpi_data.states[index].control; data->irt = (control	826	control = data->acpi_data.states[index].control; data->irt = (control
827	>> IRT_SHIFT) & IRT_MASK; data->rvo = (control >>	827	>> IRT_SHIFT) & IRT_MASK; data->rvo = (control >>
828	RVO_SHIFT) & RVO_MASK; data->exttype = (control	828	RVO_SHIFT) & RVO_MASK; data->exttype = (control
829	>> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;	829	>> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
830	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1	830	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1
831	<< ((control >> MVS_SHIFT) & MVS_MASK); data->vstable =	831	<< ((control >> MVS_SHIFT) & MVS_MASK); data->vstable =
832	(control >> VST_SHIFT) & VST_MASK; }	832	(control >> VST_SHIFT) & VST_MASK; }
833		833
834	static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)	834	static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
835	{	835	{
836	struct cpufreq_frequency_table *powernow_table;	836	struct cpufreq_frequency_table *powernow_table;
837	int ret_val = -ENODEV;	837	int ret_val = -ENODEV;
838	acpi_integer control, status;	838	acpi_integer control, status;
839		839
840	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {	840	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
841	dprintk("register performance failed: bad ACPI data\n");	841	dprintk("register performance failed: bad ACPI data\n");
842	return -EIO;	842	return -EIO;
843	}	843	}
844		844
845	/* verify the data contained in the ACPI structures */	845	/* verify the data contained in the ACPI structures */
846	if (data->acpi_data.state_count <= 1) {	846	if (data->acpi_data.state_count <= 1) {
847	dprintk("No ACPI P-States\n");	847	dprintk("No ACPI P-States\n");
848	goto err_out;	848	goto err_out;
849	}	849	}
850		850
851	control = data->acpi_data.control_register.space_id;	851	control = data->acpi_data.control_register.space_id;
852	status = data->acpi_data.status_register.space_id;	852	status = data->acpi_data.status_register.space_id;
853		853
854	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) \|\|	854	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) \|\|
855	(status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {	855	(status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
856	dprintk("Invalid control/status registers (%x - %x)\n",	856	dprintk("Invalid control/status registers (%x - %x)\n",
857	control, status);	857	control, status);
858	goto err_out;	858	goto err_out;
859	}	859	}
860		860
861	/* fill in data->powernow_table */	861	/* fill in data->powernow_table */
862	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)	862	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
863	* (data->acpi_data.state_count + 1)), GFP_KERNEL);	863	* (data->acpi_data.state_count + 1)), GFP_KERNEL);
864	if (!powernow_table) {	864	if (!powernow_table) {
865	dprintk("powernow_table memory alloc failure\n");	865	dprintk("powernow_table memory alloc failure\n");
866	goto err_out;	866	goto err_out;
867	}	867	}
868		868
869	if (cpu_family == CPU_HW_PSTATE)	869	if (cpu_family == CPU_HW_PSTATE)
870	ret_val = fill_powernow_table_pstate(data, powernow_table);	870	ret_val = fill_powernow_table_pstate(data, powernow_table);
871	else	871	else
872	ret_val = fill_powernow_table_fidvid(data, powernow_table);	872	ret_val = fill_powernow_table_fidvid(data, powernow_table);
873	if (ret_val)	873	if (ret_val)
874	goto err_out_mem;	874	goto err_out_mem;
875		875
876	powernow_table[data->acpi_data.state_count].frequency =	876	powernow_table[data->acpi_data.state_count].frequency =
877	CPUFREQ_TABLE_END;	877	CPUFREQ_TABLE_END;
878	powernow_table[data->acpi_data.state_count].index = 0;	878	powernow_table[data->acpi_data.state_count].index = 0;
879	data->powernow_table = powernow_table;	879	data->powernow_table = powernow_table;
880		880
881	/* fill in data */	881	/* fill in data */
882	data->numps = data->acpi_data.state_count;	882	data->numps = data->acpi_data.state_count;
883	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)	883	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
884	print_basics(data);	884	print_basics(data);
885	powernow_k8_acpi_pst_values(data, 0);	885	powernow_k8_acpi_pst_values(data, 0);
886		886
887	/* notify BIOS that we exist */	887	/* notify BIOS that we exist */
888	acpi_processor_notify_smm(THIS_MODULE);	888	acpi_processor_notify_smm(THIS_MODULE);
889		889
890	if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {	890	if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
891	printk(KERN_ERR PFX	891	printk(KERN_ERR PFX
892	"unable to alloc powernow_k8_data cpumask\n");	892	"unable to alloc powernow_k8_data cpumask\n");
893	ret_val = -ENOMEM;	893	ret_val = -ENOMEM;
894	goto err_out_mem;	894	goto err_out_mem;
895	}	895	}
896		896
897	return 0;	897	return 0;
898		898
899	err_out_mem:	899	err_out_mem:
900	kfree(powernow_table);	900	kfree(powernow_table);
901		901
902	err_out:	902	err_out:
903	acpi_processor_unregister_performance(&data->acpi_data, data->cpu);	903	acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
904		904
905	/* data->acpi_data.state_count informs us at ->exit()	905	/* data->acpi_data.state_count informs us at ->exit()
906	* whether ACPI was used */	906	* whether ACPI was used */
907	data->acpi_data.state_count = 0;	907	data->acpi_data.state_count = 0;
908		908
909	return ret_val;	909	return ret_val;
910	}	910	}
911		911
912	static int fill_powernow_table_pstate(struct powernow_k8_data *data,	912	static int fill_powernow_table_pstate(struct powernow_k8_data *data,
913	struct cpufreq_frequency_table *powernow_table)	913	struct cpufreq_frequency_table *powernow_table)
914	{	914	{
915	int i;	915	int i;
916	u32 hi = 0, lo = 0;	916	u32 hi = 0, lo = 0;
917	rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);	917	rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
918	data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;	918	data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
919		919
920	for (i = 0; i < data->acpi_data.state_count; i++) {	920	for (i = 0; i < data->acpi_data.state_count; i++) {
921	u32 index;	921	u32 index;
922		922
923	index = data->acpi_data.states[i].control & HW_PSTATE_MASK;	923	index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
924	if (index > data->max_hw_pstate) {	924	if (index > data->max_hw_pstate) {
925	printk(KERN_ERR PFX "invalid pstate %d - "	925	printk(KERN_ERR PFX "invalid pstate %d - "
926	"bad value %d.\n", i, index);	926	"bad value %d.\n", i, index);
927	printk(KERN_ERR PFX "Please report to BIOS "	927	printk(KERN_ERR PFX "Please report to BIOS "
928	"manufacturer\n");	928	"manufacturer\n");
929	invalidate_entry(data, i);	929	invalidate_entry(data, i);
930	continue;	930	continue;
931	}	931	}
932	rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);	932	rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
933	if (!(hi & HW_PSTATE_VALID_MASK)) {	933	if (!(hi & HW_PSTATE_VALID_MASK)) {
934	dprintk("invalid pstate %d, ignoring\n", index);	934	dprintk("invalid pstate %d, ignoring\n", index);
935	invalidate_entry(data, i);	935	invalidate_entry(data, i);
936	continue;	936	continue;
937	}	937	}
938		938
939	powernow_table[i].index = index;	939	powernow_table[i].index = index;
940		940
941	/* Frequency may be rounded for these */	941	/* Frequency may be rounded for these */
942	if (boot_cpu_data.x86 == 0x10 \|\| boot_cpu_data.x86 == 0x11) {	942	if (boot_cpu_data.x86 == 0x10 \|\| boot_cpu_data.x86 == 0x11) {
943	powernow_table[i].frequency =	943	powernow_table[i].frequency =
944	freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);	944	freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
945	} else	945	} else
946	powernow_table[i].frequency =	946	powernow_table[i].frequency =
947	data->acpi_data.states[i].core_frequency * 1000;	947	data->acpi_data.states[i].core_frequency * 1000;
948	}	948	}
949	return 0;	949	return 0;
950	}	950	}
951		951
952	static int fill_powernow_table_fidvid(struct powernow_k8_data *data,	952	static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
953	struct cpufreq_frequency_table *powernow_table)	953	struct cpufreq_frequency_table *powernow_table)
954	{	954	{
955	int i;	955	int i;
956	int cntlofreq = 0;	956	int cntlofreq = 0;
957		957
958	for (i = 0; i < data->acpi_data.state_count; i++) {	958	for (i = 0; i < data->acpi_data.state_count; i++) {
959	u32 fid;	959	u32 fid;
960	u32 vid;	960	u32 vid;
961	u32 freq, index;	961	u32 freq, index;
962	acpi_integer status, control;	962	acpi_integer status, control;
963		963
964	if (data->exttype) {	964	if (data->exttype) {
965	status = data->acpi_data.states[i].status;	965	status = data->acpi_data.states[i].status;
966	fid = status & EXT_FID_MASK;	966	fid = status & EXT_FID_MASK;
967	vid = (status >> VID_SHIFT) & EXT_VID_MASK;	967	vid = (status >> VID_SHIFT) & EXT_VID_MASK;
968	} else {	968	} else {
969	control = data->acpi_data.states[i].control;	969	control = data->acpi_data.states[i].control;
970	fid = control & FID_MASK;	970	fid = control & FID_MASK;
971	vid = (control >> VID_SHIFT) & VID_MASK;	971	vid = (control >> VID_SHIFT) & VID_MASK;
972	}	972	}
973		973
974	dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);	974	dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
975		975
976	index = fid \| (vid<<8);	976	index = fid \| (vid<<8);
977	powernow_table[i].index = index;	977	powernow_table[i].index = index;
978		978
979	freq = find_khz_freq_from_fid(fid);	979	freq = find_khz_freq_from_fid(fid);
980	powernow_table[i].frequency = freq;	980	powernow_table[i].frequency = freq;
981		981
982	/* verify frequency is OK */	982	/* verify frequency is OK */
983	if ((freq > (MAX_FREQ * 1000)) \|\| (freq < (MIN_FREQ * 1000))) {	983	if ((freq > (MAX_FREQ * 1000)) \|\| (freq < (MIN_FREQ * 1000))) {
984	dprintk("invalid freq %u kHz, ignoring\n", freq);	984	dprintk("invalid freq %u kHz, ignoring\n", freq);
985	invalidate_entry(data, i);	985	invalidate_entry(data, i);
986	continue;	986	continue;
987	}	987	}
988		988
989	/* verify voltage is OK -	989	/* verify voltage is OK -
990	* BIOSs are using "off" to indicate invalid */	990	* BIOSs are using "off" to indicate invalid */
991	if (vid == VID_OFF) {	991	if (vid == VID_OFF) {
992	dprintk("invalid vid %u, ignoring\n", vid);	992	dprintk("invalid vid %u, ignoring\n", vid);
993	invalidate_entry(data, i);	993	invalidate_entry(data, i);
994	continue;	994	continue;
995	}	995	}
996		996
997	/* verify only 1 entry from the lo frequency table */	997	/* verify only 1 entry from the lo frequency table */
998	if (fid < HI_FID_TABLE_BOTTOM) {	998	if (fid < HI_FID_TABLE_BOTTOM) {
999	if (cntlofreq) {	999	if (cntlofreq) {
1000	/* if both entries are the same,	1000	/* if both entries are the same,
1001	* ignore this one ... */	1001	* ignore this one ... */
1002	if ((freq != powernow_table[cntlofreq].frequency) \|\|	1002	if ((freq != powernow_table[cntlofreq].frequency) \|\|
1003	(index != powernow_table[cntlofreq].index)) {	1003	(index != powernow_table[cntlofreq].index)) {
1004	printk(KERN_ERR PFX	1004	printk(KERN_ERR PFX
1005	"Too many lo freq table "	1005	"Too many lo freq table "
1006	"entries\n");	1006	"entries\n");
1007	return 1;	1007	return 1;
1008	}	1008	}
1009		1009
1010	dprintk("double low frequency table entry, "	1010	dprintk("double low frequency table entry, "
1011	"ignoring it.\n");	1011	"ignoring it.\n");
1012	invalidate_entry(data, i);	1012	invalidate_entry(data, i);
1013	continue;	1013	continue;
1014	} else	1014	} else
1015	cntlofreq = i;	1015	cntlofreq = i;
1016	}	1016	}
1017		1017
1018	if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {	1018	if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
1019	printk(KERN_INFO PFX "invalid freq entries "	1019	printk(KERN_INFO PFX "invalid freq entries "
1020	"%u kHz vs. %u kHz\n", freq,	1020	"%u kHz vs. %u kHz\n", freq,
1021	(unsigned int)	1021	(unsigned int)
1022	(data->acpi_data.states[i].core_frequency	1022	(data->acpi_data.states[i].core_frequency
1023	* 1000));	1023	* 1000));
1024	invalidate_entry(data, i);	1024	invalidate_entry(data, i);
1025	continue;	1025	continue;
1026	}	1026	}
1027	}	1027	}
1028	return 0;	1028	return 0;
1029	}	1029	}
1030		1030
1031	static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)	1031	static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
1032	{	1032	{
1033	if (data->acpi_data.state_count)	1033	if (data->acpi_data.state_count)
1034	acpi_processor_unregister_performance(&data->acpi_data,	1034	acpi_processor_unregister_performance(&data->acpi_data,
1035	data->cpu);	1035	data->cpu);
1036	free_cpumask_var(data->acpi_data.shared_cpu_map);	1036	free_cpumask_var(data->acpi_data.shared_cpu_map);
1037	}	1037	}
1038		1038
1039	static int get_transition_latency(struct powernow_k8_data *data)	1039	static int get_transition_latency(struct powernow_k8_data *data)
1040	{	1040	{
1041	int max_latency = 0;	1041	int max_latency = 0;
1042	int i;	1042	int i;
1043	for (i = 0; i < data->acpi_data.state_count; i++) {	1043	for (i = 0; i < data->acpi_data.state_count; i++) {
1044	int cur_latency = data->acpi_data.states[i].transition_latency	1044	int cur_latency = data->acpi_data.states[i].transition_latency
1045	+ data->acpi_data.states[i].bus_master_latency;	1045	+ data->acpi_data.states[i].bus_master_latency;
1046	if (cur_latency > max_latency)	1046	if (cur_latency > max_latency)
1047	max_latency = cur_latency;	1047	max_latency = cur_latency;
1048	}	1048	}
1049	/* value in usecs, needs to be in nanoseconds */	1049	/* value in usecs, needs to be in nanoseconds */
1050	return 1000 * max_latency;	1050	return 1000 * max_latency;
1051	}	1051	}
1052		1052
1053	/* Take a frequency, and issue the fid/vid transition command */	1053	/* Take a frequency, and issue the fid/vid transition command */
1054	static int transition_frequency_fidvid(struct powernow_k8_data *data,	1054	static int transition_frequency_fidvid(struct powernow_k8_data *data,
1055	unsigned int index)	1055	unsigned int index)
1056	{	1056	{
1057	u32 fid = 0;	1057	u32 fid = 0;
1058	u32 vid = 0;	1058	u32 vid = 0;
1059	int res, i;	1059	int res, i;
1060	struct cpufreq_freqs freqs;	1060	struct cpufreq_freqs freqs;
1061		1061
1062	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);	1062	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1063		1063
1064	/* fid/vid correctness check for k8 */	1064	/* fid/vid correctness check for k8 */
1065	/* fid are the lower 8 bits of the index we stored into	1065	/* fid are the lower 8 bits of the index we stored into
1066	* the cpufreq frequency table in find_psb_table, vid	1066	* the cpufreq frequency table in find_psb_table, vid
1067	* are the upper 8 bits.	1067	* are the upper 8 bits.
1068	*/	1068	*/
1069	fid = data->powernow_table[index].index & 0xFF;	1069	fid = data->powernow_table[index].index & 0xFF;
1070	vid = (data->powernow_table[index].index & 0xFF00) >> 8;	1070	vid = (data->powernow_table[index].index & 0xFF00) >> 8;
1071		1071
1072	dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);	1072	dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
1073		1073
1074	if (query_current_values_with_pending_wait(data))	1074	if (query_current_values_with_pending_wait(data))
1075	return 1;	1075	return 1;
1076		1076
1077	if ((data->currvid == vid) && (data->currfid == fid)) {	1077	if ((data->currvid == vid) && (data->currfid == fid)) {
1078	dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",	1078	dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
1079	fid, vid);	1079	fid, vid);
1080	return 0;	1080	return 0;
1081	}	1081	}
1082		1082
1083	if ((fid < HI_FID_TABLE_BOTTOM) &&	1083	if ((fid < HI_FID_TABLE_BOTTOM) &&
1084	(data->currfid < HI_FID_TABLE_BOTTOM)) {	1084	(data->currfid < HI_FID_TABLE_BOTTOM)) {
1085	printk(KERN_ERR PFX	1085	printk(KERN_ERR PFX
1086	"ignoring illegal change in lo freq table-%x to 0x%x\n",	1086	"ignoring illegal change in lo freq table-%x to 0x%x\n",
1087	data->currfid, fid);	1087	data->currfid, fid);
1088	return 1;	1088	return 1;
1089	}	1089	}
1090		1090
1091	dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",	1091	dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1092	smp_processor_id(), fid, vid);	1092	smp_processor_id(), fid, vid);
1093	freqs.old = find_khz_freq_from_fid(data->currfid);	1093	freqs.old = find_khz_freq_from_fid(data->currfid);
1094	freqs.new = find_khz_freq_from_fid(fid);	1094	freqs.new = find_khz_freq_from_fid(fid);
1095		1095
1096	for_each_cpu_mask_nr(i, *(data->available_cores)) {	1096	for_each_cpu_mask_nr(i, *(data->available_cores)) {
1097	freqs.cpu = i;	1097	freqs.cpu = i;
1098	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);	1098	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1099	}	1099	}
1100		1100
1101	res = transition_fid_vid(data, fid, vid);	1101	res = transition_fid_vid(data, fid, vid);
1102	freqs.new = find_khz_freq_from_fid(data->currfid);	1102	freqs.new = find_khz_freq_from_fid(data->currfid);
1103		1103
1104	for_each_cpu_mask_nr(i, *(data->available_cores)) {	1104	for_each_cpu_mask_nr(i, *(data->available_cores)) {
1105	freqs.cpu = i;	1105	freqs.cpu = i;
1106	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);	1106	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1107	}	1107	}
1108	return res;	1108	return res;
1109	}	1109	}
1110		1110
1111	/* Take a frequency, and issue the hardware pstate transition command */	1111	/* Take a frequency, and issue the hardware pstate transition command */
1112	static int transition_frequency_pstate(struct powernow_k8_data *data,	1112	static int transition_frequency_pstate(struct powernow_k8_data *data,
1113	unsigned int index)	1113	unsigned int index)
1114	{	1114	{
1115	u32 pstate = 0;	1115	u32 pstate = 0;
1116	int res, i;	1116	int res, i;
1117	struct cpufreq_freqs freqs;	1117	struct cpufreq_freqs freqs;
1118		1118
1119	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);	1119	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1120		1120
1121	/* get MSR index for hardware pstate transition */	1121	/* get MSR index for hardware pstate transition */
1122	pstate = index & HW_PSTATE_MASK;	1122	pstate = index & HW_PSTATE_MASK;
1123	if (pstate > data->max_hw_pstate)	1123	if (pstate > data->max_hw_pstate)
1124	return 0;	1124	return 0;
1125	freqs.old = find_khz_freq_from_pstate(data->powernow_table,	1125	freqs.old = find_khz_freq_from_pstate(data->powernow_table,
1126	data->currpstate);	1126	data->currpstate);
1127	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);	1127	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1128		1128
1129	for_each_cpu_mask_nr(i, *(data->available_cores)) {	1129	for_each_cpu_mask_nr(i, *(data->available_cores)) {
1130	freqs.cpu = i;	1130	freqs.cpu = i;
1131	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);	1131	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1132	}	1132	}
1133		1133
1134	res = transition_pstate(data, pstate);	1134	res = transition_pstate(data, pstate);
1135	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);	1135	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1136		1136
1137	for_each_cpu_mask_nr(i, *(data->available_cores)) {	1137	for_each_cpu_mask_nr(i, *(data->available_cores)) {
1138	freqs.cpu = i;	1138	freqs.cpu = i;
1139	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);	1139	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1140	}	1140	}
1141	return res;	1141	return res;
1142	}	1142	}
1143		1143
1144	/* Driver entry point to switch to the target frequency */	1144	/* Driver entry point to switch to the target frequency */
1145	static int powernowk8_target(struct cpufreq_policy *pol,	1145	static int powernowk8_target(struct cpufreq_policy *pol,
1146	unsigned targfreq, unsigned relation)	1146	unsigned targfreq, unsigned relation)
1147	{	1147	{
1148	cpumask_t oldmask;	1148	cpumask_t oldmask;
1149	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);	1149	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1150	u32 checkfid;	1150	u32 checkfid;
1151	u32 checkvid;	1151	u32 checkvid;
1152	unsigned int newstate;	1152	unsigned int newstate;
1153	int ret = -EIO;	1153	int ret = -EIO;
1154		1154
1155	if (!data)	1155	if (!data)
1156	return -EINVAL;	1156	return -EINVAL;
1157		1157
1158	checkfid = data->currfid;	1158	checkfid = data->currfid;
1159	checkvid = data->currvid;	1159	checkvid = data->currvid;
1160		1160
1161	/* only run on specific CPU from here on */	1161	/* only run on specific CPU from here on */
1162	oldmask = current->cpus_allowed;	1162	oldmask = current->cpus_allowed;
1163	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));	1163	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
1164		1164
1165	if (smp_processor_id() != pol->cpu) {	1165	if (smp_processor_id() != pol->cpu) {
1166	printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);	1166	printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1167	goto err_out;	1167	goto err_out;
1168	}	1168	}
1169		1169
1170	if (pending_bit_stuck()) {	1170	if (pending_bit_stuck()) {
1171	printk(KERN_ERR PFX "failing targ, change pending bit set\n");	1171	printk(KERN_ERR PFX "failing targ, change pending bit set\n");
1172	goto err_out;	1172	goto err_out;
1173	}	1173	}
1174		1174
1175	dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",	1175	dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
1176	pol->cpu, targfreq, pol->min, pol->max, relation);	1176	pol->cpu, targfreq, pol->min, pol->max, relation);
1177		1177
1178	if (query_current_values_with_pending_wait(data))	1178	if (query_current_values_with_pending_wait(data))
1179	goto err_out;	1179	goto err_out;
1180		1180
1181	if (cpu_family != CPU_HW_PSTATE) {	1181	if (cpu_family != CPU_HW_PSTATE) {
1182	dprintk("targ: curr fid 0x%x, vid 0x%x\n",	1182	dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1183	data->currfid, data->currvid);	1183	data->currfid, data->currvid);
1184		1184
1185	if ((checkvid != data->currvid) \|\|	1185	if ((checkvid != data->currvid) \|\|
1186	(checkfid != data->currfid)) {	1186	(checkfid != data->currfid)) {
1187	printk(KERN_INFO PFX	1187	printk(KERN_INFO PFX
1188	"error - out of sync, fix 0x%x 0x%x, "	1188	"error - out of sync, fix 0x%x 0x%x, "
1189	"vid 0x%x 0x%x\n",	1189	"vid 0x%x 0x%x\n",
1190	checkfid, data->currfid,	1190	checkfid, data->currfid,
1191	checkvid, data->currvid);	1191	checkvid, data->currvid);
1192	}	1192	}
1193	}	1193	}
1194		1194
1195	if (cpufreq_frequency_table_target(pol, data->powernow_table,	1195	if (cpufreq_frequency_table_target(pol, data->powernow_table,
1196	targfreq, relation, &newstate))	1196	targfreq, relation, &newstate))
1197	goto err_out;	1197	goto err_out;
1198		1198
1199	mutex_lock(&fidvid_mutex);	1199	mutex_lock(&fidvid_mutex);
1200		1200
1201	powernow_k8_acpi_pst_values(data, newstate);	1201	powernow_k8_acpi_pst_values(data, newstate);
1202		1202
1203	if (cpu_family == CPU_HW_PSTATE)	1203	if (cpu_family == CPU_HW_PSTATE)
1204	ret = transition_frequency_pstate(data, newstate);	1204	ret = transition_frequency_pstate(data, newstate);
1205	else	1205	else
1206	ret = transition_frequency_fidvid(data, newstate);	1206	ret = transition_frequency_fidvid(data, newstate);
1207	if (ret) {	1207	if (ret) {
1208	printk(KERN_ERR PFX "transition frequency failed\n");	1208	printk(KERN_ERR PFX "transition frequency failed\n");
1209	ret = 1;	1209	ret = 1;
1210	mutex_unlock(&fidvid_mutex);	1210	mutex_unlock(&fidvid_mutex);
1211	goto err_out;	1211	goto err_out;
1212	}	1212	}
1213	mutex_unlock(&fidvid_mutex);	1213	mutex_unlock(&fidvid_mutex);
1214		1214
1215	if (cpu_family == CPU_HW_PSTATE)	1215	if (cpu_family == CPU_HW_PSTATE)
1216	pol->cur = find_khz_freq_from_pstate(data->powernow_table,	1216	pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1217	newstate);	1217	newstate);
1218	else	1218	else
1219	pol->cur = find_khz_freq_from_fid(data->currfid);	1219	pol->cur = find_khz_freq_from_fid(data->currfid);
1220	ret = 0;	1220	ret = 0;
1221		1221
1222	err_out:	1222	err_out:
1223	set_cpus_allowed_ptr(current, &oldmask);	1223	set_cpus_allowed_ptr(current, &oldmask);
1224	return ret;	1224	return ret;
1225	}	1225	}
1226		1226
1227	/* Driver entry point to verify the policy and range of frequencies */	1227	/* Driver entry point to verify the policy and range of frequencies */
1228	static int powernowk8_verify(struct cpufreq_policy *pol)	1228	static int powernowk8_verify(struct cpufreq_policy *pol)
1229	{	1229	{
1230	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);	1230	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1231		1231
1232	if (!data)	1232	if (!data)
1233	return -EINVAL;	1233	return -EINVAL;
1234		1234
1235	return cpufreq_frequency_table_verify(pol, data->powernow_table);	1235	return cpufreq_frequency_table_verify(pol, data->powernow_table);
1236	}	1236	}
1237		1237
1238	static const char ACPI_PSS_BIOS_BUG_MSG[] =	1238	static const char ACPI_PSS_BIOS_BUG_MSG[] =
1239	KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"	1239	KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1240	KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";	1240	KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
1241		1241
1242	/* per CPU init entry point to the driver */	1242	/* per CPU init entry point to the driver */
1243	static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)	1243	static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1244	{	1244	{
1245	struct powernow_k8_data *data;	1245	struct powernow_k8_data *data;
1246	cpumask_t oldmask;	1246	cpumask_t oldmask;
1247	int rc;	1247	int rc;
1248		1248
1249	if (!cpu_online(pol->cpu))	1249	if (!cpu_online(pol->cpu))
1250	return -ENODEV;	1250	return -ENODEV;
1251		1251
1252	if (!check_supported_cpu(pol->cpu))	1252	if (!check_supported_cpu(pol->cpu))
1253	return -ENODEV;	1253	return -ENODEV;
1254		1254
1255	data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);	1255	data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
1256	if (!data) {	1256	if (!data) {
1257	printk(KERN_ERR PFX "unable to alloc powernow_k8_data");	1257	printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
1258	return -ENOMEM;	1258	return -ENOMEM;
1259	}	1259	}
1260		1260
1261	data->cpu = pol->cpu;	1261	data->cpu = pol->cpu;
1262	data->currpstate = HW_PSTATE_INVALID;	1262	data->currpstate = HW_PSTATE_INVALID;
1263		1263
1264	if (powernow_k8_cpu_init_acpi(data)) {	1264	if (powernow_k8_cpu_init_acpi(data)) {
1265	/*	1265	/*
1266	* Use the PSB BIOS structure. This is only availabe on	1266	* Use the PSB BIOS structure. This is only availabe on
1267	* an UP version, and is deprecated by AMD.	1267	* an UP version, and is deprecated by AMD.
1268	*/	1268	*/
1269	if (num_online_cpus() != 1) {	1269	if (num_online_cpus() != 1) {
1270	printk_once(ACPI_PSS_BIOS_BUG_MSG);	1270	printk_once(ACPI_PSS_BIOS_BUG_MSG);
1271	goto err_out;	1271	goto err_out;
1272	}	1272	}
1273	if (pol->cpu != 0) {	1273	if (pol->cpu != 0) {
1274	printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "	1274	printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1275	"CPU other than CPU0. Complain to your BIOS "	1275	"CPU other than CPU0. Complain to your BIOS "
1276	"vendor.\n");	1276	"vendor.\n");
1277	goto err_out;	1277	goto err_out;
1278	}	1278	}
1279	rc = find_psb_table(data);	1279	rc = find_psb_table(data);
1280	if (rc)	1280	if (rc)
1281	goto err_out;	1281	goto err_out;
1282		1282
1283	/* Take a crude guess here.	1283	/* Take a crude guess here.
1284	* That guess was in microseconds, so multiply with 1000 */	1284	* That guess was in microseconds, so multiply with 1000 */
1285	pol->cpuinfo.transition_latency = (	1285	pol->cpuinfo.transition_latency = (
1286	((data->rvo + 8) * data->vstable * VST_UNITS_20US) +	1286	((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
1287	((1 << data->irt) * 30)) * 1000;	1287	((1 << data->irt) * 30)) * 1000;
1288	} else /* ACPI _PSS objects available */	1288	} else /* ACPI _PSS objects available */
1289	pol->cpuinfo.transition_latency = get_transition_latency(data);	1289	pol->cpuinfo.transition_latency = get_transition_latency(data);
1290		1290
1291	/* only run on specific CPU from here on */	1291	/* only run on specific CPU from here on */
1292	oldmask = current->cpus_allowed;	1292	oldmask = current->cpus_allowed;
1293	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));	1293	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
1294		1294
1295	if (smp_processor_id() != pol->cpu) {	1295	if (smp_processor_id() != pol->cpu) {
1296	printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);	1296	printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1297	goto err_out_unmask;	1297	goto err_out_unmask;
1298	}	1298	}
1299		1299
1300	if (pending_bit_stuck()) {	1300	if (pending_bit_stuck()) {
1301	printk(KERN_ERR PFX "failing init, change pending bit set\n");	1301	printk(KERN_ERR PFX "failing init, change pending bit set\n");
1302	goto err_out_unmask;	1302	goto err_out_unmask;
1303	}	1303	}
1304		1304
1305	if (query_current_values_with_pending_wait(data))	1305	if (query_current_values_with_pending_wait(data))
1306	goto err_out_unmask;	1306	goto err_out_unmask;
1307		1307
1308	if (cpu_family == CPU_OPTERON)	1308	if (cpu_family == CPU_OPTERON)
1309	fidvid_msr_init();	1309	fidvid_msr_init();
1310		1310
1311	/* run on any CPU again */	1311	/* run on any CPU again */
1312	set_cpus_allowed_ptr(current, &oldmask);	1312	set_cpus_allowed_ptr(current, &oldmask);
1313		1313
1314	if (cpu_family == CPU_HW_PSTATE)	1314	if (cpu_family == CPU_HW_PSTATE)
1315	cpumask_copy(pol->cpus, cpumask_of(pol->cpu));	1315	cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1316	else	1316	else
1317	cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));	1317	cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
1318	data->available_cores = pol->cpus;	1318	data->available_cores = pol->cpus;
1319		1319
1320	if (cpu_family == CPU_HW_PSTATE)	1320	if (cpu_family == CPU_HW_PSTATE)
1321	pol->cur = find_khz_freq_from_pstate(data->powernow_table,	1321	pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1322	data->currpstate);	1322	data->currpstate);
1323	else	1323	else
1324	pol->cur = find_khz_freq_from_fid(data->currfid);	1324	pol->cur = find_khz_freq_from_fid(data->currfid);
1325	dprintk("policy current frequency %d kHz\n", pol->cur);	1325	dprintk("policy current frequency %d kHz\n", pol->cur);
1326		1326
1327	/* min/max the cpu is capable of */	1327	/* min/max the cpu is capable of */
1328	if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {	1328	if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1329	printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");	1329	printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1330	powernow_k8_cpu_exit_acpi(data);	1330	powernow_k8_cpu_exit_acpi(data);
1331	kfree(data->powernow_table);	1331	kfree(data->powernow_table);
1332	kfree(data);	1332	kfree(data);
1333	return -EINVAL;	1333	return -EINVAL;
1334	}	1334	}
1335		1335
1336	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);	1336	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1337		1337
1338	if (cpu_family == CPU_HW_PSTATE)	1338	if (cpu_family == CPU_HW_PSTATE)
1339	dprintk("cpu_init done, current pstate 0x%x\n",	1339	dprintk("cpu_init done, current pstate 0x%x\n",
1340	data->currpstate);	1340	data->currpstate);
1341	else	1341	else
1342	dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",	1342	dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1343	data->currfid, data->currvid);	1343	data->currfid, data->currvid);
1344		1344
1345	per_cpu(powernow_data, pol->cpu) = data;	1345	per_cpu(powernow_data, pol->cpu) = data;
1346		1346
1347	return 0;	1347	return 0;
1348		1348
1349	err_out_unmask:	1349	err_out_unmask:
1350	set_cpus_allowed_ptr(current, &oldmask);	1350	set_cpus_allowed_ptr(current, &oldmask);
1351	powernow_k8_cpu_exit_acpi(data);	1351	powernow_k8_cpu_exit_acpi(data);
1352		1352
1353	err_out:	1353	err_out:
1354	kfree(data);	1354	kfree(data);
1355	return -ENODEV;	1355	return -ENODEV;
1356	}	1356	}
1357		1357
1358	static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)	1358	static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1359	{	1359	{
1360	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);	1360	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1361		1361
1362	if (!data)	1362	if (!data)
1363	return -EINVAL;	1363	return -EINVAL;
1364		1364
1365	powernow_k8_cpu_exit_acpi(data);	1365	powernow_k8_cpu_exit_acpi(data);
1366		1366
1367	cpufreq_frequency_table_put_attr(pol->cpu);	1367	cpufreq_frequency_table_put_attr(pol->cpu);
1368		1368
1369	kfree(data->powernow_table);	1369	kfree(data->powernow_table);
1370	kfree(data);	1370	kfree(data);
1371		1371
1372	return 0;	1372	return 0;
1373	}	1373	}
1374		1374
1375	static unsigned int powernowk8_get(unsigned int cpu)	1375	static unsigned int powernowk8_get(unsigned int cpu)
1376	{	1376	{
1377	struct powernow_k8_data *data;	1377	struct powernow_k8_data *data;
1378	cpumask_t oldmask = current->cpus_allowed;	1378	cpumask_t oldmask = current->cpus_allowed;
1379	unsigned int khz = 0;	1379	unsigned int khz = 0;
1380	unsigned int first;	1380	unsigned int first;
1381		1381
1382	first = cpumask_first(cpu_core_mask(cpu));	1382	first = cpumask_first(cpu_core_mask(cpu));
1383	data = per_cpu(powernow_data, first);	1383	data = per_cpu(powernow_data, first);
1384		1384
1385	if (!data)	1385	if (!data)
1386	return -EINVAL;	1386	return -EINVAL;
1387		1387
1388	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));	1388	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
1389	if (smp_processor_id() != cpu) {	1389	if (smp_processor_id() != cpu) {
1390	printk(KERN_ERR PFX	1390	printk(KERN_ERR PFX
1391	"limiting to CPU %d failed in powernowk8_get\n", cpu);	1391	"limiting to CPU %d failed in powernowk8_get\n", cpu);
1392	set_cpus_allowed_ptr(current, &oldmask);	1392	set_cpus_allowed_ptr(current, &oldmask);
1393	return 0;	1393	return 0;
1394	}	1394	}
1395		1395
1396	if (query_current_values_with_pending_wait(data))	1396	if (query_current_values_with_pending_wait(data))
1397	goto out;	1397	goto out;
1398		1398
1399	if (cpu_family == CPU_HW_PSTATE)	1399	if (cpu_family == CPU_HW_PSTATE)
1400	khz = find_khz_freq_from_pstate(data->powernow_table,	1400	khz = find_khz_freq_from_pstate(data->powernow_table,
1401	data->currpstate);	1401	data->currpstate);
1402	else	1402	else
1403	khz = find_khz_freq_from_fid(data->currfid);	1403	khz = find_khz_freq_from_fid(data->currfid);
1404		1404
1405		1405
1406	out:	1406	out:
1407	set_cpus_allowed_ptr(current, &oldmask);	1407	set_cpus_allowed_ptr(current, &oldmask);
1408	return khz;	1408	return khz;
1409	}	1409	}
1410		1410
1411	static struct freq_attr *powernow_k8_attr[] = {	1411	static struct freq_attr *powernow_k8_attr[] = {
1412	&cpufreq_freq_attr_scaling_available_freqs,	1412	&cpufreq_freq_attr_scaling_available_freqs,
1413	NULL,	1413	NULL,
1414	};	1414	};
1415		1415
1416	static struct cpufreq_driver cpufreq_amd64_driver = {	1416	static struct cpufreq_driver cpufreq_amd64_driver = {
1417	.verify = powernowk8_verify,	1417	.verify = powernowk8_verify,
1418	.target = powernowk8_target,	1418	.target = powernowk8_target,
1419	.init = powernowk8_cpu_init,	1419	.init = powernowk8_cpu_init,
1420	.exit = __devexit_p(powernowk8_cpu_exit),	1420	.exit = __devexit_p(powernowk8_cpu_exit),
1421	.get = powernowk8_get,	1421	.get = powernowk8_get,
1422	.name = "powernow-k8",	1422	.name = "powernow-k8",
1423	.owner = THIS_MODULE,	1423	.owner = THIS_MODULE,
1424	.attr = powernow_k8_attr,	1424	.attr = powernow_k8_attr,
1425	};	1425	};
1426		1426
1427	/* driver entry point for init */	1427	/* driver entry point for init */
1428	static int __cpuinit powernowk8_init(void)	1428	static int __cpuinit powernowk8_init(void)
1429	{	1429	{
1430	unsigned int i, supported_cpus = 0;	1430	unsigned int i, supported_cpus = 0;
1431		1431
1432	for_each_online_cpu(i) {	1432	for_each_online_cpu(i) {
1433	if (check_supported_cpu(i))	1433	if (check_supported_cpu(i))
1434	supported_cpus++;	1434	supported_cpus++;
1435	}	1435	}
1436		1436
1437	if (supported_cpus == num_online_cpus()) {	1437	if (supported_cpus == num_online_cpus()) {
1438	printk(KERN_INFO PFX "Found %d %s "	1438	printk(KERN_INFO PFX "Found %d %s "
1439	"processors (%d cpu cores) (" VERSION ")\n",	1439	"processors (%d cpu cores) (" VERSION ")\n",
1440	num_online_nodes(),	1440	num_online_nodes(),
1441	boot_cpu_data.x86_model_id, supported_cpus);	1441	boot_cpu_data.x86_model_id, supported_cpus);
1442	return cpufreq_register_driver(&cpufreq_amd64_driver);	1442	return cpufreq_register_driver(&cpufreq_amd64_driver);
1443	}	1443	}
1444		1444
1445	return -ENODEV;	1445	return -ENODEV;
1446	}	1446	}
1447		1447
1448	/* driver entry point for term */	1448	/* driver entry point for term */
1449	static void __exit powernowk8_exit(void)	1449	static void __exit powernowk8_exit(void)
1450	{	1450	{
1451	dprintk("exit\n");	1451	dprintk("exit\n");
1452		1452
1453	cpufreq_unregister_driver(&cpufreq_amd64_driver);	1453	cpufreq_unregister_driver(&cpufreq_amd64_driver);
1454	}	1454	}
1455		1455
1456	MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "	1456	MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
1457	"Mark Langsdorf <mark.langsdorf@amd.com>");	1457	"Mark Langsdorf <mark.langsdorf@amd.com>");
1458	MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");	1458	MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1459	MODULE_LICENSE("GPL");	1459	MODULE_LICENSE("GPL");
1460		1460
1461	late_initcall(powernowk8_init);	1461	late_initcall(powernowk8_init);
1462	module_exit(powernowk8_exit);	1462	module_exit(powernowk8_exit);
1463		1463

arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium	2	* cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
3	* M (part of the Centrino chipset).	3	* M (part of the Centrino chipset).
4	*	4	*
5	* Since the original Pentium M, most new Intel CPUs support Enhanced	5	* Since the original Pentium M, most new Intel CPUs support Enhanced
6	* SpeedStep.	6	* SpeedStep.
7	*	7	*
8	* Despite the "SpeedStep" in the name, this is almost entirely unlike	8	* Despite the "SpeedStep" in the name, this is almost entirely unlike
9	* traditional SpeedStep.	9	* traditional SpeedStep.
10	*	10	*
11	* Modelled on speedstep.c	11	* Modelled on speedstep.c
12	*	12	*
13	* Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>	13	* Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
14	*/	14	*/
15		15
16	#include <linux/kernel.h>	16	#include <linux/kernel.h>
17	#include <linux/module.h>	17	#include <linux/module.h>
18	#include <linux/init.h>	18	#include <linux/init.h>
19	#include <linux/cpufreq.h>	19	#include <linux/cpufreq.h>
20	#include <linux/sched.h> /* current */	20	#include <linux/sched.h> /* current */
21	#include <linux/delay.h>	21	#include <linux/delay.h>
22	#include <linux/compiler.h>	22	#include <linux/compiler.h>
23		23
24	#include <asm/msr.h>	24	#include <asm/msr.h>
25	#include <asm/processor.h>	25	#include <asm/processor.h>
26	#include <asm/cpufeature.h>	26	#include <asm/cpufeature.h>
27		27
28	#define PFX "speedstep-centrino: "	28	#define PFX "speedstep-centrino: "
29	#define MAINTAINER "cpufreq@vger.kernel.org"	29	#define MAINTAINER "cpufreq@vger.kernel.org"
30		30
31	#define dprintk(msg...) \	31	#define dprintk(msg...) \
32	cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)	32	cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
33		33
34	#define INTEL_MSR_RANGE (0xffff)	34	#define INTEL_MSR_RANGE (0xffff)
35		35
36	struct cpu_id	36	struct cpu_id
37	{	37	{
38	__u8 x86; /* CPU family */	38	__u8 x86; /* CPU family */
39	__u8 x86_model; /* model */	39	__u8 x86_model; /* model */
40	__u8 x86_mask; /* stepping */	40	__u8 x86_mask; /* stepping */
41	};	41	};
42		42
43	enum {	43	enum {
44	CPU_BANIAS,	44	CPU_BANIAS,
45	CPU_DOTHAN_A1,	45	CPU_DOTHAN_A1,
46	CPU_DOTHAN_A2,	46	CPU_DOTHAN_A2,
47	CPU_DOTHAN_B0,	47	CPU_DOTHAN_B0,
48	CPU_MP4HT_D0,	48	CPU_MP4HT_D0,
49	CPU_MP4HT_E0,	49	CPU_MP4HT_E0,
50	};	50	};
51		51
52	static const struct cpu_id cpu_ids[] = {	52	static const struct cpu_id cpu_ids[] = {
53	[CPU_BANIAS] = { 6, 9, 5 },	53	[CPU_BANIAS] = { 6, 9, 5 },
54	[CPU_DOTHAN_A1] = { 6, 13, 1 },	54	[CPU_DOTHAN_A1] = { 6, 13, 1 },
55	[CPU_DOTHAN_A2] = { 6, 13, 2 },	55	[CPU_DOTHAN_A2] = { 6, 13, 2 },
56	[CPU_DOTHAN_B0] = { 6, 13, 6 },	56	[CPU_DOTHAN_B0] = { 6, 13, 6 },
57	[CPU_MP4HT_D0] = {15, 3, 4 },	57	[CPU_MP4HT_D0] = {15, 3, 4 },
58	[CPU_MP4HT_E0] = {15, 4, 1 },	58	[CPU_MP4HT_E0] = {15, 4, 1 },
59	};	59	};
60	#define N_IDS ARRAY_SIZE(cpu_ids)	60	#define N_IDS ARRAY_SIZE(cpu_ids)
61		61
62	struct cpu_model	62	struct cpu_model
63	{	63	{
64	const struct cpu_id *cpu_id;	64	const struct cpu_id *cpu_id;
65	const char *model_name;	65	const char *model_name;
66	unsigned max_freq; /* max clock in kHz */	66	unsigned max_freq; /* max clock in kHz */
67		67
68	struct cpufreq_frequency_table op_points; / clock/voltage pairs */	68	struct cpufreq_frequency_table op_points; / clock/voltage pairs */
69	};	69	};
70	static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,	70	static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71	const struct cpu_id *x);	71	const struct cpu_id *x);
72		72
73	/* Operating points for current CPU */	73	/* Operating points for current CPU */
74	static DEFINE_PER_CPU(struct cpu_model *, centrino_model);	74	static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
75	static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);	75	static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
76		76
77	static struct cpufreq_driver centrino_driver;	77	static struct cpufreq_driver centrino_driver;
78		78
79	#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE	79	#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
80		80
81	/* Computes the correct form for IA32_PERF_CTL MSR for a particular	81	/* Computes the correct form for IA32_PERF_CTL MSR for a particular
82	frequency/voltage operating point; frequency in MHz, volts in mV.	82	frequency/voltage operating point; frequency in MHz, volts in mV.
83	This is stored as "index" in the structure. */	83	This is stored as "index" in the structure. */
84	#define OP(mhz, mv) \	84	#define OP(mhz, mv) \
85	{ \	85	{ \
86	.frequency = (mhz) * 1000, \	86	.frequency = (mhz) * 1000, \
87	.index = (((mhz)/100) << 8) \| ((mv - 700) / 16) \	87	.index = (((mhz)/100) << 8) \| ((mv - 700) / 16) \
88	}	88	}
89		89
90	/*	90	/*
91	* These voltage tables were derived from the Intel Pentium M	91	* These voltage tables were derived from the Intel Pentium M
92	* datasheet, document 25261202.pdf, Table 5. I have verified they	92	* datasheet, document 25261202.pdf, Table 5. I have verified they
93	* are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium	93	* are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
94	* M.	94	* M.
95	*/	95	*/
96		96
97	/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */	97	/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
98	static struct cpufreq_frequency_table banias_900[] =	98	static struct cpufreq_frequency_table banias_900[] =
99	{	99	{
100	OP(600, 844),	100	OP(600, 844),
101	OP(800, 988),	101	OP(800, 988),
102	OP(900, 1004),	102	OP(900, 1004),
103	{ .frequency = CPUFREQ_TABLE_END }	103	{ .frequency = CPUFREQ_TABLE_END }
104	};	104	};
105		105
106	/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */	106	/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
107	static struct cpufreq_frequency_table banias_1000[] =	107	static struct cpufreq_frequency_table banias_1000[] =
108	{	108	{
109	OP(600, 844),	109	OP(600, 844),
110	OP(800, 972),	110	OP(800, 972),
111	OP(900, 988),	111	OP(900, 988),
112	OP(1000, 1004),	112	OP(1000, 1004),
113	{ .frequency = CPUFREQ_TABLE_END }	113	{ .frequency = CPUFREQ_TABLE_END }
114	};	114	};
115		115
116	/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */	116	/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
117	static struct cpufreq_frequency_table banias_1100[] =	117	static struct cpufreq_frequency_table banias_1100[] =
118	{	118	{
119	OP( 600, 956),	119	OP( 600, 956),
120	OP( 800, 1020),	120	OP( 800, 1020),
121	OP( 900, 1100),	121	OP( 900, 1100),
122	OP(1000, 1164),	122	OP(1000, 1164),
123	OP(1100, 1180),	123	OP(1100, 1180),
124	{ .frequency = CPUFREQ_TABLE_END }	124	{ .frequency = CPUFREQ_TABLE_END }
125	};	125	};
126		126
127		127
128	/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */	128	/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
129	static struct cpufreq_frequency_table banias_1200[] =	129	static struct cpufreq_frequency_table banias_1200[] =
130	{	130	{
131	OP( 600, 956),	131	OP( 600, 956),
132	OP( 800, 1004),	132	OP( 800, 1004),
133	OP( 900, 1020),	133	OP( 900, 1020),
134	OP(1000, 1100),	134	OP(1000, 1100),
135	OP(1100, 1164),	135	OP(1100, 1164),
136	OP(1200, 1180),	136	OP(1200, 1180),
137	{ .frequency = CPUFREQ_TABLE_END }	137	{ .frequency = CPUFREQ_TABLE_END }
138	};	138	};
139		139
140	/* Intel Pentium M processor 1.30GHz (Banias) */	140	/* Intel Pentium M processor 1.30GHz (Banias) */
141	static struct cpufreq_frequency_table banias_1300[] =	141	static struct cpufreq_frequency_table banias_1300[] =
142	{	142	{
143	OP( 600, 956),	143	OP( 600, 956),
144	OP( 800, 1260),	144	OP( 800, 1260),
145	OP(1000, 1292),	145	OP(1000, 1292),
146	OP(1200, 1356),	146	OP(1200, 1356),
147	OP(1300, 1388),	147	OP(1300, 1388),
148	{ .frequency = CPUFREQ_TABLE_END }	148	{ .frequency = CPUFREQ_TABLE_END }
149	};	149	};
150		150
151	/* Intel Pentium M processor 1.40GHz (Banias) */	151	/* Intel Pentium M processor 1.40GHz (Banias) */
152	static struct cpufreq_frequency_table banias_1400[] =	152	static struct cpufreq_frequency_table banias_1400[] =
153	{	153	{
154	OP( 600, 956),	154	OP( 600, 956),
155	OP( 800, 1180),	155	OP( 800, 1180),
156	OP(1000, 1308),	156	OP(1000, 1308),
157	OP(1200, 1436),	157	OP(1200, 1436),
158	OP(1400, 1484),	158	OP(1400, 1484),
159	{ .frequency = CPUFREQ_TABLE_END }	159	{ .frequency = CPUFREQ_TABLE_END }
160	};	160	};
161		161
162	/* Intel Pentium M processor 1.50GHz (Banias) */	162	/* Intel Pentium M processor 1.50GHz (Banias) */
163	static struct cpufreq_frequency_table banias_1500[] =	163	static struct cpufreq_frequency_table banias_1500[] =
164	{	164	{
165	OP( 600, 956),	165	OP( 600, 956),
166	OP( 800, 1116),	166	OP( 800, 1116),
167	OP(1000, 1228),	167	OP(1000, 1228),
168	OP(1200, 1356),	168	OP(1200, 1356),
169	OP(1400, 1452),	169	OP(1400, 1452),
170	OP(1500, 1484),	170	OP(1500, 1484),
171	{ .frequency = CPUFREQ_TABLE_END }	171	{ .frequency = CPUFREQ_TABLE_END }
172	};	172	};
173		173
174	/* Intel Pentium M processor 1.60GHz (Banias) */	174	/* Intel Pentium M processor 1.60GHz (Banias) */
175	static struct cpufreq_frequency_table banias_1600[] =	175	static struct cpufreq_frequency_table banias_1600[] =
176	{	176	{
177	OP( 600, 956),	177	OP( 600, 956),
178	OP( 800, 1036),	178	OP( 800, 1036),
179	OP(1000, 1164),	179	OP(1000, 1164),
180	OP(1200, 1276),	180	OP(1200, 1276),
181	OP(1400, 1420),	181	OP(1400, 1420),
182	OP(1600, 1484),	182	OP(1600, 1484),
183	{ .frequency = CPUFREQ_TABLE_END }	183	{ .frequency = CPUFREQ_TABLE_END }
184	};	184	};
185		185
186	/* Intel Pentium M processor 1.70GHz (Banias) */	186	/* Intel Pentium M processor 1.70GHz (Banias) */
187	static struct cpufreq_frequency_table banias_1700[] =	187	static struct cpufreq_frequency_table banias_1700[] =
188	{	188	{
189	OP( 600, 956),	189	OP( 600, 956),
190	OP( 800, 1004),	190	OP( 800, 1004),
191	OP(1000, 1116),	191	OP(1000, 1116),
192	OP(1200, 1228),	192	OP(1200, 1228),
193	OP(1400, 1308),	193	OP(1400, 1308),
194	OP(1700, 1484),	194	OP(1700, 1484),
195	{ .frequency = CPUFREQ_TABLE_END }	195	{ .frequency = CPUFREQ_TABLE_END }
196	};	196	};
197	#undef OP	197	#undef OP
198		198
199	#define _BANIAS(cpuid, max, name) \	199	#define _BANIAS(cpuid, max, name) \
200	{ .cpu_id = cpuid, \	200	{ .cpu_id = cpuid, \
201	.model_name = "Intel(R) Pentium(R) M processor " name "MHz", \	201	.model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
202	.max_freq = (max)*1000, \	202	.max_freq = (max)*1000, \
203	.op_points = banias_##max, \	203	.op_points = banias_##max, \
204	}	204	}
205	#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)	205	#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
206		206
207	/* CPU models, their operating frequency range, and freq/voltage	207	/* CPU models, their operating frequency range, and freq/voltage
208	operating points */	208	operating points */
209	static struct cpu_model models[] =	209	static struct cpu_model models[] =
210	{	210	{
211	_BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),	211	_BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
212	BANIAS(1000),	212	BANIAS(1000),
213	BANIAS(1100),	213	BANIAS(1100),
214	BANIAS(1200),	214	BANIAS(1200),
215	BANIAS(1300),	215	BANIAS(1300),
216	BANIAS(1400),	216	BANIAS(1400),
217	BANIAS(1500),	217	BANIAS(1500),
218	BANIAS(1600),	218	BANIAS(1600),
219	BANIAS(1700),	219	BANIAS(1700),
220		220
221	/* NULL model_name is a wildcard */	221	/* NULL model_name is a wildcard */
222	{ &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },	222	{ &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
223	{ &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },	223	{ &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
224	{ &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },	224	{ &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
225	{ &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },	225	{ &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
226	{ &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },	226	{ &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
227		227
228	{ NULL, }	228	{ NULL, }
229	};	229	};
230	#undef _BANIAS	230	#undef _BANIAS
231	#undef BANIAS	231	#undef BANIAS
232		232
233	static int centrino_cpu_init_table(struct cpufreq_policy *policy)	233	static int centrino_cpu_init_table(struct cpufreq_policy *policy)
234	{	234	{
235	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);	235	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
236	struct cpu_model *model;	236	struct cpu_model *model;
237		237
238	for(model = models; model->cpu_id != NULL; model++)	238	for(model = models; model->cpu_id != NULL; model++)
239	if (centrino_verify_cpu_id(cpu, model->cpu_id) &&	239	if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
240	(model->model_name == NULL \|\|	240	(model->model_name == NULL \|\|
241	strcmp(cpu->x86_model_id, model->model_name) == 0))	241	strcmp(cpu->x86_model_id, model->model_name) == 0))
242	break;	242	break;
243		243
244	if (model->cpu_id == NULL) {	244	if (model->cpu_id == NULL) {
245	/* No match at all */	245	/* No match at all */
246	dprintk("no support for CPU model \"%s\": "	246	dprintk("no support for CPU model \"%s\": "
247	"send /proc/cpuinfo to " MAINTAINER "\n",	247	"send /proc/cpuinfo to " MAINTAINER "\n",
248	cpu->x86_model_id);	248	cpu->x86_model_id);
249	return -ENOENT;	249	return -ENOENT;
250	}	250	}
251		251
252	if (model->op_points == NULL) {	252	if (model->op_points == NULL) {
253	/* Matched a non-match */	253	/* Matched a non-match */
254	dprintk("no table support for CPU model \"%s\"\n",	254	dprintk("no table support for CPU model \"%s\"\n",
255	cpu->x86_model_id);	255	cpu->x86_model_id);
256	dprintk("try using the acpi-cpufreq driver\n");	256	dprintk("try using the acpi-cpufreq driver\n");
257	return -ENOENT;	257	return -ENOENT;
258	}	258	}
259		259
260	per_cpu(centrino_model, policy->cpu) = model;	260	per_cpu(centrino_model, policy->cpu) = model;
261		261
262	dprintk("found \"%s\": max frequency: %dkHz\n",	262	dprintk("found \"%s\": max frequency: %dkHz\n",
263	model->model_name, model->max_freq);	263	model->model_name, model->max_freq);
264		264
265	return 0;	265	return 0;
266	}	266	}
267		267
268	#else	268	#else
269	static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)	269	static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270	{	270	{
271	return -ENODEV;	271	return -ENODEV;
272	}	272	}
273	#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */	273	#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
274		274
275	static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,	275	static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276	const struct cpu_id *x)	276	const struct cpu_id *x)
277	{	277	{
278	if ((c->x86 == x->x86) &&	278	if ((c->x86 == x->x86) &&
279	(c->x86_model == x->x86_model) &&	279	(c->x86_model == x->x86_model) &&
280	(c->x86_mask == x->x86_mask))	280	(c->x86_mask == x->x86_mask))
281	return 1;	281	return 1;
282	return 0;	282	return 0;
283	}	283	}
284		284
285	/* To be called only after centrino_model is initialized */	285	/* To be called only after centrino_model is initialized */
286	static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)	286	static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
287	{	287	{
288	int i;	288	int i;
289		289
290	/*	290	/*
291	* Extract clock in kHz from PERF_CTL value	291	* Extract clock in kHz from PERF_CTL value
292	* for centrino, as some DSDTs are buggy.	292	* for centrino, as some DSDTs are buggy.
293	* Ideally, this can be done using the acpi_data structure.	293	* Ideally, this can be done using the acpi_data structure.
294	*/	294	*/
295	if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) \|\|	295	if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) \|\|
296	(per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) \|\|	296	(per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) \|\|
297	(per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {	297	(per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
298	msr = (msr >> 8) & 0xff;	298	msr = (msr >> 8) & 0xff;
299	return msr * 100000;	299	return msr * 100000;
300	}	300	}
301		301
302	if ((!per_cpu(centrino_model, cpu)) \|\|	302	if ((!per_cpu(centrino_model, cpu)) \|\|
303	(!per_cpu(centrino_model, cpu)->op_points))	303	(!per_cpu(centrino_model, cpu)->op_points))
304	return 0;	304	return 0;
305		305
306	msr &= 0xffff;	306	msr &= 0xffff;
307	for (i = 0;	307	for (i = 0;
308	per_cpu(centrino_model, cpu)->op_points[i].frequency	308	per_cpu(centrino_model, cpu)->op_points[i].frequency
309	!= CPUFREQ_TABLE_END;	309	!= CPUFREQ_TABLE_END;
310	i++) {	310	i++) {
311	if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)	311	if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312	return per_cpu(centrino_model, cpu)->	312	return per_cpu(centrino_model, cpu)->
313	op_points[i].frequency;	313	op_points[i].frequency;
314	}	314	}
315	if (failsafe)	315	if (failsafe)
316	return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;	316	return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
317	else	317	else
318	return 0;	318	return 0;
319	}	319	}
320		320
321	/* Return the current CPU frequency in kHz */	321	/* Return the current CPU frequency in kHz */
322	static unsigned int get_cur_freq(unsigned int cpu)	322	static unsigned int get_cur_freq(unsigned int cpu)
323	{	323	{
324	unsigned l, h;	324	unsigned l, h;
325	unsigned clock_freq;	325	unsigned clock_freq;
326	cpumask_t saved_mask;	326	cpumask_t saved_mask;
327		327
328	saved_mask = current->cpus_allowed;	328	saved_mask = current->cpus_allowed;
329	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));	329	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
330	if (smp_processor_id() != cpu)	330	if (smp_processor_id() != cpu)
331	return 0;	331	return 0;
332		332
333	rdmsr(MSR_IA32_PERF_STATUS, l, h);	333	rdmsr(MSR_IA32_PERF_STATUS, l, h);
334	clock_freq = extract_clock(l, cpu, 0);	334	clock_freq = extract_clock(l, cpu, 0);
335		335
336	if (unlikely(clock_freq == 0)) {	336	if (unlikely(clock_freq == 0)) {
337	/*	337	/*
338	* On some CPUs, we can see transient MSR values (which are	338	* On some CPUs, we can see transient MSR values (which are
339	* not present in _PSS), while CPU is doing some automatic	339	* not present in _PSS), while CPU is doing some automatic
340	* P-state transition (like TM2). Get the last freq set	340	* P-state transition (like TM2). Get the last freq set
341	* in PERF_CTL.	341	* in PERF_CTL.
342	*/	342	*/
343	rdmsr(MSR_IA32_PERF_CTL, l, h);	343	rdmsr(MSR_IA32_PERF_CTL, l, h);
344	clock_freq = extract_clock(l, cpu, 1);	344	clock_freq = extract_clock(l, cpu, 1);
345	}	345	}
346		346
347	set_cpus_allowed_ptr(current, &saved_mask);	347	set_cpus_allowed_ptr(current, &saved_mask);
348	return clock_freq;	348	return clock_freq;
349	}	349	}
350		350
351		351
352	static int centrino_cpu_init(struct cpufreq_policy *policy)	352	static int centrino_cpu_init(struct cpufreq_policy *policy)
353	{	353	{
354	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);	354	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
355	unsigned freq;	355	unsigned freq;
356	unsigned l, h;	356	unsigned l, h;
357	int ret;	357	int ret;
358	int i;	358	int i;
359		359
360	/* Only Intel makes Enhanced Speedstep-capable CPUs */	360	/* Only Intel makes Enhanced Speedstep-capable CPUs */
361	if (cpu->x86_vendor != X86_VENDOR_INTEL \|\|	361	if (cpu->x86_vendor != X86_VENDOR_INTEL \|\|
362	!cpu_has(cpu, X86_FEATURE_EST))	362	!cpu_has(cpu, X86_FEATURE_EST))
363	return -ENODEV;	363	return -ENODEV;
364		364
365	if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))	365	if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
366	centrino_driver.flags \|= CPUFREQ_CONST_LOOPS;	366	centrino_driver.flags \|= CPUFREQ_CONST_LOOPS;
367		367
368	if (policy->cpu != 0)	368	if (policy->cpu != 0)
369	return -ENODEV;	369	return -ENODEV;
370		370
371	for (i = 0; i < N_IDS; i++)	371	for (i = 0; i < N_IDS; i++)
372	if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))	372	if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
373	break;	373	break;
374		374
375	if (i != N_IDS)	375	if (i != N_IDS)
376	per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];	376	per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
377		377
378	if (!per_cpu(centrino_cpu, policy->cpu)) {	378	if (!per_cpu(centrino_cpu, policy->cpu)) {
379	dprintk("found unsupported CPU with "	379	dprintk("found unsupported CPU with "
380	"Enhanced SpeedStep: send /proc/cpuinfo to "	380	"Enhanced SpeedStep: send /proc/cpuinfo to "
381	MAINTAINER "\n");	381	MAINTAINER "\n");
382	return -ENODEV;	382	return -ENODEV;
383	}	383	}
384		384
385	if (centrino_cpu_init_table(policy)) {	385	if (centrino_cpu_init_table(policy)) {
386	return -ENODEV;	386	return -ENODEV;
387	}	387	}
388		388
389	/* Check to see if Enhanced SpeedStep is enabled, and try to	389	/* Check to see if Enhanced SpeedStep is enabled, and try to
390	enable it if not. */	390	enable it if not. */
391	rdmsr(MSR_IA32_MISC_ENABLE, l, h);	391	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
392		392
393	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {	393	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
394	l \|= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;	394	l \|= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
395	dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);	395	dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
396	wrmsr(MSR_IA32_MISC_ENABLE, l, h);	396	wrmsr(MSR_IA32_MISC_ENABLE, l, h);
397		397
398	/* check to see if it stuck */	398	/* check to see if it stuck */
399	rdmsr(MSR_IA32_MISC_ENABLE, l, h);	399	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
400	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {	400	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
401	printk(KERN_INFO PFX	401	printk(KERN_INFO PFX
402	"couldn't enable Enhanced SpeedStep\n");	402	"couldn't enable Enhanced SpeedStep\n");
403	return -ENODEV;	403	return -ENODEV;
404	}	404	}
405	}	405	}
406		406
407	freq = get_cur_freq(policy->cpu);	407	freq = get_cur_freq(policy->cpu);
408	policy->cpuinfo.transition_latency = 10000;	408	policy->cpuinfo.transition_latency = 10000;
409	/* 10uS transition latency */	409	/* 10uS transition latency */
410	policy->cur = freq;	410	policy->cur = freq;
411		411
412	dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);	412	dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
413		413
414	ret = cpufreq_frequency_table_cpuinfo(policy,	414	ret = cpufreq_frequency_table_cpuinfo(policy,
415	per_cpu(centrino_model, policy->cpu)->op_points);	415	per_cpu(centrino_model, policy->cpu)->op_points);
416	if (ret)	416	if (ret)
417	return (ret);	417	return (ret);
418		418
419	cpufreq_frequency_table_get_attr(	419	cpufreq_frequency_table_get_attr(
420	per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);	420	per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
421		421
422	return 0;	422	return 0;
423	}	423	}
424		424
425	static int centrino_cpu_exit(struct cpufreq_policy *policy)	425	static int centrino_cpu_exit(struct cpufreq_policy *policy)
426	{	426	{
427	unsigned int cpu = policy->cpu;	427	unsigned int cpu = policy->cpu;
428		428
429	if (!per_cpu(centrino_model, cpu))	429	if (!per_cpu(centrino_model, cpu))
430	return -ENODEV;	430	return -ENODEV;
431		431
432	cpufreq_frequency_table_put_attr(cpu);	432	cpufreq_frequency_table_put_attr(cpu);
433		433
434	per_cpu(centrino_model, cpu) = NULL;	434	per_cpu(centrino_model, cpu) = NULL;
435		435
436	return 0;	436	return 0;
437	}	437	}
438		438
439	/**	439	/**
440	* centrino_verify - verifies a new CPUFreq policy	440	* centrino_verify - verifies a new CPUFreq policy
441	* @policy: new policy	441	* @policy: new policy
442	*	442	*
443	* Limit must be within this model's frequency range at least one	443	* Limit must be within this model's frequency range at least one
444	* border included.	444	* border included.
445	*/	445	*/
446	static int centrino_verify (struct cpufreq_policy *policy)	446	static int centrino_verify (struct cpufreq_policy *policy)
447	{	447	{
448	return cpufreq_frequency_table_verify(policy,	448	return cpufreq_frequency_table_verify(policy,
449	per_cpu(centrino_model, policy->cpu)->op_points);	449	per_cpu(centrino_model, policy->cpu)->op_points);
450	}	450	}
451		451
452	/**	452	/**
453	* centrino_setpolicy - set a new CPUFreq policy	453	* centrino_setpolicy - set a new CPUFreq policy
454	* @policy: new policy	454	* @policy: new policy
455	* @target_freq: the target frequency	455	* @target_freq: the target frequency
456	* @relation: how that frequency relates to achieved frequency	456	* @relation: how that frequency relates to achieved frequency
457	* (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)	457	* (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
458	*	458	*
459	* Sets a new CPUFreq policy.	459	* Sets a new CPUFreq policy.
460	*/	460	*/
461	static int centrino_target (struct cpufreq_policy *policy,	461	static int centrino_target (struct cpufreq_policy *policy,
462	unsigned int target_freq,	462	unsigned int target_freq,
463	unsigned int relation)	463	unsigned int relation)
464	{	464	{
465	unsigned int newstate = 0;	465	unsigned int newstate = 0;
466	unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;	466	unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
467	struct cpufreq_freqs freqs;	467	struct cpufreq_freqs freqs;
468	int retval = 0;	468	int retval = 0;
469	unsigned int j, k, first_cpu, tmp;	469	unsigned int j, k, first_cpu, tmp;
470	cpumask_var_t saved_mask, covered_cpus;	470	cpumask_var_t saved_mask, covered_cpus;
471		471
472	if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))	472	if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
473	return -ENOMEM;	473	return -ENOMEM;
474	if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {	474	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475	free_cpumask_var(saved_mask);	475	free_cpumask_var(saved_mask);
476	return -ENOMEM;	476	return -ENOMEM;
477	}	477	}
478	cpumask_copy(saved_mask, &current->cpus_allowed);	478	cpumask_copy(saved_mask, &current->cpus_allowed);
479		479
480	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {	480	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
481	retval = -ENODEV;	481	retval = -ENODEV;
482	goto out;	482	goto out;
483	}	483	}
484		484
485	if (unlikely(cpufreq_frequency_table_target(policy,	485	if (unlikely(cpufreq_frequency_table_target(policy,
486	per_cpu(centrino_model, cpu)->op_points,	486	per_cpu(centrino_model, cpu)->op_points,
487	target_freq,	487	target_freq,
488	relation,	488	relation,
489	&newstate))) {	489	&newstate))) {
490	retval = -EINVAL;	490	retval = -EINVAL;
491	goto out;	491	goto out;
492	}	492	}
493		493
494	first_cpu = 1;	494	first_cpu = 1;
495	for_each_cpu(j, policy->cpus) {	495	for_each_cpu(j, policy->cpus) {
496	const struct cpumask *mask;	496	const struct cpumask *mask;
497		497
498	/* cpufreq holds the hotplug lock, so we are safe here */	498	/* cpufreq holds the hotplug lock, so we are safe here */
499	if (!cpu_online(j))	499	if (!cpu_online(j))
500	continue;	500	continue;
501		501
502	/*	502	/*
503	* Support for SMP systems.	503	* Support for SMP systems.
504	* Make sure we are running on CPU that wants to change freq	504	* Make sure we are running on CPU that wants to change freq
505	*/	505	*/
506	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)	506	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
507	mask = policy->cpus;	507	mask = policy->cpus;
508	else	508	else
509	mask = cpumask_of(j);	509	mask = cpumask_of(j);
510		510
511	set_cpus_allowed_ptr(current, mask);	511	set_cpus_allowed_ptr(current, mask);
512	preempt_disable();	512	preempt_disable();
513	if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {	513	if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
514	dprintk("couldn't limit to CPUs in this domain\n");	514	dprintk("couldn't limit to CPUs in this domain\n");
515	retval = -EAGAIN;	515	retval = -EAGAIN;
516	if (first_cpu) {	516	if (first_cpu) {
517	/* We haven't started the transition yet. */	517	/* We haven't started the transition yet. */
518	goto migrate_end;	518	goto migrate_end;
519	}	519	}
520	preempt_enable();	520	preempt_enable();
521	break;	521	break;
522	}	522	}
523		523
524	msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;	524	msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
525		525
526	if (first_cpu) {	526	if (first_cpu) {
527	rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);	527	rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
528	if (msr == (oldmsr & 0xffff)) {	528	if (msr == (oldmsr & 0xffff)) {
529	dprintk("no change needed - msr was and needs "	529	dprintk("no change needed - msr was and needs "
530	"to be %x\n", oldmsr);	530	"to be %x\n", oldmsr);
531	retval = 0;	531	retval = 0;
532	goto migrate_end;	532	goto migrate_end;
533	}	533	}
534		534
535	freqs.old = extract_clock(oldmsr, cpu, 0);	535	freqs.old = extract_clock(oldmsr, cpu, 0);
536	freqs.new = extract_clock(msr, cpu, 0);	536	freqs.new = extract_clock(msr, cpu, 0);
537		537
538	dprintk("target=%dkHz old=%d new=%d msr=%04x\n",	538	dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
539	target_freq, freqs.old, freqs.new, msr);	539	target_freq, freqs.old, freqs.new, msr);
540		540
541	for_each_cpu(k, policy->cpus) {	541	for_each_cpu(k, policy->cpus) {
542	if (!cpu_online(k))	542	if (!cpu_online(k))
543	continue;	543	continue;
544	freqs.cpu = k;	544	freqs.cpu = k;
545	cpufreq_notify_transition(&freqs,	545	cpufreq_notify_transition(&freqs,
546	CPUFREQ_PRECHANGE);	546	CPUFREQ_PRECHANGE);
547	}	547	}
548		548
549	first_cpu = 0;	549	first_cpu = 0;
550	/* all but 16 LSB are reserved, treat them with care */	550	/* all but 16 LSB are reserved, treat them with care */
551	oldmsr &= ~0xffff;	551	oldmsr &= ~0xffff;
552	msr &= 0xffff;	552	msr &= 0xffff;
553	oldmsr \|= msr;	553	oldmsr \|= msr;
554	}	554	}
555		555
556	wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);	556	wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
557	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {	557	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
558	preempt_enable();	558	preempt_enable();
559	break;	559	break;
560	}	560	}
561		561
562	cpu_set(j, *covered_cpus);	562	cpu_set(j, *covered_cpus);
563	preempt_enable();	563	preempt_enable();
564	}	564	}
565		565
566	for_each_cpu(k, policy->cpus) {	566	for_each_cpu(k, policy->cpus) {
567	if (!cpu_online(k))	567	if (!cpu_online(k))
568	continue;	568	continue;
569	freqs.cpu = k;	569	freqs.cpu = k;
570	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);	570	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571	}	571	}
572		572
573	if (unlikely(retval)) {	573	if (unlikely(retval)) {
574	/*	574	/*
575	* We have failed halfway through the frequency change.	575	* We have failed halfway through the frequency change.
576	* We have sent callbacks to policy->cpus and	576	* We have sent callbacks to policy->cpus and
577	* MSRs have already been written on coverd_cpus.	577	* MSRs have already been written on coverd_cpus.
578	* Best effort undo..	578	* Best effort undo..
579	*/	579	*/
580		580
581	for_each_cpu_mask_nr(j, *covered_cpus) {	581	for_each_cpu_mask_nr(j, *covered_cpus) {
582	set_cpus_allowed_ptr(current, &cpumask_of_cpu(j));	582	set_cpus_allowed_ptr(current, &cpumask_of_cpu(j));
583	wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);	583	wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
584	}	584	}
585		585
586	tmp = freqs.new;	586	tmp = freqs.new;
587	freqs.new = freqs.old;	587	freqs.new = freqs.old;
588	freqs.old = tmp;	588	freqs.old = tmp;
589	for_each_cpu(j, policy->cpus) {	589	for_each_cpu(j, policy->cpus) {
590	if (!cpu_online(j))	590	if (!cpu_online(j))
591	continue;	591	continue;
592	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);	592	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
593	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);	593	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
594	}	594	}
595	}	595	}
596	set_cpus_allowed_ptr(current, saved_mask);	596	set_cpus_allowed_ptr(current, saved_mask);
597	retval = 0;	597	retval = 0;
598	goto out;	598	goto out;
599		599
600	migrate_end:	600	migrate_end:
601	preempt_enable();	601	preempt_enable();
602	set_cpus_allowed_ptr(current, saved_mask);	602	set_cpus_allowed_ptr(current, saved_mask);
603	out:	603	out:
604	free_cpumask_var(saved_mask);	604	free_cpumask_var(saved_mask);
605	free_cpumask_var(covered_cpus);	605	free_cpumask_var(covered_cpus);
606	return retval;	606	return retval;
607	}	607	}
608		608
609	static struct freq_attr* centrino_attr[] = {	609	static struct freq_attr* centrino_attr[] = {
610	&cpufreq_freq_attr_scaling_available_freqs,	610	&cpufreq_freq_attr_scaling_available_freqs,
611	NULL,	611	NULL,
612	};	612	};
613		613
614	static struct cpufreq_driver centrino_driver = {	614	static struct cpufreq_driver centrino_driver = {
615	.name = "centrino", /* should be speedstep-centrino,	615	.name = "centrino", /* should be speedstep-centrino,
616	but there's a 16 char limit */	616	but there's a 16 char limit */
617	.init = centrino_cpu_init,	617	.init = centrino_cpu_init,
618	.exit = centrino_cpu_exit,	618	.exit = centrino_cpu_exit,
619	.verify = centrino_verify,	619	.verify = centrino_verify,
620	.target = centrino_target,	620	.target = centrino_target,
621	.get = get_cur_freq,	621	.get = get_cur_freq,
622	.attr = centrino_attr,	622	.attr = centrino_attr,
623	.owner = THIS_MODULE,	623	.owner = THIS_MODULE,
624	};	624	};
625		625
626		626
627	/**	627	/**
628	* centrino_init - initializes the Enhanced SpeedStep CPUFreq driver	628	* centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
629	*	629	*
630	* Initializes the Enhanced SpeedStep support. Returns -ENODEV on	630	* Initializes the Enhanced SpeedStep support. Returns -ENODEV on
631	* unsupported devices, -ENOENT if there's no voltage table for this	631	* unsupported devices, -ENOENT if there's no voltage table for this
632	* particular CPU model, -EINVAL on problems during initiatization,	632	* particular CPU model, -EINVAL on problems during initiatization,
633	* and zero on success.	633	* and zero on success.
634	*	634	*
635	* This is quite picky. Not only does the CPU have to advertise the	635	* This is quite picky. Not only does the CPU have to advertise the
636	* "est" flag in the cpuid capability flags, we look for a specific	636	* "est" flag in the cpuid capability flags, we look for a specific
637	* CPU model and stepping, and we need to have the exact model name in	637	* CPU model and stepping, and we need to have the exact model name in
638	* our voltage tables. That is, be paranoid about not releasing	638	* our voltage tables. That is, be paranoid about not releasing
639	* someone's valuable magic smoke.	639	* someone's valuable magic smoke.
640	*/	640	*/
641	static int __init centrino_init(void)	641	static int __init centrino_init(void)
642	{	642	{
643	struct cpuinfo_x86 *cpu = &cpu_data(0);	643	struct cpuinfo_x86 *cpu = &cpu_data(0);
644		644
645	if (!cpu_has(cpu, X86_FEATURE_EST))	645	if (!cpu_has(cpu, X86_FEATURE_EST))
646	return -ENODEV;	646	return -ENODEV;
647		647
648	return cpufreq_register_driver(&centrino_driver);	648	return cpufreq_register_driver(&centrino_driver);
649	}	649	}
650		650
651	static void __exit centrino_exit(void)	651	static void __exit centrino_exit(void)
652	{	652	{
653	cpufreq_unregister_driver(&centrino_driver);	653	cpufreq_unregister_driver(&centrino_driver);
654	}	654	}
655		655
656	MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");	656	MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
657	MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");	657	MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
658	MODULE_LICENSE ("GPL");	658	MODULE_LICENSE ("GPL");
659		659
660	late_initcall(centrino_init);	660	late_initcall(centrino_init);
661	module_exit(centrino_exit);	661	module_exit(centrino_exit);
662		662

arch/x86/kernel/cpu/mcheck/mce_64.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* Machine check handler.	2	* Machine check handler.
3	* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.	3	* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4	* Rest from unknown author(s).	4	* Rest from unknown author(s).
5	* 2004 Andi Kleen. Rewrote most of it.	5	* 2004 Andi Kleen. Rewrote most of it.
6	* Copyright 2008 Intel Corporation	6	* Copyright 2008 Intel Corporation
7	* Author: Andi Kleen	7	* Author: Andi Kleen
8	*/	8	*/
9		9
10	#include <linux/init.h>	10	#include <linux/init.h>
11	#include <linux/types.h>	11	#include <linux/types.h>
12	#include <linux/kernel.h>	12	#include <linux/kernel.h>
13	#include <linux/sched.h>	13	#include <linux/sched.h>
14	#include <linux/smp_lock.h>	14	#include <linux/smp_lock.h>
15	#include <linux/string.h>	15	#include <linux/string.h>
16	#include <linux/rcupdate.h>	16	#include <linux/rcupdate.h>
17	#include <linux/kallsyms.h>	17	#include <linux/kallsyms.h>
18	#include <linux/sysdev.h>	18	#include <linux/sysdev.h>
19	#include <linux/miscdevice.h>	19	#include <linux/miscdevice.h>
20	#include <linux/fs.h>	20	#include <linux/fs.h>
21	#include <linux/capability.h>	21	#include <linux/capability.h>
22	#include <linux/cpu.h>	22	#include <linux/cpu.h>
23	#include <linux/percpu.h>	23	#include <linux/percpu.h>
24	#include <linux/poll.h>	24	#include <linux/poll.h>
25	#include <linux/thread_info.h>	25	#include <linux/thread_info.h>
26	#include <linux/ctype.h>	26	#include <linux/ctype.h>
27	#include <linux/kmod.h>	27	#include <linux/kmod.h>
28	#include <linux/kdebug.h>	28	#include <linux/kdebug.h>
29	#include <linux/kobject.h>	29	#include <linux/kobject.h>
30	#include <linux/sysfs.h>	30	#include <linux/sysfs.h>
31	#include <linux/ratelimit.h>	31	#include <linux/ratelimit.h>
32	#include <asm/processor.h>	32	#include <asm/processor.h>
33	#include <asm/msr.h>	33	#include <asm/msr.h>
34	#include <asm/mce.h>	34	#include <asm/mce.h>
35	#include <asm/uaccess.h>	35	#include <asm/uaccess.h>
36	#include <asm/smp.h>	36	#include <asm/smp.h>
37	#include <asm/idle.h>	37	#include <asm/idle.h>
38		38
39	#define MISC_MCELOG_MINOR 227	39	#define MISC_MCELOG_MINOR 227
40		40
41	atomic_t mce_entry;	41	atomic_t mce_entry;
42		42
43	static int mce_dont_init;	43	static int mce_dont_init;
44		44
45	/*	45	/*
46	* Tolerant levels:	46	* Tolerant levels:
47	* 0: always panic on uncorrected errors, log corrected errors	47	* 0: always panic on uncorrected errors, log corrected errors
48	* 1: panic or SIGBUS on uncorrected errors, log corrected errors	48	* 1: panic or SIGBUS on uncorrected errors, log corrected errors
49	* 2: SIGBUS or log uncorrected errors (if possible), log corrected errors	49	* 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50	* 3: never panic or SIGBUS, log all errors (for testing only)	50	* 3: never panic or SIGBUS, log all errors (for testing only)
51	*/	51	*/
52	static int tolerant = 1;	52	static int tolerant = 1;
53	static int banks;	53	static int banks;
54	static u64 *bank;	54	static u64 *bank;
55	static unsigned long notify_user;	55	static unsigned long notify_user;
56	static int rip_msr;	56	static int rip_msr;
57	static int mce_bootlog = -1;	57	static int mce_bootlog = -1;
58	static atomic_t mce_events;	58	static atomic_t mce_events;
59		59
60	static char trigger[128];	60	static char trigger[128];
61	static char *trigger_argv[2] = { trigger, NULL };	61	static char *trigger_argv[2] = { trigger, NULL };
62		62
63	static DECLARE_WAIT_QUEUE_HEAD(mce_wait);	63	static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64		64
65	/* MCA banks polled by the period polling timer for corrected events */	65	/* MCA banks polled by the period polling timer for corrected events */
66	DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {	66	DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL	67	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68	};	68	};
69		69
70	/* Do initial initialization of a struct mce */	70	/* Do initial initialization of a struct mce */
71	void mce_setup(struct mce *m)	71	void mce_setup(struct mce *m)
72	{	72	{
73	memset(m, 0, sizeof(struct mce));	73	memset(m, 0, sizeof(struct mce));
74	m->cpu = smp_processor_id();	74	m->cpu = smp_processor_id();
75	rdtscll(m->tsc);	75	rdtscll(m->tsc);
76	}	76	}
77		77
78	/*	78	/*
79	* Lockless MCE logging infrastructure.	79	* Lockless MCE logging infrastructure.
80	* This avoids deadlocks on printk locks without having to break locks. Also	80	* This avoids deadlocks on printk locks without having to break locks. Also
81	* separate MCEs from kernel messages to avoid bogus bug reports.	81	* separate MCEs from kernel messages to avoid bogus bug reports.
82	*/	82	*/
83		83
84	static struct mce_log mcelog = {	84	static struct mce_log mcelog = {
85	MCE_LOG_SIGNATURE,	85	MCE_LOG_SIGNATURE,
86	MCE_LOG_LEN,	86	MCE_LOG_LEN,
87	};	87	};
88		88
89	void mce_log(struct mce *mce)	89	void mce_log(struct mce *mce)
90	{	90	{
91	unsigned next, entry;	91	unsigned next, entry;
92	atomic_inc(&mce_events);	92	atomic_inc(&mce_events);
93	mce->finished = 0;	93	mce->finished = 0;
94	wmb();	94	wmb();
95	for (;;) {	95	for (;;) {
96	entry = rcu_dereference(mcelog.next);	96	entry = rcu_dereference(mcelog.next);
97	for (;;) {	97	for (;;) {
98	/* When the buffer fills up discard new entries. Assume	98	/* When the buffer fills up discard new entries. Assume
99	that the earlier errors are the more interesting. */	99	that the earlier errors are the more interesting. */
100	if (entry >= MCE_LOG_LEN) {	100	if (entry >= MCE_LOG_LEN) {
101	set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);	101	set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102	return;	102	return;
103	}	103	}
104	/* Old left over entry. Skip. */	104	/* Old left over entry. Skip. */
105	if (mcelog.entry[entry].finished) {	105	if (mcelog.entry[entry].finished) {
106	entry++;	106	entry++;
107	continue;	107	continue;
108	}	108	}
109	break;	109	break;
110	}	110	}
111	smp_rmb();	111	smp_rmb();
112	next = entry + 1;	112	next = entry + 1;
113	if (cmpxchg(&mcelog.next, entry, next) == entry)	113	if (cmpxchg(&mcelog.next, entry, next) == entry)
114	break;	114	break;
115	}	115	}
116	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));	116	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117	wmb();	117	wmb();
118	mcelog.entry[entry].finished = 1;	118	mcelog.entry[entry].finished = 1;
119	wmb();	119	wmb();
120		120
121	set_bit(0, &notify_user);	121	set_bit(0, &notify_user);
122	}	122	}
123		123
124	static void print_mce(struct mce *m)	124	static void print_mce(struct mce *m)
125	{	125	{
126	printk(KERN_EMERG "\n"	126	printk(KERN_EMERG "\n"
127	KERN_EMERG "HARDWARE ERROR\n"	127	KERN_EMERG "HARDWARE ERROR\n"
128	KERN_EMERG	128	KERN_EMERG
129	"CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",	129	"CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130	m->cpu, m->mcgstatus, m->bank, m->status);	130	m->cpu, m->mcgstatus, m->bank, m->status);
131	if (m->ip) {	131	if (m->ip) {
132	printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",	132	printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133	!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",	133	!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134	m->cs, m->ip);	134	m->cs, m->ip);
135	if (m->cs == __KERNEL_CS)	135	if (m->cs == __KERNEL_CS)
136	print_symbol("{%s}", m->ip);	136	print_symbol("{%s}", m->ip);
137	printk("\n");	137	printk("\n");
138	}	138	}
139	printk(KERN_EMERG "TSC %llx ", m->tsc);	139	printk(KERN_EMERG "TSC %llx ", m->tsc);
140	if (m->addr)	140	if (m->addr)
141	printk("ADDR %llx ", m->addr);	141	printk("ADDR %llx ", m->addr);
142	if (m->misc)	142	if (m->misc)
143	printk("MISC %llx ", m->misc);	143	printk("MISC %llx ", m->misc);
144	printk("\n");	144	printk("\n");
145	printk(KERN_EMERG "This is not a software problem!\n");	145	printk(KERN_EMERG "This is not a software problem!\n");
146	printk(KERN_EMERG "Run through mcelog --ascii to decode "	146	printk(KERN_EMERG "Run through mcelog --ascii to decode "
147	"and contact your hardware vendor\n");	147	"and contact your hardware vendor\n");
148	}	148	}
149		149
150	static void mce_panic(char msg, struct mce backup, unsigned long start)	150	static void mce_panic(char msg, struct mce backup, unsigned long start)
151	{	151	{
152	int i;	152	int i;
153		153
154	oops_begin();	154	oops_begin();
155	for (i = 0; i < MCE_LOG_LEN; i++) {	155	for (i = 0; i < MCE_LOG_LEN; i++) {
156	unsigned long tsc = mcelog.entry[i].tsc;	156	unsigned long tsc = mcelog.entry[i].tsc;
157		157
158	if (time_before(tsc, start))	158	if (time_before(tsc, start))
159	continue;	159	continue;
160	print_mce(&mcelog.entry[i]);	160	print_mce(&mcelog.entry[i]);
161	if (backup && mcelog.entry[i].tsc == backup->tsc)	161	if (backup && mcelog.entry[i].tsc == backup->tsc)
162	backup = NULL;	162	backup = NULL;
163	}	163	}
164	if (backup)	164	if (backup)
165	print_mce(backup);	165	print_mce(backup);
166	panic(msg);	166	panic(msg);
167	}	167	}
168		168
169	int mce_available(struct cpuinfo_x86 *c)	169	int mce_available(struct cpuinfo_x86 *c)
170	{	170	{
171	if (mce_dont_init)	171	if (mce_dont_init)
172	return 0;	172	return 0;
173	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);	173	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174	}	174	}
175		175
176	static inline void mce_get_rip(struct mce m, struct pt_regs regs)	176	static inline void mce_get_rip(struct mce m, struct pt_regs regs)
177	{	177	{
178	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {	178	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179	m->ip = regs->ip;	179	m->ip = regs->ip;
180	m->cs = regs->cs;	180	m->cs = regs->cs;
181	} else {	181	} else {
182	m->ip = 0;	182	m->ip = 0;
183	m->cs = 0;	183	m->cs = 0;
184	}	184	}
185	if (rip_msr) {	185	if (rip_msr) {
186	/* Assume the RIP in the MSR is exact. Is this true? */	186	/* Assume the RIP in the MSR is exact. Is this true? */
187	m->mcgstatus \|= MCG_STATUS_EIPV;	187	m->mcgstatus \|= MCG_STATUS_EIPV;
188	rdmsrl(rip_msr, m->ip);	188	rdmsrl(rip_msr, m->ip);
189	m->cs = 0;	189	m->cs = 0;
190	}	190	}
191	}	191	}
192		192
193	/*	193	/*
194	* Poll for corrected events or events that happened before reset.	194	* Poll for corrected events or events that happened before reset.
195	* Those are just logged through /dev/mcelog.	195	* Those are just logged through /dev/mcelog.
196	*	196	*
197	* This is executed in standard interrupt context.	197	* This is executed in standard interrupt context.
198	*/	198	*/
199	void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)	199	void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200	{	200	{
201	struct mce m;	201	struct mce m;
202	int i;	202	int i;
203		203
204	mce_setup(&m);	204	mce_setup(&m);
205		205
206	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);	206	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207	for (i = 0; i < banks; i++) {	207	for (i = 0; i < banks; i++) {
208	if (!bank[i] \|\| !test_bit(i, *b))	208	if (!bank[i] \|\| !test_bit(i, *b))
209	continue;	209	continue;
210		210
211	m.misc = 0;	211	m.misc = 0;
212	m.addr = 0;	212	m.addr = 0;
213	m.bank = i;	213	m.bank = i;
214	m.tsc = 0;	214	m.tsc = 0;
215		215
216	barrier();	216	barrier();
217	rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);	217	rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218	if (!(m.status & MCI_STATUS_VAL))	218	if (!(m.status & MCI_STATUS_VAL))
219	continue;	219	continue;
220		220
221	/*	221	/*
222	* Uncorrected events are handled by the exception handler	222	* Uncorrected events are handled by the exception handler
223	* when it is enabled. But when the exception is disabled log	223	* when it is enabled. But when the exception is disabled log
224	* everything.	224	* everything.
225	*	225	*
226	* TBD do the same check for MCI_STATUS_EN here?	226	* TBD do the same check for MCI_STATUS_EN here?
227	*/	227	*/
228	if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))	228	if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229	continue;	229	continue;
230		230
231	if (m.status & MCI_STATUS_MISCV)	231	if (m.status & MCI_STATUS_MISCV)
232	rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);	232	rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233	if (m.status & MCI_STATUS_ADDRV)	233	if (m.status & MCI_STATUS_ADDRV)
234	rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);	234	rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235		235
236	if (!(flags & MCP_TIMESTAMP))	236	if (!(flags & MCP_TIMESTAMP))
237	m.tsc = 0;	237	m.tsc = 0;
238	/*	238	/*
239	* Don't get the IP here because it's unlikely to	239	* Don't get the IP here because it's unlikely to
240	* have anything to do with the actual error location.	240	* have anything to do with the actual error location.
241	*/	241	*/
242	if (!(flags & MCP_DONTLOG)) {	242	if (!(flags & MCP_DONTLOG)) {
243	mce_log(&m);	243	mce_log(&m);
244	add_taint(TAINT_MACHINE_CHECK);	244	add_taint(TAINT_MACHINE_CHECK);
245	}	245	}
246		246
247	/*	247	/*
248	* Clear state for this bank.	248	* Clear state for this bank.
249	*/	249	*/
250	wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);	250	wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251	}	251	}
252		252
253	/*	253	/*
254	* Don't clear MCG_STATUS here because it's only defined for	254	* Don't clear MCG_STATUS here because it's only defined for
255	* exceptions.	255	* exceptions.
256	*/	256	*/
257	}	257	}
258		258
259	/*	259	/*
260	* The actual machine check handler. This only handles real	260	* The actual machine check handler. This only handles real
261	* exceptions when something got corrupted coming in through int 18.	261	* exceptions when something got corrupted coming in through int 18.
262	*	262	*
263	* This is executed in NMI context not subject to normal locking rules. This	263	* This is executed in NMI context not subject to normal locking rules. This
264	* implies that most kernel services cannot be safely used. Don't even	264	* implies that most kernel services cannot be safely used. Don't even
265	* think about putting a printk in there!	265	* think about putting a printk in there!
266	*/	266	*/
267	void do_machine_check(struct pt_regs * regs, long error_code)	267	void do_machine_check(struct pt_regs * regs, long error_code)
268	{	268	{
269	struct mce m, panicm;	269	struct mce m, panicm;
270	u64 mcestart = 0;	270	u64 mcestart = 0;
271	int i;	271	int i;
272	int panicm_found = 0;	272	int panicm_found = 0;
273	/*	273	/*
274	* If no_way_out gets set, there is no safe way to recover from this	274	* If no_way_out gets set, there is no safe way to recover from this
275	* MCE. If tolerant is cranked up, we'll try anyway.	275	* MCE. If tolerant is cranked up, we'll try anyway.
276	*/	276	*/
277	int no_way_out = 0;	277	int no_way_out = 0;
278	/*	278	/*
279	* If kill_it gets set, there might be a way to recover from this	279	* If kill_it gets set, there might be a way to recover from this
280	* error.	280	* error.
281	*/	281	*/
282	int kill_it = 0;	282	int kill_it = 0;
283	DECLARE_BITMAP(toclear, MAX_NR_BANKS);	283	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284		284
285	atomic_inc(&mce_entry);	285	atomic_inc(&mce_entry);
286		286
287	if (notify_die(DIE_NMI, "machine check", regs, error_code,	287	if (notify_die(DIE_NMI, "machine check", regs, error_code,
288	18, SIGKILL) == NOTIFY_STOP)	288	18, SIGKILL) == NOTIFY_STOP)
289	goto out2;	289	goto out2;
290	if (!banks)	290	if (!banks)
291	goto out2;	291	goto out2;
292		292
293	mce_setup(&m);	293	mce_setup(&m);
294		294
295	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);	295	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296	/* if the restart IP is not valid, we're done for */	296	/* if the restart IP is not valid, we're done for */
297	if (!(m.mcgstatus & MCG_STATUS_RIPV))	297	if (!(m.mcgstatus & MCG_STATUS_RIPV))
298	no_way_out = 1;	298	no_way_out = 1;
299		299
300	rdtscll(mcestart);	300	rdtscll(mcestart);
301	barrier();	301	barrier();
302		302
303	for (i = 0; i < banks; i++) {	303	for (i = 0; i < banks; i++) {
304	__clear_bit(i, toclear);	304	__clear_bit(i, toclear);
305	if (!bank[i])	305	if (!bank[i])
306	continue;	306	continue;
307		307
308	m.misc = 0;	308	m.misc = 0;
309	m.addr = 0;	309	m.addr = 0;
310	m.bank = i;	310	m.bank = i;
311		311
312	rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);	312	rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313	if ((m.status & MCI_STATUS_VAL) == 0)	313	if ((m.status & MCI_STATUS_VAL) == 0)
314	continue;	314	continue;
315		315
316	/*	316	/*
317	* Non uncorrected errors are handled by machine_check_poll	317	* Non uncorrected errors are handled by machine_check_poll
318	* Leave them alone.	318	* Leave them alone.
319	*/	319	*/
320	if ((m.status & MCI_STATUS_UC) == 0)	320	if ((m.status & MCI_STATUS_UC) == 0)
321	continue;	321	continue;
322		322
323	/*	323	/*
324	* Set taint even when machine check was not enabled.	324	* Set taint even when machine check was not enabled.
325	*/	325	*/
326	add_taint(TAINT_MACHINE_CHECK);	326	add_taint(TAINT_MACHINE_CHECK);
327		327
328	__set_bit(i, toclear);	328	__set_bit(i, toclear);
329		329
330	if (m.status & MCI_STATUS_EN) {	330	if (m.status & MCI_STATUS_EN) {
331	/* if PCC was set, there's no way out */	331	/* if PCC was set, there's no way out */
332	no_way_out \|= !!(m.status & MCI_STATUS_PCC);	332	no_way_out \|= !!(m.status & MCI_STATUS_PCC);
333	/*	333	/*
334	* If this error was uncorrectable and there was	334	* If this error was uncorrectable and there was
335	* an overflow, we're in trouble. If no overflow,	335	* an overflow, we're in trouble. If no overflow,
336	* we might get away with just killing a task.	336	* we might get away with just killing a task.
337	*/	337	*/
338	if (m.status & MCI_STATUS_UC) {	338	if (m.status & MCI_STATUS_UC) {
339	if (tolerant < 1 \|\| m.status & MCI_STATUS_OVER)	339	if (tolerant < 1 \|\| m.status & MCI_STATUS_OVER)
340	no_way_out = 1;	340	no_way_out = 1;
341	kill_it = 1;	341	kill_it = 1;
342	}	342	}
343	} else {	343	} else {
344	/*	344	/*
345	* Machine check event was not enabled. Clear, but	345	* Machine check event was not enabled. Clear, but
346	* ignore.	346	* ignore.
347	*/	347	*/
348	continue;	348	continue;
349	}	349	}
350		350
351	if (m.status & MCI_STATUS_MISCV)	351	if (m.status & MCI_STATUS_MISCV)
352	rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);	352	rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353	if (m.status & MCI_STATUS_ADDRV)	353	if (m.status & MCI_STATUS_ADDRV)
354	rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);	354	rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355		355
356	mce_get_rip(&m, regs);	356	mce_get_rip(&m, regs);
357	mce_log(&m);	357	mce_log(&m);
358		358
359	/* Did this bank cause the exception? */	359	/* Did this bank cause the exception? */
360	/* Assume that the bank with uncorrectable errors did it,	360	/* Assume that the bank with uncorrectable errors did it,
361	and that there is only a single one. */	361	and that there is only a single one. */
362	if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {	362	if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363	panicm = m;	363	panicm = m;
364	panicm_found = 1;	364	panicm_found = 1;
365	}	365	}
366	}	366	}
367		367
368	/* If we didn't find an uncorrectable error, pick	368	/* If we didn't find an uncorrectable error, pick
369	the last one (shouldn't happen, just being safe). */	369	the last one (shouldn't happen, just being safe). */
370	if (!panicm_found)	370	if (!panicm_found)
371	panicm = m;	371	panicm = m;
372		372
373	/*	373	/*
374	* If we have decided that we just CAN'T continue, and the user	374	* If we have decided that we just CAN'T continue, and the user
375	* has not set tolerant to an insane level, give up and die.	375	* has not set tolerant to an insane level, give up and die.
376	*/	376	*/
377	if (no_way_out && tolerant < 3)	377	if (no_way_out && tolerant < 3)
378	mce_panic("Machine check", &panicm, mcestart);	378	mce_panic("Machine check", &panicm, mcestart);
379		379
380	/*	380	/*
381	* If the error seems to be unrecoverable, something should be	381	* If the error seems to be unrecoverable, something should be
382	* done. Try to kill as little as possible. If we can kill just	382	* done. Try to kill as little as possible. If we can kill just
383	* one task, do that. If the user has set the tolerance very	383	* one task, do that. If the user has set the tolerance very
384	* high, don't try to do anything at all.	384	* high, don't try to do anything at all.
385	*/	385	*/
386	if (kill_it && tolerant < 3) {	386	if (kill_it && tolerant < 3) {
387	int user_space = 0;	387	int user_space = 0;
388		388
389	/*	389	/*
390	* If the EIPV bit is set, it means the saved IP is the	390	* If the EIPV bit is set, it means the saved IP is the
391	* instruction which caused the MCE.	391	* instruction which caused the MCE.
392	*/	392	*/
393	if (m.mcgstatus & MCG_STATUS_EIPV)	393	if (m.mcgstatus & MCG_STATUS_EIPV)
394	user_space = panicm.ip && (panicm.cs & 3);	394	user_space = panicm.ip && (panicm.cs & 3);
395		395
396	/*	396	/*
397	* If we know that the error was in user space, send a	397	* If we know that the error was in user space, send a
398	* SIGBUS. Otherwise, panic if tolerance is low.	398	* SIGBUS. Otherwise, panic if tolerance is low.
399	*	399	*
400	* force_sig() takes an awful lot of locks and has a slight	400	* force_sig() takes an awful lot of locks and has a slight
401	* risk of deadlocking.	401	* risk of deadlocking.
402	*/	402	*/
403	if (user_space) {	403	if (user_space) {
404	force_sig(SIGBUS, current);	404	force_sig(SIGBUS, current);
405	} else if (panic_on_oops \|\| tolerant < 2) {	405	} else if (panic_on_oops \|\| tolerant < 2) {
406	mce_panic("Uncorrected machine check",	406	mce_panic("Uncorrected machine check",
407	&panicm, mcestart);	407	&panicm, mcestart);
408	}	408	}
409	}	409	}
410		410
411	/* notify userspace ASAP */	411	/* notify userspace ASAP */
412	set_thread_flag(TIF_MCE_NOTIFY);	412	set_thread_flag(TIF_MCE_NOTIFY);
413		413
414	/* the last thing we do is clear state */	414	/* the last thing we do is clear state */
415	for (i = 0; i < banks; i++) {	415	for (i = 0; i < banks; i++) {
416	if (test_bit(i, toclear))	416	if (test_bit(i, toclear))
417	wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);	417	wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418	}	418	}
419	wrmsrl(MSR_IA32_MCG_STATUS, 0);	419	wrmsrl(MSR_IA32_MCG_STATUS, 0);
420	out2:	420	out2:
421	atomic_dec(&mce_entry);	421	atomic_dec(&mce_entry);
422	}	422	}
423		423
424	#ifdef CONFIG_X86_MCE_INTEL	424	#ifdef CONFIG_X86_MCE_INTEL
425	/***	425	/***
426	* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog	426	* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
427	* @cpu: The CPU on which the event occurred.	427	* @cpu: The CPU on which the event occurred.
428	* @status: Event status information	428	* @status: Event status information
429	*	429	*
430	* This function should be called by the thermal interrupt after the	430	* This function should be called by the thermal interrupt after the
431	* event has been processed and the decision was made to log the event	431	* event has been processed and the decision was made to log the event
432	* further.	432	* further.
433	*	433	*
434	* The status parameter will be saved to the 'status' field of 'struct mce'	434	* The status parameter will be saved to the 'status' field of 'struct mce'
435	* and historically has been the register value of the	435	* and historically has been the register value of the
436	* MSR_IA32_THERMAL_STATUS (Intel) msr.	436	* MSR_IA32_THERMAL_STATUS (Intel) msr.
437	*/	437	*/
438	void mce_log_therm_throt_event(__u64 status)	438	void mce_log_therm_throt_event(__u64 status)
439	{	439	{
440	struct mce m;	440	struct mce m;
441		441
442	mce_setup(&m);	442	mce_setup(&m);
443	m.bank = MCE_THERMAL_BANK;	443	m.bank = MCE_THERMAL_BANK;
444	m.status = status;	444	m.status = status;
445	mce_log(&m);	445	mce_log(&m);
446	}	446	}
447	#endif /* CONFIG_X86_MCE_INTEL */	447	#endif /* CONFIG_X86_MCE_INTEL */
448		448
449	/*	449	/*
450	* Periodic polling timer for "silent" machine check errors. If the	450	* Periodic polling timer for "silent" machine check errors. If the
451	* poller finds an MCE, poll 2x faster. When the poller finds no more	451	* poller finds an MCE, poll 2x faster. When the poller finds no more
452	* errors, poll 2x slower (up to check_interval seconds).	452	* errors, poll 2x slower (up to check_interval seconds).
453	*/	453	*/
454		454
455	static int check_interval = 5 * 60; /* 5 minutes */	455	static int check_interval = 5 * 60; /* 5 minutes */
456	static DEFINE_PER_CPU(int, next_interval); /* in jiffies */	456	static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457	static void mcheck_timer(unsigned long);	457	static void mcheck_timer(unsigned long);
458	static DEFINE_PER_CPU(struct timer_list, mce_timer);	458	static DEFINE_PER_CPU(struct timer_list, mce_timer);
459		459
460	static void mcheck_timer(unsigned long data)	460	static void mcheck_timer(unsigned long data)
461	{	461	{
462	struct timer_list *t = &per_cpu(mce_timer, data);	462	struct timer_list *t = &per_cpu(mce_timer, data);
463	int *n;	463	int *n;
464		464
465	WARN_ON(smp_processor_id() != data);	465	WARN_ON(smp_processor_id() != data);
466		466
467	if (mce_available(&current_cpu_data))	467	if (mce_available(&current_cpu_data))
468	machine_check_poll(MCP_TIMESTAMP,	468	machine_check_poll(MCP_TIMESTAMP,
469	&__get_cpu_var(mce_poll_banks));	469	&__get_cpu_var(mce_poll_banks));
470		470
471	/*	471	/*
472	* Alert userspace if needed. If we logged an MCE, reduce the	472	* Alert userspace if needed. If we logged an MCE, reduce the
473	* polling interval, otherwise increase the polling interval.	473	* polling interval, otherwise increase the polling interval.
474	*/	474	*/
475	n = &__get_cpu_var(next_interval);	475	n = &__get_cpu_var(next_interval);
476	if (mce_notify_user()) {	476	if (mce_notify_user()) {
477	n = max(n/2, HZ/100);	477	n = max(n/2, HZ/100);
478	} else {	478	} else {
479	n = min(n2, (int)round_jiffies_relative(check_intervalHZ));	479	n = min(n2, (int)round_jiffies_relative(check_intervalHZ));
480	}	480	}
481		481
482	t->expires = jiffies + *n;	482	t->expires = jiffies + *n;
483	add_timer(t);	483	add_timer(t);
484	}	484	}
485		485
486	static void mce_do_trigger(struct work_struct *work)	486	static void mce_do_trigger(struct work_struct *work)
487	{	487	{
488	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);	488	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
489	}	489	}
490		490
491	static DECLARE_WORK(mce_trigger_work, mce_do_trigger);	491	static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
492		492
493	/*	493	/*
494	* Notify the user(s) about new machine check events.	494	* Notify the user(s) about new machine check events.
495	* Can be called from interrupt context, but not from machine check/NMI	495	* Can be called from interrupt context, but not from machine check/NMI
496	* context.	496	* context.
497	*/	497	*/
498	int mce_notify_user(void)	498	int mce_notify_user(void)
499	{	499	{
500	/* Not more than two messages every minute */	500	/* Not more than two messages every minute */
501	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);	501	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
502		502
503	clear_thread_flag(TIF_MCE_NOTIFY);	503	clear_thread_flag(TIF_MCE_NOTIFY);
504	if (test_and_clear_bit(0, &notify_user)) {	504	if (test_and_clear_bit(0, &notify_user)) {
505	wake_up_interruptible(&mce_wait);	505	wake_up_interruptible(&mce_wait);
506		506
507	/*	507	/*
508	* There is no risk of missing notifications because	508	* There is no risk of missing notifications because
509	* work_pending is always cleared before the function is	509	* work_pending is always cleared before the function is
510	* executed.	510	* executed.
511	*/	511	*/
512	if (trigger[0] && !work_pending(&mce_trigger_work))	512	if (trigger[0] && !work_pending(&mce_trigger_work))
513	schedule_work(&mce_trigger_work);	513	schedule_work(&mce_trigger_work);
514		514
515	if (__ratelimit(&ratelimit))	515	if (__ratelimit(&ratelimit))
516	printk(KERN_INFO "Machine check events logged\n");	516	printk(KERN_INFO "Machine check events logged\n");
517		517
518	return 1;	518	return 1;
519	}	519	}
520	return 0;	520	return 0;
521	}	521	}
522		522
523	/* see if the idle task needs to notify userspace */	523	/* see if the idle task needs to notify userspace */
524	static int	524	static int
525	mce_idle_callback(struct notifier_block nfb, unsigned long action, void junk)	525	mce_idle_callback(struct notifier_block nfb, unsigned long action, void junk)
526	{	526	{
527	/* IDLE_END should be safe - interrupts are back on */	527	/* IDLE_END should be safe - interrupts are back on */
528	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))	528	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529	mce_notify_user();	529	mce_notify_user();
530		530
531	return NOTIFY_OK;	531	return NOTIFY_OK;
532	}	532	}
533		533
534	static struct notifier_block mce_idle_notifier = {	534	static struct notifier_block mce_idle_notifier = {
535	.notifier_call = mce_idle_callback,	535	.notifier_call = mce_idle_callback,
536	};	536	};
537		537
538	static __init int periodic_mcheck_init(void)	538	static __init int periodic_mcheck_init(void)
539	{	539	{
540	idle_notifier_register(&mce_idle_notifier);	540	idle_notifier_register(&mce_idle_notifier);
541	return 0;	541	return 0;
542	}	542	}
543	__initcall(periodic_mcheck_init);	543	__initcall(periodic_mcheck_init);
544		544
545	/*	545	/*
546	* Initialize Machine Checks for a CPU.	546	* Initialize Machine Checks for a CPU.
547	*/	547	*/
548	static int mce_cap_init(void)	548	static int mce_cap_init(void)
549	{	549	{
550	u64 cap;	550	u64 cap;
551	unsigned b;	551	unsigned b;
552		552
553	rdmsrl(MSR_IA32_MCG_CAP, cap);	553	rdmsrl(MSR_IA32_MCG_CAP, cap);
554	b = cap & 0xff;	554	b = cap & 0xff;
555	if (b > MAX_NR_BANKS) {	555	if (b > MAX_NR_BANKS) {
556	printk(KERN_WARNING	556	printk(KERN_WARNING
557	"MCE: Using only %u machine check banks out of %u\n",	557	"MCE: Using only %u machine check banks out of %u\n",
558	MAX_NR_BANKS, b);	558	MAX_NR_BANKS, b);
559	b = MAX_NR_BANKS;	559	b = MAX_NR_BANKS;
560	}	560	}
561		561
562	/* Don't support asymmetric configurations today */	562	/* Don't support asymmetric configurations today */
563	WARN_ON(banks != 0 && b != banks);	563	WARN_ON(banks != 0 && b != banks);
564	banks = b;	564	banks = b;
565	if (!bank) {	565	if (!bank) {
566	bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);	566	bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567	if (!bank)	567	if (!bank)
568	return -ENOMEM;	568	return -ENOMEM;
569	memset(bank, 0xff, banks * sizeof(u64));	569	memset(bank, 0xff, banks * sizeof(u64));
570	}	570	}
571		571
572	/* Use accurate RIP reporting if available. */	572	/* Use accurate RIP reporting if available. */
573	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)	573	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574	rip_msr = MSR_IA32_MCG_EIP;	574	rip_msr = MSR_IA32_MCG_EIP;
575		575
576	return 0;	576	return 0;
577	}	577	}
578		578
579	static void mce_init(void *dummy)	579	static void mce_init(void *dummy)
580	{	580	{
581	u64 cap;	581	u64 cap;
582	int i;	582	int i;
583	mce_banks_t all_banks;	583	mce_banks_t all_banks;
584		584
585	/*	585	/*
586	* Log the machine checks left over from the previous reset.	586	* Log the machine checks left over from the previous reset.
587	*/	587	*/
588	bitmap_fill(all_banks, MAX_NR_BANKS);	588	bitmap_fill(all_banks, MAX_NR_BANKS);
589	machine_check_poll(MCP_UC\|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);	589	machine_check_poll(MCP_UC\|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
590		590
591	set_in_cr4(X86_CR4_MCE);	591	set_in_cr4(X86_CR4_MCE);
592		592
593	rdmsrl(MSR_IA32_MCG_CAP, cap);	593	rdmsrl(MSR_IA32_MCG_CAP, cap);
594	if (cap & MCG_CTL_P)	594	if (cap & MCG_CTL_P)
595	wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);	595	wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
596		596
597	for (i = 0; i < banks; i++) {	597	for (i = 0; i < banks; i++) {
598	wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);	598	wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
599	wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);	599	wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600	}	600	}
601	}	601	}
602		602
603	/* Add per CPU specific workarounds here */	603	/* Add per CPU specific workarounds here */
604	static void mce_cpu_quirks(struct cpuinfo_x86 *c)	604	static void mce_cpu_quirks(struct cpuinfo_x86 *c)
605	{	605	{
606	/* This should be disabled by the BIOS, but isn't always */	606	/* This should be disabled by the BIOS, but isn't always */
607	if (c->x86_vendor == X86_VENDOR_AMD) {	607	if (c->x86_vendor == X86_VENDOR_AMD) {
608	if (c->x86 == 15 && banks > 4)	608	if (c->x86 == 15 && banks > 4)
609	/* disable GART TBL walk error reporting, which trips off	609	/* disable GART TBL walk error reporting, which trips off
610	incorrectly with the IOMMU & 3ware & Cerberus. */	610	incorrectly with the IOMMU & 3ware & Cerberus. */
611	clear_bit(10, (unsigned long *)&bank[4]);	611	clear_bit(10, (unsigned long *)&bank[4]);
612	if(c->x86 <= 17 && mce_bootlog < 0)	612	if(c->x86 <= 17 && mce_bootlog < 0)
613	/* Lots of broken BIOS around that don't clear them	613	/* Lots of broken BIOS around that don't clear them
614	by default and leave crap in there. Don't log. */	614	by default and leave crap in there. Don't log. */
615	mce_bootlog = 0;	615	mce_bootlog = 0;
616	}	616	}
617		617
618	}	618	}
619		619
620	static void mce_cpu_features(struct cpuinfo_x86 *c)	620	static void mce_cpu_features(struct cpuinfo_x86 *c)
621	{	621	{
622	switch (c->x86_vendor) {	622	switch (c->x86_vendor) {
623	case X86_VENDOR_INTEL:	623	case X86_VENDOR_INTEL:
624	mce_intel_feature_init(c);	624	mce_intel_feature_init(c);
625	break;	625	break;
626	case X86_VENDOR_AMD:	626	case X86_VENDOR_AMD:
627	mce_amd_feature_init(c);	627	mce_amd_feature_init(c);
628	break;	628	break;
629	default:	629	default:
630	break;	630	break;
631	}	631	}
632	}	632	}
633		633
634	static void mce_init_timer(void)	634	static void mce_init_timer(void)
635	{	635	{
636	struct timer_list *t = &__get_cpu_var(mce_timer);	636	struct timer_list *t = &__get_cpu_var(mce_timer);
637	int *n = &__get_cpu_var(next_interval);	637	int *n = &__get_cpu_var(next_interval);
638		638
639	n = check_interval HZ;	639	n = check_interval HZ;
640	if (!*n)	640	if (!*n)
641	return;	641	return;
642	setup_timer(t, mcheck_timer, smp_processor_id());	642	setup_timer(t, mcheck_timer, smp_processor_id());
643	t->expires = round_jiffies(jiffies + *n);	643	t->expires = round_jiffies(jiffies + *n);
644	add_timer(t);	644	add_timer(t);
645	}	645	}
646		646
647	/*	647	/*
648	* Called for each booted CPU to set up machine checks.	648	* Called for each booted CPU to set up machine checks.
649	* Must be called with preempt off.	649	* Must be called with preempt off.
650	*/	650	*/
651	void __cpuinit mcheck_init(struct cpuinfo_x86 *c)	651	void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
652	{	652	{
653	if (!mce_available(c))	653	if (!mce_available(c))
654	return;	654	return;
655		655
656	if (mce_cap_init() < 0) {	656	if (mce_cap_init() < 0) {
657	mce_dont_init = 1;	657	mce_dont_init = 1;
658	return;	658	return;
659	}	659	}
660	mce_cpu_quirks(c);	660	mce_cpu_quirks(c);
661		661
662	mce_init(NULL);	662	mce_init(NULL);
663	mce_cpu_features(c);	663	mce_cpu_features(c);
664	mce_init_timer();	664	mce_init_timer();
665	}	665	}
666		666
667	/*	667	/*
668	* Character device to read and clear the MCE log.	668	* Character device to read and clear the MCE log.
669	*/	669	*/
670		670
671	static DEFINE_SPINLOCK(mce_state_lock);	671	static DEFINE_SPINLOCK(mce_state_lock);
672	static int open_count; /* #times opened */	672	static int open_count; /* #times opened */
673	static int open_exclu; /* already open exclusive? */	673	static int open_exclu; /* already open exclusive? */
674		674
675	static int mce_open(struct inode inode, struct file file)	675	static int mce_open(struct inode inode, struct file file)
676	{	676	{
677	lock_kernel();	677	lock_kernel();
678	spin_lock(&mce_state_lock);	678	spin_lock(&mce_state_lock);
679		679
680	if (open_exclu \|\| (open_count && (file->f_flags & O_EXCL))) {	680	if (open_exclu \|\| (open_count && (file->f_flags & O_EXCL))) {
681	spin_unlock(&mce_state_lock);	681	spin_unlock(&mce_state_lock);
682	unlock_kernel();	682	unlock_kernel();
683	return -EBUSY;	683	return -EBUSY;
684	}	684	}
685		685
686	if (file->f_flags & O_EXCL)	686	if (file->f_flags & O_EXCL)
687	open_exclu = 1;	687	open_exclu = 1;
688	open_count++;	688	open_count++;
689		689
690	spin_unlock(&mce_state_lock);	690	spin_unlock(&mce_state_lock);
691	unlock_kernel();	691	unlock_kernel();
692		692
693	return nonseekable_open(inode, file);	693	return nonseekable_open(inode, file);
694	}	694	}
695		695
696	static int mce_release(struct inode inode, struct file file)	696	static int mce_release(struct inode inode, struct file file)
697	{	697	{
698	spin_lock(&mce_state_lock);	698	spin_lock(&mce_state_lock);
699		699
700	open_count--;	700	open_count--;
701	open_exclu = 0;	701	open_exclu = 0;
702		702
703	spin_unlock(&mce_state_lock);	703	spin_unlock(&mce_state_lock);
704		704
705	return 0;	705	return 0;
706	}	706	}
707		707
708	static void collect_tscs(void *data)	708	static void collect_tscs(void *data)
709	{	709	{
710	unsigned long cpu_tsc = (unsigned long )data;	710	unsigned long cpu_tsc = (unsigned long )data;
711		711
712	rdtscll(cpu_tsc[smp_processor_id()]);	712	rdtscll(cpu_tsc[smp_processor_id()]);
713	}	713	}
714		714
715	static ssize_t mce_read(struct file filp, char __user ubuf, size_t usize,	715	static ssize_t mce_read(struct file filp, char __user ubuf, size_t usize,
716	loff_t *off)	716	loff_t *off)
717	{	717	{
718	unsigned long *cpu_tsc;	718	unsigned long *cpu_tsc;
719	static DEFINE_MUTEX(mce_read_mutex);	719	static DEFINE_MUTEX(mce_read_mutex);
720	unsigned prev, next;	720	unsigned prev, next;
721	char __user *buf = ubuf;	721	char __user *buf = ubuf;
722	int i, err;	722	int i, err;
723		723
724	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);	724	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725	if (!cpu_tsc)	725	if (!cpu_tsc)
726	return -ENOMEM;	726	return -ENOMEM;
727		727
728	mutex_lock(&mce_read_mutex);	728	mutex_lock(&mce_read_mutex);
729	next = rcu_dereference(mcelog.next);	729	next = rcu_dereference(mcelog.next);
730		730
731	/* Only supports full reads right now */	731	/* Only supports full reads right now */
732	if (off != 0 \|\| usize < MCE_LOG_LENsizeof(struct mce)) {	732	if (off != 0 \|\| usize < MCE_LOG_LENsizeof(struct mce)) {
733	mutex_unlock(&mce_read_mutex);	733	mutex_unlock(&mce_read_mutex);
734	kfree(cpu_tsc);	734	kfree(cpu_tsc);
735	return -EINVAL;	735	return -EINVAL;
736	}	736	}
737		737
738	err = 0;	738	err = 0;
739	prev = 0;	739	prev = 0;
740	do {	740	do {
741	for (i = prev; i < next; i++) {	741	for (i = prev; i < next; i++) {
742	unsigned long start = jiffies;	742	unsigned long start = jiffies;
743		743
744	while (!mcelog.entry[i].finished) {	744	while (!mcelog.entry[i].finished) {
745	if (time_after_eq(jiffies, start + 2)) {	745	if (time_after_eq(jiffies, start + 2)) {
746	memset(mcelog.entry + i, 0,	746	memset(mcelog.entry + i, 0,
747	sizeof(struct mce));	747	sizeof(struct mce));
748	goto timeout;	748	goto timeout;
749	}	749	}
750	cpu_relax();	750	cpu_relax();
751	}	751	}
752	smp_rmb();	752	smp_rmb();
753	err \|= copy_to_user(buf, mcelog.entry + i,	753	err \|= copy_to_user(buf, mcelog.entry + i,
754	sizeof(struct mce));	754	sizeof(struct mce));
755	buf += sizeof(struct mce);	755	buf += sizeof(struct mce);
756	timeout:	756	timeout:
757	;	757	;
758	}	758	}
759		759
760	memset(mcelog.entry + prev, 0,	760	memset(mcelog.entry + prev, 0,
761	(next - prev) * sizeof(struct mce));	761	(next - prev) * sizeof(struct mce));
762	prev = next;	762	prev = next;
763	next = cmpxchg(&mcelog.next, prev, 0);	763	next = cmpxchg(&mcelog.next, prev, 0);
764	} while (next != prev);	764	} while (next != prev);
765		765
766	synchronize_sched();	766	synchronize_sched();
767		767
768	/*	768	/*
769	* Collect entries that were still getting written before the	769	* Collect entries that were still getting written before the
770	* synchronize.	770	* synchronize.
771	*/	771	*/
772	on_each_cpu(collect_tscs, cpu_tsc, 1);	772	on_each_cpu(collect_tscs, cpu_tsc, 1);
773	for (i = next; i < MCE_LOG_LEN; i++) {	773	for (i = next; i < MCE_LOG_LEN; i++) {
774	if (mcelog.entry[i].finished &&	774	if (mcelog.entry[i].finished &&
775	mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {	775	mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776	err \|= copy_to_user(buf, mcelog.entry+i,	776	err \|= copy_to_user(buf, mcelog.entry+i,
777	sizeof(struct mce));	777	sizeof(struct mce));
778	smp_rmb();	778	smp_rmb();
779	buf += sizeof(struct mce);	779	buf += sizeof(struct mce);
780	memset(&mcelog.entry[i], 0, sizeof(struct mce));	780	memset(&mcelog.entry[i], 0, sizeof(struct mce));
781	}	781	}
782	}	782	}
783	mutex_unlock(&mce_read_mutex);	783	mutex_unlock(&mce_read_mutex);
784	kfree(cpu_tsc);	784	kfree(cpu_tsc);
785	return err ? -EFAULT : buf - ubuf;	785	return err ? -EFAULT : buf - ubuf;
786	}	786	}
787		787
788	static unsigned int mce_poll(struct file file, poll_table wait)	788	static unsigned int mce_poll(struct file file, poll_table wait)
789	{	789	{
790	poll_wait(file, &mce_wait, wait);	790	poll_wait(file, &mce_wait, wait);
791	if (rcu_dereference(mcelog.next))	791	if (rcu_dereference(mcelog.next))
792	return POLLIN \| POLLRDNORM;	792	return POLLIN \| POLLRDNORM;
793	return 0;	793	return 0;
794	}	794	}
795		795
796	static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)	796	static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
797	{	797	{
798	int __user p = (int __user )arg;	798	int __user p = (int __user )arg;
799		799
800	if (!capable(CAP_SYS_ADMIN))	800	if (!capable(CAP_SYS_ADMIN))
801	return -EPERM;	801	return -EPERM;
802	switch (cmd) {	802	switch (cmd) {
803	case MCE_GET_RECORD_LEN:	803	case MCE_GET_RECORD_LEN:
804	return put_user(sizeof(struct mce), p);	804	return put_user(sizeof(struct mce), p);
805	case MCE_GET_LOG_LEN:	805	case MCE_GET_LOG_LEN:
806	return put_user(MCE_LOG_LEN, p);	806	return put_user(MCE_LOG_LEN, p);
807	case MCE_GETCLEAR_FLAGS: {	807	case MCE_GETCLEAR_FLAGS: {
808	unsigned flags;	808	unsigned flags;
809		809
810	do {	810	do {
811	flags = mcelog.flags;	811	flags = mcelog.flags;
812	} while (cmpxchg(&mcelog.flags, flags, 0) != flags);	812	} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813	return put_user(flags, p);	813	return put_user(flags, p);
814	}	814	}
815	default:	815	default:
816	return -ENOTTY;	816	return -ENOTTY;
817	}	817	}
818	}	818	}
819		819
820	static const struct file_operations mce_chrdev_ops = {	820	static const struct file_operations mce_chrdev_ops = {
821	.open = mce_open,	821	.open = mce_open,
822	.release = mce_release,	822	.release = mce_release,
823	.read = mce_read,	823	.read = mce_read,
824	.poll = mce_poll,	824	.poll = mce_poll,
825	.unlocked_ioctl = mce_ioctl,	825	.unlocked_ioctl = mce_ioctl,
826	};	826	};
827		827
828	static struct miscdevice mce_log_device = {	828	static struct miscdevice mce_log_device = {
829	MISC_MCELOG_MINOR,	829	MISC_MCELOG_MINOR,
830	"mcelog",	830	"mcelog",
831	&mce_chrdev_ops,	831	&mce_chrdev_ops,
832	};	832	};
833		833
834	/*	834	/*
835	* Old style boot options parsing. Only for compatibility.	835	* Old style boot options parsing. Only for compatibility.
836	*/	836	*/
837	static int __init mcheck_disable(char *str)	837	static int __init mcheck_disable(char *str)
838	{	838	{
839	mce_dont_init = 1;	839	mce_dont_init = 1;
840	return 1;	840	return 1;
841	}	841	}
842		842
843	/* mce=off disables machine check.	843	/* mce=off disables machine check.
844	mce=TOLERANCELEVEL (number, see above)	844	mce=TOLERANCELEVEL (number, see above)
845	mce=bootlog Log MCEs from before booting. Disabled by default on AMD.	845	mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
846	mce=nobootlog Don't log MCEs from before booting. */	846	mce=nobootlog Don't log MCEs from before booting. */
847	static int __init mcheck_enable(char *str)	847	static int __init mcheck_enable(char *str)
848	{	848	{
849	if (!strcmp(str, "off"))	849	if (!strcmp(str, "off"))
850	mce_dont_init = 1;	850	mce_dont_init = 1;
851	else if (!strcmp(str, "bootlog") \|\| !strcmp(str,"nobootlog"))	851	else if (!strcmp(str, "bootlog") \|\| !strcmp(str,"nobootlog"))
852	mce_bootlog = str[0] == 'b';	852	mce_bootlog = str[0] == 'b';
853	else if (isdigit(str[0]))	853	else if (isdigit(str[0]))
854	get_option(&str, &tolerant);	854	get_option(&str, &tolerant);
855	else	855	else
856	printk("mce= argument %s ignored. Please use /sys", str);	856	printk("mce= argument %s ignored. Please use /sys", str);
857	return 1;	857	return 1;
858	}	858	}
859		859
860	__setup("nomce", mcheck_disable);	860	__setup("nomce", mcheck_disable);
861	__setup("mce=", mcheck_enable);	861	__setup("mce=", mcheck_enable);
862		862
863	/*	863	/*
864	* Sysfs support	864	* Sysfs support
865	*/	865	*/
866		866
867	/*	867	/*
868	* Disable machine checks on suspend and shutdown. We can't really handle	868	* Disable machine checks on suspend and shutdown. We can't really handle
869	* them later.	869	* them later.
870	*/	870	*/
871	static int mce_disable(void)	871	static int mce_disable(void)
872	{	872	{
873	int i;	873	int i;
874		874
875	for (i = 0; i < banks; i++)	875	for (i = 0; i < banks; i++)
876	wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);	876	wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
877	return 0;	877	return 0;
878	}	878	}
879		879
880	static int mce_suspend(struct sys_device *dev, pm_message_t state)	880	static int mce_suspend(struct sys_device *dev, pm_message_t state)
881	{	881	{
882	return mce_disable();	882	return mce_disable();
883	}	883	}
884		884
885	static int mce_shutdown(struct sys_device *dev)	885	static int mce_shutdown(struct sys_device *dev)
886	{	886	{
887	return mce_disable();	887	return mce_disable();
888	}	888	}
889		889
890	/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.	890	/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
891	Only one CPU is active at this time, the others get readded later using	891	Only one CPU is active at this time, the others get readded later using
892	CPU hotplug. */	892	CPU hotplug. */
893	static int mce_resume(struct sys_device *dev)	893	static int mce_resume(struct sys_device *dev)
894	{	894	{
895	mce_init(NULL);	895	mce_init(NULL);
896	mce_cpu_features(&current_cpu_data);	896	mce_cpu_features(&current_cpu_data);
897	return 0;	897	return 0;
898	}	898	}
899		899
900	static void mce_cpu_restart(void *data)	900	static void mce_cpu_restart(void *data)
901	{	901	{
902	del_timer_sync(&__get_cpu_var(mce_timer));	902	del_timer_sync(&__get_cpu_var(mce_timer));
903	if (mce_available(&current_cpu_data))	903	if (mce_available(&current_cpu_data))
904	mce_init(NULL);	904	mce_init(NULL);
905	mce_init_timer();	905	mce_init_timer();
906	}	906	}
907		907
908	/* Reinit MCEs after user configuration changes */	908	/* Reinit MCEs after user configuration changes */
909	static void mce_restart(void)	909	static void mce_restart(void)
910	{	910	{
911	on_each_cpu(mce_cpu_restart, NULL, 1);	911	on_each_cpu(mce_cpu_restart, NULL, 1);
912	}	912	}
913		913
914	static struct sysdev_class mce_sysclass = {	914	static struct sysdev_class mce_sysclass = {
915	.suspend = mce_suspend,	915	.suspend = mce_suspend,
916	.shutdown = mce_shutdown,	916	.shutdown = mce_shutdown,
917	.resume = mce_resume,	917	.resume = mce_resume,
918	.name = "machinecheck",	918	.name = "machinecheck",
919	};	919	};
920		920
921	DEFINE_PER_CPU(struct sys_device, device_mce);	921	DEFINE_PER_CPU(struct sys_device, device_mce);
922	void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;	922	void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
923		923
924	/* Why are there no generic functions for this? */	924	/* Why are there no generic functions for this? */
925	#define ACCESSOR(name, var, start) \	925	#define ACCESSOR(name, var, start) \
926	static ssize_t show_ ## name(struct sys_device *s, \	926	static ssize_t show_ ## name(struct sys_device *s, \
927	struct sysdev_attribute *attr, \	927	struct sysdev_attribute *attr, \
928	char *buf) { \	928	char *buf) { \
929	return sprintf(buf, "%lx\n", (unsigned long)var); \	929	return sprintf(buf, "%lx\n", (unsigned long)var); \
930	} \	930	} \
931	static ssize_t set_ ## name(struct sys_device *s, \	931	static ssize_t set_ ## name(struct sys_device *s, \
932	struct sysdev_attribute *attr, \	932	struct sysdev_attribute *attr, \
933	const char *buf, size_t siz) { \	933	const char *buf, size_t siz) { \
934	char *end; \	934	char *end; \
935	unsigned long new = simple_strtoul(buf, &end, 0); \	935	unsigned long new = simple_strtoul(buf, &end, 0); \
936	if (end == buf) return -EINVAL; \	936	if (end == buf) return -EINVAL; \
937	var = new; \	937	var = new; \
938	start; \	938	start; \
939	return end-buf; \	939	return end-buf; \
940	} \	940	} \
941	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);	941	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942		942
943	static struct sysdev_attribute *bank_attrs;	943	static struct sysdev_attribute *bank_attrs;
944		944
945	static ssize_t show_bank(struct sys_device s, struct sysdev_attribute attr,	945	static ssize_t show_bank(struct sys_device s, struct sysdev_attribute attr,
946	char *buf)	946	char *buf)
947	{	947	{
948	u64 b = bank[attr - bank_attrs];	948	u64 b = bank[attr - bank_attrs];
949	return sprintf(buf, "%llx\n", b);	949	return sprintf(buf, "%llx\n", b);
950	}	950	}
951		951
952	static ssize_t set_bank(struct sys_device s, struct sysdev_attribute attr,	952	static ssize_t set_bank(struct sys_device s, struct sysdev_attribute attr,
953	const char *buf, size_t siz)	953	const char *buf, size_t siz)
954	{	954	{
955	char *end;	955	char *end;
956	u64 new = simple_strtoull(buf, &end, 0);	956	u64 new = simple_strtoull(buf, &end, 0);
957	if (end == buf)	957	if (end == buf)
958	return -EINVAL;	958	return -EINVAL;
959	bank[attr - bank_attrs] = new;	959	bank[attr - bank_attrs] = new;
960	mce_restart();	960	mce_restart();
961	return end-buf;	961	return end-buf;
962	}	962	}
963		963
964	static ssize_t show_trigger(struct sys_device s, struct sysdev_attribute attr,	964	static ssize_t show_trigger(struct sys_device s, struct sysdev_attribute attr,
965	char *buf)	965	char *buf)
966	{	966	{
967	strcpy(buf, trigger);	967	strcpy(buf, trigger);
968	strcat(buf, "\n");	968	strcat(buf, "\n");
969	return strlen(trigger) + 1;	969	return strlen(trigger) + 1;
970	}	970	}
971		971
972	static ssize_t set_trigger(struct sys_device s, struct sysdev_attribute attr,	972	static ssize_t set_trigger(struct sys_device s, struct sysdev_attribute attr,
973	const char *buf,size_t siz)	973	const char *buf,size_t siz)
974	{	974	{
975	char *p;	975	char *p;
976	int len;	976	int len;
977	strncpy(trigger, buf, sizeof(trigger));	977	strncpy(trigger, buf, sizeof(trigger));
978	trigger[sizeof(trigger)-1] = 0;	978	trigger[sizeof(trigger)-1] = 0;
979	len = strlen(trigger);	979	len = strlen(trigger);
980	p = strchr(trigger, '\n');	980	p = strchr(trigger, '\n');
981	if (p) p = 0;	981	if (p) p = 0;
982	return len;	982	return len;
983	}	983	}
984		984
985	static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);	985	static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986	static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);	986	static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
987	ACCESSOR(check_interval,check_interval,mce_restart())	987	ACCESSOR(check_interval,check_interval,mce_restart())
988	static struct sysdev_attribute *mce_attributes[] = {	988	static struct sysdev_attribute *mce_attributes[] = {
989	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,	989	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990	NULL	990	NULL
991	};	991	};
992		992
993	static cpumask_var_t mce_device_initialized;	993	static cpumask_var_t mce_device_initialized;
994		994
995	/* Per cpu sysdev init. All of the cpus still share the same ctl bank */	995	/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996	static __cpuinit int mce_create_device(unsigned int cpu)	996	static __cpuinit int mce_create_device(unsigned int cpu)
997	{	997	{
998	int err;	998	int err;
999	int i;	999	int i;
1000		1000
1001	if (!mce_available(&boot_cpu_data))	1001	if (!mce_available(&boot_cpu_data))
1002	return -EIO;	1002	return -EIO;
1003		1003
1004	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));	1004	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005	per_cpu(device_mce,cpu).id = cpu;	1005	per_cpu(device_mce,cpu).id = cpu;
1006	per_cpu(device_mce,cpu).cls = &mce_sysclass;	1006	per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007		1007
1008	err = sysdev_register(&per_cpu(device_mce,cpu));	1008	err = sysdev_register(&per_cpu(device_mce,cpu));
1009	if (err)	1009	if (err)
1010	return err;	1010	return err;
1011		1011
1012	for (i = 0; mce_attributes[i]; i++) {	1012	for (i = 0; mce_attributes[i]; i++) {
1013	err = sysdev_create_file(&per_cpu(device_mce,cpu),	1013	err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014	mce_attributes[i]);	1014	mce_attributes[i]);
1015	if (err)	1015	if (err)
1016	goto error;	1016	goto error;
1017	}	1017	}
1018	for (i = 0; i < banks; i++) {	1018	for (i = 0; i < banks; i++) {
1019	err = sysdev_create_file(&per_cpu(device_mce, cpu),	1019	err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020	&bank_attrs[i]);	1020	&bank_attrs[i]);
1021	if (err)	1021	if (err)
1022	goto error2;	1022	goto error2;
1023	}	1023	}
1024	cpumask_set_cpu(cpu, mce_device_initialized);	1024	cpumask_set_cpu(cpu, mce_device_initialized);
1025		1025
1026	return 0;	1026	return 0;
1027	error2:	1027	error2:
1028	while (--i >= 0) {	1028	while (--i >= 0) {
1029	sysdev_remove_file(&per_cpu(device_mce, cpu),	1029	sysdev_remove_file(&per_cpu(device_mce, cpu),
1030	&bank_attrs[i]);	1030	&bank_attrs[i]);
1031	}	1031	}
1032	error:	1032	error:
1033	while (--i >= 0) {	1033	while (--i >= 0) {
1034	sysdev_remove_file(&per_cpu(device_mce,cpu),	1034	sysdev_remove_file(&per_cpu(device_mce,cpu),
1035	mce_attributes[i]);	1035	mce_attributes[i]);
1036	}	1036	}
1037	sysdev_unregister(&per_cpu(device_mce,cpu));	1037	sysdev_unregister(&per_cpu(device_mce,cpu));
1038		1038
1039	return err;	1039	return err;
1040	}	1040	}
1041		1041
1042	static __cpuinit void mce_remove_device(unsigned int cpu)	1042	static __cpuinit void mce_remove_device(unsigned int cpu)
1043	{	1043	{
1044	int i;	1044	int i;
1045		1045
1046	if (!cpumask_test_cpu(cpu, mce_device_initialized))	1046	if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047	return;	1047	return;
1048		1048
1049	for (i = 0; mce_attributes[i]; i++)	1049	for (i = 0; mce_attributes[i]; i++)
1050	sysdev_remove_file(&per_cpu(device_mce,cpu),	1050	sysdev_remove_file(&per_cpu(device_mce,cpu),
1051	mce_attributes[i]);	1051	mce_attributes[i]);
1052	for (i = 0; i < banks; i++)	1052	for (i = 0; i < banks; i++)
1053	sysdev_remove_file(&per_cpu(device_mce, cpu),	1053	sysdev_remove_file(&per_cpu(device_mce, cpu),
1054	&bank_attrs[i]);	1054	&bank_attrs[i]);
1055	sysdev_unregister(&per_cpu(device_mce,cpu));	1055	sysdev_unregister(&per_cpu(device_mce,cpu));
1056	cpumask_clear_cpu(cpu, mce_device_initialized);	1056	cpumask_clear_cpu(cpu, mce_device_initialized);
1057	}	1057	}
1058		1058
1059	/* Make sure there are no machine checks on offlined CPUs. */	1059	/* Make sure there are no machine checks on offlined CPUs. */
1060	static void mce_disable_cpu(void *h)	1060	static void mce_disable_cpu(void *h)
1061	{	1061	{
1062	int i;	1062	int i;
1063	unsigned long action = (unsigned long )h;	1063	unsigned long action = (unsigned long )h;
1064		1064
1065	if (!mce_available(&current_cpu_data))	1065	if (!mce_available(&current_cpu_data))
1066	return;	1066	return;
1067	if (!(action & CPU_TASKS_FROZEN))	1067	if (!(action & CPU_TASKS_FROZEN))
1068	cmci_clear();	1068	cmci_clear();
1069	for (i = 0; i < banks; i++)	1069	for (i = 0; i < banks; i++)
1070	wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);	1070	wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071	}	1071	}
1072		1072
1073	static void mce_reenable_cpu(void *h)	1073	static void mce_reenable_cpu(void *h)
1074	{	1074	{
1075	int i;	1075	int i;
1076	unsigned long action = (unsigned long )h;	1076	unsigned long action = (unsigned long )h;
1077		1077
1078	if (!mce_available(&current_cpu_data))	1078	if (!mce_available(&current_cpu_data))
1079	return;	1079	return;
1080	if (!(action & CPU_TASKS_FROZEN))	1080	if (!(action & CPU_TASKS_FROZEN))
1081	cmci_reenable();	1081	cmci_reenable();
1082	for (i = 0; i < banks; i++)	1082	for (i = 0; i < banks; i++)
1083	wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);	1083	wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084	}	1084	}
1085		1085
1086	/* Get notified when a cpu comes on/off. Be hotplug friendly. */	1086	/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087	static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,	1087	static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088	unsigned long action, void *hcpu)	1088	unsigned long action, void *hcpu)
1089	{	1089	{
1090	unsigned int cpu = (unsigned long)hcpu;	1090	unsigned int cpu = (unsigned long)hcpu;
1091	struct timer_list *t = &per_cpu(mce_timer, cpu);	1091	struct timer_list *t = &per_cpu(mce_timer, cpu);
1092		1092
1093	switch (action) {	1093	switch (action) {
1094	case CPU_ONLINE:	1094	case CPU_ONLINE:
1095	case CPU_ONLINE_FROZEN:	1095	case CPU_ONLINE_FROZEN:
1096	mce_create_device(cpu);	1096	mce_create_device(cpu);
1097	if (threshold_cpu_callback)	1097	if (threshold_cpu_callback)
1098	threshold_cpu_callback(action, cpu);	1098	threshold_cpu_callback(action, cpu);
1099	break;	1099	break;
1100	case CPU_DEAD:	1100	case CPU_DEAD:
1101	case CPU_DEAD_FROZEN:	1101	case CPU_DEAD_FROZEN:
1102	if (threshold_cpu_callback)	1102	if (threshold_cpu_callback)
1103	threshold_cpu_callback(action, cpu);	1103	threshold_cpu_callback(action, cpu);
1104	mce_remove_device(cpu);	1104	mce_remove_device(cpu);
1105	break;	1105	break;
1106	case CPU_DOWN_PREPARE:	1106	case CPU_DOWN_PREPARE:
1107	case CPU_DOWN_PREPARE_FROZEN:	1107	case CPU_DOWN_PREPARE_FROZEN:
1108	del_timer_sync(t);	1108	del_timer_sync(t);
1109	smp_call_function_single(cpu, mce_disable_cpu, &action, 1);	1109	smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110	break;	1110	break;
1111	case CPU_DOWN_FAILED:	1111	case CPU_DOWN_FAILED:
1112	case CPU_DOWN_FAILED_FROZEN:	1112	case CPU_DOWN_FAILED_FROZEN:
1113	t->expires = round_jiffies(jiffies +	1113	t->expires = round_jiffies(jiffies +
1114	__get_cpu_var(next_interval));	1114	__get_cpu_var(next_interval));
1115	add_timer_on(t, cpu);	1115	add_timer_on(t, cpu);
1116	smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);	1116	smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117	break;	1117	break;
1118	case CPU_POST_DEAD:	1118	case CPU_POST_DEAD:
1119	/* intentionally ignoring frozen here */	1119	/* intentionally ignoring frozen here */
1120	cmci_rediscover(cpu);	1120	cmci_rediscover(cpu);
1121	break;	1121	break;
1122	}	1122	}
1123	return NOTIFY_OK;	1123	return NOTIFY_OK;
1124	}	1124	}
1125		1125
1126	static struct notifier_block mce_cpu_notifier __cpuinitdata = {	1126	static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127	.notifier_call = mce_cpu_callback,	1127	.notifier_call = mce_cpu_callback,
1128	};	1128	};
1129		1129
1130	static __init int mce_init_banks(void)	1130	static __init int mce_init_banks(void)
1131	{	1131	{
1132	int i;	1132	int i;
1133		1133
1134	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,	1134	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135	GFP_KERNEL);	1135	GFP_KERNEL);
1136	if (!bank_attrs)	1136	if (!bank_attrs)
1137	return -ENOMEM;	1137	return -ENOMEM;
1138		1138
1139	for (i = 0; i < banks; i++) {	1139	for (i = 0; i < banks; i++) {
1140	struct sysdev_attribute *a = &bank_attrs[i];	1140	struct sysdev_attribute *a = &bank_attrs[i];
1141	a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);	1141	a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142	if (!a->attr.name)	1142	if (!a->attr.name)
1143	goto nomem;	1143	goto nomem;
1144	a->attr.mode = 0644;	1144	a->attr.mode = 0644;
1145	a->show = show_bank;	1145	a->show = show_bank;
1146	a->store = set_bank;	1146	a->store = set_bank;
1147	}	1147	}
1148	return 0;	1148	return 0;
1149		1149
1150	nomem:	1150	nomem:
1151	while (--i >= 0)	1151	while (--i >= 0)
1152	kfree(bank_attrs[i].attr.name);	1152	kfree(bank_attrs[i].attr.name);
1153	kfree(bank_attrs);	1153	kfree(bank_attrs);
1154	bank_attrs = NULL;	1154	bank_attrs = NULL;
1155	return -ENOMEM;	1155	return -ENOMEM;
1156	}	1156	}
1157		1157
1158	static __init int mce_init_device(void)	1158	static __init int mce_init_device(void)
1159	{	1159	{
1160	int err;	1160	int err;
1161	int i = 0;	1161	int i = 0;
1162		1162
1163	if (!mce_available(&boot_cpu_data))	1163	if (!mce_available(&boot_cpu_data))
1164	return -EIO;	1164	return -EIO;
1165		1165
1166	alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);	1166	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167		1167
1168	err = mce_init_banks();	1168	err = mce_init_banks();
1169	if (err)	1169	if (err)
1170	return err;	1170	return err;
1171		1171
1172	err = sysdev_class_register(&mce_sysclass);	1172	err = sysdev_class_register(&mce_sysclass);
1173	if (err)	1173	if (err)
1174	return err;	1174	return err;
1175		1175
1176	for_each_online_cpu(i) {	1176	for_each_online_cpu(i) {
1177	err = mce_create_device(i);	1177	err = mce_create_device(i);
1178	if (err)	1178	if (err)
1179	return err;	1179	return err;
1180	}	1180	}
1181		1181
1182	register_hotcpu_notifier(&mce_cpu_notifier);	1182	register_hotcpu_notifier(&mce_cpu_notifier);
1183	misc_register(&mce_log_device);	1183	misc_register(&mce_log_device);
1184	return err;	1184	return err;
1185	}	1185	}
1186		1186
1187	device_initcall(mce_init_device);	1187	device_initcall(mce_init_device);
1188		1188

arch/x86/kernel/tlb_uv.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* SGI UltraViolet TLB flush routines.	2	* SGI UltraViolet TLB flush routines.
3	*	3	*
4	* (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.	4	* (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
5	*	5	*
6	* This code is released under the GNU General Public License version 2 or	6	* This code is released under the GNU General Public License version 2 or
7	* later.	7	* later.
8	*/	8	*/
9	#include <linux/seq_file.h>	9	#include <linux/seq_file.h>
10	#include <linux/proc_fs.h>	10	#include <linux/proc_fs.h>
11	#include <linux/kernel.h>	11	#include <linux/kernel.h>
12		12
13	#include <asm/mmu_context.h>	13	#include <asm/mmu_context.h>
14	#include <asm/uv/uv.h>	14	#include <asm/uv/uv.h>
15	#include <asm/uv/uv_mmrs.h>	15	#include <asm/uv/uv_mmrs.h>
16	#include <asm/uv/uv_hub.h>	16	#include <asm/uv/uv_hub.h>
17	#include <asm/uv/uv_bau.h>	17	#include <asm/uv/uv_bau.h>
18	#include <asm/apic.h>	18	#include <asm/apic.h>
19	#include <asm/idle.h>	19	#include <asm/idle.h>
20	#include <asm/tsc.h>	20	#include <asm/tsc.h>
21	#include <asm/irq_vectors.h>	21	#include <asm/irq_vectors.h>
22		22
23	static struct bau_control **uv_bau_table_bases __read_mostly;	23	static struct bau_control **uv_bau_table_bases __read_mostly;
24	static int uv_bau_retry_limit __read_mostly;	24	static int uv_bau_retry_limit __read_mostly;
25		25
26	/* position of pnode (which is nasid>>1): */	26	/* position of pnode (which is nasid>>1): */
27	static int uv_nshift __read_mostly;	27	static int uv_nshift __read_mostly;
28	/* base pnode in this partition */	28	/* base pnode in this partition */
29	static int uv_partition_base_pnode __read_mostly;	29	static int uv_partition_base_pnode __read_mostly;
30		30
31	static unsigned long uv_mmask __read_mostly;	31	static unsigned long uv_mmask __read_mostly;
32		32
33	static DEFINE_PER_CPU(struct ptc_stats, ptcstats);	33	static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
34	static DEFINE_PER_CPU(struct bau_control, bau_control);	34	static DEFINE_PER_CPU(struct bau_control, bau_control);
35		35
36	/*	36	/*
37	* Determine the first node on a blade.	37	* Determine the first node on a blade.
38	*/	38	*/
39	static int __init blade_to_first_node(int blade)	39	static int __init blade_to_first_node(int blade)
40	{	40	{
41	int node, b;	41	int node, b;
42		42
43	for_each_online_node(node) {	43	for_each_online_node(node) {
44	b = uv_node_to_blade_id(node);	44	b = uv_node_to_blade_id(node);
45	if (blade == b)	45	if (blade == b)
46	return node;	46	return node;
47	}	47	}
48	return -1; /* shouldn't happen */	48	return -1; /* shouldn't happen */
49	}	49	}
50		50
51	/*	51	/*
52	* Determine the apicid of the first cpu on a blade.	52	* Determine the apicid of the first cpu on a blade.
53	*/	53	*/
54	static int __init blade_to_first_apicid(int blade)	54	static int __init blade_to_first_apicid(int blade)
55	{	55	{
56	int cpu;	56	int cpu;
57		57
58	for_each_present_cpu(cpu)	58	for_each_present_cpu(cpu)
59	if (blade == uv_cpu_to_blade_id(cpu))	59	if (blade == uv_cpu_to_blade_id(cpu))
60	return per_cpu(x86_cpu_to_apicid, cpu);	60	return per_cpu(x86_cpu_to_apicid, cpu);
61	return -1;	61	return -1;
62	}	62	}
63		63
64	/*	64	/*
65	* Free a software acknowledge hardware resource by clearing its Pending	65	* Free a software acknowledge hardware resource by clearing its Pending
66	* bit. This will return a reply to the sender.	66	* bit. This will return a reply to the sender.
67	* If the message has timed out, a reply has already been sent by the	67	* If the message has timed out, a reply has already been sent by the
68	* hardware but the resource has not been released. In that case our	68	* hardware but the resource has not been released. In that case our
69	* clear of the Timeout bit (as well) will free the resource. No reply will	69	* clear of the Timeout bit (as well) will free the resource. No reply will
70	* be sent (the hardware will only do one reply per message).	70	* be sent (the hardware will only do one reply per message).
71	*/	71	*/
72	static void uv_reply_to_message(int resource,	72	static void uv_reply_to_message(int resource,
73	struct bau_payload_queue_entry *msg,	73	struct bau_payload_queue_entry *msg,
74	struct bau_msg_status *msp)	74	struct bau_msg_status *msp)
75	{	75	{
76	unsigned long dw;	76	unsigned long dw;
77		77
78	dw = (1 << (resource + UV_SW_ACK_NPENDING)) \| (1 << resource);	78	dw = (1 << (resource + UV_SW_ACK_NPENDING)) \| (1 << resource);
79	msg->replied_to = 1;	79	msg->replied_to = 1;
80	msg->sw_ack_vector = 0;	80	msg->sw_ack_vector = 0;
81	if (msp)	81	if (msp)
82	msp->seen_by.bits = 0;	82	msp->seen_by.bits = 0;
83	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);	83	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
84	}	84	}
85		85
86	/*	86	/*
87	* Do all the things a cpu should do for a TLB shootdown message.	87	* Do all the things a cpu should do for a TLB shootdown message.
88	* Other cpu's may come here at the same time for this message.	88	* Other cpu's may come here at the same time for this message.
89	*/	89	*/
90	static void uv_bau_process_message(struct bau_payload_queue_entry *msg,	90	static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
91	int msg_slot, int sw_ack_slot)	91	int msg_slot, int sw_ack_slot)
92	{	92	{
93	unsigned long this_cpu_mask;	93	unsigned long this_cpu_mask;
94	struct bau_msg_status *msp;	94	struct bau_msg_status *msp;
95	int cpu;	95	int cpu;
96		96
97	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;	97	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
98	cpu = uv_blade_processor_id();	98	cpu = uv_blade_processor_id();
99	msg->number_of_cpus =	99	msg->number_of_cpus =
100	uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));	100	uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
101	this_cpu_mask = 1UL << cpu;	101	this_cpu_mask = 1UL << cpu;
102	if (msp->seen_by.bits & this_cpu_mask)	102	if (msp->seen_by.bits & this_cpu_mask)
103	return;	103	return;
104	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);	104	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
105		105
106	if (msg->replied_to == 1)	106	if (msg->replied_to == 1)
107	return;	107	return;
108		108
109	if (msg->address == TLB_FLUSH_ALL) {	109	if (msg->address == TLB_FLUSH_ALL) {
110	local_flush_tlb();	110	local_flush_tlb();
111	__get_cpu_var(ptcstats).alltlb++;	111	__get_cpu_var(ptcstats).alltlb++;
112	} else {	112	} else {
113	__flush_tlb_one(msg->address);	113	__flush_tlb_one(msg->address);
114	__get_cpu_var(ptcstats).onetlb++;	114	__get_cpu_var(ptcstats).onetlb++;
115	}	115	}
116		116
117	__get_cpu_var(ptcstats).requestee++;	117	__get_cpu_var(ptcstats).requestee++;
118		118
119	atomic_inc_short(&msg->acknowledge_count);	119	atomic_inc_short(&msg->acknowledge_count);
120	if (msg->number_of_cpus == msg->acknowledge_count)	120	if (msg->number_of_cpus == msg->acknowledge_count)
121	uv_reply_to_message(sw_ack_slot, msg, msp);	121	uv_reply_to_message(sw_ack_slot, msg, msp);
122	}	122	}
123		123
124	/*	124	/*
125	* Examine the payload queue on one distribution node to see	125	* Examine the payload queue on one distribution node to see
126	* which messages have not been seen, and which cpu(s) have not seen them.	126	* which messages have not been seen, and which cpu(s) have not seen them.
127	*	127	*
128	* Returns the number of cpu's that have not responded.	128	* Returns the number of cpu's that have not responded.
129	*/	129	*/
130	static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)	130	static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
131	{	131	{
132	struct bau_payload_queue_entry *msg;	132	struct bau_payload_queue_entry *msg;
133	struct bau_msg_status *msp;	133	struct bau_msg_status *msp;
134	int count = 0;	134	int count = 0;
135	int i;	135	int i;
136	int j;	136	int j;
137		137
138	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;	138	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
139	msg++, i++) {	139	msg++, i++) {
140	if ((msg->sending_cpu == sender) && (!msg->replied_to)) {	140	if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
141	msp = bau_tablesp->msg_statuses + i;	141	msp = bau_tablesp->msg_statuses + i;
142	printk(KERN_DEBUG	142	printk(KERN_DEBUG
143	"blade %d: address:%#lx %d of %d, not cpu(s): ",	143	"blade %d: address:%#lx %d of %d, not cpu(s): ",
144	i, msg->address, msg->acknowledge_count,	144	i, msg->address, msg->acknowledge_count,
145	msg->number_of_cpus);	145	msg->number_of_cpus);
146	for (j = 0; j < msg->number_of_cpus; j++) {	146	for (j = 0; j < msg->number_of_cpus; j++) {
147	if (!((1L << j) & msp->seen_by.bits)) {	147	if (!((1L << j) & msp->seen_by.bits)) {
148	count++;	148	count++;
149	printk("%d ", j);	149	printk("%d ", j);
150	}	150	}
151	}	151	}
152	printk("\n");	152	printk("\n");
153	}	153	}
154	}	154	}
155	return count;	155	return count;
156	}	156	}
157		157
158	/*	158	/*
159	* Examine the payload queue on all the distribution nodes to see	159	* Examine the payload queue on all the distribution nodes to see
160	* which messages have not been seen, and which cpu(s) have not seen them.	160	* which messages have not been seen, and which cpu(s) have not seen them.
161	*	161	*
162	* Returns the number of cpu's that have not responded.	162	* Returns the number of cpu's that have not responded.
163	*/	163	*/
164	static int uv_examine_destinations(struct bau_target_nodemask *distribution)	164	static int uv_examine_destinations(struct bau_target_nodemask *distribution)
165	{	165	{
166	int sender;	166	int sender;
167	int i;	167	int i;
168	int count = 0;	168	int count = 0;
169		169
170	sender = smp_processor_id();	170	sender = smp_processor_id();
171	for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {	171	for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
172	if (!bau_node_isset(i, distribution))	172	if (!bau_node_isset(i, distribution))
173	continue;	173	continue;
174	count += uv_examine_destination(uv_bau_table_bases[i], sender);	174	count += uv_examine_destination(uv_bau_table_bases[i], sender);
175	}	175	}
176	return count;	176	return count;
177	}	177	}
178		178
179	/*	179	/*
180	* wait for completion of a broadcast message	180	* wait for completion of a broadcast message
181	*	181	*
182	* return COMPLETE, RETRY or GIVEUP	182	* return COMPLETE, RETRY or GIVEUP
183	*/	183	*/
184	static int uv_wait_completion(struct bau_desc *bau_desc,	184	static int uv_wait_completion(struct bau_desc *bau_desc,
185	unsigned long mmr_offset, int right_shift)	185	unsigned long mmr_offset, int right_shift)
186	{	186	{
187	int exams = 0;	187	int exams = 0;
188	long destination_timeouts = 0;	188	long destination_timeouts = 0;
189	long source_timeouts = 0;	189	long source_timeouts = 0;
190	unsigned long descriptor_status;	190	unsigned long descriptor_status;
191		191
192	while ((descriptor_status = (((unsigned long)	192	while ((descriptor_status = (((unsigned long)
193	uv_read_local_mmr(mmr_offset) >>	193	uv_read_local_mmr(mmr_offset) >>
194	right_shift) & UV_ACT_STATUS_MASK)) !=	194	right_shift) & UV_ACT_STATUS_MASK)) !=
195	DESC_STATUS_IDLE) {	195	DESC_STATUS_IDLE) {
196	if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {	196	if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
197	source_timeouts++;	197	source_timeouts++;
198	if (source_timeouts > SOURCE_TIMEOUT_LIMIT)	198	if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
199	source_timeouts = 0;	199	source_timeouts = 0;
200	__get_cpu_var(ptcstats).s_retry++;	200	__get_cpu_var(ptcstats).s_retry++;
201	return FLUSH_RETRY;	201	return FLUSH_RETRY;
202	}	202	}
203	/*	203	/*
204	* spin here looking for progress at the destinations	204	* spin here looking for progress at the destinations
205	*/	205	*/
206	if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {	206	if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
207	destination_timeouts++;	207	destination_timeouts++;
208	if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {	208	if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
209	/*	209	/*
210	* returns number of cpus not responding	210	* returns number of cpus not responding
211	*/	211	*/
212	if (uv_examine_destinations	212	if (uv_examine_destinations
213	(&bau_desc->distribution) == 0) {	213	(&bau_desc->distribution) == 0) {
214	__get_cpu_var(ptcstats).d_retry++;	214	__get_cpu_var(ptcstats).d_retry++;
215	return FLUSH_RETRY;	215	return FLUSH_RETRY;
216	}	216	}
217	exams++;	217	exams++;
218	if (exams >= uv_bau_retry_limit) {	218	if (exams >= uv_bau_retry_limit) {
219	printk(KERN_DEBUG	219	printk(KERN_DEBUG
220	"uv_flush_tlb_others");	220	"uv_flush_tlb_others");
221	printk("giving up on cpu %d\n",	221	printk("giving up on cpu %d\n",
222	smp_processor_id());	222	smp_processor_id());
223	return FLUSH_GIVEUP;	223	return FLUSH_GIVEUP;
224	}	224	}
225	/*	225	/*
226	* delays can hang the simulator	226	* delays can hang the simulator
227	udelay(1000);	227	udelay(1000);
228	*/	228	*/
229	destination_timeouts = 0;	229	destination_timeouts = 0;
230	}	230	}
231	}	231	}
232	cpu_relax();	232	cpu_relax();
233	}	233	}
234	return FLUSH_COMPLETE;	234	return FLUSH_COMPLETE;
235	}	235	}
236		236
237	/**	237	/**
238	* uv_flush_send_and_wait	238	* uv_flush_send_and_wait
239	*	239	*
240	* Send a broadcast and wait for a broadcast message to complete.	240	* Send a broadcast and wait for a broadcast message to complete.
241	*	241	*
242	* The flush_mask contains the cpus the broadcast was sent to.	242	* The flush_mask contains the cpus the broadcast was sent to.
243	*	243	*
244	* Returns NULL if all remote flushing was done. The mask is zeroed.	244	* Returns NULL if all remote flushing was done. The mask is zeroed.
245	* Returns @flush_mask if some remote flushing remains to be done. The	245	* Returns @flush_mask if some remote flushing remains to be done. The
246	* mask will have some bits still set.	246	* mask will have some bits still set.
247	*/	247	*/
248	const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,	248	const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
249	struct bau_desc *bau_desc,	249	struct bau_desc *bau_desc,
250	struct cpumask *flush_mask)	250	struct cpumask *flush_mask)
251	{	251	{
252	int completion_status = 0;	252	int completion_status = 0;
253	int right_shift;	253	int right_shift;
254	int tries = 0;	254	int tries = 0;
255	int pnode;	255	int pnode;
256	int bit;	256	int bit;
257	unsigned long mmr_offset;	257	unsigned long mmr_offset;
258	unsigned long index;	258	unsigned long index;
259	cycles_t time1;	259	cycles_t time1;
260	cycles_t time2;	260	cycles_t time2;
261		261
262	if (cpu < UV_CPUS_PER_ACT_STATUS) {	262	if (cpu < UV_CPUS_PER_ACT_STATUS) {
263	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;	263	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
264	right_shift = cpu * UV_ACT_STATUS_SIZE;	264	right_shift = cpu * UV_ACT_STATUS_SIZE;
265	} else {	265	} else {
266	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;	266	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
267	right_shift =	267	right_shift =
268	((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);	268	((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
269	}	269	}
270	time1 = get_cycles();	270	time1 = get_cycles();
271	do {	271	do {
272	tries++;	272	tries++;
273	index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) \|	273	index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) \|
274	cpu;	274	cpu;
275	uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);	275	uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
276	completion_status = uv_wait_completion(bau_desc, mmr_offset,	276	completion_status = uv_wait_completion(bau_desc, mmr_offset,
277	right_shift);	277	right_shift);
278	} while (completion_status == FLUSH_RETRY);	278	} while (completion_status == FLUSH_RETRY);
279	time2 = get_cycles();	279	time2 = get_cycles();
280	__get_cpu_var(ptcstats).sflush += (time2 - time1);	280	__get_cpu_var(ptcstats).sflush += (time2 - time1);
281	if (tries > 1)	281	if (tries > 1)
282	__get_cpu_var(ptcstats).retriesok++;	282	__get_cpu_var(ptcstats).retriesok++;
283		283
284	if (completion_status == FLUSH_GIVEUP) {	284	if (completion_status == FLUSH_GIVEUP) {
285	/*	285	/*
286	* Cause the caller to do an IPI-style TLB shootdown on	286	* Cause the caller to do an IPI-style TLB shootdown on
287	* the cpu's, all of which are still in the mask.	287	* the cpu's, all of which are still in the mask.
288	*/	288	*/
289	__get_cpu_var(ptcstats).ptc_i++;	289	__get_cpu_var(ptcstats).ptc_i++;
290	return flush_mask;	290	return flush_mask;
291	}	291	}
292		292
293	/*	293	/*
294	* Success, so clear the remote cpu's from the mask so we don't	294	* Success, so clear the remote cpu's from the mask so we don't
295	* use the IPI method of shootdown on them.	295	* use the IPI method of shootdown on them.
296	*/	296	*/
297	for_each_cpu(bit, flush_mask) {	297	for_each_cpu(bit, flush_mask) {
298	pnode = uv_cpu_to_pnode(bit);	298	pnode = uv_cpu_to_pnode(bit);
299	if (pnode == this_pnode)	299	if (pnode == this_pnode)
300	continue;	300	continue;
301	cpumask_clear_cpu(bit, flush_mask);	301	cpumask_clear_cpu(bit, flush_mask);
302	}	302	}
303	if (!cpumask_empty(flush_mask))	303	if (!cpumask_empty(flush_mask))
304	return flush_mask;	304	return flush_mask;
305	return NULL;	305	return NULL;
306	}	306	}
307		307
308	static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);	308	static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
309		309
310	/**	310	/**
311	* uv_flush_tlb_others - globally purge translation cache of a virtual	311	* uv_flush_tlb_others - globally purge translation cache of a virtual
312	* address or all TLB's	312	* address or all TLB's
313	* @cpumask: mask of all cpu's in which the address is to be removed	313	* @cpumask: mask of all cpu's in which the address is to be removed
314	* @mm: mm_struct containing virtual address range	314	* @mm: mm_struct containing virtual address range
315	* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)	315	* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
316	* @cpu: the current cpu	316	* @cpu: the current cpu
317	*	317	*
318	* This is the entry point for initiating any UV global TLB shootdown.	318	* This is the entry point for initiating any UV global TLB shootdown.
319	*	319	*
320	* Purges the translation caches of all specified processors of the given	320	* Purges the translation caches of all specified processors of the given
321	* virtual address, or purges all TLB's on specified processors.	321	* virtual address, or purges all TLB's on specified processors.
322	*	322	*
323	* The caller has derived the cpumask from the mm_struct. This function	323	* The caller has derived the cpumask from the mm_struct. This function
324	* is called only if there are bits set in the mask. (e.g. flush_tlb_page())	324	* is called only if there are bits set in the mask. (e.g. flush_tlb_page())
325	*	325	*
326	* The cpumask is converted into a nodemask of the nodes containing	326	* The cpumask is converted into a nodemask of the nodes containing
327	* the cpus.	327	* the cpus.
328	*	328	*
329	* Note that this function should be called with preemption disabled.	329	* Note that this function should be called with preemption disabled.
330	*	330	*
331	* Returns NULL if all remote flushing was done.	331	* Returns NULL if all remote flushing was done.
332	* Returns pointer to cpumask if some remote flushing remains to be	332	* Returns pointer to cpumask if some remote flushing remains to be
333	* done. The returned pointer is valid till preemption is re-enabled.	333	* done. The returned pointer is valid till preemption is re-enabled.
334	*/	334	*/
335	const struct cpumask uv_flush_tlb_others(const struct cpumask cpumask,	335	const struct cpumask uv_flush_tlb_others(const struct cpumask cpumask,
336	struct mm_struct *mm,	336	struct mm_struct *mm,
337	unsigned long va, unsigned int cpu)	337	unsigned long va, unsigned int cpu)
338	{	338	{
339	struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);	339	struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
340	int i;	340	int i;
341	int bit;	341	int bit;
342	int pnode;	342	int pnode;
343	int uv_cpu;	343	int uv_cpu;
344	int this_pnode;	344	int this_pnode;
345	int locals = 0;	345	int locals = 0;
346	struct bau_desc *bau_desc;	346	struct bau_desc *bau_desc;
347		347
348	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));	348	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
349		349
350	uv_cpu = uv_blade_processor_id();	350	uv_cpu = uv_blade_processor_id();
351	this_pnode = uv_hub_info->pnode;	351	this_pnode = uv_hub_info->pnode;
352	bau_desc = __get_cpu_var(bau_control).descriptor_base;	352	bau_desc = __get_cpu_var(bau_control).descriptor_base;
353	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;	353	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
354		354
355	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);	355	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
356		356
357	i = 0;	357	i = 0;
358	for_each_cpu(bit, flush_mask) {	358	for_each_cpu(bit, flush_mask) {
359	pnode = uv_cpu_to_pnode(bit);	359	pnode = uv_cpu_to_pnode(bit);
360	BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));	360	BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
361	if (pnode == this_pnode) {	361	if (pnode == this_pnode) {
362	locals++;	362	locals++;
363	continue;	363	continue;
364	}	364	}
365	bau_node_set(pnode - uv_partition_base_pnode,	365	bau_node_set(pnode - uv_partition_base_pnode,
366	&bau_desc->distribution);	366	&bau_desc->distribution);
367	i++;	367	i++;
368	}	368	}
369	if (i == 0) {	369	if (i == 0) {
370	/*	370	/*
371	* no off_node flushing; return status for local node	371	* no off_node flushing; return status for local node
372	*/	372	*/
373	if (locals)	373	if (locals)
374	return flush_mask;	374	return flush_mask;
375	else	375	else
376	return NULL;	376	return NULL;
377	}	377	}
378	__get_cpu_var(ptcstats).requestor++;	378	__get_cpu_var(ptcstats).requestor++;
379	__get_cpu_var(ptcstats).ntargeted += i;	379	__get_cpu_var(ptcstats).ntargeted += i;
380		380
381	bau_desc->payload.address = va;	381	bau_desc->payload.address = va;
382	bau_desc->payload.sending_cpu = cpu;	382	bau_desc->payload.sending_cpu = cpu;
383		383
384	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);	384	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
385	}	385	}
386		386
387	/*	387	/*
388	* The BAU message interrupt comes here. (registered by set_intr_gate)	388	* The BAU message interrupt comes here. (registered by set_intr_gate)
389	* See entry_64.S	389	* See entry_64.S
390	*	390	*
391	* We received a broadcast assist message.	391	* We received a broadcast assist message.
392	*	392	*
393	* Interrupts may have been disabled; this interrupt could represent	393	* Interrupts may have been disabled; this interrupt could represent
394	* the receipt of several messages.	394	* the receipt of several messages.
395	*	395	*
396	* All cores/threads on this node get this interrupt.	396	* All cores/threads on this node get this interrupt.
397	* The last one to see it does the s/w ack.	397	* The last one to see it does the s/w ack.
398	* (the resource will not be freed until noninterruptable cpus see this	398	* (the resource will not be freed until noninterruptable cpus see this
399	* interrupt; hardware will timeout the s/w ack and reply ERROR)	399	* interrupt; hardware will timeout the s/w ack and reply ERROR)
400	*/	400	*/
401	void uv_bau_message_interrupt(struct pt_regs *regs)	401	void uv_bau_message_interrupt(struct pt_regs *regs)
402	{	402	{
403	struct bau_payload_queue_entry *va_queue_first;	403	struct bau_payload_queue_entry *va_queue_first;
404	struct bau_payload_queue_entry *va_queue_last;	404	struct bau_payload_queue_entry *va_queue_last;
405	struct bau_payload_queue_entry *msg;	405	struct bau_payload_queue_entry *msg;
406	struct pt_regs *old_regs = set_irq_regs(regs);	406	struct pt_regs *old_regs = set_irq_regs(regs);
407	cycles_t time1;	407	cycles_t time1;
408	cycles_t time2;	408	cycles_t time2;
409	int msg_slot;	409	int msg_slot;
410	int sw_ack_slot;	410	int sw_ack_slot;
411	int fw;	411	int fw;
412	int count = 0;	412	int count = 0;
413	unsigned long local_pnode;	413	unsigned long local_pnode;
414		414
415	ack_APIC_irq();	415	ack_APIC_irq();
416	exit_idle();	416	exit_idle();
417	irq_enter();	417	irq_enter();
418		418
419	time1 = get_cycles();	419	time1 = get_cycles();
420		420
421	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());	421	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
422		422
423	va_queue_first = __get_cpu_var(bau_control).va_queue_first;	423	va_queue_first = __get_cpu_var(bau_control).va_queue_first;
424	va_queue_last = __get_cpu_var(bau_control).va_queue_last;	424	va_queue_last = __get_cpu_var(bau_control).va_queue_last;
425		425
426	msg = __get_cpu_var(bau_control).bau_msg_head;	426	msg = __get_cpu_var(bau_control).bau_msg_head;
427	while (msg->sw_ack_vector) {	427	while (msg->sw_ack_vector) {
428	count++;	428	count++;
429	fw = msg->sw_ack_vector;	429	fw = msg->sw_ack_vector;
430	msg_slot = msg - va_queue_first;	430	msg_slot = msg - va_queue_first;
431	sw_ack_slot = ffs(fw) - 1;	431	sw_ack_slot = ffs(fw) - 1;
432		432
433	uv_bau_process_message(msg, msg_slot, sw_ack_slot);	433	uv_bau_process_message(msg, msg_slot, sw_ack_slot);
434		434
435	msg++;	435	msg++;
436	if (msg > va_queue_last)	436	if (msg > va_queue_last)
437	msg = va_queue_first;	437	msg = va_queue_first;
438	__get_cpu_var(bau_control).bau_msg_head = msg;	438	__get_cpu_var(bau_control).bau_msg_head = msg;
439	}	439	}
440	if (!count)	440	if (!count)
441	__get_cpu_var(ptcstats).nomsg++;	441	__get_cpu_var(ptcstats).nomsg++;
442	else if (count > 1)	442	else if (count > 1)
443	__get_cpu_var(ptcstats).multmsg++;	443	__get_cpu_var(ptcstats).multmsg++;
444		444
445	time2 = get_cycles();	445	time2 = get_cycles();
446	__get_cpu_var(ptcstats).dflush += (time2 - time1);	446	__get_cpu_var(ptcstats).dflush += (time2 - time1);
447		447
448	irq_exit();	448	irq_exit();
449	set_irq_regs(old_regs);	449	set_irq_regs(old_regs);
450	}	450	}
451		451
452	/*	452	/*
453	* uv_enable_timeouts	453	* uv_enable_timeouts
454	*	454	*
455	* Each target blade (i.e. blades that have cpu's) needs to have	455	* Each target blade (i.e. blades that have cpu's) needs to have
456	* shootdown message timeouts enabled. The timeout does not cause	456	* shootdown message timeouts enabled. The timeout does not cause
457	* an interrupt, but causes an error message to be returned to	457	* an interrupt, but causes an error message to be returned to
458	* the sender.	458	* the sender.
459	*/	459	*/
460	static void uv_enable_timeouts(void)	460	static void uv_enable_timeouts(void)
461	{	461	{
462	int blade;	462	int blade;
463	int nblades;	463	int nblades;
464	int pnode;	464	int pnode;
465	unsigned long mmr_image;	465	unsigned long mmr_image;
466		466
467	nblades = uv_num_possible_blades();	467	nblades = uv_num_possible_blades();
468		468
469	for (blade = 0; blade < nblades; blade++) {	469	for (blade = 0; blade < nblades; blade++) {
470	if (!uv_blade_nr_possible_cpus(blade))	470	if (!uv_blade_nr_possible_cpus(blade))
471	continue;	471	continue;
472		472
473	pnode = uv_blade_to_pnode(blade);	473	pnode = uv_blade_to_pnode(blade);
474	mmr_image =	474	mmr_image =
475	uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);	475	uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
476	/*	476	/*
477	* Set the timeout period and then lock it in, in three	477	* Set the timeout period and then lock it in, in three
478	* steps; captures and locks in the period.	478	* steps; captures and locks in the period.
479	*	479	*
480	* To program the period, the SOFT_ACK_MODE must be off.	480	* To program the period, the SOFT_ACK_MODE must be off.
481	*/	481	*/
482	mmr_image &= ~((unsigned long)1 <<	482	mmr_image &= ~((unsigned long)1 <<
483	UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);	483	UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
484	uv_write_global_mmr64	484	uv_write_global_mmr64
485	(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);	485	(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
486	/*	486	/*
487	* Set the 4-bit period.	487	* Set the 4-bit period.
488	*/	488	*/
489	mmr_image &= ~((unsigned long)0xf <<	489	mmr_image &= ~((unsigned long)0xf <<
490	UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);	490	UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
491	mmr_image \|= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<	491	mmr_image \|= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
492	UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);	492	UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
493	uv_write_global_mmr64	493	uv_write_global_mmr64
494	(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);	494	(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
495	/*	495	/*
496	* Subsequent reversals of the timebase bit (3) cause an	496	* Subsequent reversals of the timebase bit (3) cause an
497	* immediate timeout of one or all INTD resources as	497	* immediate timeout of one or all INTD resources as
498	* indicated in bits 2:0 (7 causes all of them to timeout).	498	* indicated in bits 2:0 (7 causes all of them to timeout).
499	*/	499	*/
500	mmr_image \|= ((unsigned long)1 <<	500	mmr_image \|= ((unsigned long)1 <<
501	UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);	501	UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
502	uv_write_global_mmr64	502	uv_write_global_mmr64
503	(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);	503	(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
504	}	504	}
505	}	505	}
506		506
507	static void uv_ptc_seq_start(struct seq_file file, loff_t *offset)	507	static void uv_ptc_seq_start(struct seq_file file, loff_t *offset)
508	{	508	{
509	if (*offset < num_possible_cpus())	509	if (*offset < num_possible_cpus())
510	return offset;	510	return offset;
511	return NULL;	511	return NULL;
512	}	512	}
513		513
514	static void uv_ptc_seq_next(struct seq_file file, void data, loff_t offset)	514	static void uv_ptc_seq_next(struct seq_file file, void data, loff_t offset)
515	{	515	{
516	(*offset)++;	516	(*offset)++;
517	if (*offset < num_possible_cpus())	517	if (*offset < num_possible_cpus())
518	return offset;	518	return offset;
519	return NULL;	519	return NULL;
520	}	520	}
521		521
522	static void uv_ptc_seq_stop(struct seq_file file, void data)	522	static void uv_ptc_seq_stop(struct seq_file file, void data)
523	{	523	{
524	}	524	}
525		525
526	/*	526	/*
527	* Display the statistics thru /proc	527	* Display the statistics thru /proc
528	* data points to the cpu number	528	* data points to the cpu number
529	*/	529	*/
530	static int uv_ptc_seq_show(struct seq_file file, void data)	530	static int uv_ptc_seq_show(struct seq_file file, void data)
531	{	531	{
532	struct ptc_stats *stat;	532	struct ptc_stats *stat;
533	int cpu;	533	int cpu;
534		534
535	cpu = (loff_t )data;	535	cpu = (loff_t )data;
536		536
537	if (!cpu) {	537	if (!cpu) {
538	seq_printf(file,	538	seq_printf(file,
539	"# cpu requestor requestee one all sretry dretry ptc_i ");	539	"# cpu requestor requestee one all sretry dretry ptc_i ");
540	seq_printf(file,	540	seq_printf(file,
541	"sw_ack sflush dflush sok dnomsg dmult starget\n");	541	"sw_ack sflush dflush sok dnomsg dmult starget\n");
542	}	542	}
543	if (cpu < num_possible_cpus() && cpu_online(cpu)) {	543	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
544	stat = &per_cpu(ptcstats, cpu);	544	stat = &per_cpu(ptcstats, cpu);
545	seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",	545	seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
546	cpu, stat->requestor,	546	cpu, stat->requestor,
547	stat->requestee, stat->onetlb, stat->alltlb,	547	stat->requestee, stat->onetlb, stat->alltlb,
548	stat->s_retry, stat->d_retry, stat->ptc_i);	548	stat->s_retry, stat->d_retry, stat->ptc_i);
549	seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",	549	seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
550	uv_read_global_mmr64(uv_cpu_to_pnode(cpu),	550	uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
551	UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),	551	UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
552	stat->sflush, stat->dflush,	552	stat->sflush, stat->dflush,
553	stat->retriesok, stat->nomsg,	553	stat->retriesok, stat->nomsg,
554	stat->multmsg, stat->ntargeted);	554	stat->multmsg, stat->ntargeted);
555	}	555	}
556		556
557	return 0;	557	return 0;
558	}	558	}
559		559
560	/*	560	/*
561	* 0: display meaning of the statistics	561	* 0: display meaning of the statistics
562	* >0: retry limit	562	* >0: retry limit
563	*/	563	*/
564	static ssize_t uv_ptc_proc_write(struct file file, const char __user user,	564	static ssize_t uv_ptc_proc_write(struct file file, const char __user user,
565	size_t count, loff_t *data)	565	size_t count, loff_t *data)
566	{	566	{
567	long newmode;	567	long newmode;
568	char optstr[64];	568	char optstr[64];
569		569
570	if (count == 0 \|\| count > sizeof(optstr))	570	if (count == 0 \|\| count > sizeof(optstr))
571	return -EINVAL;	571	return -EINVAL;
572	if (copy_from_user(optstr, user, count))	572	if (copy_from_user(optstr, user, count))
573	return -EFAULT;	573	return -EFAULT;
574	optstr[count - 1] = '\0';	574	optstr[count - 1] = '\0';
575	if (strict_strtoul(optstr, 10, &newmode) < 0) {	575	if (strict_strtoul(optstr, 10, &newmode) < 0) {
576	printk(KERN_DEBUG "%s is invalid\n", optstr);	576	printk(KERN_DEBUG "%s is invalid\n", optstr);
577	return -EINVAL;	577	return -EINVAL;
578	}	578	}
579		579
580	if (newmode == 0) {	580	if (newmode == 0) {
581	printk(KERN_DEBUG "# cpu: cpu number\n");	581	printk(KERN_DEBUG "# cpu: cpu number\n");
582	printk(KERN_DEBUG	582	printk(KERN_DEBUG
583	"requestor: times this cpu was the flush requestor\n");	583	"requestor: times this cpu was the flush requestor\n");
584	printk(KERN_DEBUG	584	printk(KERN_DEBUG
585	"requestee: times this cpu was requested to flush its TLBs\n");	585	"requestee: times this cpu was requested to flush its TLBs\n");
586	printk(KERN_DEBUG	586	printk(KERN_DEBUG
587	"one: times requested to flush a single address\n");	587	"one: times requested to flush a single address\n");
588	printk(KERN_DEBUG	588	printk(KERN_DEBUG
589	"all: times requested to flush all TLB's\n");	589	"all: times requested to flush all TLB's\n");
590	printk(KERN_DEBUG	590	printk(KERN_DEBUG
591	"sretry: number of retries of source-side timeouts\n");	591	"sretry: number of retries of source-side timeouts\n");
592	printk(KERN_DEBUG	592	printk(KERN_DEBUG
593	"dretry: number of retries of destination-side timeouts\n");	593	"dretry: number of retries of destination-side timeouts\n");
594	printk(KERN_DEBUG	594	printk(KERN_DEBUG
595	"ptc_i: times UV fell through to IPI-style flushes\n");	595	"ptc_i: times UV fell through to IPI-style flushes\n");
596	printk(KERN_DEBUG	596	printk(KERN_DEBUG
597	"sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");	597	"sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
598	printk(KERN_DEBUG	598	printk(KERN_DEBUG
599	"sflush_us: cycles spent in uv_flush_tlb_others()\n");	599	"sflush_us: cycles spent in uv_flush_tlb_others()\n");
600	printk(KERN_DEBUG	600	printk(KERN_DEBUG
601	"dflush_us: cycles spent in handling flush requests\n");	601	"dflush_us: cycles spent in handling flush requests\n");
602	printk(KERN_DEBUG "sok: successes on retry\n");	602	printk(KERN_DEBUG "sok: successes on retry\n");
603	printk(KERN_DEBUG "dnomsg: interrupts with no message\n");	603	printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
604	printk(KERN_DEBUG	604	printk(KERN_DEBUG
605	"dmult: interrupts with multiple messages\n");	605	"dmult: interrupts with multiple messages\n");
606	printk(KERN_DEBUG "starget: nodes targeted\n");	606	printk(KERN_DEBUG "starget: nodes targeted\n");
607	} else {	607	} else {
608	uv_bau_retry_limit = newmode;	608	uv_bau_retry_limit = newmode;
609	printk(KERN_DEBUG "timeout retry limit:%d\n",	609	printk(KERN_DEBUG "timeout retry limit:%d\n",
610	uv_bau_retry_limit);	610	uv_bau_retry_limit);
611	}	611	}
612		612
613	return count;	613	return count;
614	}	614	}
615		615
616	static const struct seq_operations uv_ptc_seq_ops = {	616	static const struct seq_operations uv_ptc_seq_ops = {
617	.start = uv_ptc_seq_start,	617	.start = uv_ptc_seq_start,
618	.next = uv_ptc_seq_next,	618	.next = uv_ptc_seq_next,
619	.stop = uv_ptc_seq_stop,	619	.stop = uv_ptc_seq_stop,
620	.show = uv_ptc_seq_show	620	.show = uv_ptc_seq_show
621	};	621	};
622		622
623	static int uv_ptc_proc_open(struct inode inode, struct file file)	623	static int uv_ptc_proc_open(struct inode inode, struct file file)
624	{	624	{
625	return seq_open(file, &uv_ptc_seq_ops);	625	return seq_open(file, &uv_ptc_seq_ops);
626	}	626	}
627		627
628	static const struct file_operations proc_uv_ptc_operations = {	628	static const struct file_operations proc_uv_ptc_operations = {
629	.open = uv_ptc_proc_open,	629	.open = uv_ptc_proc_open,
630	.read = seq_read,	630	.read = seq_read,
631	.write = uv_ptc_proc_write,	631	.write = uv_ptc_proc_write,
632	.llseek = seq_lseek,	632	.llseek = seq_lseek,
633	.release = seq_release,	633	.release = seq_release,
634	};	634	};
635		635
636	static int __init uv_ptc_init(void)	636	static int __init uv_ptc_init(void)
637	{	637	{
638	struct proc_dir_entry *proc_uv_ptc;	638	struct proc_dir_entry *proc_uv_ptc;
639		639
640	if (!is_uv_system())	640	if (!is_uv_system())
641	return 0;	641	return 0;
642		642
643	proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);	643	proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
644	if (!proc_uv_ptc) {	644	if (!proc_uv_ptc) {
645	printk(KERN_ERR "unable to create %s proc entry\n",	645	printk(KERN_ERR "unable to create %s proc entry\n",
646	UV_PTC_BASENAME);	646	UV_PTC_BASENAME);
647	return -EINVAL;	647	return -EINVAL;
648	}	648	}
649	proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;	649	proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
650	return 0;	650	return 0;
651	}	651	}
652		652
653	/*	653	/*
654	* begin the initialization of the per-blade control structures	654	* begin the initialization of the per-blade control structures
655	*/	655	*/
656	static struct bau_control * __init uv_table_bases_init(int blade, int node)	656	static struct bau_control * __init uv_table_bases_init(int blade, int node)
657	{	657	{
658	int i;	658	int i;
659	struct bau_msg_status *msp;	659	struct bau_msg_status *msp;
660	struct bau_control *bau_tabp;	660	struct bau_control *bau_tabp;
661		661
662	bau_tabp =	662	bau_tabp =
663	kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);	663	kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
664	BUG_ON(!bau_tabp);	664	BUG_ON(!bau_tabp);
665		665
666	bau_tabp->msg_statuses =	666	bau_tabp->msg_statuses =
667	kmalloc_node(sizeof(struct bau_msg_status) *	667	kmalloc_node(sizeof(struct bau_msg_status) *
668	DEST_Q_SIZE, GFP_KERNEL, node);	668	DEST_Q_SIZE, GFP_KERNEL, node);
669	BUG_ON(!bau_tabp->msg_statuses);	669	BUG_ON(!bau_tabp->msg_statuses);
670		670
671	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)	671	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
672	bau_cpubits_clear(&msp->seen_by, (int)	672	bau_cpubits_clear(&msp->seen_by, (int)
673	uv_blade_nr_possible_cpus(blade));	673	uv_blade_nr_possible_cpus(blade));
674		674
675	uv_bau_table_bases[blade] = bau_tabp;	675	uv_bau_table_bases[blade] = bau_tabp;
676		676
677	return bau_tabp;	677	return bau_tabp;
678	}	678	}
679		679
680	/*	680	/*
681	* finish the initialization of the per-blade control structures	681	* finish the initialization of the per-blade control structures
682	*/	682	*/
683	static void __init	683	static void __init
684	uv_table_bases_finish(int blade,	684	uv_table_bases_finish(int blade,
685	struct bau_control *bau_tablesp,	685	struct bau_control *bau_tablesp,
686	struct bau_desc *adp)	686	struct bau_desc *adp)
687	{	687	{
688	struct bau_control *bcp;	688	struct bau_control *bcp;
689	int cpu;	689	int cpu;
690		690
691	for_each_present_cpu(cpu) {	691	for_each_present_cpu(cpu) {
692	if (blade != uv_cpu_to_blade_id(cpu))	692	if (blade != uv_cpu_to_blade_id(cpu))
693	continue;	693	continue;
694		694
695	bcp = (struct bau_control *)&per_cpu(bau_control, cpu);	695	bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
696	bcp->bau_msg_head = bau_tablesp->va_queue_first;	696	bcp->bau_msg_head = bau_tablesp->va_queue_first;
697	bcp->va_queue_first = bau_tablesp->va_queue_first;	697	bcp->va_queue_first = bau_tablesp->va_queue_first;
698	bcp->va_queue_last = bau_tablesp->va_queue_last;	698	bcp->va_queue_last = bau_tablesp->va_queue_last;
699	bcp->msg_statuses = bau_tablesp->msg_statuses;	699	bcp->msg_statuses = bau_tablesp->msg_statuses;
700	bcp->descriptor_base = adp;	700	bcp->descriptor_base = adp;
701	}	701	}
702	}	702	}
703		703
704	/*	704	/*
705	* initialize the sending side's sending buffers	705	* initialize the sending side's sending buffers
706	*/	706	*/
707	static struct bau_desc * __init	707	static struct bau_desc * __init
708	uv_activation_descriptor_init(int node, int pnode)	708	uv_activation_descriptor_init(int node, int pnode)
709	{	709	{
710	int i;	710	int i;
711	unsigned long pa;	711	unsigned long pa;
712	unsigned long m;	712	unsigned long m;
713	unsigned long n;	713	unsigned long n;
714	unsigned long mmr_image;	714	unsigned long mmr_image;
715	struct bau_desc *adp;	715	struct bau_desc *adp;
716	struct bau_desc *ad2;	716	struct bau_desc *ad2;
717		717
718	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);	718	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
719	BUG_ON(!adp);	719	BUG_ON(!adp);
720		720
721	pa = uv_gpa(adp); /* need the real nasid*/	721	pa = uv_gpa(adp); /* need the real nasid*/
722	n = pa >> uv_nshift;	722	n = pa >> uv_nshift;
723	m = pa & uv_mmask;	723	m = pa & uv_mmask;
724		724
725	mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);	725	mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
726	if (mmr_image) {	726	if (mmr_image) {
727	uv_write_global_mmr64(pnode, (unsigned long)	727	uv_write_global_mmr64(pnode, (unsigned long)
728	UVH_LB_BAU_SB_DESCRIPTOR_BASE,	728	UVH_LB_BAU_SB_DESCRIPTOR_BASE,
729	(n << UV_DESC_BASE_PNODE_SHIFT \| m));	729	(n << UV_DESC_BASE_PNODE_SHIFT \| m));
730	}	730	}
731		731
732	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {	732	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
733	memset(ad2, 0, sizeof(struct bau_desc));	733	memset(ad2, 0, sizeof(struct bau_desc));
734	ad2->header.sw_ack_flag = 1;	734	ad2->header.sw_ack_flag = 1;
735	/*	735	/*
736	* base_dest_nodeid is the first node in the partition, so	736	* base_dest_nodeid is the first node in the partition, so
737	* the bit map will indicate partition-relative node numbers.	737	* the bit map will indicate partition-relative node numbers.
738	* note that base_dest_nodeid is actually a nasid.	738	* note that base_dest_nodeid is actually a nasid.
739	*/	739	*/
740	ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;	740	ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
741	ad2->header.command = UV_NET_ENDPOINT_INTD;	741	ad2->header.command = UV_NET_ENDPOINT_INTD;
742	ad2->header.int_both = 1;	742	ad2->header.int_both = 1;
743	/*	743	/*
744	* all others need to be set to zero:	744	* all others need to be set to zero:
745	* fairness chaining multilevel count replied_to	745	* fairness chaining multilevel count replied_to
746	*/	746	*/
747	}	747	}
748	return adp;	748	return adp;
749	}	749	}
750		750
751	/*	751	/*
752	* initialize the destination side's receiving buffers	752	* initialize the destination side's receiving buffers
753	*/	753	*/
754	static struct bau_payload_queue_entry * __init	754	static struct bau_payload_queue_entry * __init
755	uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)	755	uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
756	{	756	{
757	struct bau_payload_queue_entry *pqp;	757	struct bau_payload_queue_entry *pqp;
758	unsigned long pa;	758	unsigned long pa;
759	int pn;	759	int pn;
760	char *cp;	760	char *cp;
761		761
762	pqp = (struct bau_payload_queue_entry *) kmalloc_node(	762	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
763	(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),	763	(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
764	GFP_KERNEL, node);	764	GFP_KERNEL, node);
765	BUG_ON(!pqp);	765	BUG_ON(!pqp);
766		766
767	cp = (char *)pqp + 31;	767	cp = (char *)pqp + 31;
768	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);	768	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
769	bau_tablesp->va_queue_first = pqp;	769	bau_tablesp->va_queue_first = pqp;
770	/*	770	/*
771	* need the pnode of where the memory was really allocated	771	* need the pnode of where the memory was really allocated
772	*/	772	*/
773	pa = uv_gpa(pqp);	773	pa = uv_gpa(pqp);
774	pn = pa >> uv_nshift;	774	pn = pa >> uv_nshift;
775	uv_write_global_mmr64(pnode,	775	uv_write_global_mmr64(pnode,
776	UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,	776	UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
777	((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) \|	777	((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) \|
778	uv_physnodeaddr(pqp));	778	uv_physnodeaddr(pqp));
779	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,	779	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
780	uv_physnodeaddr(pqp));	780	uv_physnodeaddr(pqp));
781	bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);	781	bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
782	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,	782	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
783	(unsigned long)	783	(unsigned long)
784	uv_physnodeaddr(bau_tablesp->va_queue_last));	784	uv_physnodeaddr(bau_tablesp->va_queue_last));
785	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);	785	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
786		786
787	return pqp;	787	return pqp;
788	}	788	}
789		789
790	/*	790	/*
791	* Initialization of each UV blade's structures	791	* Initialization of each UV blade's structures
792	*/	792	*/
793	static int __init uv_init_blade(int blade)	793	static int __init uv_init_blade(int blade)
794	{	794	{
795	int node;	795	int node;
796	int pnode;	796	int pnode;
797	unsigned long pa;	797	unsigned long pa;
798	unsigned long apicid;	798	unsigned long apicid;
799	struct bau_desc *adp;	799	struct bau_desc *adp;
800	struct bau_payload_queue_entry *pqp;	800	struct bau_payload_queue_entry *pqp;
801	struct bau_control *bau_tablesp;	801	struct bau_control *bau_tablesp;
802		802
803	node = blade_to_first_node(blade);	803	node = blade_to_first_node(blade);
804	bau_tablesp = uv_table_bases_init(blade, node);	804	bau_tablesp = uv_table_bases_init(blade, node);
805	pnode = uv_blade_to_pnode(blade);	805	pnode = uv_blade_to_pnode(blade);
806	adp = uv_activation_descriptor_init(node, pnode);	806	adp = uv_activation_descriptor_init(node, pnode);
807	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);	807	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
808	uv_table_bases_finish(blade, bau_tablesp, adp);	808	uv_table_bases_finish(blade, bau_tablesp, adp);
809	/*	809	/*
810	* the below initialization can't be in firmware because the	810	* the below initialization can't be in firmware because the
811	* messaging IRQ will be determined by the OS	811	* messaging IRQ will be determined by the OS
812	*/	812	*/
813	apicid = blade_to_first_apicid(blade);	813	apicid = blade_to_first_apicid(blade);
814	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);	814	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
815	if ((pa & 0xff) != UV_BAU_MESSAGE) {	815	if ((pa & 0xff) != UV_BAU_MESSAGE) {
816	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,	816	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
817	((apicid << 32) \| UV_BAU_MESSAGE));	817	((apicid << 32) \| UV_BAU_MESSAGE));
818	}	818	}
819	return 0;	819	return 0;
820	}	820	}
821		821
822	/*	822	/*
823	* Initialization of BAU-related structures	823	* Initialization of BAU-related structures
824	*/	824	*/
825	static int __init uv_bau_init(void)	825	static int __init uv_bau_init(void)
826	{	826	{
827	int blade;	827	int blade;
828	int nblades;	828	int nblades;
829	int cur_cpu;	829	int cur_cpu;
830		830
831	if (!is_uv_system())	831	if (!is_uv_system())
832	return 0;	832	return 0;
833		833
834	for_each_possible_cpu(cur_cpu)	834	for_each_possible_cpu(cur_cpu)
835	alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),	835	zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
836	GFP_KERNEL, cpu_to_node(cur_cpu));	836	GFP_KERNEL, cpu_to_node(cur_cpu));
837		837
838	uv_bau_retry_limit = 1;	838	uv_bau_retry_limit = 1;
839	uv_nshift = uv_hub_info->n_val;	839	uv_nshift = uv_hub_info->n_val;
840	uv_mmask = (1UL << uv_hub_info->n_val) - 1;	840	uv_mmask = (1UL << uv_hub_info->n_val) - 1;
841	nblades = uv_num_possible_blades();	841	nblades = uv_num_possible_blades();
842		842
843	uv_bau_table_bases = (struct bau_control **)	843	uv_bau_table_bases = (struct bau_control **)
844	kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);	844	kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
845	BUG_ON(!uv_bau_table_bases);	845	BUG_ON(!uv_bau_table_bases);
846		846
847	uv_partition_base_pnode = 0x7fffffff;	847	uv_partition_base_pnode = 0x7fffffff;
848	for (blade = 0; blade < nblades; blade++)	848	for (blade = 0; blade < nblades; blade++)
849	if (uv_blade_nr_possible_cpus(blade) &&	849	if (uv_blade_nr_possible_cpus(blade) &&
850	(uv_blade_to_pnode(blade) < uv_partition_base_pnode))	850	(uv_blade_to_pnode(blade) < uv_partition_base_pnode))
851	uv_partition_base_pnode = uv_blade_to_pnode(blade);	851	uv_partition_base_pnode = uv_blade_to_pnode(blade);
852	for (blade = 0; blade < nblades; blade++)	852	for (blade = 0; blade < nblades; blade++)
853	if (uv_blade_nr_possible_cpus(blade))	853	if (uv_blade_nr_possible_cpus(blade))
854	uv_init_blade(blade);	854	uv_init_blade(blade);
855		855
856	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);	856	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
857	uv_enable_timeouts();	857	uv_enable_timeouts();
858		858
859	return 0;	859	return 0;
860	}	860	}
861	__initcall(uv_bau_init);	861	__initcall(uv_bau_init);
862	__initcall(uv_ptc_init);	862	__initcall(uv_ptc_init);
863		863

drivers/acpi/processor_core.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* acpi_processor.c - ACPI Processor Driver ($Revision: 71 $)	2	* acpi_processor.c - ACPI Processor Driver ($Revision: 71 $)
3	*	3	*
4	* Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>	4	* Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5	* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>	5	* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6	* Copyright (C) 2004 Dominik Brodowski <linux@brodo.de>	6	* Copyright (C) 2004 Dominik Brodowski <linux@brodo.de>
7	* Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>	7	* Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8	* - Added processor hotplug support	8	* - Added processor hotplug support
9	*	9	*
10	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~	10	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11	*	11	*
12	* This program is free software; you can redistribute it and/or modify	12	* This program is free software; you can redistribute it and/or modify
13	* it under the terms of the GNU General Public License as published by	13	* it under the terms of the GNU General Public License as published by
14	* the Free Software Foundation; either version 2 of the License, or (at	14	* the Free Software Foundation; either version 2 of the License, or (at
15	* your option) any later version.	15	* your option) any later version.
16	*	16	*
17	* This program is distributed in the hope that it will be useful, but	17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.	20	* General Public License for more details.
21	*	21	*
22	* You should have received a copy of the GNU General Public License along	22	* You should have received a copy of the GNU General Public License along
23	* with this program; if not, write to the Free Software Foundation, Inc.,	23	* with this program; if not, write to the Free Software Foundation, Inc.,
24	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.	24	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
25	*	25	*
26	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~	26	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27	* TBD:	27	* TBD:
28	* 1. Make # power states dynamic.	28	* 1. Make # power states dynamic.
29	* 2. Support duty_cycle values that span bit 4.	29	* 2. Support duty_cycle values that span bit 4.
30	* 3. Optimize by having scheduler determine business instead of	30	* 3. Optimize by having scheduler determine business instead of
31	* having us try to calculate it here.	31	* having us try to calculate it here.
32	* 4. Need C1 timing -- must modify kernel (IRQ handler) to get this.	32	* 4. Need C1 timing -- must modify kernel (IRQ handler) to get this.
33	*/	33	*/
34		34
35	#include <linux/kernel.h>	35	#include <linux/kernel.h>
36	#include <linux/module.h>	36	#include <linux/module.h>
37	#include <linux/init.h>	37	#include <linux/init.h>
38	#include <linux/types.h>	38	#include <linux/types.h>
39	#include <linux/pci.h>	39	#include <linux/pci.h>
40	#include <linux/pm.h>	40	#include <linux/pm.h>
41	#include <linux/cpufreq.h>	41	#include <linux/cpufreq.h>
42	#include <linux/cpu.h>	42	#include <linux/cpu.h>
43	#include <linux/proc_fs.h>	43	#include <linux/proc_fs.h>
44	#include <linux/seq_file.h>	44	#include <linux/seq_file.h>
45	#include <linux/dmi.h>	45	#include <linux/dmi.h>
46	#include <linux/moduleparam.h>	46	#include <linux/moduleparam.h>
47	#include <linux/cpuidle.h>	47	#include <linux/cpuidle.h>
48		48
49	#include <asm/io.h>	49	#include <asm/io.h>
50	#include <asm/system.h>	50	#include <asm/system.h>
51	#include <asm/cpu.h>	51	#include <asm/cpu.h>
52	#include <asm/delay.h>	52	#include <asm/delay.h>
53	#include <asm/uaccess.h>	53	#include <asm/uaccess.h>
54	#include <asm/processor.h>	54	#include <asm/processor.h>
55	#include <asm/smp.h>	55	#include <asm/smp.h>
56	#include <asm/acpi.h>	56	#include <asm/acpi.h>
57		57
58	#include <acpi/acpi_bus.h>	58	#include <acpi/acpi_bus.h>
59	#include <acpi/acpi_drivers.h>	59	#include <acpi/acpi_drivers.h>
60	#include <acpi/processor.h>	60	#include <acpi/processor.h>
61		61
62	#define ACPI_PROCESSOR_CLASS "processor"	62	#define ACPI_PROCESSOR_CLASS "processor"
63	#define ACPI_PROCESSOR_DEVICE_NAME "Processor"	63	#define ACPI_PROCESSOR_DEVICE_NAME "Processor"
64	#define ACPI_PROCESSOR_FILE_INFO "info"	64	#define ACPI_PROCESSOR_FILE_INFO "info"
65	#define ACPI_PROCESSOR_FILE_THROTTLING "throttling"	65	#define ACPI_PROCESSOR_FILE_THROTTLING "throttling"
66	#define ACPI_PROCESSOR_FILE_LIMIT "limit"	66	#define ACPI_PROCESSOR_FILE_LIMIT "limit"
67	#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80	67	#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
68	#define ACPI_PROCESSOR_NOTIFY_POWER 0x81	68	#define ACPI_PROCESSOR_NOTIFY_POWER 0x81
69	#define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82	69	#define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82
70		70
71	#define ACPI_PROCESSOR_LIMIT_USER 0	71	#define ACPI_PROCESSOR_LIMIT_USER 0
72	#define ACPI_PROCESSOR_LIMIT_THERMAL 1	72	#define ACPI_PROCESSOR_LIMIT_THERMAL 1
73		73
74	#define _COMPONENT ACPI_PROCESSOR_COMPONENT	74	#define _COMPONENT ACPI_PROCESSOR_COMPONENT
75	ACPI_MODULE_NAME("processor_core");	75	ACPI_MODULE_NAME("processor_core");
76		76
77	MODULE_AUTHOR("Paul Diefenbaugh");	77	MODULE_AUTHOR("Paul Diefenbaugh");
78	MODULE_DESCRIPTION("ACPI Processor Driver");	78	MODULE_DESCRIPTION("ACPI Processor Driver");
79	MODULE_LICENSE("GPL");	79	MODULE_LICENSE("GPL");
80		80
81	static int acpi_processor_add(struct acpi_device *device);	81	static int acpi_processor_add(struct acpi_device *device);
82	static int acpi_processor_start(struct acpi_device *device);	82	static int acpi_processor_start(struct acpi_device *device);
83	static int acpi_processor_remove(struct acpi_device *device, int type);	83	static int acpi_processor_remove(struct acpi_device *device, int type);
84	static int acpi_processor_info_open_fs(struct inode inode, struct file file);	84	static int acpi_processor_info_open_fs(struct inode inode, struct file file);
85	static void acpi_processor_notify(struct acpi_device *device, u32 event);	85	static void acpi_processor_notify(struct acpi_device *device, u32 event);
86	static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu);	86	static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu);
87	static int acpi_processor_handle_eject(struct acpi_processor *pr);	87	static int acpi_processor_handle_eject(struct acpi_processor *pr);
88		88
89		89
90	static const struct acpi_device_id processor_device_ids[] = {	90	static const struct acpi_device_id processor_device_ids[] = {
91	{ACPI_PROCESSOR_OBJECT_HID, 0},	91	{ACPI_PROCESSOR_OBJECT_HID, 0},
92	{ACPI_PROCESSOR_HID, 0},	92	{ACPI_PROCESSOR_HID, 0},
93	{"", 0},	93	{"", 0},
94	};	94	};
95	MODULE_DEVICE_TABLE(acpi, processor_device_ids);	95	MODULE_DEVICE_TABLE(acpi, processor_device_ids);
96		96
97	static struct acpi_driver acpi_processor_driver = {	97	static struct acpi_driver acpi_processor_driver = {
98	.name = "processor",	98	.name = "processor",
99	.class = ACPI_PROCESSOR_CLASS,	99	.class = ACPI_PROCESSOR_CLASS,
100	.ids = processor_device_ids,	100	.ids = processor_device_ids,
101	.ops = {	101	.ops = {
102	.add = acpi_processor_add,	102	.add = acpi_processor_add,
103	.remove = acpi_processor_remove,	103	.remove = acpi_processor_remove,
104	.start = acpi_processor_start,	104	.start = acpi_processor_start,
105	.suspend = acpi_processor_suspend,	105	.suspend = acpi_processor_suspend,
106	.resume = acpi_processor_resume,	106	.resume = acpi_processor_resume,
107	.notify = acpi_processor_notify,	107	.notify = acpi_processor_notify,
108	},	108	},
109	};	109	};
110		110
111	#define INSTALL_NOTIFY_HANDLER 1	111	#define INSTALL_NOTIFY_HANDLER 1
112	#define UNINSTALL_NOTIFY_HANDLER 2	112	#define UNINSTALL_NOTIFY_HANDLER 2
113		113
114	static const struct file_operations acpi_processor_info_fops = {	114	static const struct file_operations acpi_processor_info_fops = {
115	.owner = THIS_MODULE,	115	.owner = THIS_MODULE,
116	.open = acpi_processor_info_open_fs,	116	.open = acpi_processor_info_open_fs,
117	.read = seq_read,	117	.read = seq_read,
118	.llseek = seq_lseek,	118	.llseek = seq_lseek,
119	.release = single_release,	119	.release = single_release,
120	};	120	};
121		121
122	DEFINE_PER_CPU(struct acpi_processor *, processors);	122	DEFINE_PER_CPU(struct acpi_processor *, processors);
123	struct acpi_processor_errata errata __read_mostly;	123	struct acpi_processor_errata errata __read_mostly;
124	static int set_no_mwait(const struct dmi_system_id *id)	124	static int set_no_mwait(const struct dmi_system_id *id)
125	{	125	{
126	printk(KERN_NOTICE PREFIX "%s detected - "	126	printk(KERN_NOTICE PREFIX "%s detected - "
127	"disabling mwait for CPU C-states\n", id->ident);	127	"disabling mwait for CPU C-states\n", id->ident);
128	idle_nomwait = 1;	128	idle_nomwait = 1;
129	return 0;	129	return 0;
130	}	130	}
131		131
132	static struct dmi_system_id __cpuinitdata processor_idle_dmi_table[] = {	132	static struct dmi_system_id __cpuinitdata processor_idle_dmi_table[] = {
133	{	133	{
134	set_no_mwait, "IFL91 board", {	134	set_no_mwait, "IFL91 board", {
135	DMI_MATCH(DMI_BIOS_VENDOR, "COMPAL"),	135	DMI_MATCH(DMI_BIOS_VENDOR, "COMPAL"),
136	DMI_MATCH(DMI_SYS_VENDOR, "ZEPTO"),	136	DMI_MATCH(DMI_SYS_VENDOR, "ZEPTO"),
137	DMI_MATCH(DMI_PRODUCT_VERSION, "3215W"),	137	DMI_MATCH(DMI_PRODUCT_VERSION, "3215W"),
138	DMI_MATCH(DMI_BOARD_NAME, "IFL91") }, NULL},	138	DMI_MATCH(DMI_BOARD_NAME, "IFL91") }, NULL},
139	{	139	{
140	set_no_mwait, "Extensa 5220", {	140	set_no_mwait, "Extensa 5220", {
141	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),	141	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
142	DMI_MATCH(DMI_SYS_VENDOR, "Acer"),	142	DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
143	DMI_MATCH(DMI_PRODUCT_VERSION, "0100"),	143	DMI_MATCH(DMI_PRODUCT_VERSION, "0100"),
144	DMI_MATCH(DMI_BOARD_NAME, "Columbia") }, NULL},	144	DMI_MATCH(DMI_BOARD_NAME, "Columbia") }, NULL},
145	{},	145	{},
146	};	146	};
147		147
148	/* --------------------------------------------------------------------------	148	/* --------------------------------------------------------------------------
149	Errata Handling	149	Errata Handling
150	-------------------------------------------------------------------------- */	150	-------------------------------------------------------------------------- */
151		151
152	static int acpi_processor_errata_piix4(struct pci_dev *dev)	152	static int acpi_processor_errata_piix4(struct pci_dev *dev)
153	{	153	{
154	u8 value1 = 0;	154	u8 value1 = 0;
155	u8 value2 = 0;	155	u8 value2 = 0;
156		156
157		157
158	if (!dev)	158	if (!dev)
159	return -EINVAL;	159	return -EINVAL;
160		160
161	/*	161	/*
162	* Note that 'dev' references the PIIX4 ACPI Controller.	162	* Note that 'dev' references the PIIX4 ACPI Controller.
163	*/	163	*/
164		164
165	switch (dev->revision) {	165	switch (dev->revision) {
166	case 0:	166	case 0:
167	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4 A-step\n"));	167	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4 A-step\n"));
168	break;	168	break;
169	case 1:	169	case 1:
170	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4 B-step\n"));	170	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4 B-step\n"));
171	break;	171	break;
172	case 2:	172	case 2:
173	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4E\n"));	173	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4E\n"));
174	break;	174	break;
175	case 3:	175	case 3:
176	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4M\n"));	176	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found PIIX4M\n"));
177	break;	177	break;
178	default:	178	default:
179	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found unknown PIIX4\n"));	179	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found unknown PIIX4\n"));
180	break;	180	break;
181	}	181	}
182		182
183	switch (dev->revision) {	183	switch (dev->revision) {
184		184
185	case 0: /* PIIX4 A-step */	185	case 0: /* PIIX4 A-step */
186	case 1: /* PIIX4 B-step */	186	case 1: /* PIIX4 B-step */
187	/*	187	/*
188	* See specification changes #13 ("Manual Throttle Duty Cycle")	188	* See specification changes #13 ("Manual Throttle Duty Cycle")
189	* and #14 ("Enabling and Disabling Manual Throttle"), plus	189	* and #14 ("Enabling and Disabling Manual Throttle"), plus
190	* erratum #5 ("STPCLK# Deassertion Time") from the January	190	* erratum #5 ("STPCLK# Deassertion Time") from the January
191	* 2002 PIIX4 specification update. Applies to only older	191	* 2002 PIIX4 specification update. Applies to only older
192	* PIIX4 models.	192	* PIIX4 models.
193	*/	193	*/
194	errata.piix4.throttle = 1;	194	errata.piix4.throttle = 1;
195		195
196	case 2: /* PIIX4E */	196	case 2: /* PIIX4E */
197	case 3: /* PIIX4M */	197	case 3: /* PIIX4M */
198	/*	198	/*
199	* See erratum #18 ("C3 Power State/BMIDE and Type-F DMA	199	* See erratum #18 ("C3 Power State/BMIDE and Type-F DMA
200	* Livelock") from the January 2002 PIIX4 specification update.	200	* Livelock") from the January 2002 PIIX4 specification update.
201	* Applies to all PIIX4 models.	201	* Applies to all PIIX4 models.
202	*/	202	*/
203		203
204	/*	204	/*
205	* BM-IDE	205	* BM-IDE
206	* ------	206	* ------
207	* Find the PIIX4 IDE Controller and get the Bus Master IDE	207	* Find the PIIX4 IDE Controller and get the Bus Master IDE
208	* Status register address. We'll use this later to read	208	* Status register address. We'll use this later to read
209	* each IDE controller's DMA status to make sure we catch all	209	* each IDE controller's DMA status to make sure we catch all
210	* DMA activity.	210	* DMA activity.
211	*/	211	*/
212	dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,	212	dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
213	PCI_DEVICE_ID_INTEL_82371AB,	213	PCI_DEVICE_ID_INTEL_82371AB,
214	PCI_ANY_ID, PCI_ANY_ID, NULL);	214	PCI_ANY_ID, PCI_ANY_ID, NULL);
215	if (dev) {	215	if (dev) {
216	errata.piix4.bmisx = pci_resource_start(dev, 4);	216	errata.piix4.bmisx = pci_resource_start(dev, 4);
217	pci_dev_put(dev);	217	pci_dev_put(dev);
218	}	218	}
219		219
220	/*	220	/*
221	* Type-F DMA	221	* Type-F DMA
222	* ----------	222	* ----------
223	* Find the PIIX4 ISA Controller and read the Motherboard	223	* Find the PIIX4 ISA Controller and read the Motherboard
224	* DMA controller's status to see if Type-F (Fast) DMA mode	224	* DMA controller's status to see if Type-F (Fast) DMA mode
225	* is enabled (bit 7) on either channel. Note that we'll	225	* is enabled (bit 7) on either channel. Note that we'll
226	* disable C3 support if this is enabled, as some legacy	226	* disable C3 support if this is enabled, as some legacy
227	* devices won't operate well if fast DMA is disabled.	227	* devices won't operate well if fast DMA is disabled.
228	*/	228	*/
229	dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,	229	dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
230	PCI_DEVICE_ID_INTEL_82371AB_0,	230	PCI_DEVICE_ID_INTEL_82371AB_0,
231	PCI_ANY_ID, PCI_ANY_ID, NULL);	231	PCI_ANY_ID, PCI_ANY_ID, NULL);
232	if (dev) {	232	if (dev) {
233	pci_read_config_byte(dev, 0x76, &value1);	233	pci_read_config_byte(dev, 0x76, &value1);
234	pci_read_config_byte(dev, 0x77, &value2);	234	pci_read_config_byte(dev, 0x77, &value2);
235	if ((value1 & 0x80) \|\| (value2 & 0x80))	235	if ((value1 & 0x80) \|\| (value2 & 0x80))
236	errata.piix4.fdma = 1;	236	errata.piix4.fdma = 1;
237	pci_dev_put(dev);	237	pci_dev_put(dev);
238	}	238	}
239		239
240	break;	240	break;
241	}	241	}
242		242
243	if (errata.piix4.bmisx)	243	if (errata.piix4.bmisx)
244	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	244	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
245	"Bus master activity detection (BM-IDE) erratum enabled\n"));	245	"Bus master activity detection (BM-IDE) erratum enabled\n"));
246	if (errata.piix4.fdma)	246	if (errata.piix4.fdma)
247	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	247	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
248	"Type-F DMA livelock erratum (C3 disabled)\n"));	248	"Type-F DMA livelock erratum (C3 disabled)\n"));
249		249
250	return 0;	250	return 0;
251	}	251	}
252		252
253	static int acpi_processor_errata(struct acpi_processor *pr)	253	static int acpi_processor_errata(struct acpi_processor *pr)
254	{	254	{
255	int result = 0;	255	int result = 0;
256	struct pci_dev *dev = NULL;	256	struct pci_dev *dev = NULL;
257		257
258		258
259	if (!pr)	259	if (!pr)
260	return -EINVAL;	260	return -EINVAL;
261		261
262	/*	262	/*
263	* PIIX4	263	* PIIX4
264	*/	264	*/
265	dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,	265	dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
266	PCI_DEVICE_ID_INTEL_82371AB_3, PCI_ANY_ID,	266	PCI_DEVICE_ID_INTEL_82371AB_3, PCI_ANY_ID,
267	PCI_ANY_ID, NULL);	267	PCI_ANY_ID, NULL);
268	if (dev) {	268	if (dev) {
269	result = acpi_processor_errata_piix4(dev);	269	result = acpi_processor_errata_piix4(dev);
270	pci_dev_put(dev);	270	pci_dev_put(dev);
271	}	271	}
272		272
273	return result;	273	return result;
274	}	274	}
275		275
276	/* --------------------------------------------------------------------------	276	/* --------------------------------------------------------------------------
277	Common ACPI processor functions	277	Common ACPI processor functions
278	-------------------------------------------------------------------------- */	278	-------------------------------------------------------------------------- */
279		279
280	/*	280	/*
281	* _PDC is required for a BIOS-OS handshake for most of the newer	281	* _PDC is required for a BIOS-OS handshake for most of the newer
282	* ACPI processor features.	282	* ACPI processor features.
283	*/	283	*/
284	static int acpi_processor_set_pdc(struct acpi_processor *pr)	284	static int acpi_processor_set_pdc(struct acpi_processor *pr)
285	{	285	{
286	struct acpi_object_list *pdc_in = pr->pdc;	286	struct acpi_object_list *pdc_in = pr->pdc;
287	acpi_status status = AE_OK;	287	acpi_status status = AE_OK;
288		288
289		289
290	if (!pdc_in)	290	if (!pdc_in)
291	return status;	291	return status;
292	if (idle_nomwait) {	292	if (idle_nomwait) {
293	/*	293	/*
294	* If mwait is disabled for CPU C-states, the C2C3_FFH access	294	* If mwait is disabled for CPU C-states, the C2C3_FFH access
295	* mode will be disabled in the parameter of _PDC object.	295	* mode will be disabled in the parameter of _PDC object.
296	* Of course C1_FFH access mode will also be disabled.	296	* Of course C1_FFH access mode will also be disabled.
297	*/	297	*/
298	union acpi_object *obj;	298	union acpi_object *obj;
299	u32 *buffer = NULL;	299	u32 *buffer = NULL;
300		300
301	obj = pdc_in->pointer;	301	obj = pdc_in->pointer;
302	buffer = (u32 *)(obj->buffer.pointer);	302	buffer = (u32 *)(obj->buffer.pointer);
303	buffer[2] &= ~(ACPI_PDC_C_C2C3_FFH \| ACPI_PDC_C_C1_FFH);	303	buffer[2] &= ~(ACPI_PDC_C_C2C3_FFH \| ACPI_PDC_C_C1_FFH);
304		304
305	}	305	}
306	status = acpi_evaluate_object(pr->handle, "_PDC", pdc_in, NULL);	306	status = acpi_evaluate_object(pr->handle, "_PDC", pdc_in, NULL);
307		307
308	if (ACPI_FAILURE(status))	308	if (ACPI_FAILURE(status))
309	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	309	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
310	"Could not evaluate _PDC, using legacy perf. control...\n"));	310	"Could not evaluate _PDC, using legacy perf. control...\n"));
311		311
312	return status;	312	return status;
313	}	313	}
314		314
315	/* --------------------------------------------------------------------------	315	/* --------------------------------------------------------------------------
316	FS Interface (/proc)	316	FS Interface (/proc)
317	-------------------------------------------------------------------------- */	317	-------------------------------------------------------------------------- */
318		318
319	static struct proc_dir_entry *acpi_processor_dir = NULL;	319	static struct proc_dir_entry *acpi_processor_dir = NULL;
320		320
321	static int acpi_processor_info_seq_show(struct seq_file seq, void offset)	321	static int acpi_processor_info_seq_show(struct seq_file seq, void offset)
322	{	322	{
323	struct acpi_processor *pr = seq->private;	323	struct acpi_processor *pr = seq->private;
324		324
325		325
326	if (!pr)	326	if (!pr)
327	goto end;	327	goto end;
328		328
329	seq_printf(seq, "processor id: %d\n"	329	seq_printf(seq, "processor id: %d\n"
330	"acpi id: %d\n"	330	"acpi id: %d\n"
331	"bus mastering control: %s\n"	331	"bus mastering control: %s\n"
332	"power management: %s\n"	332	"power management: %s\n"
333	"throttling control: %s\n"	333	"throttling control: %s\n"
334	"limit interface: %s\n",	334	"limit interface: %s\n",
335	pr->id,	335	pr->id,
336	pr->acpi_id,	336	pr->acpi_id,
337	pr->flags.bm_control ? "yes" : "no",	337	pr->flags.bm_control ? "yes" : "no",
338	pr->flags.power ? "yes" : "no",	338	pr->flags.power ? "yes" : "no",
339	pr->flags.throttling ? "yes" : "no",	339	pr->flags.throttling ? "yes" : "no",
340	pr->flags.limit ? "yes" : "no");	340	pr->flags.limit ? "yes" : "no");
341		341
342	end:	342	end:
343	return 0;	343	return 0;
344	}	344	}
345		345
346	static int acpi_processor_info_open_fs(struct inode inode, struct file file)	346	static int acpi_processor_info_open_fs(struct inode inode, struct file file)
347	{	347	{
348	return single_open(file, acpi_processor_info_seq_show,	348	return single_open(file, acpi_processor_info_seq_show,
349	PDE(inode)->data);	349	PDE(inode)->data);
350	}	350	}
351		351
352	static int acpi_processor_add_fs(struct acpi_device *device)	352	static int acpi_processor_add_fs(struct acpi_device *device)
353	{	353	{
354	struct proc_dir_entry *entry = NULL;	354	struct proc_dir_entry *entry = NULL;
355		355
356		356
357	if (!acpi_device_dir(device)) {	357	if (!acpi_device_dir(device)) {
358	acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device),	358	acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device),
359	acpi_processor_dir);	359	acpi_processor_dir);
360	if (!acpi_device_dir(device))	360	if (!acpi_device_dir(device))
361	return -ENODEV;	361	return -ENODEV;
362	}	362	}
363		363
364	/* 'info' [R] */	364	/* 'info' [R] */
365	entry = proc_create_data(ACPI_PROCESSOR_FILE_INFO,	365	entry = proc_create_data(ACPI_PROCESSOR_FILE_INFO,
366	S_IRUGO, acpi_device_dir(device),	366	S_IRUGO, acpi_device_dir(device),
367	&acpi_processor_info_fops,	367	&acpi_processor_info_fops,
368	acpi_driver_data(device));	368	acpi_driver_data(device));
369	if (!entry)	369	if (!entry)
370	return -EIO;	370	return -EIO;
371		371
372	/* 'throttling' [R/W] */	372	/* 'throttling' [R/W] */
373	entry = proc_create_data(ACPI_PROCESSOR_FILE_THROTTLING,	373	entry = proc_create_data(ACPI_PROCESSOR_FILE_THROTTLING,
374	S_IFREG \| S_IRUGO \| S_IWUSR,	374	S_IFREG \| S_IRUGO \| S_IWUSR,
375	acpi_device_dir(device),	375	acpi_device_dir(device),
376	&acpi_processor_throttling_fops,	376	&acpi_processor_throttling_fops,
377	acpi_driver_data(device));	377	acpi_driver_data(device));
378	if (!entry)	378	if (!entry)
379	return -EIO;	379	return -EIO;
380		380
381	/* 'limit' [R/W] */	381	/* 'limit' [R/W] */
382	entry = proc_create_data(ACPI_PROCESSOR_FILE_LIMIT,	382	entry = proc_create_data(ACPI_PROCESSOR_FILE_LIMIT,
383	S_IFREG \| S_IRUGO \| S_IWUSR,	383	S_IFREG \| S_IRUGO \| S_IWUSR,
384	acpi_device_dir(device),	384	acpi_device_dir(device),
385	&acpi_processor_limit_fops,	385	&acpi_processor_limit_fops,
386	acpi_driver_data(device));	386	acpi_driver_data(device));
387	if (!entry)	387	if (!entry)
388	return -EIO;	388	return -EIO;
389	return 0;	389	return 0;
390	}	390	}
391		391
392	static int acpi_processor_remove_fs(struct acpi_device *device)	392	static int acpi_processor_remove_fs(struct acpi_device *device)
393	{	393	{
394		394
395	if (acpi_device_dir(device)) {	395	if (acpi_device_dir(device)) {
396	remove_proc_entry(ACPI_PROCESSOR_FILE_INFO,	396	remove_proc_entry(ACPI_PROCESSOR_FILE_INFO,
397	acpi_device_dir(device));	397	acpi_device_dir(device));
398	remove_proc_entry(ACPI_PROCESSOR_FILE_THROTTLING,	398	remove_proc_entry(ACPI_PROCESSOR_FILE_THROTTLING,
399	acpi_device_dir(device));	399	acpi_device_dir(device));
400	remove_proc_entry(ACPI_PROCESSOR_FILE_LIMIT,	400	remove_proc_entry(ACPI_PROCESSOR_FILE_LIMIT,
401	acpi_device_dir(device));	401	acpi_device_dir(device));
402	remove_proc_entry(acpi_device_bid(device), acpi_processor_dir);	402	remove_proc_entry(acpi_device_bid(device), acpi_processor_dir);
403	acpi_device_dir(device) = NULL;	403	acpi_device_dir(device) = NULL;
404	}	404	}
405		405
406	return 0;	406	return 0;
407	}	407	}
408		408
409	/* Use the acpiid in MADT to map cpus in case of SMP */	409	/* Use the acpiid in MADT to map cpus in case of SMP */
410		410
411	#ifndef CONFIG_SMP	411	#ifndef CONFIG_SMP
412	static int get_cpu_id(acpi_handle handle, int type, u32 acpi_id) { return -1; }	412	static int get_cpu_id(acpi_handle handle, int type, u32 acpi_id) { return -1; }
413	#else	413	#else
414		414
415	static struct acpi_table_madt *madt;	415	static struct acpi_table_madt *madt;
416		416
417	static int map_lapic_id(struct acpi_subtable_header *entry,	417	static int map_lapic_id(struct acpi_subtable_header *entry,
418	u32 acpi_id, int *apic_id)	418	u32 acpi_id, int *apic_id)
419	{	419	{
420	struct acpi_madt_local_apic *lapic =	420	struct acpi_madt_local_apic *lapic =
421	(struct acpi_madt_local_apic *)entry;	421	(struct acpi_madt_local_apic *)entry;
422	if ((lapic->lapic_flags & ACPI_MADT_ENABLED) &&	422	if ((lapic->lapic_flags & ACPI_MADT_ENABLED) &&
423	lapic->processor_id == acpi_id) {	423	lapic->processor_id == acpi_id) {
424	*apic_id = lapic->id;	424	*apic_id = lapic->id;
425	return 1;	425	return 1;
426	}	426	}
427	return 0;	427	return 0;
428	}	428	}
429		429
430	static int map_x2apic_id(struct acpi_subtable_header *entry,	430	static int map_x2apic_id(struct acpi_subtable_header *entry,
431	int device_declaration, u32 acpi_id, int *apic_id)	431	int device_declaration, u32 acpi_id, int *apic_id)
432	{	432	{
433	struct acpi_madt_local_x2apic *apic =	433	struct acpi_madt_local_x2apic *apic =
434	(struct acpi_madt_local_x2apic *)entry;	434	(struct acpi_madt_local_x2apic *)entry;
435	u32 tmp = apic->local_apic_id;	435	u32 tmp = apic->local_apic_id;
436		436
437	/* Only check enabled APICs*/	437	/* Only check enabled APICs*/
438	if (!(apic->lapic_flags & ACPI_MADT_ENABLED))	438	if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
439	return 0;	439	return 0;
440		440
441	/* Device statement declaration type */	441	/* Device statement declaration type */
442	if (device_declaration) {	442	if (device_declaration) {
443	if (apic->uid == acpi_id)	443	if (apic->uid == acpi_id)
444	goto found;	444	goto found;
445	}	445	}
446		446
447	return 0;	447	return 0;
448	found:	448	found:
449	*apic_id = tmp;	449	*apic_id = tmp;
450	return 1;	450	return 1;
451	}	451	}
452		452
453	static int map_lsapic_id(struct acpi_subtable_header *entry,	453	static int map_lsapic_id(struct acpi_subtable_header *entry,
454	int device_declaration, u32 acpi_id, int *apic_id)	454	int device_declaration, u32 acpi_id, int *apic_id)
455	{	455	{
456	struct acpi_madt_local_sapic *lsapic =	456	struct acpi_madt_local_sapic *lsapic =
457	(struct acpi_madt_local_sapic *)entry;	457	(struct acpi_madt_local_sapic *)entry;
458	u32 tmp = (lsapic->id << 8) \| lsapic->eid;	458	u32 tmp = (lsapic->id << 8) \| lsapic->eid;
459		459
460	/* Only check enabled APICs*/	460	/* Only check enabled APICs*/
461	if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))	461	if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))
462	return 0;	462	return 0;
463		463
464	/* Device statement declaration type */	464	/* Device statement declaration type */
465	if (device_declaration) {	465	if (device_declaration) {
466	if (entry->length < 16)	466	if (entry->length < 16)
467	printk(KERN_ERR PREFIX	467	printk(KERN_ERR PREFIX
468	"Invalid LSAPIC with Device type processor (SAPIC ID %#x)\n",	468	"Invalid LSAPIC with Device type processor (SAPIC ID %#x)\n",
469	tmp);	469	tmp);
470	else if (lsapic->uid == acpi_id)	470	else if (lsapic->uid == acpi_id)
471	goto found;	471	goto found;
472	/* Processor statement declaration type */	472	/* Processor statement declaration type */
473	} else if (lsapic->processor_id == acpi_id)	473	} else if (lsapic->processor_id == acpi_id)
474	goto found;	474	goto found;
475		475
476	return 0;	476	return 0;
477	found:	477	found:
478	*apic_id = tmp;	478	*apic_id = tmp;
479	return 1;	479	return 1;
480	}	480	}
481		481
482	static int map_madt_entry(int type, u32 acpi_id)	482	static int map_madt_entry(int type, u32 acpi_id)
483	{	483	{
484	unsigned long madt_end, entry;	484	unsigned long madt_end, entry;
485	int apic_id = -1;	485	int apic_id = -1;
486		486
487	if (!madt)	487	if (!madt)
488	return apic_id;	488	return apic_id;
489		489
490	entry = (unsigned long)madt;	490	entry = (unsigned long)madt;
491	madt_end = entry + madt->header.length;	491	madt_end = entry + madt->header.length;
492		492
493	/* Parse all entries looking for a match. */	493	/* Parse all entries looking for a match. */
494		494
495	entry += sizeof(struct acpi_table_madt);	495	entry += sizeof(struct acpi_table_madt);
496	while (entry + sizeof(struct acpi_subtable_header) < madt_end) {	496	while (entry + sizeof(struct acpi_subtable_header) < madt_end) {
497	struct acpi_subtable_header *header =	497	struct acpi_subtable_header *header =
498	(struct acpi_subtable_header *)entry;	498	(struct acpi_subtable_header *)entry;
499	if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {	499	if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
500	if (map_lapic_id(header, acpi_id, &apic_id))	500	if (map_lapic_id(header, acpi_id, &apic_id))
501	break;	501	break;
502	} else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {	502	} else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
503	if (map_x2apic_id(header, type, acpi_id, &apic_id))	503	if (map_x2apic_id(header, type, acpi_id, &apic_id))
504	break;	504	break;
505	} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {	505	} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
506	if (map_lsapic_id(header, type, acpi_id, &apic_id))	506	if (map_lsapic_id(header, type, acpi_id, &apic_id))
507	break;	507	break;
508	}	508	}
509	entry += header->length;	509	entry += header->length;
510	}	510	}
511	return apic_id;	511	return apic_id;
512	}	512	}
513		513
514	static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id)	514	static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
515	{	515	{
516	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };	516	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
517	union acpi_object *obj;	517	union acpi_object *obj;
518	struct acpi_subtable_header *header;	518	struct acpi_subtable_header *header;
519	int apic_id = -1;	519	int apic_id = -1;
520		520
521	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))	521	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
522	goto exit;	522	goto exit;
523		523
524	if (!buffer.length \|\| !buffer.pointer)	524	if (!buffer.length \|\| !buffer.pointer)
525	goto exit;	525	goto exit;
526		526
527	obj = buffer.pointer;	527	obj = buffer.pointer;
528	if (obj->type != ACPI_TYPE_BUFFER \|\|	528	if (obj->type != ACPI_TYPE_BUFFER \|\|
529	obj->buffer.length < sizeof(struct acpi_subtable_header)) {	529	obj->buffer.length < sizeof(struct acpi_subtable_header)) {
530	goto exit;	530	goto exit;
531	}	531	}
532		532
533	header = (struct acpi_subtable_header *)obj->buffer.pointer;	533	header = (struct acpi_subtable_header *)obj->buffer.pointer;
534	if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {	534	if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
535	map_lapic_id(header, acpi_id, &apic_id);	535	map_lapic_id(header, acpi_id, &apic_id);
536	} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {	536	} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
537	map_lsapic_id(header, type, acpi_id, &apic_id);	537	map_lsapic_id(header, type, acpi_id, &apic_id);
538	}	538	}
539		539
540	exit:	540	exit:
541	if (buffer.pointer)	541	if (buffer.pointer)
542	kfree(buffer.pointer);	542	kfree(buffer.pointer);
543	return apic_id;	543	return apic_id;
544	}	544	}
545		545
546	static int get_cpu_id(acpi_handle handle, int type, u32 acpi_id)	546	static int get_cpu_id(acpi_handle handle, int type, u32 acpi_id)
547	{	547	{
548	int i;	548	int i;
549	int apic_id = -1;	549	int apic_id = -1;
550		550
551	apic_id = map_mat_entry(handle, type, acpi_id);	551	apic_id = map_mat_entry(handle, type, acpi_id);
552	if (apic_id == -1)	552	if (apic_id == -1)
553	apic_id = map_madt_entry(type, acpi_id);	553	apic_id = map_madt_entry(type, acpi_id);
554	if (apic_id == -1)	554	if (apic_id == -1)
555	return apic_id;	555	return apic_id;
556		556
557	for_each_possible_cpu(i) {	557	for_each_possible_cpu(i) {
558	if (cpu_physical_id(i) == apic_id)	558	if (cpu_physical_id(i) == apic_id)
559	return i;	559	return i;
560	}	560	}
561	return -1;	561	return -1;
562	}	562	}
563	#endif	563	#endif
564		564
565	/* --------------------------------------------------------------------------	565	/* --------------------------------------------------------------------------
566	Driver Interface	566	Driver Interface
567	-------------------------------------------------------------------------- */	567	-------------------------------------------------------------------------- */
568		568
569	static int acpi_processor_get_info(struct acpi_device *device)	569	static int acpi_processor_get_info(struct acpi_device *device)
570	{	570	{
571	acpi_status status = 0;	571	acpi_status status = 0;
572	union acpi_object object = { 0 };	572	union acpi_object object = { 0 };
573	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };	573	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
574	struct acpi_processor *pr;	574	struct acpi_processor *pr;
575	int cpu_index, device_declaration = 0;	575	int cpu_index, device_declaration = 0;
576	static int cpu0_initialized;	576	static int cpu0_initialized;
577		577
578	pr = acpi_driver_data(device);	578	pr = acpi_driver_data(device);
579	if (!pr)	579	if (!pr)
580	return -EINVAL;	580	return -EINVAL;
581		581
582	if (num_online_cpus() > 1)	582	if (num_online_cpus() > 1)
583	errata.smp = TRUE;	583	errata.smp = TRUE;
584		584
585	acpi_processor_errata(pr);	585	acpi_processor_errata(pr);
586		586
587	/*	587	/*
588	* Check to see if we have bus mastering arbitration control. This	588	* Check to see if we have bus mastering arbitration control. This
589	* is required for proper C3 usage (to maintain cache coherency).	589	* is required for proper C3 usage (to maintain cache coherency).
590	*/	590	*/
591	if (acpi_gbl_FADT.pm2_control_block && acpi_gbl_FADT.pm2_control_length) {	591	if (acpi_gbl_FADT.pm2_control_block && acpi_gbl_FADT.pm2_control_length) {
592	pr->flags.bm_control = 1;	592	pr->flags.bm_control = 1;
593	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	593	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
594	"Bus mastering arbitration control present\n"));	594	"Bus mastering arbitration control present\n"));
595	} else	595	} else
596	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	596	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
597	"No bus mastering arbitration control\n"));	597	"No bus mastering arbitration control\n"));
598		598
599	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_HID)) {	599	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_HID)) {
600	/*	600	/*
601	* Declared with "Device" statement; match _UID.	601	* Declared with "Device" statement; match _UID.
602	* Note that we don't handle string _UIDs yet.	602	* Note that we don't handle string _UIDs yet.
603	*/	603	*/
604	unsigned long long value;	604	unsigned long long value;
605	status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,	605	status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
606	NULL, &value);	606	NULL, &value);
607	if (ACPI_FAILURE(status)) {	607	if (ACPI_FAILURE(status)) {
608	printk(KERN_ERR PREFIX	608	printk(KERN_ERR PREFIX
609	"Evaluating processor _UID [%#x]\n", status);	609	"Evaluating processor _UID [%#x]\n", status);
610	return -ENODEV;	610	return -ENODEV;
611	}	611	}
612	device_declaration = 1;	612	device_declaration = 1;
613	pr->acpi_id = value;	613	pr->acpi_id = value;
614	} else {	614	} else {
615	/* Declared with "Processor" statement; match ProcessorID */	615	/* Declared with "Processor" statement; match ProcessorID */
616	status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);	616	status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
617	if (ACPI_FAILURE(status)) {	617	if (ACPI_FAILURE(status)) {
618	printk(KERN_ERR PREFIX "Evaluating processor object\n");	618	printk(KERN_ERR PREFIX "Evaluating processor object\n");
619	return -ENODEV;	619	return -ENODEV;
620	}	620	}
621		621
622	/*	622	/*
623	* TBD: Synch processor ID (via LAPIC/LSAPIC structures) on SMP.	623	* TBD: Synch processor ID (via LAPIC/LSAPIC structures) on SMP.
624	* >>> 'acpi_get_processor_id(acpi_id, &id)' in	624	* >>> 'acpi_get_processor_id(acpi_id, &id)' in
625	* arch/xxx/acpi.c	625	* arch/xxx/acpi.c
626	*/	626	*/
627	pr->acpi_id = object.processor.proc_id;	627	pr->acpi_id = object.processor.proc_id;
628	}	628	}
629	cpu_index = get_cpu_id(pr->handle, device_declaration, pr->acpi_id);	629	cpu_index = get_cpu_id(pr->handle, device_declaration, pr->acpi_id);
630		630
631	/* Handle UP system running SMP kernel, with no LAPIC in MADT */	631	/* Handle UP system running SMP kernel, with no LAPIC in MADT */
632	if (!cpu0_initialized && (cpu_index == -1) &&	632	if (!cpu0_initialized && (cpu_index == -1) &&
633	(num_online_cpus() == 1)) {	633	(num_online_cpus() == 1)) {
634	cpu_index = 0;	634	cpu_index = 0;
635	}	635	}
636		636
637	cpu0_initialized = 1;	637	cpu0_initialized = 1;
638		638
639	pr->id = cpu_index;	639	pr->id = cpu_index;
640		640
641	/*	641	/*
642	* Extra Processor objects may be enumerated on MP systems with	642	* Extra Processor objects may be enumerated on MP systems with
643	* less than the max # of CPUs. They should be ignored _iff	643	* less than the max # of CPUs. They should be ignored _iff
644	* they are physically not present.	644	* they are physically not present.
645	*/	645	*/
646	if (pr->id == -1) {	646	if (pr->id == -1) {
647	if (ACPI_FAILURE	647	if (ACPI_FAILURE
648	(acpi_processor_hotadd_init(pr->handle, &pr->id))) {	648	(acpi_processor_hotadd_init(pr->handle, &pr->id))) {
649	return -ENODEV;	649	return -ENODEV;
650	}	650	}
651	}	651	}
652		652
653	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,	653	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
654	pr->acpi_id));	654	pr->acpi_id));
655		655
656	if (!object.processor.pblk_address)	656	if (!object.processor.pblk_address)
657	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No PBLK (NULL address)\n"));	657	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No PBLK (NULL address)\n"));
658	else if (object.processor.pblk_length != 6)	658	else if (object.processor.pblk_length != 6)
659	printk(KERN_ERR PREFIX "Invalid PBLK length [%d]\n",	659	printk(KERN_ERR PREFIX "Invalid PBLK length [%d]\n",
660	object.processor.pblk_length);	660	object.processor.pblk_length);
661	else {	661	else {
662	pr->throttling.address = object.processor.pblk_address;	662	pr->throttling.address = object.processor.pblk_address;
663	pr->throttling.duty_offset = acpi_gbl_FADT.duty_offset;	663	pr->throttling.duty_offset = acpi_gbl_FADT.duty_offset;
664	pr->throttling.duty_width = acpi_gbl_FADT.duty_width;	664	pr->throttling.duty_width = acpi_gbl_FADT.duty_width;
665		665
666	pr->pblk = object.processor.pblk_address;	666	pr->pblk = object.processor.pblk_address;
667		667
668	/*	668	/*
669	* We don't care about error returns - we just try to mark	669	* We don't care about error returns - we just try to mark
670	* these reserved so that nobody else is confused into thinking	670	* these reserved so that nobody else is confused into thinking
671	* that this region might be unused..	671	* that this region might be unused..
672	*	672	*
673	* (In particular, allocating the IO range for Cardbus)	673	* (In particular, allocating the IO range for Cardbus)
674	*/	674	*/
675	request_region(pr->throttling.address, 6, "ACPI CPU throttle");	675	request_region(pr->throttling.address, 6, "ACPI CPU throttle");
676	}	676	}
677		677
678	/*	678	/*
679	* If ACPI describes a slot number for this CPU, we can use it	679	* If ACPI describes a slot number for this CPU, we can use it
680	* ensure we get the right value in the "physical id" field	680	* ensure we get the right value in the "physical id" field
681	* of /proc/cpuinfo	681	* of /proc/cpuinfo
682	*/	682	*/
683	status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);	683	status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
684	if (ACPI_SUCCESS(status))	684	if (ACPI_SUCCESS(status))
685	arch_fix_phys_package_id(pr->id, object.integer.value);	685	arch_fix_phys_package_id(pr->id, object.integer.value);
686		686
687	return 0;	687	return 0;
688	}	688	}
689		689
690	static DEFINE_PER_CPU(void *, processor_device_array);	690	static DEFINE_PER_CPU(void *, processor_device_array);
691		691
692	static int __cpuinit acpi_processor_start(struct acpi_device *device)	692	static int __cpuinit acpi_processor_start(struct acpi_device *device)
693	{	693	{
694	int result = 0;	694	int result = 0;
695	struct acpi_processor *pr;	695	struct acpi_processor *pr;
696	struct sys_device *sysdev;	696	struct sys_device *sysdev;
697		697
698	pr = acpi_driver_data(device);	698	pr = acpi_driver_data(device);
699		699
700	result = acpi_processor_get_info(device);	700	result = acpi_processor_get_info(device);
701	if (result) {	701	if (result) {
702	/* Processor is physically not present */	702	/* Processor is physically not present */
703	return 0;	703	return 0;
704	}	704	}
705		705
706	BUG_ON((pr->id >= nr_cpu_ids) \|\| (pr->id < 0));	706	BUG_ON((pr->id >= nr_cpu_ids) \|\| (pr->id < 0));
707		707
708	/*	708	/*
709	* Buggy BIOS check	709	* Buggy BIOS check
710	* ACPI id of processors can be reported wrongly by the BIOS.	710	* ACPI id of processors can be reported wrongly by the BIOS.
711	* Don't trust it blindly	711	* Don't trust it blindly
712	*/	712	*/
713	if (per_cpu(processor_device_array, pr->id) != NULL &&	713	if (per_cpu(processor_device_array, pr->id) != NULL &&
714	per_cpu(processor_device_array, pr->id) != device) {	714	per_cpu(processor_device_array, pr->id) != device) {
715	printk(KERN_WARNING "BIOS reported wrong ACPI id "	715	printk(KERN_WARNING "BIOS reported wrong ACPI id "
716	"for the processor\n");	716	"for the processor\n");
717	return -ENODEV;	717	return -ENODEV;
718	}	718	}
719	per_cpu(processor_device_array, pr->id) = device;	719	per_cpu(processor_device_array, pr->id) = device;
720		720
721	per_cpu(processors, pr->id) = pr;	721	per_cpu(processors, pr->id) = pr;
722		722
723	result = acpi_processor_add_fs(device);	723	result = acpi_processor_add_fs(device);
724	if (result)	724	if (result)
725	goto end;	725	goto end;
726		726
727	sysdev = get_cpu_sysdev(pr->id);	727	sysdev = get_cpu_sysdev(pr->id);
728	if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))	728	if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
729	return -EFAULT;	729	return -EFAULT;
730		730
731	/* _PDC call should be done before doing anything else (if reqd.). */	731	/* _PDC call should be done before doing anything else (if reqd.). */
732	arch_acpi_processor_init_pdc(pr);	732	arch_acpi_processor_init_pdc(pr);
733	acpi_processor_set_pdc(pr);	733	acpi_processor_set_pdc(pr);
734	#ifdef CONFIG_CPU_FREQ	734	#ifdef CONFIG_CPU_FREQ
735	acpi_processor_ppc_has_changed(pr);	735	acpi_processor_ppc_has_changed(pr);
736	#endif	736	#endif
737	acpi_processor_get_throttling_info(pr);	737	acpi_processor_get_throttling_info(pr);
738	acpi_processor_get_limit_info(pr);	738	acpi_processor_get_limit_info(pr);
739		739
740		740
741	acpi_processor_power_init(pr, device);	741	acpi_processor_power_init(pr, device);
742		742
743	pr->cdev = thermal_cooling_device_register("Processor", device,	743	pr->cdev = thermal_cooling_device_register("Processor", device,
744	&processor_cooling_ops);	744	&processor_cooling_ops);
745	if (IS_ERR(pr->cdev)) {	745	if (IS_ERR(pr->cdev)) {
746	result = PTR_ERR(pr->cdev);	746	result = PTR_ERR(pr->cdev);
747	goto end;	747	goto end;
748	}	748	}
749		749
750	dev_info(&device->dev, "registered as cooling_device%d\n",	750	dev_info(&device->dev, "registered as cooling_device%d\n",
751	pr->cdev->id);	751	pr->cdev->id);
752		752
753	result = sysfs_create_link(&device->dev.kobj,	753	result = sysfs_create_link(&device->dev.kobj,
754	&pr->cdev->device.kobj,	754	&pr->cdev->device.kobj,
755	"thermal_cooling");	755	"thermal_cooling");
756	if (result)	756	if (result)
757	printk(KERN_ERR PREFIX "Create sysfs link\n");	757	printk(KERN_ERR PREFIX "Create sysfs link\n");
758	result = sysfs_create_link(&pr->cdev->device.kobj,	758	result = sysfs_create_link(&pr->cdev->device.kobj,
759	&device->dev.kobj,	759	&device->dev.kobj,
760	"device");	760	"device");
761	if (result)	761	if (result)
762	printk(KERN_ERR PREFIX "Create sysfs link\n");	762	printk(KERN_ERR PREFIX "Create sysfs link\n");
763		763
764	if (pr->flags.throttling) {	764	if (pr->flags.throttling) {
765	printk(KERN_INFO PREFIX "%s [%s] (supports",	765	printk(KERN_INFO PREFIX "%s [%s] (supports",
766	acpi_device_name(device), acpi_device_bid(device));	766	acpi_device_name(device), acpi_device_bid(device));
767	printk(" %d throttling states", pr->throttling.state_count);	767	printk(" %d throttling states", pr->throttling.state_count);
768	printk(")\n");	768	printk(")\n");
769	}	769	}
770		770
771	end:	771	end:
772		772
773	return result;	773	return result;
774	}	774	}
775		775
776	static void acpi_processor_notify(struct acpi_device *device, u32 event)	776	static void acpi_processor_notify(struct acpi_device *device, u32 event)
777	{	777	{
778	struct acpi_processor *pr = acpi_driver_data(device);	778	struct acpi_processor *pr = acpi_driver_data(device);
779	int saved;	779	int saved;
780		780
781	if (!pr)	781	if (!pr)
782	return;	782	return;
783		783
784	switch (event) {	784	switch (event) {
785	case ACPI_PROCESSOR_NOTIFY_PERFORMANCE:	785	case ACPI_PROCESSOR_NOTIFY_PERFORMANCE:
786	saved = pr->performance_platform_limit;	786	saved = pr->performance_platform_limit;
787	acpi_processor_ppc_has_changed(pr);	787	acpi_processor_ppc_has_changed(pr);
788	if (saved == pr->performance_platform_limit)	788	if (saved == pr->performance_platform_limit)
789	break;	789	break;
790	acpi_bus_generate_proc_event(device, event,	790	acpi_bus_generate_proc_event(device, event,
791	pr->performance_platform_limit);	791	pr->performance_platform_limit);
792	acpi_bus_generate_netlink_event(device->pnp.device_class,	792	acpi_bus_generate_netlink_event(device->pnp.device_class,
793	dev_name(&device->dev), event,	793	dev_name(&device->dev), event,
794	pr->performance_platform_limit);	794	pr->performance_platform_limit);
795	break;	795	break;
796	case ACPI_PROCESSOR_NOTIFY_POWER:	796	case ACPI_PROCESSOR_NOTIFY_POWER:
797	acpi_processor_cst_has_changed(pr);	797	acpi_processor_cst_has_changed(pr);
798	acpi_bus_generate_proc_event(device, event, 0);	798	acpi_bus_generate_proc_event(device, event, 0);
799	acpi_bus_generate_netlink_event(device->pnp.device_class,	799	acpi_bus_generate_netlink_event(device->pnp.device_class,
800	dev_name(&device->dev), event, 0);	800	dev_name(&device->dev), event, 0);
801	break;	801	break;
802	case ACPI_PROCESSOR_NOTIFY_THROTTLING:	802	case ACPI_PROCESSOR_NOTIFY_THROTTLING:
803	acpi_processor_tstate_has_changed(pr);	803	acpi_processor_tstate_has_changed(pr);
804	acpi_bus_generate_proc_event(device, event, 0);	804	acpi_bus_generate_proc_event(device, event, 0);
805	acpi_bus_generate_netlink_event(device->pnp.device_class,	805	acpi_bus_generate_netlink_event(device->pnp.device_class,
806	dev_name(&device->dev), event, 0);	806	dev_name(&device->dev), event, 0);
807	default:	807	default:
808	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	808	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
809	"Unsupported event [0x%x]\n", event));	809	"Unsupported event [0x%x]\n", event));
810	break;	810	break;
811	}	811	}
812		812
813	return;	813	return;
814	}	814	}
815		815
816	static int acpi_cpu_soft_notify(struct notifier_block *nfb,	816	static int acpi_cpu_soft_notify(struct notifier_block *nfb,
817	unsigned long action, void *hcpu)	817	unsigned long action, void *hcpu)
818	{	818	{
819	unsigned int cpu = (unsigned long)hcpu;	819	unsigned int cpu = (unsigned long)hcpu;
820	struct acpi_processor *pr = per_cpu(processors, cpu);	820	struct acpi_processor *pr = per_cpu(processors, cpu);
821		821
822	if (action == CPU_ONLINE && pr) {	822	if (action == CPU_ONLINE && pr) {
823	acpi_processor_ppc_has_changed(pr);	823	acpi_processor_ppc_has_changed(pr);
824	acpi_processor_cst_has_changed(pr);	824	acpi_processor_cst_has_changed(pr);
825	acpi_processor_tstate_has_changed(pr);	825	acpi_processor_tstate_has_changed(pr);
826	}	826	}
827	return NOTIFY_OK;	827	return NOTIFY_OK;
828	}	828	}
829		829
830	static struct notifier_block acpi_cpu_notifier =	830	static struct notifier_block acpi_cpu_notifier =
831	{	831	{
832	.notifier_call = acpi_cpu_soft_notify,	832	.notifier_call = acpi_cpu_soft_notify,
833	};	833	};
834		834
835	static int acpi_processor_add(struct acpi_device *device)	835	static int acpi_processor_add(struct acpi_device *device)
836	{	836	{
837	struct acpi_processor *pr = NULL;	837	struct acpi_processor *pr = NULL;
838		838
839		839
840	if (!device)	840	if (!device)
841	return -EINVAL;	841	return -EINVAL;
842		842
843	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);	843	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
844	if (!pr)	844	if (!pr)
845	return -ENOMEM;	845	return -ENOMEM;
846		846
847	if (!alloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {	847	if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
848	kfree(pr);	848	kfree(pr);
849	return -ENOMEM;	849	return -ENOMEM;
850	}	850	}
851		851
852	pr->handle = device->handle;	852	pr->handle = device->handle;
853	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);	853	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
854	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);	854	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
855	device->driver_data = pr;	855	device->driver_data = pr;
856		856
857	return 0;	857	return 0;
858	}	858	}
859		859
860	static int acpi_processor_remove(struct acpi_device *device, int type)	860	static int acpi_processor_remove(struct acpi_device *device, int type)
861	{	861	{
862	struct acpi_processor *pr = NULL;	862	struct acpi_processor *pr = NULL;
863		863
864		864
865	if (!device \|\| !acpi_driver_data(device))	865	if (!device \|\| !acpi_driver_data(device))
866	return -EINVAL;	866	return -EINVAL;
867		867
868	pr = acpi_driver_data(device);	868	pr = acpi_driver_data(device);
869		869
870	if (pr->id >= nr_cpu_ids)	870	if (pr->id >= nr_cpu_ids)
871	goto free;	871	goto free;
872		872
873	if (type == ACPI_BUS_REMOVAL_EJECT) {	873	if (type == ACPI_BUS_REMOVAL_EJECT) {
874	if (acpi_processor_handle_eject(pr))	874	if (acpi_processor_handle_eject(pr))
875	return -EINVAL;	875	return -EINVAL;
876	}	876	}
877		877
878	acpi_processor_power_exit(pr, device);	878	acpi_processor_power_exit(pr, device);
879		879
880	sysfs_remove_link(&device->dev.kobj, "sysdev");	880	sysfs_remove_link(&device->dev.kobj, "sysdev");
881		881
882	acpi_processor_remove_fs(device);	882	acpi_processor_remove_fs(device);
883		883
884	if (pr->cdev) {	884	if (pr->cdev) {
885	sysfs_remove_link(&device->dev.kobj, "thermal_cooling");	885	sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
886	sysfs_remove_link(&pr->cdev->device.kobj, "device");	886	sysfs_remove_link(&pr->cdev->device.kobj, "device");
887	thermal_cooling_device_unregister(pr->cdev);	887	thermal_cooling_device_unregister(pr->cdev);
888	pr->cdev = NULL;	888	pr->cdev = NULL;
889	}	889	}
890		890
891	per_cpu(processors, pr->id) = NULL;	891	per_cpu(processors, pr->id) = NULL;
892	per_cpu(processor_device_array, pr->id) = NULL;	892	per_cpu(processor_device_array, pr->id) = NULL;
893		893
894	free:	894	free:
895	free_cpumask_var(pr->throttling.shared_cpu_map);	895	free_cpumask_var(pr->throttling.shared_cpu_map);
896	kfree(pr);	896	kfree(pr);
897		897
898	return 0;	898	return 0;
899	}	899	}
900		900
901	#ifdef CONFIG_ACPI_HOTPLUG_CPU	901	#ifdef CONFIG_ACPI_HOTPLUG_CPU
902	/****************************************************************************	902	/****************************************************************************
903	* Acpi processor hotplug support *	903	* Acpi processor hotplug support *
904	****************************************************************************/	904	****************************************************************************/
905		905
906	static int is_processor_present(acpi_handle handle)	906	static int is_processor_present(acpi_handle handle)
907	{	907	{
908	acpi_status status;	908	acpi_status status;
909	unsigned long long sta = 0;	909	unsigned long long sta = 0;
910		910
911		911
912	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);	912	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
913		913
914	if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))	914	if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))
915	return 1;	915	return 1;
916		916
917	/*	917	/*
918	* _STA is mandatory for a processor that supports hot plug	918	* _STA is mandatory for a processor that supports hot plug
919	*/	919	*/
920	if (status == AE_NOT_FOUND)	920	if (status == AE_NOT_FOUND)
921	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	921	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
922	"Processor does not support hot plug\n"));	922	"Processor does not support hot plug\n"));
923	else	923	else
924	ACPI_EXCEPTION((AE_INFO, status,	924	ACPI_EXCEPTION((AE_INFO, status,
925	"Processor Device is not present"));	925	"Processor Device is not present"));
926	return 0;	926	return 0;
927	}	927	}
928		928
929	static	929	static
930	int acpi_processor_device_add(acpi_handle handle, struct acpi_device **device)	930	int acpi_processor_device_add(acpi_handle handle, struct acpi_device **device)
931	{	931	{
932	acpi_handle phandle;	932	acpi_handle phandle;
933	struct acpi_device *pdev;	933	struct acpi_device *pdev;
934	struct acpi_processor *pr;	934	struct acpi_processor *pr;
935		935
936		936
937	if (acpi_get_parent(handle, &phandle)) {	937	if (acpi_get_parent(handle, &phandle)) {
938	return -ENODEV;	938	return -ENODEV;
939	}	939	}
940		940
941	if (acpi_bus_get_device(phandle, &pdev)) {	941	if (acpi_bus_get_device(phandle, &pdev)) {
942	return -ENODEV;	942	return -ENODEV;
943	}	943	}
944		944
945	if (acpi_bus_add(device, pdev, handle, ACPI_BUS_TYPE_PROCESSOR)) {	945	if (acpi_bus_add(device, pdev, handle, ACPI_BUS_TYPE_PROCESSOR)) {
946	return -ENODEV;	946	return -ENODEV;
947	}	947	}
948		948
949	acpi_bus_start(*device);	949	acpi_bus_start(*device);
950		950
951	pr = acpi_driver_data(*device);	951	pr = acpi_driver_data(*device);
952	if (!pr)	952	if (!pr)
953	return -ENODEV;	953	return -ENODEV;
954		954
955	if ((pr->id >= 0) && (pr->id < nr_cpu_ids)) {	955	if ((pr->id >= 0) && (pr->id < nr_cpu_ids)) {
956	kobject_uevent(&(*device)->dev.kobj, KOBJ_ONLINE);	956	kobject_uevent(&(*device)->dev.kobj, KOBJ_ONLINE);
957	}	957	}
958	return 0;	958	return 0;
959	}	959	}
960		960
961	static void __ref acpi_processor_hotplug_notify(acpi_handle handle,	961	static void __ref acpi_processor_hotplug_notify(acpi_handle handle,
962	u32 event, void *data)	962	u32 event, void *data)
963	{	963	{
964	struct acpi_processor *pr;	964	struct acpi_processor *pr;
965	struct acpi_device *device = NULL;	965	struct acpi_device *device = NULL;
966	int result;	966	int result;
967		967
968		968
969	switch (event) {	969	switch (event) {
970	case ACPI_NOTIFY_BUS_CHECK:	970	case ACPI_NOTIFY_BUS_CHECK:
971	case ACPI_NOTIFY_DEVICE_CHECK:	971	case ACPI_NOTIFY_DEVICE_CHECK:
972	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	972	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
973	"Processor driver received %s event\n",	973	"Processor driver received %s event\n",
974	(event == ACPI_NOTIFY_BUS_CHECK) ?	974	(event == ACPI_NOTIFY_BUS_CHECK) ?
975	"ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK"));	975	"ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK"));
976		976
977	if (!is_processor_present(handle))	977	if (!is_processor_present(handle))
978	break;	978	break;
979		979
980	if (acpi_bus_get_device(handle, &device)) {	980	if (acpi_bus_get_device(handle, &device)) {
981	result = acpi_processor_device_add(handle, &device);	981	result = acpi_processor_device_add(handle, &device);
982	if (result)	982	if (result)
983	printk(KERN_ERR PREFIX	983	printk(KERN_ERR PREFIX
984	"Unable to add the device\n");	984	"Unable to add the device\n");
985	break;	985	break;
986	}	986	}
987		987
988	pr = acpi_driver_data(device);	988	pr = acpi_driver_data(device);
989	if (!pr) {	989	if (!pr) {
990	printk(KERN_ERR PREFIX "Driver data is NULL\n");	990	printk(KERN_ERR PREFIX "Driver data is NULL\n");
991	break;	991	break;
992	}	992	}
993		993
994	if (pr->id >= 0 && (pr->id < nr_cpu_ids)) {	994	if (pr->id >= 0 && (pr->id < nr_cpu_ids)) {
995	kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE);	995	kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE);
996	break;	996	break;
997	}	997	}
998		998
999	result = acpi_processor_start(device);	999	result = acpi_processor_start(device);
1000	if ((!result) && ((pr->id >= 0) && (pr->id < nr_cpu_ids))) {	1000	if ((!result) && ((pr->id >= 0) && (pr->id < nr_cpu_ids))) {
1001	kobject_uevent(&device->dev.kobj, KOBJ_ONLINE);	1001	kobject_uevent(&device->dev.kobj, KOBJ_ONLINE);
1002	} else {	1002	} else {
1003	printk(KERN_ERR PREFIX "Device [%s] failed to start\n",	1003	printk(KERN_ERR PREFIX "Device [%s] failed to start\n",
1004	acpi_device_bid(device));	1004	acpi_device_bid(device));
1005	}	1005	}
1006	break;	1006	break;
1007	case ACPI_NOTIFY_EJECT_REQUEST:	1007	case ACPI_NOTIFY_EJECT_REQUEST:
1008	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	1008	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1009	"received ACPI_NOTIFY_EJECT_REQUEST\n"));	1009	"received ACPI_NOTIFY_EJECT_REQUEST\n"));
1010		1010
1011	if (acpi_bus_get_device(handle, &device)) {	1011	if (acpi_bus_get_device(handle, &device)) {
1012	printk(KERN_ERR PREFIX	1012	printk(KERN_ERR PREFIX
1013	"Device don't exist, dropping EJECT\n");	1013	"Device don't exist, dropping EJECT\n");
1014	break;	1014	break;
1015	}	1015	}
1016	pr = acpi_driver_data(device);	1016	pr = acpi_driver_data(device);
1017	if (!pr) {	1017	if (!pr) {
1018	printk(KERN_ERR PREFIX	1018	printk(KERN_ERR PREFIX
1019	"Driver data is NULL, dropping EJECT\n");	1019	"Driver data is NULL, dropping EJECT\n");
1020	return;	1020	return;
1021	}	1021	}
1022		1022
1023	if ((pr->id < nr_cpu_ids) && (cpu_present(pr->id)))	1023	if ((pr->id < nr_cpu_ids) && (cpu_present(pr->id)))
1024	kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE);	1024	kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE);
1025	break;	1025	break;
1026	default:	1026	default:
1027	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	1027	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1028	"Unsupported event [0x%x]\n", event));	1028	"Unsupported event [0x%x]\n", event));
1029	break;	1029	break;
1030	}	1030	}
1031		1031
1032	return;	1032	return;
1033	}	1033	}
1034		1034
1035	static acpi_status	1035	static acpi_status
1036	processor_walk_namespace_cb(acpi_handle handle,	1036	processor_walk_namespace_cb(acpi_handle handle,
1037	u32 lvl, void context, void *rv)	1037	u32 lvl, void context, void *rv)
1038	{	1038	{
1039	acpi_status status;	1039	acpi_status status;
1040	int *action = context;	1040	int *action = context;
1041	acpi_object_type type = 0;	1041	acpi_object_type type = 0;
1042		1042
1043	status = acpi_get_type(handle, &type);	1043	status = acpi_get_type(handle, &type);
1044	if (ACPI_FAILURE(status))	1044	if (ACPI_FAILURE(status))
1045	return (AE_OK);	1045	return (AE_OK);
1046		1046
1047	if (type != ACPI_TYPE_PROCESSOR)	1047	if (type != ACPI_TYPE_PROCESSOR)
1048	return (AE_OK);	1048	return (AE_OK);
1049		1049
1050	switch (*action) {	1050	switch (*action) {
1051	case INSTALL_NOTIFY_HANDLER:	1051	case INSTALL_NOTIFY_HANDLER:
1052	acpi_install_notify_handler(handle,	1052	acpi_install_notify_handler(handle,
1053	ACPI_SYSTEM_NOTIFY,	1053	ACPI_SYSTEM_NOTIFY,
1054	acpi_processor_hotplug_notify,	1054	acpi_processor_hotplug_notify,
1055	NULL);	1055	NULL);
1056	break;	1056	break;
1057	case UNINSTALL_NOTIFY_HANDLER:	1057	case UNINSTALL_NOTIFY_HANDLER:
1058	acpi_remove_notify_handler(handle,	1058	acpi_remove_notify_handler(handle,
1059	ACPI_SYSTEM_NOTIFY,	1059	ACPI_SYSTEM_NOTIFY,
1060	acpi_processor_hotplug_notify);	1060	acpi_processor_hotplug_notify);
1061	break;	1061	break;
1062	default:	1062	default:
1063	break;	1063	break;
1064	}	1064	}
1065		1065
1066	return (AE_OK);	1066	return (AE_OK);
1067	}	1067	}
1068		1068
1069	static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu)	1069	static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu)
1070	{	1070	{
1071		1071
1072	if (!is_processor_present(handle)) {	1072	if (!is_processor_present(handle)) {
1073	return AE_ERROR;	1073	return AE_ERROR;
1074	}	1074	}
1075		1075
1076	if (acpi_map_lsapic(handle, p_cpu))	1076	if (acpi_map_lsapic(handle, p_cpu))
1077	return AE_ERROR;	1077	return AE_ERROR;
1078		1078
1079	if (arch_register_cpu(*p_cpu)) {	1079	if (arch_register_cpu(*p_cpu)) {
1080	acpi_unmap_lsapic(*p_cpu);	1080	acpi_unmap_lsapic(*p_cpu);
1081	return AE_ERROR;	1081	return AE_ERROR;
1082	}	1082	}
1083		1083
1084	return AE_OK;	1084	return AE_OK;
1085	}	1085	}
1086		1086
1087	static int acpi_processor_handle_eject(struct acpi_processor *pr)	1087	static int acpi_processor_handle_eject(struct acpi_processor *pr)
1088	{	1088	{
1089	if (cpu_online(pr->id))	1089	if (cpu_online(pr->id))
1090	cpu_down(pr->id);	1090	cpu_down(pr->id);
1091		1091
1092	arch_unregister_cpu(pr->id);	1092	arch_unregister_cpu(pr->id);
1093	acpi_unmap_lsapic(pr->id);	1093	acpi_unmap_lsapic(pr->id);
1094	return (0);	1094	return (0);
1095	}	1095	}
1096	#else	1096	#else
1097	static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu)	1097	static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu)
1098	{	1098	{
1099	return AE_ERROR;	1099	return AE_ERROR;
1100	}	1100	}
1101	static int acpi_processor_handle_eject(struct acpi_processor *pr)	1101	static int acpi_processor_handle_eject(struct acpi_processor *pr)
1102	{	1102	{
1103	return (-EINVAL);	1103	return (-EINVAL);
1104	}	1104	}
1105	#endif	1105	#endif
1106		1106
1107	static	1107	static
1108	void acpi_processor_install_hotplug_notify(void)	1108	void acpi_processor_install_hotplug_notify(void)
1109	{	1109	{
1110	#ifdef CONFIG_ACPI_HOTPLUG_CPU	1110	#ifdef CONFIG_ACPI_HOTPLUG_CPU
1111	int action = INSTALL_NOTIFY_HANDLER;	1111	int action = INSTALL_NOTIFY_HANDLER;
1112	acpi_walk_namespace(ACPI_TYPE_PROCESSOR,	1112	acpi_walk_namespace(ACPI_TYPE_PROCESSOR,
1113	ACPI_ROOT_OBJECT,	1113	ACPI_ROOT_OBJECT,
1114	ACPI_UINT32_MAX,	1114	ACPI_UINT32_MAX,
1115	processor_walk_namespace_cb, &action, NULL);	1115	processor_walk_namespace_cb, &action, NULL);
1116	#endif	1116	#endif
1117	register_hotcpu_notifier(&acpi_cpu_notifier);	1117	register_hotcpu_notifier(&acpi_cpu_notifier);
1118	}	1118	}
1119		1119
1120	static	1120	static
1121	void acpi_processor_uninstall_hotplug_notify(void)	1121	void acpi_processor_uninstall_hotplug_notify(void)
1122	{	1122	{
1123	#ifdef CONFIG_ACPI_HOTPLUG_CPU	1123	#ifdef CONFIG_ACPI_HOTPLUG_CPU
1124	int action = UNINSTALL_NOTIFY_HANDLER;	1124	int action = UNINSTALL_NOTIFY_HANDLER;
1125	acpi_walk_namespace(ACPI_TYPE_PROCESSOR,	1125	acpi_walk_namespace(ACPI_TYPE_PROCESSOR,
1126	ACPI_ROOT_OBJECT,	1126	ACPI_ROOT_OBJECT,
1127	ACPI_UINT32_MAX,	1127	ACPI_UINT32_MAX,
1128	processor_walk_namespace_cb, &action, NULL);	1128	processor_walk_namespace_cb, &action, NULL);
1129	#endif	1129	#endif
1130	unregister_hotcpu_notifier(&acpi_cpu_notifier);	1130	unregister_hotcpu_notifier(&acpi_cpu_notifier);
1131	}	1131	}
1132		1132
1133	/*	1133	/*
1134	* We keep the driver loaded even when ACPI is not running.	1134	* We keep the driver loaded even when ACPI is not running.
1135	* This is needed for the powernow-k8 driver, that works even without	1135	* This is needed for the powernow-k8 driver, that works even without
1136	* ACPI, but needs symbols from this driver	1136	* ACPI, but needs symbols from this driver
1137	*/	1137	*/
1138		1138
1139	static int __init acpi_processor_init(void)	1139	static int __init acpi_processor_init(void)
1140	{	1140	{
1141	int result = 0;	1141	int result = 0;
1142		1142
1143	memset(&errata, 0, sizeof(errata));	1143	memset(&errata, 0, sizeof(errata));
1144		1144
1145	#ifdef CONFIG_SMP	1145	#ifdef CONFIG_SMP
1146	if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_MADT, 0,	1146	if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_MADT, 0,
1147	(struct acpi_table_header **)&madt)))	1147	(struct acpi_table_header **)&madt)))
1148	madt = NULL;	1148	madt = NULL;
1149	#endif	1149	#endif
1150		1150
1151	acpi_processor_dir = proc_mkdir(ACPI_PROCESSOR_CLASS, acpi_root_dir);	1151	acpi_processor_dir = proc_mkdir(ACPI_PROCESSOR_CLASS, acpi_root_dir);
1152	if (!acpi_processor_dir)	1152	if (!acpi_processor_dir)
1153	return -ENOMEM;	1153	return -ENOMEM;
1154		1154
1155	/*	1155	/*
1156	* Check whether the system is DMI table. If yes, OSPM	1156	* Check whether the system is DMI table. If yes, OSPM
1157	* should not use mwait for CPU-states.	1157	* should not use mwait for CPU-states.
1158	*/	1158	*/
1159	dmi_check_system(processor_idle_dmi_table);	1159	dmi_check_system(processor_idle_dmi_table);
1160	result = cpuidle_register_driver(&acpi_idle_driver);	1160	result = cpuidle_register_driver(&acpi_idle_driver);
1161	if (result < 0)	1161	if (result < 0)
1162	goto out_proc;	1162	goto out_proc;
1163		1163
1164	result = acpi_bus_register_driver(&acpi_processor_driver);	1164	result = acpi_bus_register_driver(&acpi_processor_driver);
1165	if (result < 0)	1165	if (result < 0)
1166	goto out_cpuidle;	1166	goto out_cpuidle;
1167		1167
1168	acpi_processor_install_hotplug_notify();	1168	acpi_processor_install_hotplug_notify();
1169		1169
1170	acpi_thermal_cpufreq_init();	1170	acpi_thermal_cpufreq_init();
1171		1171
1172	acpi_processor_ppc_init();	1172	acpi_processor_ppc_init();
1173		1173
1174	acpi_processor_throttling_init();	1174	acpi_processor_throttling_init();
1175		1175
1176	return 0;	1176	return 0;
1177		1177
1178	out_cpuidle:	1178	out_cpuidle:
1179	cpuidle_unregister_driver(&acpi_idle_driver);	1179	cpuidle_unregister_driver(&acpi_idle_driver);
1180		1180
1181	out_proc:	1181	out_proc:
1182	remove_proc_entry(ACPI_PROCESSOR_CLASS, acpi_root_dir);	1182	remove_proc_entry(ACPI_PROCESSOR_CLASS, acpi_root_dir);
1183		1183
1184	return result;	1184	return result;
1185	}	1185	}
1186		1186
1187	static void __exit acpi_processor_exit(void)	1187	static void __exit acpi_processor_exit(void)
1188	{	1188	{
1189	acpi_processor_ppc_exit();	1189	acpi_processor_ppc_exit();
1190		1190
1191	acpi_thermal_cpufreq_exit();	1191	acpi_thermal_cpufreq_exit();
1192		1192
1193	acpi_processor_uninstall_hotplug_notify();	1193	acpi_processor_uninstall_hotplug_notify();
1194		1194
1195	acpi_bus_unregister_driver(&acpi_processor_driver);	1195	acpi_bus_unregister_driver(&acpi_processor_driver);
1196		1196
1197	cpuidle_unregister_driver(&acpi_idle_driver);	1197	cpuidle_unregister_driver(&acpi_idle_driver);
1198		1198
1199	remove_proc_entry(ACPI_PROCESSOR_CLASS, acpi_root_dir);	1199	remove_proc_entry(ACPI_PROCESSOR_CLASS, acpi_root_dir);
1200		1200
1201	return;	1201	return;
1202	}	1202	}
1203		1203
1204	module_init(acpi_processor_init);	1204	module_init(acpi_processor_init);
1205	module_exit(acpi_processor_exit);	1205	module_exit(acpi_processor_exit);
1206		1206
1207	EXPORT_SYMBOL(acpi_processor_set_thermal_limit);	1207	EXPORT_SYMBOL(acpi_processor_set_thermal_limit);
1208		1208
1209	MODULE_ALIAS("processor");	1209	MODULE_ALIAS("processor");
1210		1210

drivers/cpufreq/cpufreq.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* linux/drivers/cpufreq/cpufreq.c	2	* linux/drivers/cpufreq/cpufreq.c
3	*	3	*
4	* Copyright (C) 2001 Russell King	4	* Copyright (C) 2001 Russell King
5	* (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>	5	* (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
6	*	6	*
7	* Oct 2005 - Ashok Raj <ashok.raj@intel.com>	7	* Oct 2005 - Ashok Raj <ashok.raj@intel.com>
8	* Added handling for CPU hotplug	8	* Added handling for CPU hotplug
9	* Feb 2006 - Jacob Shin <jacob.shin@amd.com>	9	* Feb 2006 - Jacob Shin <jacob.shin@amd.com>
10	* Fix handling for CPU hotplug -- affected CPUs	10	* Fix handling for CPU hotplug -- affected CPUs
11	*	11	*
12	* This program is free software; you can redistribute it and/or modify	12	* This program is free software; you can redistribute it and/or modify
13	* it under the terms of the GNU General Public License version 2 as	13	* it under the terms of the GNU General Public License version 2 as
14	* published by the Free Software Foundation.	14	* published by the Free Software Foundation.
15	*	15	*
16	*/	16	*/
17		17
18	#include <linux/kernel.h>	18	#include <linux/kernel.h>
19	#include <linux/module.h>	19	#include <linux/module.h>
20	#include <linux/init.h>	20	#include <linux/init.h>
21	#include <linux/notifier.h>	21	#include <linux/notifier.h>
22	#include <linux/cpufreq.h>	22	#include <linux/cpufreq.h>
23	#include <linux/delay.h>	23	#include <linux/delay.h>
24	#include <linux/interrupt.h>	24	#include <linux/interrupt.h>
25	#include <linux/spinlock.h>	25	#include <linux/spinlock.h>
26	#include <linux/device.h>	26	#include <linux/device.h>
27	#include <linux/slab.h>	27	#include <linux/slab.h>
28	#include <linux/cpu.h>	28	#include <linux/cpu.h>
29	#include <linux/completion.h>	29	#include <linux/completion.h>
30	#include <linux/mutex.h>	30	#include <linux/mutex.h>
31		31
32	#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \	32	#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
33	"cpufreq-core", msg)	33	"cpufreq-core", msg)
34		34
35	/**	35	/**
36	* The "cpufreq driver" - the arch- or hardware-dependent low	36	* The "cpufreq driver" - the arch- or hardware-dependent low
37	* level driver of CPUFreq support, and its spinlock. This lock	37	* level driver of CPUFreq support, and its spinlock. This lock
38	* also protects the cpufreq_cpu_data array.	38	* also protects the cpufreq_cpu_data array.
39	*/	39	*/
40	static struct cpufreq_driver *cpufreq_driver;	40	static struct cpufreq_driver *cpufreq_driver;
41	static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);	41	static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
42	#ifdef CONFIG_HOTPLUG_CPU	42	#ifdef CONFIG_HOTPLUG_CPU
43	/* This one keeps track of the previously set governor of a removed CPU */	43	/* This one keeps track of the previously set governor of a removed CPU */
44	static DEFINE_PER_CPU(struct cpufreq_governor *, cpufreq_cpu_governor);	44	static DEFINE_PER_CPU(struct cpufreq_governor *, cpufreq_cpu_governor);
45	#endif	45	#endif
46	static DEFINE_SPINLOCK(cpufreq_driver_lock);	46	static DEFINE_SPINLOCK(cpufreq_driver_lock);
47		47
48	/*	48	/*
49	* cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure	49	* cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
50	* all cpufreq/hotplug/workqueue/etc related lock issues.	50	* all cpufreq/hotplug/workqueue/etc related lock issues.
51	*	51	*
52	* The rules for this semaphore:	52	* The rules for this semaphore:
53	* - Any routine that wants to read from the policy structure will	53	* - Any routine that wants to read from the policy structure will
54	* do a down_read on this semaphore.	54	* do a down_read on this semaphore.
55	* - Any routine that will write to the policy structure and/or may take away	55	* - Any routine that will write to the policy structure and/or may take away
56	* the policy altogether (eg. CPU hotplug), will hold this lock in write	56	* the policy altogether (eg. CPU hotplug), will hold this lock in write
57	* mode before doing so.	57	* mode before doing so.
58	*	58	*
59	* Additional rules:	59	* Additional rules:
60	* - All holders of the lock should check to make sure that the CPU they	60	* - All holders of the lock should check to make sure that the CPU they
61	* are concerned with are online after they get the lock.	61	* are concerned with are online after they get the lock.
62	* - Governor routines that can be called in cpufreq hotplug path should not	62	* - Governor routines that can be called in cpufreq hotplug path should not
63	* take this sem as top level hotplug notifier handler takes this.	63	* take this sem as top level hotplug notifier handler takes this.
64	*/	64	*/
65	static DEFINE_PER_CPU(int, policy_cpu);	65	static DEFINE_PER_CPU(int, policy_cpu);
66	static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem);	66	static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem);
67		67
68	#define lock_policy_rwsem(mode, cpu) \	68	#define lock_policy_rwsem(mode, cpu) \
69	int lock_policy_rwsem_##mode \	69	int lock_policy_rwsem_##mode \
70	(int cpu) \	70	(int cpu) \
71	{ \	71	{ \
72	int policy_cpu = per_cpu(policy_cpu, cpu); \	72	int policy_cpu = per_cpu(policy_cpu, cpu); \
73	BUG_ON(policy_cpu == -1); \	73	BUG_ON(policy_cpu == -1); \
74	down_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \	74	down_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \
75	if (unlikely(!cpu_online(cpu))) { \	75	if (unlikely(!cpu_online(cpu))) { \
76	up_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \	76	up_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \
77	return -1; \	77	return -1; \
78	} \	78	} \
79	\	79	\
80	return 0; \	80	return 0; \
81	}	81	}
82		82
83	lock_policy_rwsem(read, cpu);	83	lock_policy_rwsem(read, cpu);
84	EXPORT_SYMBOL_GPL(lock_policy_rwsem_read);	84	EXPORT_SYMBOL_GPL(lock_policy_rwsem_read);
85		85
86	lock_policy_rwsem(write, cpu);	86	lock_policy_rwsem(write, cpu);
87	EXPORT_SYMBOL_GPL(lock_policy_rwsem_write);	87	EXPORT_SYMBOL_GPL(lock_policy_rwsem_write);
88		88
89	void unlock_policy_rwsem_read(int cpu)	89	void unlock_policy_rwsem_read(int cpu)
90	{	90	{
91	int policy_cpu = per_cpu(policy_cpu, cpu);	91	int policy_cpu = per_cpu(policy_cpu, cpu);
92	BUG_ON(policy_cpu == -1);	92	BUG_ON(policy_cpu == -1);
93	up_read(&per_cpu(cpu_policy_rwsem, policy_cpu));	93	up_read(&per_cpu(cpu_policy_rwsem, policy_cpu));
94	}	94	}
95	EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read);	95	EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read);
96		96
97	void unlock_policy_rwsem_write(int cpu)	97	void unlock_policy_rwsem_write(int cpu)
98	{	98	{
99	int policy_cpu = per_cpu(policy_cpu, cpu);	99	int policy_cpu = per_cpu(policy_cpu, cpu);
100	BUG_ON(policy_cpu == -1);	100	BUG_ON(policy_cpu == -1);
101	up_write(&per_cpu(cpu_policy_rwsem, policy_cpu));	101	up_write(&per_cpu(cpu_policy_rwsem, policy_cpu));
102	}	102	}
103	EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write);	103	EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write);
104		104
105		105
106	/* internal prototypes */	106	/* internal prototypes */
107	static int __cpufreq_governor(struct cpufreq_policy *policy,	107	static int __cpufreq_governor(struct cpufreq_policy *policy,
108	unsigned int event);	108	unsigned int event);
109	static unsigned int __cpufreq_get(unsigned int cpu);	109	static unsigned int __cpufreq_get(unsigned int cpu);
110	static void handle_update(struct work_struct *work);	110	static void handle_update(struct work_struct *work);
111		111
112	/**	112	/**
113	* Two notifier lists: the "policy" list is involved in the	113	* Two notifier lists: the "policy" list is involved in the
114	* validation process for a new CPU frequency policy; the	114	* validation process for a new CPU frequency policy; the
115	* "transition" list for kernel code that needs to handle	115	* "transition" list for kernel code that needs to handle
116	* changes to devices when the CPU clock speed changes.	116	* changes to devices when the CPU clock speed changes.
117	* The mutex locks both lists.	117	* The mutex locks both lists.
118	*/	118	*/
119	static BLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);	119	static BLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);
120	static struct srcu_notifier_head cpufreq_transition_notifier_list;	120	static struct srcu_notifier_head cpufreq_transition_notifier_list;
121		121
122	static bool init_cpufreq_transition_notifier_list_called;	122	static bool init_cpufreq_transition_notifier_list_called;
123	static int __init init_cpufreq_transition_notifier_list(void)	123	static int __init init_cpufreq_transition_notifier_list(void)
124	{	124	{
125	srcu_init_notifier_head(&cpufreq_transition_notifier_list);	125	srcu_init_notifier_head(&cpufreq_transition_notifier_list);
126	init_cpufreq_transition_notifier_list_called = true;	126	init_cpufreq_transition_notifier_list_called = true;
127	return 0;	127	return 0;
128	}	128	}
129	pure_initcall(init_cpufreq_transition_notifier_list);	129	pure_initcall(init_cpufreq_transition_notifier_list);
130		130
131	static LIST_HEAD(cpufreq_governor_list);	131	static LIST_HEAD(cpufreq_governor_list);
132	static DEFINE_MUTEX(cpufreq_governor_mutex);	132	static DEFINE_MUTEX(cpufreq_governor_mutex);
133		133
134	struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu)	134	struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu)
135	{	135	{
136	struct cpufreq_policy *data;	136	struct cpufreq_policy *data;
137	unsigned long flags;	137	unsigned long flags;
138		138
139	if (cpu >= nr_cpu_ids)	139	if (cpu >= nr_cpu_ids)
140	goto err_out;	140	goto err_out;
141		141
142	/* get the cpufreq driver */	142	/* get the cpufreq driver */
143	spin_lock_irqsave(&cpufreq_driver_lock, flags);	143	spin_lock_irqsave(&cpufreq_driver_lock, flags);
144		144
145	if (!cpufreq_driver)	145	if (!cpufreq_driver)
146	goto err_out_unlock;	146	goto err_out_unlock;
147		147
148	if (!try_module_get(cpufreq_driver->owner))	148	if (!try_module_get(cpufreq_driver->owner))
149	goto err_out_unlock;	149	goto err_out_unlock;
150		150
151		151
152	/* get the CPU */	152	/* get the CPU */
153	data = per_cpu(cpufreq_cpu_data, cpu);	153	data = per_cpu(cpufreq_cpu_data, cpu);
154		154
155	if (!data)	155	if (!data)
156	goto err_out_put_module;	156	goto err_out_put_module;
157		157
158	if (!kobject_get(&data->kobj))	158	if (!kobject_get(&data->kobj))
159	goto err_out_put_module;	159	goto err_out_put_module;
160		160
161	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	161	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
162	return data;	162	return data;
163		163
164	err_out_put_module:	164	err_out_put_module:
165	module_put(cpufreq_driver->owner);	165	module_put(cpufreq_driver->owner);
166	err_out_unlock:	166	err_out_unlock:
167	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	167	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
168	err_out:	168	err_out:
169	return NULL;	169	return NULL;
170	}	170	}
171	EXPORT_SYMBOL_GPL(cpufreq_cpu_get);	171	EXPORT_SYMBOL_GPL(cpufreq_cpu_get);
172		172
173		173
174	void cpufreq_cpu_put(struct cpufreq_policy *data)	174	void cpufreq_cpu_put(struct cpufreq_policy *data)
175	{	175	{
176	kobject_put(&data->kobj);	176	kobject_put(&data->kobj);
177	module_put(cpufreq_driver->owner);	177	module_put(cpufreq_driver->owner);
178	}	178	}
179	EXPORT_SYMBOL_GPL(cpufreq_cpu_put);	179	EXPORT_SYMBOL_GPL(cpufreq_cpu_put);
180		180
181		181
182	/*********************************************************************	182	/*********************************************************************
183	* UNIFIED DEBUG HELPERS *	183	* UNIFIED DEBUG HELPERS *
184	*********************************************************************/	184	*********************************************************************/
185	#ifdef CONFIG_CPU_FREQ_DEBUG	185	#ifdef CONFIG_CPU_FREQ_DEBUG
186		186
187	/* what part(s) of the CPUfreq subsystem are debugged? */	187	/* what part(s) of the CPUfreq subsystem are debugged? */
188	static unsigned int debug;	188	static unsigned int debug;
189		189
190	/* is the debug output ratelimit'ed using printk_ratelimit? User can	190	/* is the debug output ratelimit'ed using printk_ratelimit? User can
191	* set or modify this value.	191	* set or modify this value.
192	*/	192	*/
193	static unsigned int debug_ratelimit = 1;	193	static unsigned int debug_ratelimit = 1;
194		194
195	/* is the printk_ratelimit'ing enabled? It's enabled after a successful	195	/* is the printk_ratelimit'ing enabled? It's enabled after a successful
196	* loading of a cpufreq driver, temporarily disabled when a new policy	196	* loading of a cpufreq driver, temporarily disabled when a new policy
197	* is set, and disabled upon cpufreq driver removal	197	* is set, and disabled upon cpufreq driver removal
198	*/	198	*/
199	static unsigned int disable_ratelimit = 1;	199	static unsigned int disable_ratelimit = 1;
200	static DEFINE_SPINLOCK(disable_ratelimit_lock);	200	static DEFINE_SPINLOCK(disable_ratelimit_lock);
201		201
202	static void cpufreq_debug_enable_ratelimit(void)	202	static void cpufreq_debug_enable_ratelimit(void)
203	{	203	{
204	unsigned long flags;	204	unsigned long flags;
205		205
206	spin_lock_irqsave(&disable_ratelimit_lock, flags);	206	spin_lock_irqsave(&disable_ratelimit_lock, flags);
207	if (disable_ratelimit)	207	if (disable_ratelimit)
208	disable_ratelimit--;	208	disable_ratelimit--;
209	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);	209	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
210	}	210	}
211		211
212	static void cpufreq_debug_disable_ratelimit(void)	212	static void cpufreq_debug_disable_ratelimit(void)
213	{	213	{
214	unsigned long flags;	214	unsigned long flags;
215		215
216	spin_lock_irqsave(&disable_ratelimit_lock, flags);	216	spin_lock_irqsave(&disable_ratelimit_lock, flags);
217	disable_ratelimit++;	217	disable_ratelimit++;
218	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);	218	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
219	}	219	}
220		220
221	void cpufreq_debug_printk(unsigned int type, const char *prefix,	221	void cpufreq_debug_printk(unsigned int type, const char *prefix,
222	const char *fmt, ...)	222	const char *fmt, ...)
223	{	223	{
224	char s[256];	224	char s[256];
225	va_list args;	225	va_list args;
226	unsigned int len;	226	unsigned int len;
227	unsigned long flags;	227	unsigned long flags;
228		228
229	WARN_ON(!prefix);	229	WARN_ON(!prefix);
230	if (type & debug) {	230	if (type & debug) {
231	spin_lock_irqsave(&disable_ratelimit_lock, flags);	231	spin_lock_irqsave(&disable_ratelimit_lock, flags);
232	if (!disable_ratelimit && debug_ratelimit	232	if (!disable_ratelimit && debug_ratelimit
233	&& !printk_ratelimit()) {	233	&& !printk_ratelimit()) {
234	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);	234	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
235	return;	235	return;
236	}	236	}
237	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);	237	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
238		238
239	len = snprintf(s, 256, KERN_DEBUG "%s: ", prefix);	239	len = snprintf(s, 256, KERN_DEBUG "%s: ", prefix);
240		240
241	va_start(args, fmt);	241	va_start(args, fmt);
242	len += vsnprintf(&s[len], (256 - len), fmt, args);	242	len += vsnprintf(&s[len], (256 - len), fmt, args);
243	va_end(args);	243	va_end(args);
244		244
245	printk(s);	245	printk(s);
246		246
247	WARN_ON(len < 5);	247	WARN_ON(len < 5);
248	}	248	}
249	}	249	}
250	EXPORT_SYMBOL(cpufreq_debug_printk);	250	EXPORT_SYMBOL(cpufreq_debug_printk);
251		251
252		252
253	module_param(debug, uint, 0644);	253	module_param(debug, uint, 0644);
254	MODULE_PARM_DESC(debug, "CPUfreq debugging: add 1 to debug core,"	254	MODULE_PARM_DESC(debug, "CPUfreq debugging: add 1 to debug core,"
255	" 2 to debug drivers, and 4 to debug governors.");	255	" 2 to debug drivers, and 4 to debug governors.");
256		256
257	module_param(debug_ratelimit, uint, 0644);	257	module_param(debug_ratelimit, uint, 0644);
258	MODULE_PARM_DESC(debug_ratelimit, "CPUfreq debugging:"	258	MODULE_PARM_DESC(debug_ratelimit, "CPUfreq debugging:"
259	" set to 0 to disable ratelimiting.");	259	" set to 0 to disable ratelimiting.");
260		260
261	#else /* !CONFIG_CPU_FREQ_DEBUG */	261	#else /* !CONFIG_CPU_FREQ_DEBUG */
262		262
263	static inline void cpufreq_debug_enable_ratelimit(void) { return; }	263	static inline void cpufreq_debug_enable_ratelimit(void) { return; }
264	static inline void cpufreq_debug_disable_ratelimit(void) { return; }	264	static inline void cpufreq_debug_disable_ratelimit(void) { return; }
265		265
266	#endif /* CONFIG_CPU_FREQ_DEBUG */	266	#endif /* CONFIG_CPU_FREQ_DEBUG */
267		267
268		268
269	/*********************************************************************	269	/*********************************************************************
270	* EXTERNALLY AFFECTING FREQUENCY CHANGES *	270	* EXTERNALLY AFFECTING FREQUENCY CHANGES *
271	*********************************************************************/	271	*********************************************************************/
272		272
273	/**	273	/**
274	* adjust_jiffies - adjust the system "loops_per_jiffy"	274	* adjust_jiffies - adjust the system "loops_per_jiffy"
275	*	275	*
276	* This function alters the system "loops_per_jiffy" for the clock	276	* This function alters the system "loops_per_jiffy" for the clock
277	* speed change. Note that loops_per_jiffy cannot be updated on SMP	277	* speed change. Note that loops_per_jiffy cannot be updated on SMP
278	* systems as each CPU might be scaled differently. So, use the arch	278	* systems as each CPU might be scaled differently. So, use the arch
279	* per-CPU loops_per_jiffy value wherever possible.	279	* per-CPU loops_per_jiffy value wherever possible.
280	*/	280	*/
281	#ifndef CONFIG_SMP	281	#ifndef CONFIG_SMP
282	static unsigned long l_p_j_ref;	282	static unsigned long l_p_j_ref;
283	static unsigned int l_p_j_ref_freq;	283	static unsigned int l_p_j_ref_freq;
284		284
285	static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)	285	static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
286	{	286	{
287	if (ci->flags & CPUFREQ_CONST_LOOPS)	287	if (ci->flags & CPUFREQ_CONST_LOOPS)
288	return;	288	return;
289		289
290	if (!l_p_j_ref_freq) {	290	if (!l_p_j_ref_freq) {
291	l_p_j_ref = loops_per_jiffy;	291	l_p_j_ref = loops_per_jiffy;
292	l_p_j_ref_freq = ci->old;	292	l_p_j_ref_freq = ci->old;
293	dprintk("saving %lu as reference value for loops_per_jiffy; "	293	dprintk("saving %lu as reference value for loops_per_jiffy; "
294	"freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq);	294	"freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq);
295	}	295	}
296	if ((val == CPUFREQ_PRECHANGE && ci->old < ci->new) \|\|	296	if ((val == CPUFREQ_PRECHANGE && ci->old < ci->new) \|\|
297	(val == CPUFREQ_POSTCHANGE && ci->old > ci->new) \|\|	297	(val == CPUFREQ_POSTCHANGE && ci->old > ci->new) \|\|
298	(val == CPUFREQ_RESUMECHANGE \|\| val == CPUFREQ_SUSPENDCHANGE)) {	298	(val == CPUFREQ_RESUMECHANGE \|\| val == CPUFREQ_SUSPENDCHANGE)) {
299	loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq,	299	loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq,
300	ci->new);	300	ci->new);
301	dprintk("scaling loops_per_jiffy to %lu "	301	dprintk("scaling loops_per_jiffy to %lu "
302	"for frequency %u kHz\n", loops_per_jiffy, ci->new);	302	"for frequency %u kHz\n", loops_per_jiffy, ci->new);
303	}	303	}
304	}	304	}
305	#else	305	#else
306	static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)	306	static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
307	{	307	{
308	return;	308	return;
309	}	309	}
310	#endif	310	#endif
311		311
312		312
313	/**	313	/**
314	* cpufreq_notify_transition - call notifier chain and adjust_jiffies	314	* cpufreq_notify_transition - call notifier chain and adjust_jiffies
315	* on frequency transition.	315	* on frequency transition.
316	*	316	*
317	* This function calls the transition notifiers and the "adjust_jiffies"	317	* This function calls the transition notifiers and the "adjust_jiffies"
318	* function. It is called twice on all CPU frequency changes that have	318	* function. It is called twice on all CPU frequency changes that have
319	* external effects.	319	* external effects.
320	*/	320	*/
321	void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)	321	void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
322	{	322	{
323	struct cpufreq_policy *policy;	323	struct cpufreq_policy *policy;
324		324
325	BUG_ON(irqs_disabled());	325	BUG_ON(irqs_disabled());
326		326
327	freqs->flags = cpufreq_driver->flags;	327	freqs->flags = cpufreq_driver->flags;
328	dprintk("notification %u of frequency transition to %u kHz\n",	328	dprintk("notification %u of frequency transition to %u kHz\n",
329	state, freqs->new);	329	state, freqs->new);
330		330
331	policy = per_cpu(cpufreq_cpu_data, freqs->cpu);	331	policy = per_cpu(cpufreq_cpu_data, freqs->cpu);
332	switch (state) {	332	switch (state) {
333		333
334	case CPUFREQ_PRECHANGE:	334	case CPUFREQ_PRECHANGE:
335	/* detect if the driver reported a value as "old frequency"	335	/* detect if the driver reported a value as "old frequency"
336	* which is not equal to what the cpufreq core thinks is	336	* which is not equal to what the cpufreq core thinks is
337	* "old frequency".	337	* "old frequency".
338	*/	338	*/
339	if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {	339	if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
340	if ((policy) && (policy->cpu == freqs->cpu) &&	340	if ((policy) && (policy->cpu == freqs->cpu) &&
341	(policy->cur) && (policy->cur != freqs->old)) {	341	(policy->cur) && (policy->cur != freqs->old)) {
342	dprintk("Warning: CPU frequency is"	342	dprintk("Warning: CPU frequency is"
343	" %u, cpufreq assumed %u kHz.\n",	343	" %u, cpufreq assumed %u kHz.\n",
344	freqs->old, policy->cur);	344	freqs->old, policy->cur);
345	freqs->old = policy->cur;	345	freqs->old = policy->cur;
346	}	346	}
347	}	347	}
348	srcu_notifier_call_chain(&cpufreq_transition_notifier_list,	348	srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
349	CPUFREQ_PRECHANGE, freqs);	349	CPUFREQ_PRECHANGE, freqs);
350	adjust_jiffies(CPUFREQ_PRECHANGE, freqs);	350	adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
351	break;	351	break;
352		352
353	case CPUFREQ_POSTCHANGE:	353	case CPUFREQ_POSTCHANGE:
354	adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);	354	adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
355	srcu_notifier_call_chain(&cpufreq_transition_notifier_list,	355	srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
356	CPUFREQ_POSTCHANGE, freqs);	356	CPUFREQ_POSTCHANGE, freqs);
357	if (likely(policy) && likely(policy->cpu == freqs->cpu))	357	if (likely(policy) && likely(policy->cpu == freqs->cpu))
358	policy->cur = freqs->new;	358	policy->cur = freqs->new;
359	break;	359	break;
360	}	360	}
361	}	361	}
362	EXPORT_SYMBOL_GPL(cpufreq_notify_transition);	362	EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
363		363
364		364
365		365
366	/*********************************************************************	366	/*********************************************************************
367	* SYSFS INTERFACE *	367	* SYSFS INTERFACE *
368	*********************************************************************/	368	*********************************************************************/
369		369
370	static struct cpufreq_governor __find_governor(const char str_governor)	370	static struct cpufreq_governor __find_governor(const char str_governor)
371	{	371	{
372	struct cpufreq_governor *t;	372	struct cpufreq_governor *t;
373		373
374	list_for_each_entry(t, &cpufreq_governor_list, governor_list)	374	list_for_each_entry(t, &cpufreq_governor_list, governor_list)
375	if (!strnicmp(str_governor, t->name, CPUFREQ_NAME_LEN))	375	if (!strnicmp(str_governor, t->name, CPUFREQ_NAME_LEN))
376	return t;	376	return t;
377		377
378	return NULL;	378	return NULL;
379	}	379	}
380		380
381	/**	381	/**
382	* cpufreq_parse_governor - parse a governor string	382	* cpufreq_parse_governor - parse a governor string
383	*/	383	*/
384	static int cpufreq_parse_governor(char str_governor, unsigned int policy,	384	static int cpufreq_parse_governor(char str_governor, unsigned int policy,
385	struct cpufreq_governor **governor)	385	struct cpufreq_governor **governor)
386	{	386	{
387	int err = -EINVAL;	387	int err = -EINVAL;
388		388
389	if (!cpufreq_driver)	389	if (!cpufreq_driver)
390	goto out;	390	goto out;
391		391
392	if (cpufreq_driver->setpolicy) {	392	if (cpufreq_driver->setpolicy) {
393	if (!strnicmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {	393	if (!strnicmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {
394	*policy = CPUFREQ_POLICY_PERFORMANCE;	394	*policy = CPUFREQ_POLICY_PERFORMANCE;
395	err = 0;	395	err = 0;
396	} else if (!strnicmp(str_governor, "powersave",	396	} else if (!strnicmp(str_governor, "powersave",
397	CPUFREQ_NAME_LEN)) {	397	CPUFREQ_NAME_LEN)) {
398	*policy = CPUFREQ_POLICY_POWERSAVE;	398	*policy = CPUFREQ_POLICY_POWERSAVE;
399	err = 0;	399	err = 0;
400	}	400	}
401	} else if (cpufreq_driver->target) {	401	} else if (cpufreq_driver->target) {
402	struct cpufreq_governor *t;	402	struct cpufreq_governor *t;
403		403
404	mutex_lock(&cpufreq_governor_mutex);	404	mutex_lock(&cpufreq_governor_mutex);
405		405
406	t = __find_governor(str_governor);	406	t = __find_governor(str_governor);
407		407
408	if (t == NULL) {	408	if (t == NULL) {
409	char *name = kasprintf(GFP_KERNEL, "cpufreq_%s",	409	char *name = kasprintf(GFP_KERNEL, "cpufreq_%s",
410	str_governor);	410	str_governor);
411		411
412	if (name) {	412	if (name) {
413	int ret;	413	int ret;
414		414
415	mutex_unlock(&cpufreq_governor_mutex);	415	mutex_unlock(&cpufreq_governor_mutex);
416	ret = request_module("%s", name);	416	ret = request_module("%s", name);
417	mutex_lock(&cpufreq_governor_mutex);	417	mutex_lock(&cpufreq_governor_mutex);
418		418
419	if (ret == 0)	419	if (ret == 0)
420	t = __find_governor(str_governor);	420	t = __find_governor(str_governor);
421	}	421	}
422		422
423	kfree(name);	423	kfree(name);
424	}	424	}
425		425
426	if (t != NULL) {	426	if (t != NULL) {
427	*governor = t;	427	*governor = t;
428	err = 0;	428	err = 0;
429	}	429	}
430		430
431	mutex_unlock(&cpufreq_governor_mutex);	431	mutex_unlock(&cpufreq_governor_mutex);
432	}	432	}
433	out:	433	out:
434	return err;	434	return err;
435	}	435	}
436		436
437		437
438	/**	438	/**
439	* cpufreq_per_cpu_attr_read() / show_##file_name() -	439	* cpufreq_per_cpu_attr_read() / show_##file_name() -
440	* print out cpufreq information	440	* print out cpufreq information
441	*	441	*
442	* Write out information from cpufreq_driver->policy[cpu]; object must be	442	* Write out information from cpufreq_driver->policy[cpu]; object must be
443	* "unsigned int".	443	* "unsigned int".
444	*/	444	*/
445		445
446	#define show_one(file_name, object) \	446	#define show_one(file_name, object) \
447	static ssize_t show_##file_name \	447	static ssize_t show_##file_name \
448	(struct cpufreq_policy policy, char buf) \	448	(struct cpufreq_policy policy, char buf) \
449	{ \	449	{ \
450	return sprintf(buf, "%u\n", policy->object); \	450	return sprintf(buf, "%u\n", policy->object); \
451	}	451	}
452		452
453	show_one(cpuinfo_min_freq, cpuinfo.min_freq);	453	show_one(cpuinfo_min_freq, cpuinfo.min_freq);
454	show_one(cpuinfo_max_freq, cpuinfo.max_freq);	454	show_one(cpuinfo_max_freq, cpuinfo.max_freq);
455	show_one(cpuinfo_transition_latency, cpuinfo.transition_latency);	455	show_one(cpuinfo_transition_latency, cpuinfo.transition_latency);
456	show_one(scaling_min_freq, min);	456	show_one(scaling_min_freq, min);
457	show_one(scaling_max_freq, max);	457	show_one(scaling_max_freq, max);
458	show_one(scaling_cur_freq, cur);	458	show_one(scaling_cur_freq, cur);
459		459
460	static int __cpufreq_set_policy(struct cpufreq_policy *data,	460	static int __cpufreq_set_policy(struct cpufreq_policy *data,
461	struct cpufreq_policy *policy);	461	struct cpufreq_policy *policy);
462		462
463	/**	463	/**
464	* cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access	464	* cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access
465	*/	465	*/
466	#define store_one(file_name, object) \	466	#define store_one(file_name, object) \
467	static ssize_t store_##file_name \	467	static ssize_t store_##file_name \
468	(struct cpufreq_policy policy, const char buf, size_t count) \	468	(struct cpufreq_policy policy, const char buf, size_t count) \
469	{ \	469	{ \
470	unsigned int ret = -EINVAL; \	470	unsigned int ret = -EINVAL; \
471	struct cpufreq_policy new_policy; \	471	struct cpufreq_policy new_policy; \
472	\	472	\
473	ret = cpufreq_get_policy(&new_policy, policy->cpu); \	473	ret = cpufreq_get_policy(&new_policy, policy->cpu); \
474	if (ret) \	474	if (ret) \
475	return -EINVAL; \	475	return -EINVAL; \
476	\	476	\
477	ret = sscanf(buf, "%u", &new_policy.object); \	477	ret = sscanf(buf, "%u", &new_policy.object); \
478	if (ret != 1) \	478	if (ret != 1) \
479	return -EINVAL; \	479	return -EINVAL; \
480	\	480	\
481	ret = __cpufreq_set_policy(policy, &new_policy); \	481	ret = __cpufreq_set_policy(policy, &new_policy); \
482	policy->user_policy.object = policy->object; \	482	policy->user_policy.object = policy->object; \
483	\	483	\
484	return ret ? ret : count; \	484	return ret ? ret : count; \
485	}	485	}
486		486
487	store_one(scaling_min_freq, min);	487	store_one(scaling_min_freq, min);
488	store_one(scaling_max_freq, max);	488	store_one(scaling_max_freq, max);
489		489
490	/**	490	/**
491	* show_cpuinfo_cur_freq - current CPU frequency as detected by hardware	491	* show_cpuinfo_cur_freq - current CPU frequency as detected by hardware
492	*/	492	*/
493	static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy,	493	static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy,
494	char *buf)	494	char *buf)
495	{	495	{
496	unsigned int cur_freq = __cpufreq_get(policy->cpu);	496	unsigned int cur_freq = __cpufreq_get(policy->cpu);
497	if (!cur_freq)	497	if (!cur_freq)
498	return sprintf(buf, "<unknown>");	498	return sprintf(buf, "<unknown>");
499	return sprintf(buf, "%u\n", cur_freq);	499	return sprintf(buf, "%u\n", cur_freq);
500	}	500	}
501		501
502		502
503	/**	503	/**
504	* show_scaling_governor - show the current policy for the specified CPU	504	* show_scaling_governor - show the current policy for the specified CPU
505	*/	505	*/
506	static ssize_t show_scaling_governor(struct cpufreq_policy policy, char buf)	506	static ssize_t show_scaling_governor(struct cpufreq_policy policy, char buf)
507	{	507	{
508	if (policy->policy == CPUFREQ_POLICY_POWERSAVE)	508	if (policy->policy == CPUFREQ_POLICY_POWERSAVE)
509	return sprintf(buf, "powersave\n");	509	return sprintf(buf, "powersave\n");
510	else if (policy->policy == CPUFREQ_POLICY_PERFORMANCE)	510	else if (policy->policy == CPUFREQ_POLICY_PERFORMANCE)
511	return sprintf(buf, "performance\n");	511	return sprintf(buf, "performance\n");
512	else if (policy->governor)	512	else if (policy->governor)
513	return scnprintf(buf, CPUFREQ_NAME_LEN, "%s\n",	513	return scnprintf(buf, CPUFREQ_NAME_LEN, "%s\n",
514	policy->governor->name);	514	policy->governor->name);
515	return -EINVAL;	515	return -EINVAL;
516	}	516	}
517		517
518		518
519	/**	519	/**
520	* store_scaling_governor - store policy for the specified CPU	520	* store_scaling_governor - store policy for the specified CPU
521	*/	521	*/
522	static ssize_t store_scaling_governor(struct cpufreq_policy *policy,	522	static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
523	const char *buf, size_t count)	523	const char *buf, size_t count)
524	{	524	{
525	unsigned int ret = -EINVAL;	525	unsigned int ret = -EINVAL;
526	char str_governor[16];	526	char str_governor[16];
527	struct cpufreq_policy new_policy;	527	struct cpufreq_policy new_policy;
528		528
529	ret = cpufreq_get_policy(&new_policy, policy->cpu);	529	ret = cpufreq_get_policy(&new_policy, policy->cpu);
530	if (ret)	530	if (ret)
531	return ret;	531	return ret;
532		532
533	ret = sscanf(buf, "%15s", str_governor);	533	ret = sscanf(buf, "%15s", str_governor);
534	if (ret != 1)	534	if (ret != 1)
535	return -EINVAL;	535	return -EINVAL;
536		536
537	if (cpufreq_parse_governor(str_governor, &new_policy.policy,	537	if (cpufreq_parse_governor(str_governor, &new_policy.policy,
538	&new_policy.governor))	538	&new_policy.governor))
539	return -EINVAL;	539	return -EINVAL;
540		540
541	/* Do not use cpufreq_set_policy here or the user_policy.max	541	/* Do not use cpufreq_set_policy here or the user_policy.max
542	will be wrongly overridden */	542	will be wrongly overridden */
543	ret = __cpufreq_set_policy(policy, &new_policy);	543	ret = __cpufreq_set_policy(policy, &new_policy);
544		544
545	policy->user_policy.policy = policy->policy;	545	policy->user_policy.policy = policy->policy;
546	policy->user_policy.governor = policy->governor;	546	policy->user_policy.governor = policy->governor;
547		547
548	if (ret)	548	if (ret)
549	return ret;	549	return ret;
550	else	550	else
551	return count;	551	return count;
552	}	552	}
553		553
554	/**	554	/**
555	* show_scaling_driver - show the cpufreq driver currently loaded	555	* show_scaling_driver - show the cpufreq driver currently loaded
556	*/	556	*/
557	static ssize_t show_scaling_driver(struct cpufreq_policy policy, char buf)	557	static ssize_t show_scaling_driver(struct cpufreq_policy policy, char buf)
558	{	558	{
559	return scnprintf(buf, CPUFREQ_NAME_LEN, "%s\n", cpufreq_driver->name);	559	return scnprintf(buf, CPUFREQ_NAME_LEN, "%s\n", cpufreq_driver->name);
560	}	560	}
561		561
562	/**	562	/**
563	* show_scaling_available_governors - show the available CPUfreq governors	563	* show_scaling_available_governors - show the available CPUfreq governors
564	*/	564	*/
565	static ssize_t show_scaling_available_governors(struct cpufreq_policy *policy,	565	static ssize_t show_scaling_available_governors(struct cpufreq_policy *policy,
566	char *buf)	566	char *buf)
567	{	567	{
568	ssize_t i = 0;	568	ssize_t i = 0;
569	struct cpufreq_governor *t;	569	struct cpufreq_governor *t;
570		570
571	if (!cpufreq_driver->target) {	571	if (!cpufreq_driver->target) {
572	i += sprintf(buf, "performance powersave");	572	i += sprintf(buf, "performance powersave");
573	goto out;	573	goto out;
574	}	574	}
575		575
576	list_for_each_entry(t, &cpufreq_governor_list, governor_list) {	576	list_for_each_entry(t, &cpufreq_governor_list, governor_list) {
577	if (i >= (ssize_t) ((PAGE_SIZE / sizeof(char))	577	if (i >= (ssize_t) ((PAGE_SIZE / sizeof(char))
578	- (CPUFREQ_NAME_LEN + 2)))	578	- (CPUFREQ_NAME_LEN + 2)))
579	goto out;	579	goto out;
580	i += scnprintf(&buf[i], CPUFREQ_NAME_LEN, "%s ", t->name);	580	i += scnprintf(&buf[i], CPUFREQ_NAME_LEN, "%s ", t->name);
581	}	581	}
582	out:	582	out:
583	i += sprintf(&buf[i], "\n");	583	i += sprintf(&buf[i], "\n");
584	return i;	584	return i;
585	}	585	}
586		586
587	static ssize_t show_cpus(const struct cpumask mask, char buf)	587	static ssize_t show_cpus(const struct cpumask mask, char buf)
588	{	588	{
589	ssize_t i = 0;	589	ssize_t i = 0;
590	unsigned int cpu;	590	unsigned int cpu;
591		591
592	for_each_cpu(cpu, mask) {	592	for_each_cpu(cpu, mask) {
593	if (i)	593	if (i)
594	i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), " ");	594	i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), " ");
595	i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), "%u", cpu);	595	i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), "%u", cpu);
596	if (i >= (PAGE_SIZE - 5))	596	if (i >= (PAGE_SIZE - 5))
597	break;	597	break;
598	}	598	}
599	i += sprintf(&buf[i], "\n");	599	i += sprintf(&buf[i], "\n");
600	return i;	600	return i;
601	}	601	}
602		602
603	/**	603	/**
604	* show_related_cpus - show the CPUs affected by each transition even if	604	* show_related_cpus - show the CPUs affected by each transition even if
605	* hw coordination is in use	605	* hw coordination is in use
606	*/	606	*/
607	static ssize_t show_related_cpus(struct cpufreq_policy policy, char buf)	607	static ssize_t show_related_cpus(struct cpufreq_policy policy, char buf)
608	{	608	{
609	if (cpumask_empty(policy->related_cpus))	609	if (cpumask_empty(policy->related_cpus))
610	return show_cpus(policy->cpus, buf);	610	return show_cpus(policy->cpus, buf);
611	return show_cpus(policy->related_cpus, buf);	611	return show_cpus(policy->related_cpus, buf);
612	}	612	}
613		613
614	/**	614	/**
615	* show_affected_cpus - show the CPUs affected by each transition	615	* show_affected_cpus - show the CPUs affected by each transition
616	*/	616	*/
617	static ssize_t show_affected_cpus(struct cpufreq_policy policy, char buf)	617	static ssize_t show_affected_cpus(struct cpufreq_policy policy, char buf)
618	{	618	{
619	return show_cpus(policy->cpus, buf);	619	return show_cpus(policy->cpus, buf);
620	}	620	}
621		621
622	static ssize_t store_scaling_setspeed(struct cpufreq_policy *policy,	622	static ssize_t store_scaling_setspeed(struct cpufreq_policy *policy,
623	const char *buf, size_t count)	623	const char *buf, size_t count)
624	{	624	{
625	unsigned int freq = 0;	625	unsigned int freq = 0;
626	unsigned int ret;	626	unsigned int ret;
627		627
628	if (!policy->governor \|\| !policy->governor->store_setspeed)	628	if (!policy->governor \|\| !policy->governor->store_setspeed)
629	return -EINVAL;	629	return -EINVAL;
630		630
631	ret = sscanf(buf, "%u", &freq);	631	ret = sscanf(buf, "%u", &freq);
632	if (ret != 1)	632	if (ret != 1)
633	return -EINVAL;	633	return -EINVAL;
634		634
635	policy->governor->store_setspeed(policy, freq);	635	policy->governor->store_setspeed(policy, freq);
636		636
637	return count;	637	return count;
638	}	638	}
639		639
640	static ssize_t show_scaling_setspeed(struct cpufreq_policy policy, char buf)	640	static ssize_t show_scaling_setspeed(struct cpufreq_policy policy, char buf)
641	{	641	{
642	if (!policy->governor \|\| !policy->governor->show_setspeed)	642	if (!policy->governor \|\| !policy->governor->show_setspeed)
643	return sprintf(buf, "<unsupported>\n");	643	return sprintf(buf, "<unsupported>\n");
644		644
645	return policy->governor->show_setspeed(policy, buf);	645	return policy->governor->show_setspeed(policy, buf);
646	}	646	}
647		647
648	#define define_one_ro(_name) \	648	#define define_one_ro(_name) \
649	static struct freq_attr _name = \	649	static struct freq_attr _name = \
650	__ATTR(_name, 0444, show_##_name, NULL)	650	__ATTR(_name, 0444, show_##_name, NULL)
651		651
652	#define define_one_ro0400(_name) \	652	#define define_one_ro0400(_name) \
653	static struct freq_attr _name = \	653	static struct freq_attr _name = \
654	__ATTR(_name, 0400, show_##_name, NULL)	654	__ATTR(_name, 0400, show_##_name, NULL)
655		655
656	#define define_one_rw(_name) \	656	#define define_one_rw(_name) \
657	static struct freq_attr _name = \	657	static struct freq_attr _name = \
658	__ATTR(_name, 0644, show_##_name, store_##_name)	658	__ATTR(_name, 0644, show_##_name, store_##_name)
659		659
660	define_one_ro0400(cpuinfo_cur_freq);	660	define_one_ro0400(cpuinfo_cur_freq);
661	define_one_ro(cpuinfo_min_freq);	661	define_one_ro(cpuinfo_min_freq);
662	define_one_ro(cpuinfo_max_freq);	662	define_one_ro(cpuinfo_max_freq);
663	define_one_ro(cpuinfo_transition_latency);	663	define_one_ro(cpuinfo_transition_latency);
664	define_one_ro(scaling_available_governors);	664	define_one_ro(scaling_available_governors);
665	define_one_ro(scaling_driver);	665	define_one_ro(scaling_driver);
666	define_one_ro(scaling_cur_freq);	666	define_one_ro(scaling_cur_freq);
667	define_one_ro(related_cpus);	667	define_one_ro(related_cpus);
668	define_one_ro(affected_cpus);	668	define_one_ro(affected_cpus);
669	define_one_rw(scaling_min_freq);	669	define_one_rw(scaling_min_freq);
670	define_one_rw(scaling_max_freq);	670	define_one_rw(scaling_max_freq);
671	define_one_rw(scaling_governor);	671	define_one_rw(scaling_governor);
672	define_one_rw(scaling_setspeed);	672	define_one_rw(scaling_setspeed);
673		673
674	static struct attribute *default_attrs[] = {	674	static struct attribute *default_attrs[] = {
675	&cpuinfo_min_freq.attr,	675	&cpuinfo_min_freq.attr,
676	&cpuinfo_max_freq.attr,	676	&cpuinfo_max_freq.attr,
677	&cpuinfo_transition_latency.attr,	677	&cpuinfo_transition_latency.attr,
678	&scaling_min_freq.attr,	678	&scaling_min_freq.attr,
679	&scaling_max_freq.attr,	679	&scaling_max_freq.attr,
680	&affected_cpus.attr,	680	&affected_cpus.attr,
681	&related_cpus.attr,	681	&related_cpus.attr,
682	&scaling_governor.attr,	682	&scaling_governor.attr,
683	&scaling_driver.attr,	683	&scaling_driver.attr,
684	&scaling_available_governors.attr,	684	&scaling_available_governors.attr,
685	&scaling_setspeed.attr,	685	&scaling_setspeed.attr,
686	NULL	686	NULL
687	};	687	};
688		688
689	#define to_policy(k) container_of(k, struct cpufreq_policy, kobj)	689	#define to_policy(k) container_of(k, struct cpufreq_policy, kobj)
690	#define to_attr(a) container_of(a, struct freq_attr, attr)	690	#define to_attr(a) container_of(a, struct freq_attr, attr)
691		691
692	static ssize_t show(struct kobject kobj, struct attribute attr, char *buf)	692	static ssize_t show(struct kobject kobj, struct attribute attr, char *buf)
693	{	693	{
694	struct cpufreq_policy *policy = to_policy(kobj);	694	struct cpufreq_policy *policy = to_policy(kobj);
695	struct freq_attr *fattr = to_attr(attr);	695	struct freq_attr *fattr = to_attr(attr);
696	ssize_t ret = -EINVAL;	696	ssize_t ret = -EINVAL;
697	policy = cpufreq_cpu_get(policy->cpu);	697	policy = cpufreq_cpu_get(policy->cpu);
698	if (!policy)	698	if (!policy)
699	goto no_policy;	699	goto no_policy;
700		700
701	if (lock_policy_rwsem_read(policy->cpu) < 0)	701	if (lock_policy_rwsem_read(policy->cpu) < 0)
702	goto fail;	702	goto fail;
703		703
704	if (fattr->show)	704	if (fattr->show)
705	ret = fattr->show(policy, buf);	705	ret = fattr->show(policy, buf);
706	else	706	else
707	ret = -EIO;	707	ret = -EIO;
708		708
709	unlock_policy_rwsem_read(policy->cpu);	709	unlock_policy_rwsem_read(policy->cpu);
710	fail:	710	fail:
711	cpufreq_cpu_put(policy);	711	cpufreq_cpu_put(policy);
712	no_policy:	712	no_policy:
713	return ret;	713	return ret;
714	}	714	}
715		715
716	static ssize_t store(struct kobject kobj, struct attribute attr,	716	static ssize_t store(struct kobject kobj, struct attribute attr,
717	const char *buf, size_t count)	717	const char *buf, size_t count)
718	{	718	{
719	struct cpufreq_policy *policy = to_policy(kobj);	719	struct cpufreq_policy *policy = to_policy(kobj);
720	struct freq_attr *fattr = to_attr(attr);	720	struct freq_attr *fattr = to_attr(attr);
721	ssize_t ret = -EINVAL;	721	ssize_t ret = -EINVAL;
722	policy = cpufreq_cpu_get(policy->cpu);	722	policy = cpufreq_cpu_get(policy->cpu);
723	if (!policy)	723	if (!policy)
724	goto no_policy;	724	goto no_policy;
725		725
726	if (lock_policy_rwsem_write(policy->cpu) < 0)	726	if (lock_policy_rwsem_write(policy->cpu) < 0)
727	goto fail;	727	goto fail;
728		728
729	if (fattr->store)	729	if (fattr->store)
730	ret = fattr->store(policy, buf, count);	730	ret = fattr->store(policy, buf, count);
731	else	731	else
732	ret = -EIO;	732	ret = -EIO;
733		733
734	unlock_policy_rwsem_write(policy->cpu);	734	unlock_policy_rwsem_write(policy->cpu);
735	fail:	735	fail:
736	cpufreq_cpu_put(policy);	736	cpufreq_cpu_put(policy);
737	no_policy:	737	no_policy:
738	return ret;	738	return ret;
739	}	739	}
740		740
741	static void cpufreq_sysfs_release(struct kobject *kobj)	741	static void cpufreq_sysfs_release(struct kobject *kobj)
742	{	742	{
743	struct cpufreq_policy *policy = to_policy(kobj);	743	struct cpufreq_policy *policy = to_policy(kobj);
744	dprintk("last reference is dropped\n");	744	dprintk("last reference is dropped\n");
745	complete(&policy->kobj_unregister);	745	complete(&policy->kobj_unregister);
746	}	746	}
747		747
748	static struct sysfs_ops sysfs_ops = {	748	static struct sysfs_ops sysfs_ops = {
749	.show = show,	749	.show = show,
750	.store = store,	750	.store = store,
751	};	751	};
752		752
753	static struct kobj_type ktype_cpufreq = {	753	static struct kobj_type ktype_cpufreq = {
754	.sysfs_ops = &sysfs_ops,	754	.sysfs_ops = &sysfs_ops,
755	.default_attrs = default_attrs,	755	.default_attrs = default_attrs,
756	.release = cpufreq_sysfs_release,	756	.release = cpufreq_sysfs_release,
757	};	757	};
758		758
759		759
760	/**	760	/**
761	* cpufreq_add_dev - add a CPU device	761	* cpufreq_add_dev - add a CPU device
762	*	762	*
763	* Adds the cpufreq interface for a CPU device.	763	* Adds the cpufreq interface for a CPU device.
764	*/	764	*/
765	static int cpufreq_add_dev(struct sys_device *sys_dev)	765	static int cpufreq_add_dev(struct sys_device *sys_dev)
766	{	766	{
767	unsigned int cpu = sys_dev->id;	767	unsigned int cpu = sys_dev->id;
768	int ret = 0;	768	int ret = 0;
769	struct cpufreq_policy new_policy;	769	struct cpufreq_policy new_policy;
770	struct cpufreq_policy *policy;	770	struct cpufreq_policy *policy;
771	struct freq_attr **drv_attr;	771	struct freq_attr **drv_attr;
772	struct sys_device *cpu_sys_dev;	772	struct sys_device *cpu_sys_dev;
773	unsigned long flags;	773	unsigned long flags;
774	unsigned int j;	774	unsigned int j;
775	#ifdef CONFIG_SMP	775	#ifdef CONFIG_SMP
776	struct cpufreq_policy *managed_policy;	776	struct cpufreq_policy *managed_policy;
777	#endif	777	#endif
778		778
779	if (cpu_is_offline(cpu))	779	if (cpu_is_offline(cpu))
780	return 0;	780	return 0;
781		781
782	cpufreq_debug_disable_ratelimit();	782	cpufreq_debug_disable_ratelimit();
783	dprintk("adding CPU %u\n", cpu);	783	dprintk("adding CPU %u\n", cpu);
784		784
785	#ifdef CONFIG_SMP	785	#ifdef CONFIG_SMP
786	/* check whether a different CPU already registered this	786	/* check whether a different CPU already registered this
787	* CPU because it is in the same boat. */	787	* CPU because it is in the same boat. */
788	policy = cpufreq_cpu_get(cpu);	788	policy = cpufreq_cpu_get(cpu);
789	if (unlikely(policy)) {	789	if (unlikely(policy)) {
790	cpufreq_cpu_put(policy);	790	cpufreq_cpu_put(policy);
791	cpufreq_debug_enable_ratelimit();	791	cpufreq_debug_enable_ratelimit();
792	return 0;	792	return 0;
793	}	793	}
794	#endif	794	#endif
795		795
796	if (!try_module_get(cpufreq_driver->owner)) {	796	if (!try_module_get(cpufreq_driver->owner)) {
797	ret = -EINVAL;	797	ret = -EINVAL;
798	goto module_out;	798	goto module_out;
799	}	799	}
800		800
801	policy = kzalloc(sizeof(struct cpufreq_policy), GFP_KERNEL);	801	policy = kzalloc(sizeof(struct cpufreq_policy), GFP_KERNEL);
802	if (!policy) {	802	if (!policy) {
803	ret = -ENOMEM;	803	ret = -ENOMEM;
804	goto nomem_out;	804	goto nomem_out;
805	}	805	}
806	if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL)) {	806	if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL)) {
807	kfree(policy);	807	kfree(policy);
808	ret = -ENOMEM;	808	ret = -ENOMEM;
809	goto nomem_out;	809	goto nomem_out;
810	}	810	}
811	if (!alloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {	811	if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {
812	free_cpumask_var(policy->cpus);	812	free_cpumask_var(policy->cpus);
813	kfree(policy);	813	kfree(policy);
814	ret = -ENOMEM;	814	ret = -ENOMEM;
815	goto nomem_out;	815	goto nomem_out;
816	}	816	}
817		817
818	policy->cpu = cpu;	818	policy->cpu = cpu;
819	cpumask_copy(policy->cpus, cpumask_of(cpu));	819	cpumask_copy(policy->cpus, cpumask_of(cpu));
820		820
821	/* Initially set CPU itself as the policy_cpu */	821	/* Initially set CPU itself as the policy_cpu */
822	per_cpu(policy_cpu, cpu) = cpu;	822	per_cpu(policy_cpu, cpu) = cpu;
823	lock_policy_rwsem_write(cpu);	823	lock_policy_rwsem_write(cpu);
824		824
825	init_completion(&policy->kobj_unregister);	825	init_completion(&policy->kobj_unregister);
826	INIT_WORK(&policy->update, handle_update);	826	INIT_WORK(&policy->update, handle_update);
827		827
828	/* Set governor before ->init, so that driver could check it */	828	/* Set governor before ->init, so that driver could check it */
829	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;	829	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
830	/* call driver. From then on the cpufreq must be able	830	/* call driver. From then on the cpufreq must be able
831	* to accept all calls to ->verify and ->setpolicy for this CPU	831	* to accept all calls to ->verify and ->setpolicy for this CPU
832	*/	832	*/
833	ret = cpufreq_driver->init(policy);	833	ret = cpufreq_driver->init(policy);
834	if (ret) {	834	if (ret) {
835	dprintk("initialization failed\n");	835	dprintk("initialization failed\n");
836	goto err_out;	836	goto err_out;
837	}	837	}
838	policy->user_policy.min = policy->min;	838	policy->user_policy.min = policy->min;
839	policy->user_policy.max = policy->max;	839	policy->user_policy.max = policy->max;
840		840
841	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,	841	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
842	CPUFREQ_START, policy);	842	CPUFREQ_START, policy);
843		843
844	#ifdef CONFIG_SMP	844	#ifdef CONFIG_SMP
845		845
846	#ifdef CONFIG_HOTPLUG_CPU	846	#ifdef CONFIG_HOTPLUG_CPU
847	if (per_cpu(cpufreq_cpu_governor, cpu)) {	847	if (per_cpu(cpufreq_cpu_governor, cpu)) {
848	policy->governor = per_cpu(cpufreq_cpu_governor, cpu);	848	policy->governor = per_cpu(cpufreq_cpu_governor, cpu);
849	dprintk("Restoring governor %s for cpu %d\n",	849	dprintk("Restoring governor %s for cpu %d\n",
850	policy->governor->name, cpu);	850	policy->governor->name, cpu);
851	}	851	}
852	#endif	852	#endif
853		853
854	for_each_cpu(j, policy->cpus) {	854	for_each_cpu(j, policy->cpus) {
855	if (cpu == j)	855	if (cpu == j)
856	continue;	856	continue;
857		857
858	/* Check for existing affected CPUs.	858	/* Check for existing affected CPUs.
859	* They may not be aware of it due to CPU Hotplug.	859	* They may not be aware of it due to CPU Hotplug.
860	*/	860	*/
861	managed_policy = cpufreq_cpu_get(j); /* FIXME: Where is this released? What about error paths? */	861	managed_policy = cpufreq_cpu_get(j); /* FIXME: Where is this released? What about error paths? */
862	if (unlikely(managed_policy)) {	862	if (unlikely(managed_policy)) {
863		863
864	/* Set proper policy_cpu */	864	/* Set proper policy_cpu */
865	unlock_policy_rwsem_write(cpu);	865	unlock_policy_rwsem_write(cpu);
866	per_cpu(policy_cpu, cpu) = managed_policy->cpu;	866	per_cpu(policy_cpu, cpu) = managed_policy->cpu;
867		867
868	if (lock_policy_rwsem_write(cpu) < 0)	868	if (lock_policy_rwsem_write(cpu) < 0)
869	goto err_out_driver_exit;	869	goto err_out_driver_exit;
870		870
871	spin_lock_irqsave(&cpufreq_driver_lock, flags);	871	spin_lock_irqsave(&cpufreq_driver_lock, flags);
872	cpumask_copy(managed_policy->cpus, policy->cpus);	872	cpumask_copy(managed_policy->cpus, policy->cpus);
873	per_cpu(cpufreq_cpu_data, cpu) = managed_policy;	873	per_cpu(cpufreq_cpu_data, cpu) = managed_policy;
874	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	874	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
875		875
876	dprintk("CPU already managed, adding link\n");	876	dprintk("CPU already managed, adding link\n");
877	ret = sysfs_create_link(&sys_dev->kobj,	877	ret = sysfs_create_link(&sys_dev->kobj,
878	&managed_policy->kobj,	878	&managed_policy->kobj,
879	"cpufreq");	879	"cpufreq");
880	if (ret)	880	if (ret)
881	goto err_out_driver_exit;	881	goto err_out_driver_exit;
882		882
883	cpufreq_debug_enable_ratelimit();	883	cpufreq_debug_enable_ratelimit();
884	ret = 0;	884	ret = 0;
885	goto err_out_driver_exit; /* call driver->exit() */	885	goto err_out_driver_exit; /* call driver->exit() */
886	}	886	}
887	}	887	}
888	#endif	888	#endif
889	memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));	889	memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
890		890
891	/* prepare interface data */	891	/* prepare interface data */
892	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj,	892	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj,
893	"cpufreq");	893	"cpufreq");
894	if (ret)	894	if (ret)
895	goto err_out_driver_exit;	895	goto err_out_driver_exit;
896		896
897	/* set up files for this cpu device */	897	/* set up files for this cpu device */
898	drv_attr = cpufreq_driver->attr;	898	drv_attr = cpufreq_driver->attr;
899	while ((drv_attr) && (*drv_attr)) {	899	while ((drv_attr) && (*drv_attr)) {
900	ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));	900	ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
901	if (ret)	901	if (ret)
902	goto err_out_driver_exit;	902	goto err_out_driver_exit;
903	drv_attr++;	903	drv_attr++;
904	}	904	}
905	if (cpufreq_driver->get) {	905	if (cpufreq_driver->get) {
906	ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);	906	ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);
907	if (ret)	907	if (ret)
908	goto err_out_driver_exit;	908	goto err_out_driver_exit;
909	}	909	}
910	if (cpufreq_driver->target) {	910	if (cpufreq_driver->target) {
911	ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);	911	ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
912	if (ret)	912	if (ret)
913	goto err_out_driver_exit;	913	goto err_out_driver_exit;
914	}	914	}
915		915
916	spin_lock_irqsave(&cpufreq_driver_lock, flags);	916	spin_lock_irqsave(&cpufreq_driver_lock, flags);
917	for_each_cpu(j, policy->cpus) {	917	for_each_cpu(j, policy->cpus) {
918	per_cpu(cpufreq_cpu_data, j) = policy;	918	per_cpu(cpufreq_cpu_data, j) = policy;
919	per_cpu(policy_cpu, j) = policy->cpu;	919	per_cpu(policy_cpu, j) = policy->cpu;
920	}	920	}
921	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	921	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
922		922
923	/* symlink affected CPUs */	923	/* symlink affected CPUs */
924	for_each_cpu(j, policy->cpus) {	924	for_each_cpu(j, policy->cpus) {
925	if (j == cpu)	925	if (j == cpu)
926	continue;	926	continue;
927	if (!cpu_online(j))	927	if (!cpu_online(j))
928	continue;	928	continue;
929		929
930	dprintk("CPU %u already managed, adding link\n", j);	930	dprintk("CPU %u already managed, adding link\n", j);
931	cpufreq_cpu_get(cpu);	931	cpufreq_cpu_get(cpu);
932	cpu_sys_dev = get_cpu_sysdev(j);	932	cpu_sys_dev = get_cpu_sysdev(j);
933	ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,	933	ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,
934	"cpufreq");	934	"cpufreq");
935	if (ret)	935	if (ret)
936	goto err_out_unregister;	936	goto err_out_unregister;
937	}	937	}
938		938
939	policy->governor = NULL; /* to assure that the starting sequence is	939	policy->governor = NULL; /* to assure that the starting sequence is
940	* run in cpufreq_set_policy */	940	* run in cpufreq_set_policy */
941		941
942	/* set default policy */	942	/* set default policy */
943	ret = __cpufreq_set_policy(policy, &new_policy);	943	ret = __cpufreq_set_policy(policy, &new_policy);
944	policy->user_policy.policy = policy->policy;	944	policy->user_policy.policy = policy->policy;
945	policy->user_policy.governor = policy->governor;	945	policy->user_policy.governor = policy->governor;
946		946
947	if (ret) {	947	if (ret) {
948	dprintk("setting policy failed\n");	948	dprintk("setting policy failed\n");
949	goto err_out_unregister;	949	goto err_out_unregister;
950	}	950	}
951		951
952	unlock_policy_rwsem_write(cpu);	952	unlock_policy_rwsem_write(cpu);
953		953
954	kobject_uevent(&policy->kobj, KOBJ_ADD);	954	kobject_uevent(&policy->kobj, KOBJ_ADD);
955	module_put(cpufreq_driver->owner);	955	module_put(cpufreq_driver->owner);
956	dprintk("initialization complete\n");	956	dprintk("initialization complete\n");
957	cpufreq_debug_enable_ratelimit();	957	cpufreq_debug_enable_ratelimit();
958		958
959	return 0;	959	return 0;
960		960
961		961
962	err_out_unregister:	962	err_out_unregister:
963	spin_lock_irqsave(&cpufreq_driver_lock, flags);	963	spin_lock_irqsave(&cpufreq_driver_lock, flags);
964	for_each_cpu(j, policy->cpus)	964	for_each_cpu(j, policy->cpus)
965	per_cpu(cpufreq_cpu_data, j) = NULL;	965	per_cpu(cpufreq_cpu_data, j) = NULL;
966	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	966	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
967		967
968	kobject_put(&policy->kobj);	968	kobject_put(&policy->kobj);
969	wait_for_completion(&policy->kobj_unregister);	969	wait_for_completion(&policy->kobj_unregister);
970		970
971	err_out_driver_exit:	971	err_out_driver_exit:
972	if (cpufreq_driver->exit)	972	if (cpufreq_driver->exit)
973	cpufreq_driver->exit(policy);	973	cpufreq_driver->exit(policy);
974		974
975	err_out:	975	err_out:
976	unlock_policy_rwsem_write(cpu);	976	unlock_policy_rwsem_write(cpu);
977	kfree(policy);	977	kfree(policy);
978		978
979	nomem_out:	979	nomem_out:
980	module_put(cpufreq_driver->owner);	980	module_put(cpufreq_driver->owner);
981	module_out:	981	module_out:
982	cpufreq_debug_enable_ratelimit();	982	cpufreq_debug_enable_ratelimit();
983	return ret;	983	return ret;
984	}	984	}
985		985
986		986
987	/**	987	/**
988	* __cpufreq_remove_dev - remove a CPU device	988	* __cpufreq_remove_dev - remove a CPU device
989	*	989	*
990	* Removes the cpufreq interface for a CPU device.	990	* Removes the cpufreq interface for a CPU device.
991	* Caller should already have policy_rwsem in write mode for this CPU.	991	* Caller should already have policy_rwsem in write mode for this CPU.
992	* This routine frees the rwsem before returning.	992	* This routine frees the rwsem before returning.
993	*/	993	*/
994	static int __cpufreq_remove_dev(struct sys_device *sys_dev)	994	static int __cpufreq_remove_dev(struct sys_device *sys_dev)
995	{	995	{
996	unsigned int cpu = sys_dev->id;	996	unsigned int cpu = sys_dev->id;
997	unsigned long flags;	997	unsigned long flags;
998	struct cpufreq_policy *data;	998	struct cpufreq_policy *data;
999	#ifdef CONFIG_SMP	999	#ifdef CONFIG_SMP
1000	struct sys_device *cpu_sys_dev;	1000	struct sys_device *cpu_sys_dev;
1001	unsigned int j;	1001	unsigned int j;
1002	#endif	1002	#endif
1003		1003
1004	cpufreq_debug_disable_ratelimit();	1004	cpufreq_debug_disable_ratelimit();
1005	dprintk("unregistering CPU %u\n", cpu);	1005	dprintk("unregistering CPU %u\n", cpu);
1006		1006
1007	spin_lock_irqsave(&cpufreq_driver_lock, flags);	1007	spin_lock_irqsave(&cpufreq_driver_lock, flags);
1008	data = per_cpu(cpufreq_cpu_data, cpu);	1008	data = per_cpu(cpufreq_cpu_data, cpu);
1009		1009
1010	if (!data) {	1010	if (!data) {
1011	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1011	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1012	cpufreq_debug_enable_ratelimit();	1012	cpufreq_debug_enable_ratelimit();
1013	unlock_policy_rwsem_write(cpu);	1013	unlock_policy_rwsem_write(cpu);
1014	return -EINVAL;	1014	return -EINVAL;
1015	}	1015	}
1016	per_cpu(cpufreq_cpu_data, cpu) = NULL;	1016	per_cpu(cpufreq_cpu_data, cpu) = NULL;
1017		1017
1018		1018
1019	#ifdef CONFIG_SMP	1019	#ifdef CONFIG_SMP
1020	/* if this isn't the CPU which is the parent of the kobj, we	1020	/* if this isn't the CPU which is the parent of the kobj, we
1021	* only need to unlink, put and exit	1021	* only need to unlink, put and exit
1022	*/	1022	*/
1023	if (unlikely(cpu != data->cpu)) {	1023	if (unlikely(cpu != data->cpu)) {
1024	dprintk("removing link\n");	1024	dprintk("removing link\n");
1025	cpumask_clear_cpu(cpu, data->cpus);	1025	cpumask_clear_cpu(cpu, data->cpus);
1026	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1026	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1027	sysfs_remove_link(&sys_dev->kobj, "cpufreq");	1027	sysfs_remove_link(&sys_dev->kobj, "cpufreq");
1028	cpufreq_cpu_put(data);	1028	cpufreq_cpu_put(data);
1029	cpufreq_debug_enable_ratelimit();	1029	cpufreq_debug_enable_ratelimit();
1030	unlock_policy_rwsem_write(cpu);	1030	unlock_policy_rwsem_write(cpu);
1031	return 0;	1031	return 0;
1032	}	1032	}
1033	#endif	1033	#endif
1034		1034
1035	#ifdef CONFIG_SMP	1035	#ifdef CONFIG_SMP
1036		1036
1037	#ifdef CONFIG_HOTPLUG_CPU	1037	#ifdef CONFIG_HOTPLUG_CPU
1038	per_cpu(cpufreq_cpu_governor, cpu) = data->governor;	1038	per_cpu(cpufreq_cpu_governor, cpu) = data->governor;
1039	#endif	1039	#endif
1040		1040
1041	/* if we have other CPUs still registered, we need to unlink them,	1041	/* if we have other CPUs still registered, we need to unlink them,
1042	* or else wait_for_completion below will lock up. Clean the	1042	* or else wait_for_completion below will lock up. Clean the
1043	* per_cpu(cpufreq_cpu_data) while holding the lock, and remove	1043	* per_cpu(cpufreq_cpu_data) while holding the lock, and remove
1044	* the sysfs links afterwards.	1044	* the sysfs links afterwards.
1045	*/	1045	*/
1046	if (unlikely(cpumask_weight(data->cpus) > 1)) {	1046	if (unlikely(cpumask_weight(data->cpus) > 1)) {
1047	for_each_cpu(j, data->cpus) {	1047	for_each_cpu(j, data->cpus) {
1048	if (j == cpu)	1048	if (j == cpu)
1049	continue;	1049	continue;
1050	per_cpu(cpufreq_cpu_data, j) = NULL;	1050	per_cpu(cpufreq_cpu_data, j) = NULL;
1051	}	1051	}
1052	}	1052	}
1053		1053
1054	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1054	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1055		1055
1056	if (unlikely(cpumask_weight(data->cpus) > 1)) {	1056	if (unlikely(cpumask_weight(data->cpus) > 1)) {
1057	for_each_cpu(j, data->cpus) {	1057	for_each_cpu(j, data->cpus) {
1058	if (j == cpu)	1058	if (j == cpu)
1059	continue;	1059	continue;
1060	dprintk("removing link for cpu %u\n", j);	1060	dprintk("removing link for cpu %u\n", j);
1061	#ifdef CONFIG_HOTPLUG_CPU	1061	#ifdef CONFIG_HOTPLUG_CPU
1062	per_cpu(cpufreq_cpu_governor, j) = data->governor;	1062	per_cpu(cpufreq_cpu_governor, j) = data->governor;
1063	#endif	1063	#endif
1064	cpu_sys_dev = get_cpu_sysdev(j);	1064	cpu_sys_dev = get_cpu_sysdev(j);
1065	sysfs_remove_link(&cpu_sys_dev->kobj, "cpufreq");	1065	sysfs_remove_link(&cpu_sys_dev->kobj, "cpufreq");
1066	cpufreq_cpu_put(data);	1066	cpufreq_cpu_put(data);
1067	}	1067	}
1068	}	1068	}
1069	#else	1069	#else
1070	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1070	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1071	#endif	1071	#endif
1072		1072
1073	unlock_policy_rwsem_write(cpu);	1073	unlock_policy_rwsem_write(cpu);
1074		1074
1075	if (cpufreq_driver->target)	1075	if (cpufreq_driver->target)
1076	__cpufreq_governor(data, CPUFREQ_GOV_STOP);	1076	__cpufreq_governor(data, CPUFREQ_GOV_STOP);
1077		1077
1078	kobject_put(&data->kobj);	1078	kobject_put(&data->kobj);
1079		1079
1080	/* we need to make sure that the underlying kobj is actually	1080	/* we need to make sure that the underlying kobj is actually
1081	* not referenced anymore by anybody before we proceed with	1081	* not referenced anymore by anybody before we proceed with
1082	* unloading.	1082	* unloading.
1083	*/	1083	*/
1084	dprintk("waiting for dropping of refcount\n");	1084	dprintk("waiting for dropping of refcount\n");
1085	wait_for_completion(&data->kobj_unregister);	1085	wait_for_completion(&data->kobj_unregister);
1086	dprintk("wait complete\n");	1086	dprintk("wait complete\n");
1087		1087
1088	if (cpufreq_driver->exit)	1088	if (cpufreq_driver->exit)
1089	cpufreq_driver->exit(data);	1089	cpufreq_driver->exit(data);
1090		1090
1091	free_cpumask_var(data->related_cpus);	1091	free_cpumask_var(data->related_cpus);
1092	free_cpumask_var(data->cpus);	1092	free_cpumask_var(data->cpus);
1093	kfree(data);	1093	kfree(data);
1094	per_cpu(cpufreq_cpu_data, cpu) = NULL;	1094	per_cpu(cpufreq_cpu_data, cpu) = NULL;
1095		1095
1096	cpufreq_debug_enable_ratelimit();	1096	cpufreq_debug_enable_ratelimit();
1097	return 0;	1097	return 0;
1098	}	1098	}
1099		1099
1100		1100
1101	static int cpufreq_remove_dev(struct sys_device *sys_dev)	1101	static int cpufreq_remove_dev(struct sys_device *sys_dev)
1102	{	1102	{
1103	unsigned int cpu = sys_dev->id;	1103	unsigned int cpu = sys_dev->id;
1104	int retval;	1104	int retval;
1105		1105
1106	if (cpu_is_offline(cpu))	1106	if (cpu_is_offline(cpu))
1107	return 0;	1107	return 0;
1108		1108
1109	if (unlikely(lock_policy_rwsem_write(cpu)))	1109	if (unlikely(lock_policy_rwsem_write(cpu)))
1110	BUG();	1110	BUG();
1111		1111
1112	retval = __cpufreq_remove_dev(sys_dev);	1112	retval = __cpufreq_remove_dev(sys_dev);
1113	return retval;	1113	return retval;
1114	}	1114	}
1115		1115
1116		1116
1117	static void handle_update(struct work_struct *work)	1117	static void handle_update(struct work_struct *work)
1118	{	1118	{
1119	struct cpufreq_policy *policy =	1119	struct cpufreq_policy *policy =
1120	container_of(work, struct cpufreq_policy, update);	1120	container_of(work, struct cpufreq_policy, update);
1121	unsigned int cpu = policy->cpu;	1121	unsigned int cpu = policy->cpu;
1122	dprintk("handle_update for cpu %u called\n", cpu);	1122	dprintk("handle_update for cpu %u called\n", cpu);
1123	cpufreq_update_policy(cpu);	1123	cpufreq_update_policy(cpu);
1124	}	1124	}
1125		1125
1126	/**	1126	/**
1127	* cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're in deep trouble.	1127	* cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're in deep trouble.
1128	* @cpu: cpu number	1128	* @cpu: cpu number
1129	* @old_freq: CPU frequency the kernel thinks the CPU runs at	1129	* @old_freq: CPU frequency the kernel thinks the CPU runs at
1130	* @new_freq: CPU frequency the CPU actually runs at	1130	* @new_freq: CPU frequency the CPU actually runs at
1131	*	1131	*
1132	* We adjust to current frequency first, and need to clean up later.	1132	* We adjust to current frequency first, and need to clean up later.
1133	* So either call to cpufreq_update_policy() or schedule handle_update()).	1133	* So either call to cpufreq_update_policy() or schedule handle_update()).
1134	*/	1134	*/
1135	static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq,	1135	static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq,
1136	unsigned int new_freq)	1136	unsigned int new_freq)
1137	{	1137	{
1138	struct cpufreq_freqs freqs;	1138	struct cpufreq_freqs freqs;
1139		1139
1140	dprintk("Warning: CPU frequency out of sync: cpufreq and timing "	1140	dprintk("Warning: CPU frequency out of sync: cpufreq and timing "
1141	"core thinks of %u, is %u kHz.\n", old_freq, new_freq);	1141	"core thinks of %u, is %u kHz.\n", old_freq, new_freq);
1142		1142
1143	freqs.cpu = cpu;	1143	freqs.cpu = cpu;
1144	freqs.old = old_freq;	1144	freqs.old = old_freq;
1145	freqs.new = new_freq;	1145	freqs.new = new_freq;
1146	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);	1146	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1147	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);	1147	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1148	}	1148	}
1149		1149
1150		1150
1151	/**	1151	/**
1152	* cpufreq_quick_get - get the CPU frequency (in kHz) from policy->cur	1152	* cpufreq_quick_get - get the CPU frequency (in kHz) from policy->cur
1153	* @cpu: CPU number	1153	* @cpu: CPU number
1154	*	1154	*
1155	* This is the last known freq, without actually getting it from the driver.	1155	* This is the last known freq, without actually getting it from the driver.
1156	* Return value will be same as what is shown in scaling_cur_freq in sysfs.	1156	* Return value will be same as what is shown in scaling_cur_freq in sysfs.
1157	*/	1157	*/
1158	unsigned int cpufreq_quick_get(unsigned int cpu)	1158	unsigned int cpufreq_quick_get(unsigned int cpu)
1159	{	1159	{
1160	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);	1160	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
1161	unsigned int ret_freq = 0;	1161	unsigned int ret_freq = 0;
1162		1162
1163	if (policy) {	1163	if (policy) {
1164	ret_freq = policy->cur;	1164	ret_freq = policy->cur;
1165	cpufreq_cpu_put(policy);	1165	cpufreq_cpu_put(policy);
1166	}	1166	}
1167		1167
1168	return ret_freq;	1168	return ret_freq;
1169	}	1169	}
1170	EXPORT_SYMBOL(cpufreq_quick_get);	1170	EXPORT_SYMBOL(cpufreq_quick_get);
1171		1171
1172		1172
1173	static unsigned int __cpufreq_get(unsigned int cpu)	1173	static unsigned int __cpufreq_get(unsigned int cpu)
1174	{	1174	{
1175	struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);	1175	struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
1176	unsigned int ret_freq = 0;	1176	unsigned int ret_freq = 0;
1177		1177
1178	if (!cpufreq_driver->get)	1178	if (!cpufreq_driver->get)
1179	return ret_freq;	1179	return ret_freq;
1180		1180
1181	ret_freq = cpufreq_driver->get(cpu);	1181	ret_freq = cpufreq_driver->get(cpu);
1182		1182
1183	if (ret_freq && policy->cur &&	1183	if (ret_freq && policy->cur &&
1184	!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {	1184	!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
1185	/* verify no discrepancy between actual and	1185	/* verify no discrepancy between actual and
1186	saved value exists */	1186	saved value exists */
1187	if (unlikely(ret_freq != policy->cur)) {	1187	if (unlikely(ret_freq != policy->cur)) {
1188	cpufreq_out_of_sync(cpu, policy->cur, ret_freq);	1188	cpufreq_out_of_sync(cpu, policy->cur, ret_freq);
1189	schedule_work(&policy->update);	1189	schedule_work(&policy->update);
1190	}	1190	}
1191	}	1191	}
1192		1192
1193	return ret_freq;	1193	return ret_freq;
1194	}	1194	}
1195		1195
1196	/**	1196	/**
1197	* cpufreq_get - get the current CPU frequency (in kHz)	1197	* cpufreq_get - get the current CPU frequency (in kHz)
1198	* @cpu: CPU number	1198	* @cpu: CPU number
1199	*	1199	*
1200	* Get the CPU current (static) CPU frequency	1200	* Get the CPU current (static) CPU frequency
1201	*/	1201	*/
1202	unsigned int cpufreq_get(unsigned int cpu)	1202	unsigned int cpufreq_get(unsigned int cpu)
1203	{	1203	{
1204	unsigned int ret_freq = 0;	1204	unsigned int ret_freq = 0;
1205	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);	1205	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
1206		1206
1207	if (!policy)	1207	if (!policy)
1208	goto out;	1208	goto out;
1209		1209
1210	if (unlikely(lock_policy_rwsem_read(cpu)))	1210	if (unlikely(lock_policy_rwsem_read(cpu)))
1211	goto out_policy;	1211	goto out_policy;
1212		1212
1213	ret_freq = __cpufreq_get(cpu);	1213	ret_freq = __cpufreq_get(cpu);
1214		1214
1215	unlock_policy_rwsem_read(cpu);	1215	unlock_policy_rwsem_read(cpu);
1216		1216
1217	out_policy:	1217	out_policy:
1218	cpufreq_cpu_put(policy);	1218	cpufreq_cpu_put(policy);
1219	out:	1219	out:
1220	return ret_freq;	1220	return ret_freq;
1221	}	1221	}
1222	EXPORT_SYMBOL(cpufreq_get);	1222	EXPORT_SYMBOL(cpufreq_get);
1223		1223
1224		1224
1225	/**	1225	/**
1226	* cpufreq_suspend - let the low level driver prepare for suspend	1226	* cpufreq_suspend - let the low level driver prepare for suspend
1227	*/	1227	*/
1228		1228
1229	static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg)	1229	static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg)
1230	{	1230	{
1231	int cpu = sysdev->id;	1231	int cpu = sysdev->id;
1232	int ret = 0;	1232	int ret = 0;
1233	unsigned int cur_freq = 0;	1233	unsigned int cur_freq = 0;
1234	struct cpufreq_policy *cpu_policy;	1234	struct cpufreq_policy *cpu_policy;
1235		1235
1236	dprintk("suspending cpu %u\n", cpu);	1236	dprintk("suspending cpu %u\n", cpu);
1237		1237
1238	if (!cpu_online(cpu))	1238	if (!cpu_online(cpu))
1239	return 0;	1239	return 0;
1240		1240
1241	/* we may be lax here as interrupts are off. Nonetheless	1241	/* we may be lax here as interrupts are off. Nonetheless
1242	* we need to grab the correct cpu policy, as to check	1242	* we need to grab the correct cpu policy, as to check
1243	* whether we really run on this CPU.	1243	* whether we really run on this CPU.
1244	*/	1244	*/
1245		1245
1246	cpu_policy = cpufreq_cpu_get(cpu);	1246	cpu_policy = cpufreq_cpu_get(cpu);
1247	if (!cpu_policy)	1247	if (!cpu_policy)
1248	return -EINVAL;	1248	return -EINVAL;
1249		1249
1250	/* only handle each CPU group once */	1250	/* only handle each CPU group once */
1251	if (unlikely(cpu_policy->cpu != cpu))	1251	if (unlikely(cpu_policy->cpu != cpu))
1252	goto out;	1252	goto out;
1253		1253
1254	if (cpufreq_driver->suspend) {	1254	if (cpufreq_driver->suspend) {
1255	ret = cpufreq_driver->suspend(cpu_policy, pmsg);	1255	ret = cpufreq_driver->suspend(cpu_policy, pmsg);
1256	if (ret) {	1256	if (ret) {
1257	printk(KERN_ERR "cpufreq: suspend failed in ->suspend "	1257	printk(KERN_ERR "cpufreq: suspend failed in ->suspend "
1258	"step on CPU %u\n", cpu_policy->cpu);	1258	"step on CPU %u\n", cpu_policy->cpu);
1259	goto out;	1259	goto out;
1260	}	1260	}
1261	}	1261	}
1262		1262
1263	if (cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)	1263	if (cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)
1264	goto out;	1264	goto out;
1265		1265
1266	if (cpufreq_driver->get)	1266	if (cpufreq_driver->get)
1267	cur_freq = cpufreq_driver->get(cpu_policy->cpu);	1267	cur_freq = cpufreq_driver->get(cpu_policy->cpu);
1268		1268
1269	if (!cur_freq \|\| !cpu_policy->cur) {	1269	if (!cur_freq \|\| !cpu_policy->cur) {
1270	printk(KERN_ERR "cpufreq: suspend failed to assert current "	1270	printk(KERN_ERR "cpufreq: suspend failed to assert current "
1271	"frequency is what timing core thinks it is.\n");	1271	"frequency is what timing core thinks it is.\n");
1272	goto out;	1272	goto out;
1273	}	1273	}
1274		1274
1275	if (unlikely(cur_freq != cpu_policy->cur)) {	1275	if (unlikely(cur_freq != cpu_policy->cur)) {
1276	struct cpufreq_freqs freqs;	1276	struct cpufreq_freqs freqs;
1277		1277
1278	if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))	1278	if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))
1279	dprintk("Warning: CPU frequency is %u, "	1279	dprintk("Warning: CPU frequency is %u, "
1280	"cpufreq assumed %u kHz.\n",	1280	"cpufreq assumed %u kHz.\n",
1281	cur_freq, cpu_policy->cur);	1281	cur_freq, cpu_policy->cur);
1282		1282
1283	freqs.cpu = cpu;	1283	freqs.cpu = cpu;
1284	freqs.old = cpu_policy->cur;	1284	freqs.old = cpu_policy->cur;
1285	freqs.new = cur_freq;	1285	freqs.new = cur_freq;
1286		1286
1287	srcu_notifier_call_chain(&cpufreq_transition_notifier_list,	1287	srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
1288	CPUFREQ_SUSPENDCHANGE, &freqs);	1288	CPUFREQ_SUSPENDCHANGE, &freqs);
1289	adjust_jiffies(CPUFREQ_SUSPENDCHANGE, &freqs);	1289	adjust_jiffies(CPUFREQ_SUSPENDCHANGE, &freqs);
1290		1290
1291	cpu_policy->cur = cur_freq;	1291	cpu_policy->cur = cur_freq;
1292	}	1292	}
1293		1293
1294	out:	1294	out:
1295	cpufreq_cpu_put(cpu_policy);	1295	cpufreq_cpu_put(cpu_policy);
1296	return ret;	1296	return ret;
1297	}	1297	}
1298		1298
1299	/**	1299	/**
1300	* cpufreq_resume - restore proper CPU frequency handling after resume	1300	* cpufreq_resume - restore proper CPU frequency handling after resume
1301	*	1301	*
1302	* 1.) resume CPUfreq hardware support (cpufreq_driver->resume())	1302	* 1.) resume CPUfreq hardware support (cpufreq_driver->resume())
1303	* 2.) if ->target and !CPUFREQ_CONST_LOOPS: verify we're in sync	1303	* 2.) if ->target and !CPUFREQ_CONST_LOOPS: verify we're in sync
1304	* 3.) schedule call cpufreq_update_policy() ASAP as interrupts are	1304	* 3.) schedule call cpufreq_update_policy() ASAP as interrupts are
1305	* restored.	1305	* restored.
1306	*/	1306	*/
1307	static int cpufreq_resume(struct sys_device *sysdev)	1307	static int cpufreq_resume(struct sys_device *sysdev)
1308	{	1308	{
1309	int cpu = sysdev->id;	1309	int cpu = sysdev->id;
1310	int ret = 0;	1310	int ret = 0;
1311	struct cpufreq_policy *cpu_policy;	1311	struct cpufreq_policy *cpu_policy;
1312		1312
1313	dprintk("resuming cpu %u\n", cpu);	1313	dprintk("resuming cpu %u\n", cpu);
1314		1314
1315	if (!cpu_online(cpu))	1315	if (!cpu_online(cpu))
1316	return 0;	1316	return 0;
1317		1317
1318	/* we may be lax here as interrupts are off. Nonetheless	1318	/* we may be lax here as interrupts are off. Nonetheless
1319	* we need to grab the correct cpu policy, as to check	1319	* we need to grab the correct cpu policy, as to check
1320	* whether we really run on this CPU.	1320	* whether we really run on this CPU.
1321	*/	1321	*/
1322		1322
1323	cpu_policy = cpufreq_cpu_get(cpu);	1323	cpu_policy = cpufreq_cpu_get(cpu);
1324	if (!cpu_policy)	1324	if (!cpu_policy)
1325	return -EINVAL;	1325	return -EINVAL;
1326		1326
1327	/* only handle each CPU group once */	1327	/* only handle each CPU group once */
1328	if (unlikely(cpu_policy->cpu != cpu))	1328	if (unlikely(cpu_policy->cpu != cpu))
1329	goto fail;	1329	goto fail;
1330		1330
1331	if (cpufreq_driver->resume) {	1331	if (cpufreq_driver->resume) {
1332	ret = cpufreq_driver->resume(cpu_policy);	1332	ret = cpufreq_driver->resume(cpu_policy);
1333	if (ret) {	1333	if (ret) {
1334	printk(KERN_ERR "cpufreq: resume failed in ->resume "	1334	printk(KERN_ERR "cpufreq: resume failed in ->resume "
1335	"step on CPU %u\n", cpu_policy->cpu);	1335	"step on CPU %u\n", cpu_policy->cpu);
1336	goto fail;	1336	goto fail;
1337	}	1337	}
1338	}	1338	}
1339		1339
1340	if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {	1340	if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
1341	unsigned int cur_freq = 0;	1341	unsigned int cur_freq = 0;
1342		1342
1343	if (cpufreq_driver->get)	1343	if (cpufreq_driver->get)
1344	cur_freq = cpufreq_driver->get(cpu_policy->cpu);	1344	cur_freq = cpufreq_driver->get(cpu_policy->cpu);
1345		1345
1346	if (!cur_freq \|\| !cpu_policy->cur) {	1346	if (!cur_freq \|\| !cpu_policy->cur) {
1347	printk(KERN_ERR "cpufreq: resume failed to assert "	1347	printk(KERN_ERR "cpufreq: resume failed to assert "
1348	"current frequency is what timing core "	1348	"current frequency is what timing core "
1349	"thinks it is.\n");	1349	"thinks it is.\n");
1350	goto out;	1350	goto out;
1351	}	1351	}
1352		1352
1353	if (unlikely(cur_freq != cpu_policy->cur)) {	1353	if (unlikely(cur_freq != cpu_policy->cur)) {
1354	struct cpufreq_freqs freqs;	1354	struct cpufreq_freqs freqs;
1355		1355
1356	if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))	1356	if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))
1357	dprintk("Warning: CPU frequency "	1357	dprintk("Warning: CPU frequency "
1358	"is %u, cpufreq assumed %u kHz.\n",	1358	"is %u, cpufreq assumed %u kHz.\n",
1359	cur_freq, cpu_policy->cur);	1359	cur_freq, cpu_policy->cur);
1360		1360
1361	freqs.cpu = cpu;	1361	freqs.cpu = cpu;
1362	freqs.old = cpu_policy->cur;	1362	freqs.old = cpu_policy->cur;
1363	freqs.new = cur_freq;	1363	freqs.new = cur_freq;
1364		1364
1365	srcu_notifier_call_chain(	1365	srcu_notifier_call_chain(
1366	&cpufreq_transition_notifier_list,	1366	&cpufreq_transition_notifier_list,
1367	CPUFREQ_RESUMECHANGE, &freqs);	1367	CPUFREQ_RESUMECHANGE, &freqs);
1368	adjust_jiffies(CPUFREQ_RESUMECHANGE, &freqs);	1368	adjust_jiffies(CPUFREQ_RESUMECHANGE, &freqs);
1369		1369
1370	cpu_policy->cur = cur_freq;	1370	cpu_policy->cur = cur_freq;
1371	}	1371	}
1372	}	1372	}
1373		1373
1374	out:	1374	out:
1375	schedule_work(&cpu_policy->update);	1375	schedule_work(&cpu_policy->update);
1376	fail:	1376	fail:
1377	cpufreq_cpu_put(cpu_policy);	1377	cpufreq_cpu_put(cpu_policy);
1378	return ret;	1378	return ret;
1379	}	1379	}
1380		1380
1381	static struct sysdev_driver cpufreq_sysdev_driver = {	1381	static struct sysdev_driver cpufreq_sysdev_driver = {
1382	.add = cpufreq_add_dev,	1382	.add = cpufreq_add_dev,
1383	.remove = cpufreq_remove_dev,	1383	.remove = cpufreq_remove_dev,
1384	.suspend = cpufreq_suspend,	1384	.suspend = cpufreq_suspend,
1385	.resume = cpufreq_resume,	1385	.resume = cpufreq_resume,
1386	};	1386	};
1387		1387
1388		1388
1389	/*********************************************************************	1389	/*********************************************************************
1390	* NOTIFIER LISTS INTERFACE *	1390	* NOTIFIER LISTS INTERFACE *
1391	*********************************************************************/	1391	*********************************************************************/
1392		1392
1393	/**	1393	/**
1394	* cpufreq_register_notifier - register a driver with cpufreq	1394	* cpufreq_register_notifier - register a driver with cpufreq
1395	* @nb: notifier function to register	1395	* @nb: notifier function to register
1396	* @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER	1396	* @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
1397	*	1397	*
1398	* Add a driver to one of two lists: either a list of drivers that	1398	* Add a driver to one of two lists: either a list of drivers that
1399	* are notified about clock rate changes (once before and once after	1399	* are notified about clock rate changes (once before and once after
1400	* the transition), or a list of drivers that are notified about	1400	* the transition), or a list of drivers that are notified about
1401	* changes in cpufreq policy.	1401	* changes in cpufreq policy.
1402	*	1402	*
1403	* This function may sleep, and has the same return conditions as	1403	* This function may sleep, and has the same return conditions as
1404	* blocking_notifier_chain_register.	1404	* blocking_notifier_chain_register.
1405	*/	1405	*/
1406	int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)	1406	int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
1407	{	1407	{
1408	int ret;	1408	int ret;
1409		1409
1410	WARN_ON(!init_cpufreq_transition_notifier_list_called);	1410	WARN_ON(!init_cpufreq_transition_notifier_list_called);
1411		1411
1412	switch (list) {	1412	switch (list) {
1413	case CPUFREQ_TRANSITION_NOTIFIER:	1413	case CPUFREQ_TRANSITION_NOTIFIER:
1414	ret = srcu_notifier_chain_register(	1414	ret = srcu_notifier_chain_register(
1415	&cpufreq_transition_notifier_list, nb);	1415	&cpufreq_transition_notifier_list, nb);
1416	break;	1416	break;
1417	case CPUFREQ_POLICY_NOTIFIER:	1417	case CPUFREQ_POLICY_NOTIFIER:
1418	ret = blocking_notifier_chain_register(	1418	ret = blocking_notifier_chain_register(
1419	&cpufreq_policy_notifier_list, nb);	1419	&cpufreq_policy_notifier_list, nb);
1420	break;	1420	break;
1421	default:	1421	default:
1422	ret = -EINVAL;	1422	ret = -EINVAL;
1423	}	1423	}
1424		1424
1425	return ret;	1425	return ret;
1426	}	1426	}
1427	EXPORT_SYMBOL(cpufreq_register_notifier);	1427	EXPORT_SYMBOL(cpufreq_register_notifier);
1428		1428
1429		1429
1430	/**	1430	/**
1431	* cpufreq_unregister_notifier - unregister a driver with cpufreq	1431	* cpufreq_unregister_notifier - unregister a driver with cpufreq
1432	* @nb: notifier block to be unregistered	1432	* @nb: notifier block to be unregistered
1433	* @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER	1433	* @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
1434	*	1434	*
1435	* Remove a driver from the CPU frequency notifier list.	1435	* Remove a driver from the CPU frequency notifier list.
1436	*	1436	*
1437	* This function may sleep, and has the same return conditions as	1437	* This function may sleep, and has the same return conditions as
1438	* blocking_notifier_chain_unregister.	1438	* blocking_notifier_chain_unregister.
1439	*/	1439	*/
1440	int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)	1440	int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)
1441	{	1441	{
1442	int ret;	1442	int ret;
1443		1443
1444	switch (list) {	1444	switch (list) {
1445	case CPUFREQ_TRANSITION_NOTIFIER:	1445	case CPUFREQ_TRANSITION_NOTIFIER:
1446	ret = srcu_notifier_chain_unregister(	1446	ret = srcu_notifier_chain_unregister(
1447	&cpufreq_transition_notifier_list, nb);	1447	&cpufreq_transition_notifier_list, nb);
1448	break;	1448	break;
1449	case CPUFREQ_POLICY_NOTIFIER:	1449	case CPUFREQ_POLICY_NOTIFIER:
1450	ret = blocking_notifier_chain_unregister(	1450	ret = blocking_notifier_chain_unregister(
1451	&cpufreq_policy_notifier_list, nb);	1451	&cpufreq_policy_notifier_list, nb);
1452	break;	1452	break;
1453	default:	1453	default:
1454	ret = -EINVAL;	1454	ret = -EINVAL;
1455	}	1455	}
1456		1456
1457	return ret;	1457	return ret;
1458	}	1458	}
1459	EXPORT_SYMBOL(cpufreq_unregister_notifier);	1459	EXPORT_SYMBOL(cpufreq_unregister_notifier);
1460		1460
1461		1461
1462	/*********************************************************************	1462	/*********************************************************************
1463	* GOVERNORS *	1463	* GOVERNORS *
1464	*********************************************************************/	1464	*********************************************************************/
1465		1465
1466		1466
1467	int __cpufreq_driver_target(struct cpufreq_policy *policy,	1467	int __cpufreq_driver_target(struct cpufreq_policy *policy,
1468	unsigned int target_freq,	1468	unsigned int target_freq,
1469	unsigned int relation)	1469	unsigned int relation)
1470	{	1470	{
1471	int retval = -EINVAL;	1471	int retval = -EINVAL;
1472		1472
1473	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,	1473	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
1474	target_freq, relation);	1474	target_freq, relation);
1475	if (cpu_online(policy->cpu) && cpufreq_driver->target)	1475	if (cpu_online(policy->cpu) && cpufreq_driver->target)
1476	retval = cpufreq_driver->target(policy, target_freq, relation);	1476	retval = cpufreq_driver->target(policy, target_freq, relation);
1477		1477
1478	return retval;	1478	return retval;
1479	}	1479	}
1480	EXPORT_SYMBOL_GPL(__cpufreq_driver_target);	1480	EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
1481		1481
1482	int cpufreq_driver_target(struct cpufreq_policy *policy,	1482	int cpufreq_driver_target(struct cpufreq_policy *policy,
1483	unsigned int target_freq,	1483	unsigned int target_freq,
1484	unsigned int relation)	1484	unsigned int relation)
1485	{	1485	{
1486	int ret = -EINVAL;	1486	int ret = -EINVAL;
1487		1487
1488	policy = cpufreq_cpu_get(policy->cpu);	1488	policy = cpufreq_cpu_get(policy->cpu);
1489	if (!policy)	1489	if (!policy)
1490	goto no_policy;	1490	goto no_policy;
1491		1491
1492	if (unlikely(lock_policy_rwsem_write(policy->cpu)))	1492	if (unlikely(lock_policy_rwsem_write(policy->cpu)))
1493	goto fail;	1493	goto fail;
1494		1494
1495	ret = __cpufreq_driver_target(policy, target_freq, relation);	1495	ret = __cpufreq_driver_target(policy, target_freq, relation);
1496		1496
1497	unlock_policy_rwsem_write(policy->cpu);	1497	unlock_policy_rwsem_write(policy->cpu);
1498		1498
1499	fail:	1499	fail:
1500	cpufreq_cpu_put(policy);	1500	cpufreq_cpu_put(policy);
1501	no_policy:	1501	no_policy:
1502	return ret;	1502	return ret;
1503	}	1503	}
1504	EXPORT_SYMBOL_GPL(cpufreq_driver_target);	1504	EXPORT_SYMBOL_GPL(cpufreq_driver_target);
1505		1505
1506	int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu)	1506	int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu)
1507	{	1507	{
1508	int ret = 0;	1508	int ret = 0;
1509		1509
1510	policy = cpufreq_cpu_get(policy->cpu);	1510	policy = cpufreq_cpu_get(policy->cpu);
1511	if (!policy)	1511	if (!policy)
1512	return -EINVAL;	1512	return -EINVAL;
1513		1513
1514	if (cpu_online(cpu) && cpufreq_driver->getavg)	1514	if (cpu_online(cpu) && cpufreq_driver->getavg)
1515	ret = cpufreq_driver->getavg(policy, cpu);	1515	ret = cpufreq_driver->getavg(policy, cpu);
1516		1516
1517	cpufreq_cpu_put(policy);	1517	cpufreq_cpu_put(policy);
1518	return ret;	1518	return ret;
1519	}	1519	}
1520	EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg);	1520	EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg);
1521		1521
1522	/*	1522	/*
1523	* when "event" is CPUFREQ_GOV_LIMITS	1523	* when "event" is CPUFREQ_GOV_LIMITS
1524	*/	1524	*/
1525		1525
1526	static int __cpufreq_governor(struct cpufreq_policy *policy,	1526	static int __cpufreq_governor(struct cpufreq_policy *policy,
1527	unsigned int event)	1527	unsigned int event)
1528	{	1528	{
1529	int ret;	1529	int ret;
1530		1530
1531	/* Only must be defined when default governor is known to have latency	1531	/* Only must be defined when default governor is known to have latency
1532	restrictions, like e.g. conservative or ondemand.	1532	restrictions, like e.g. conservative or ondemand.
1533	That this is the case is already ensured in Kconfig	1533	That this is the case is already ensured in Kconfig
1534	*/	1534	*/
1535	#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE	1535	#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
1536	struct cpufreq_governor *gov = &cpufreq_gov_performance;	1536	struct cpufreq_governor *gov = &cpufreq_gov_performance;
1537	#else	1537	#else
1538	struct cpufreq_governor *gov = NULL;	1538	struct cpufreq_governor *gov = NULL;
1539	#endif	1539	#endif
1540		1540
1541	if (policy->governor->max_transition_latency &&	1541	if (policy->governor->max_transition_latency &&
1542	policy->cpuinfo.transition_latency >	1542	policy->cpuinfo.transition_latency >
1543	policy->governor->max_transition_latency) {	1543	policy->governor->max_transition_latency) {
1544	if (!gov)	1544	if (!gov)
1545	return -EINVAL;	1545	return -EINVAL;
1546	else {	1546	else {
1547	printk(KERN_WARNING "%s governor failed, too long"	1547	printk(KERN_WARNING "%s governor failed, too long"
1548	" transition latency of HW, fallback"	1548	" transition latency of HW, fallback"
1549	" to %s governor\n",	1549	" to %s governor\n",
1550	policy->governor->name,	1550	policy->governor->name,
1551	gov->name);	1551	gov->name);
1552	policy->governor = gov;	1552	policy->governor = gov;
1553	}	1553	}
1554	}	1554	}
1555		1555
1556	if (!try_module_get(policy->governor->owner))	1556	if (!try_module_get(policy->governor->owner))
1557	return -EINVAL;	1557	return -EINVAL;
1558		1558
1559	dprintk("__cpufreq_governor for CPU %u, event %u\n",	1559	dprintk("__cpufreq_governor for CPU %u, event %u\n",
1560	policy->cpu, event);	1560	policy->cpu, event);
1561	ret = policy->governor->governor(policy, event);	1561	ret = policy->governor->governor(policy, event);
1562		1562
1563	/* we keep one module reference alive for	1563	/* we keep one module reference alive for
1564	each CPU governed by this CPU */	1564	each CPU governed by this CPU */
1565	if ((event != CPUFREQ_GOV_START) \|\| ret)	1565	if ((event != CPUFREQ_GOV_START) \|\| ret)
1566	module_put(policy->governor->owner);	1566	module_put(policy->governor->owner);
1567	if ((event == CPUFREQ_GOV_STOP) && !ret)	1567	if ((event == CPUFREQ_GOV_STOP) && !ret)
1568	module_put(policy->governor->owner);	1568	module_put(policy->governor->owner);
1569		1569
1570	return ret;	1570	return ret;
1571	}	1571	}
1572		1572
1573		1573
1574	int cpufreq_register_governor(struct cpufreq_governor *governor)	1574	int cpufreq_register_governor(struct cpufreq_governor *governor)
1575	{	1575	{
1576	int err;	1576	int err;
1577		1577
1578	if (!governor)	1578	if (!governor)
1579	return -EINVAL;	1579	return -EINVAL;
1580		1580
1581	mutex_lock(&cpufreq_governor_mutex);	1581	mutex_lock(&cpufreq_governor_mutex);
1582		1582
1583	err = -EBUSY;	1583	err = -EBUSY;
1584	if (__find_governor(governor->name) == NULL) {	1584	if (__find_governor(governor->name) == NULL) {
1585	err = 0;	1585	err = 0;
1586	list_add(&governor->governor_list, &cpufreq_governor_list);	1586	list_add(&governor->governor_list, &cpufreq_governor_list);
1587	}	1587	}
1588		1588
1589	mutex_unlock(&cpufreq_governor_mutex);	1589	mutex_unlock(&cpufreq_governor_mutex);
1590	return err;	1590	return err;
1591	}	1591	}
1592	EXPORT_SYMBOL_GPL(cpufreq_register_governor);	1592	EXPORT_SYMBOL_GPL(cpufreq_register_governor);
1593		1593
1594		1594
1595	void cpufreq_unregister_governor(struct cpufreq_governor *governor)	1595	void cpufreq_unregister_governor(struct cpufreq_governor *governor)
1596	{	1596	{
1597	if (!governor)	1597	if (!governor)
1598	return;	1598	return;
1599		1599
1600	mutex_lock(&cpufreq_governor_mutex);	1600	mutex_lock(&cpufreq_governor_mutex);
1601	list_del(&governor->governor_list);	1601	list_del(&governor->governor_list);
1602	mutex_unlock(&cpufreq_governor_mutex);	1602	mutex_unlock(&cpufreq_governor_mutex);
1603	return;	1603	return;
1604	}	1604	}
1605	EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);	1605	EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
1606		1606
1607		1607
1608		1608
1609	/*********************************************************************	1609	/*********************************************************************
1610	* POLICY INTERFACE *	1610	* POLICY INTERFACE *
1611	*********************************************************************/	1611	*********************************************************************/
1612		1612
1613	/**	1613	/**
1614	* cpufreq_get_policy - get the current cpufreq_policy	1614	* cpufreq_get_policy - get the current cpufreq_policy
1615	* @policy: struct cpufreq_policy into which the current cpufreq_policy	1615	* @policy: struct cpufreq_policy into which the current cpufreq_policy
1616	* is written	1616	* is written
1617	*	1617	*
1618	* Reads the current cpufreq policy.	1618	* Reads the current cpufreq policy.
1619	*/	1619	*/
1620	int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)	1620	int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
1621	{	1621	{
1622	struct cpufreq_policy *cpu_policy;	1622	struct cpufreq_policy *cpu_policy;
1623	if (!policy)	1623	if (!policy)
1624	return -EINVAL;	1624	return -EINVAL;
1625		1625
1626	cpu_policy = cpufreq_cpu_get(cpu);	1626	cpu_policy = cpufreq_cpu_get(cpu);
1627	if (!cpu_policy)	1627	if (!cpu_policy)
1628	return -EINVAL;	1628	return -EINVAL;
1629		1629
1630	memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy));	1630	memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy));
1631		1631
1632	cpufreq_cpu_put(cpu_policy);	1632	cpufreq_cpu_put(cpu_policy);
1633	return 0;	1633	return 0;
1634	}	1634	}
1635	EXPORT_SYMBOL(cpufreq_get_policy);	1635	EXPORT_SYMBOL(cpufreq_get_policy);
1636		1636
1637		1637
1638	/*	1638	/*
1639	* data : current policy.	1639	* data : current policy.
1640	* policy : policy to be set.	1640	* policy : policy to be set.
1641	*/	1641	*/
1642	static int __cpufreq_set_policy(struct cpufreq_policy *data,	1642	static int __cpufreq_set_policy(struct cpufreq_policy *data,
1643	struct cpufreq_policy *policy)	1643	struct cpufreq_policy *policy)
1644	{	1644	{
1645	int ret = 0;	1645	int ret = 0;
1646		1646
1647	cpufreq_debug_disable_ratelimit();	1647	cpufreq_debug_disable_ratelimit();
1648	dprintk("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu,	1648	dprintk("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu,
1649	policy->min, policy->max);	1649	policy->min, policy->max);
1650		1650
1651	memcpy(&policy->cpuinfo, &data->cpuinfo,	1651	memcpy(&policy->cpuinfo, &data->cpuinfo,
1652	sizeof(struct cpufreq_cpuinfo));	1652	sizeof(struct cpufreq_cpuinfo));
1653		1653
1654	if (policy->min > data->max \|\| policy->max < data->min) {	1654	if (policy->min > data->max \|\| policy->max < data->min) {
1655	ret = -EINVAL;	1655	ret = -EINVAL;
1656	goto error_out;	1656	goto error_out;
1657	}	1657	}
1658		1658
1659	/* verify the cpu speed can be set within this limit */	1659	/* verify the cpu speed can be set within this limit */
1660	ret = cpufreq_driver->verify(policy);	1660	ret = cpufreq_driver->verify(policy);
1661	if (ret)	1661	if (ret)
1662	goto error_out;	1662	goto error_out;
1663		1663
1664	/* adjust if necessary - all reasons */	1664	/* adjust if necessary - all reasons */
1665	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,	1665	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
1666	CPUFREQ_ADJUST, policy);	1666	CPUFREQ_ADJUST, policy);
1667		1667
1668	/* adjust if necessary - hardware incompatibility*/	1668	/* adjust if necessary - hardware incompatibility*/
1669	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,	1669	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
1670	CPUFREQ_INCOMPATIBLE, policy);	1670	CPUFREQ_INCOMPATIBLE, policy);
1671		1671
1672	/* verify the cpu speed can be set within this limit,	1672	/* verify the cpu speed can be set within this limit,
1673	which might be different to the first one */	1673	which might be different to the first one */
1674	ret = cpufreq_driver->verify(policy);	1674	ret = cpufreq_driver->verify(policy);
1675	if (ret)	1675	if (ret)
1676	goto error_out;	1676	goto error_out;
1677		1677
1678	/* notification of the new policy */	1678	/* notification of the new policy */
1679	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,	1679	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
1680	CPUFREQ_NOTIFY, policy);	1680	CPUFREQ_NOTIFY, policy);
1681		1681
1682	data->min = policy->min;	1682	data->min = policy->min;
1683	data->max = policy->max;	1683	data->max = policy->max;
1684		1684
1685	dprintk("new min and max freqs are %u - %u kHz\n",	1685	dprintk("new min and max freqs are %u - %u kHz\n",
1686	data->min, data->max);	1686	data->min, data->max);
1687		1687
1688	if (cpufreq_driver->setpolicy) {	1688	if (cpufreq_driver->setpolicy) {
1689	data->policy = policy->policy;	1689	data->policy = policy->policy;
1690	dprintk("setting range\n");	1690	dprintk("setting range\n");
1691	ret = cpufreq_driver->setpolicy(policy);	1691	ret = cpufreq_driver->setpolicy(policy);
1692	} else {	1692	} else {
1693	if (policy->governor != data->governor) {	1693	if (policy->governor != data->governor) {
1694	/* save old, working values */	1694	/* save old, working values */
1695	struct cpufreq_governor *old_gov = data->governor;	1695	struct cpufreq_governor *old_gov = data->governor;
1696		1696
1697	dprintk("governor switch\n");	1697	dprintk("governor switch\n");
1698		1698
1699	/* end old governor */	1699	/* end old governor */
1700	if (data->governor)	1700	if (data->governor)
1701	__cpufreq_governor(data, CPUFREQ_GOV_STOP);	1701	__cpufreq_governor(data, CPUFREQ_GOV_STOP);
1702		1702
1703	/* start new governor */	1703	/* start new governor */
1704	data->governor = policy->governor;	1704	data->governor = policy->governor;
1705	if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {	1705	if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {
1706	/* new governor failed, so re-start old one */	1706	/* new governor failed, so re-start old one */
1707	dprintk("starting governor %s failed\n",	1707	dprintk("starting governor %s failed\n",
1708	data->governor->name);	1708	data->governor->name);
1709	if (old_gov) {	1709	if (old_gov) {
1710	data->governor = old_gov;	1710	data->governor = old_gov;
1711	__cpufreq_governor(data,	1711	__cpufreq_governor(data,
1712	CPUFREQ_GOV_START);	1712	CPUFREQ_GOV_START);
1713	}	1713	}
1714	ret = -EINVAL;	1714	ret = -EINVAL;
1715	goto error_out;	1715	goto error_out;
1716	}	1716	}
1717	/* might be a policy change, too, so fall through */	1717	/* might be a policy change, too, so fall through */
1718	}	1718	}
1719	dprintk("governor: change or update limits\n");	1719	dprintk("governor: change or update limits\n");
1720	__cpufreq_governor(data, CPUFREQ_GOV_LIMITS);	1720	__cpufreq_governor(data, CPUFREQ_GOV_LIMITS);
1721	}	1721	}
1722		1722
1723	error_out:	1723	error_out:
1724	cpufreq_debug_enable_ratelimit();	1724	cpufreq_debug_enable_ratelimit();
1725	return ret;	1725	return ret;
1726	}	1726	}
1727		1727
1728	/**	1728	/**
1729	* cpufreq_update_policy - re-evaluate an existing cpufreq policy	1729	* cpufreq_update_policy - re-evaluate an existing cpufreq policy
1730	* @cpu: CPU which shall be re-evaluated	1730	* @cpu: CPU which shall be re-evaluated
1731	*	1731	*
1732	* Usefull for policy notifiers which have different necessities	1732	* Usefull for policy notifiers which have different necessities
1733	* at different times.	1733	* at different times.
1734	*/	1734	*/
1735	int cpufreq_update_policy(unsigned int cpu)	1735	int cpufreq_update_policy(unsigned int cpu)
1736	{	1736	{
1737	struct cpufreq_policy *data = cpufreq_cpu_get(cpu);	1737	struct cpufreq_policy *data = cpufreq_cpu_get(cpu);
1738	struct cpufreq_policy policy;	1738	struct cpufreq_policy policy;
1739	int ret;	1739	int ret;
1740		1740
1741	if (!data) {	1741	if (!data) {
1742	ret = -ENODEV;	1742	ret = -ENODEV;
1743	goto no_policy;	1743	goto no_policy;
1744	}	1744	}
1745		1745
1746	if (unlikely(lock_policy_rwsem_write(cpu))) {	1746	if (unlikely(lock_policy_rwsem_write(cpu))) {
1747	ret = -EINVAL;	1747	ret = -EINVAL;
1748	goto fail;	1748	goto fail;
1749	}	1749	}
1750		1750
1751	dprintk("updating policy for CPU %u\n", cpu);	1751	dprintk("updating policy for CPU %u\n", cpu);
1752	memcpy(&policy, data, sizeof(struct cpufreq_policy));	1752	memcpy(&policy, data, sizeof(struct cpufreq_policy));
1753	policy.min = data->user_policy.min;	1753	policy.min = data->user_policy.min;
1754	policy.max = data->user_policy.max;	1754	policy.max = data->user_policy.max;
1755	policy.policy = data->user_policy.policy;	1755	policy.policy = data->user_policy.policy;
1756	policy.governor = data->user_policy.governor;	1756	policy.governor = data->user_policy.governor;
1757		1757
1758	/* BIOS might change freq behind our back	1758	/* BIOS might change freq behind our back
1759	-> ask driver for current freq and notify governors about a change */	1759	-> ask driver for current freq and notify governors about a change */
1760	if (cpufreq_driver->get) {	1760	if (cpufreq_driver->get) {
1761	policy.cur = cpufreq_driver->get(cpu);	1761	policy.cur = cpufreq_driver->get(cpu);
1762	if (!data->cur) {	1762	if (!data->cur) {
1763	dprintk("Driver did not initialize current freq");	1763	dprintk("Driver did not initialize current freq");
1764	data->cur = policy.cur;	1764	data->cur = policy.cur;
1765	} else {	1765	} else {
1766	if (data->cur != policy.cur)	1766	if (data->cur != policy.cur)
1767	cpufreq_out_of_sync(cpu, data->cur,	1767	cpufreq_out_of_sync(cpu, data->cur,
1768	policy.cur);	1768	policy.cur);
1769	}	1769	}
1770	}	1770	}
1771		1771
1772	ret = __cpufreq_set_policy(data, &policy);	1772	ret = __cpufreq_set_policy(data, &policy);
1773		1773
1774	unlock_policy_rwsem_write(cpu);	1774	unlock_policy_rwsem_write(cpu);
1775		1775
1776	fail:	1776	fail:
1777	cpufreq_cpu_put(data);	1777	cpufreq_cpu_put(data);
1778	no_policy:	1778	no_policy:
1779	return ret;	1779	return ret;
1780	}	1780	}
1781	EXPORT_SYMBOL(cpufreq_update_policy);	1781	EXPORT_SYMBOL(cpufreq_update_policy);
1782		1782
1783	static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,	1783	static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
1784	unsigned long action, void *hcpu)	1784	unsigned long action, void *hcpu)
1785	{	1785	{
1786	unsigned int cpu = (unsigned long)hcpu;	1786	unsigned int cpu = (unsigned long)hcpu;
1787	struct sys_device *sys_dev;	1787	struct sys_device *sys_dev;
1788		1788
1789	sys_dev = get_cpu_sysdev(cpu);	1789	sys_dev = get_cpu_sysdev(cpu);
1790	if (sys_dev) {	1790	if (sys_dev) {
1791	switch (action) {	1791	switch (action) {
1792	case CPU_ONLINE:	1792	case CPU_ONLINE:
1793	case CPU_ONLINE_FROZEN:	1793	case CPU_ONLINE_FROZEN:
1794	cpufreq_add_dev(sys_dev);	1794	cpufreq_add_dev(sys_dev);
1795	break;	1795	break;
1796	case CPU_DOWN_PREPARE:	1796	case CPU_DOWN_PREPARE:
1797	case CPU_DOWN_PREPARE_FROZEN:	1797	case CPU_DOWN_PREPARE_FROZEN:
1798	if (unlikely(lock_policy_rwsem_write(cpu)))	1798	if (unlikely(lock_policy_rwsem_write(cpu)))
1799	BUG();	1799	BUG();
1800		1800
1801	__cpufreq_remove_dev(sys_dev);	1801	__cpufreq_remove_dev(sys_dev);
1802	break;	1802	break;
1803	case CPU_DOWN_FAILED:	1803	case CPU_DOWN_FAILED:
1804	case CPU_DOWN_FAILED_FROZEN:	1804	case CPU_DOWN_FAILED_FROZEN:
1805	cpufreq_add_dev(sys_dev);	1805	cpufreq_add_dev(sys_dev);
1806	break;	1806	break;
1807	}	1807	}
1808	}	1808	}
1809	return NOTIFY_OK;	1809	return NOTIFY_OK;
1810	}	1810	}
1811		1811
1812	static struct notifier_block __refdata cpufreq_cpu_notifier =	1812	static struct notifier_block __refdata cpufreq_cpu_notifier =
1813	{	1813	{
1814	.notifier_call = cpufreq_cpu_callback,	1814	.notifier_call = cpufreq_cpu_callback,
1815	};	1815	};
1816		1816
1817	/*********************************************************************	1817	/*********************************************************************
1818	* REGISTER / UNREGISTER CPUFREQ DRIVER *	1818	* REGISTER / UNREGISTER CPUFREQ DRIVER *
1819	*********************************************************************/	1819	*********************************************************************/
1820		1820
1821	/**	1821	/**
1822	* cpufreq_register_driver - register a CPU Frequency driver	1822	* cpufreq_register_driver - register a CPU Frequency driver
1823	* @driver_data: A struct cpufreq_driver containing the values#	1823	* @driver_data: A struct cpufreq_driver containing the values#
1824	* submitted by the CPU Frequency driver.	1824	* submitted by the CPU Frequency driver.
1825	*	1825	*
1826	* Registers a CPU Frequency driver to this core code. This code	1826	* Registers a CPU Frequency driver to this core code. This code
1827	* returns zero on success, -EBUSY when another driver got here first	1827	* returns zero on success, -EBUSY when another driver got here first
1828	* (and isn't unregistered in the meantime).	1828	* (and isn't unregistered in the meantime).
1829	*	1829	*
1830	*/	1830	*/
1831	int cpufreq_register_driver(struct cpufreq_driver *driver_data)	1831	int cpufreq_register_driver(struct cpufreq_driver *driver_data)
1832	{	1832	{
1833	unsigned long flags;	1833	unsigned long flags;
1834	int ret;	1834	int ret;
1835		1835
1836	if (!driver_data \|\| !driver_data->verify \|\| !driver_data->init \|\|	1836	if (!driver_data \|\| !driver_data->verify \|\| !driver_data->init \|\|
1837	((!driver_data->setpolicy) && (!driver_data->target)))	1837	((!driver_data->setpolicy) && (!driver_data->target)))
1838	return -EINVAL;	1838	return -EINVAL;
1839		1839
1840	dprintk("trying to register driver %s\n", driver_data->name);	1840	dprintk("trying to register driver %s\n", driver_data->name);
1841		1841
1842	if (driver_data->setpolicy)	1842	if (driver_data->setpolicy)
1843	driver_data->flags \|= CPUFREQ_CONST_LOOPS;	1843	driver_data->flags \|= CPUFREQ_CONST_LOOPS;
1844		1844
1845	spin_lock_irqsave(&cpufreq_driver_lock, flags);	1845	spin_lock_irqsave(&cpufreq_driver_lock, flags);
1846	if (cpufreq_driver) {	1846	if (cpufreq_driver) {
1847	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1847	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1848	return -EBUSY;	1848	return -EBUSY;
1849	}	1849	}
1850	cpufreq_driver = driver_data;	1850	cpufreq_driver = driver_data;
1851	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1851	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1852		1852
1853	ret = sysdev_driver_register(&cpu_sysdev_class,	1853	ret = sysdev_driver_register(&cpu_sysdev_class,
1854	&cpufreq_sysdev_driver);	1854	&cpufreq_sysdev_driver);
1855		1855
1856	if ((!ret) && !(cpufreq_driver->flags & CPUFREQ_STICKY)) {	1856	if ((!ret) && !(cpufreq_driver->flags & CPUFREQ_STICKY)) {
1857	int i;	1857	int i;
1858	ret = -ENODEV;	1858	ret = -ENODEV;
1859		1859
1860	/* check for at least one working CPU */	1860	/* check for at least one working CPU */
1861	for (i = 0; i < nr_cpu_ids; i++)	1861	for (i = 0; i < nr_cpu_ids; i++)
1862	if (cpu_possible(i) && per_cpu(cpufreq_cpu_data, i)) {	1862	if (cpu_possible(i) && per_cpu(cpufreq_cpu_data, i)) {
1863	ret = 0;	1863	ret = 0;
1864	break;	1864	break;
1865	}	1865	}
1866		1866
1867	/* if all ->init() calls failed, unregister */	1867	/* if all ->init() calls failed, unregister */
1868	if (ret) {	1868	if (ret) {
1869	dprintk("no CPU initialized for driver %s\n",	1869	dprintk("no CPU initialized for driver %s\n",
1870	driver_data->name);	1870	driver_data->name);
1871	sysdev_driver_unregister(&cpu_sysdev_class,	1871	sysdev_driver_unregister(&cpu_sysdev_class,
1872	&cpufreq_sysdev_driver);	1872	&cpufreq_sysdev_driver);
1873		1873
1874	spin_lock_irqsave(&cpufreq_driver_lock, flags);	1874	spin_lock_irqsave(&cpufreq_driver_lock, flags);
1875	cpufreq_driver = NULL;	1875	cpufreq_driver = NULL;
1876	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1876	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1877	}	1877	}
1878	}	1878	}
1879		1879
1880	if (!ret) {	1880	if (!ret) {
1881	register_hotcpu_notifier(&cpufreq_cpu_notifier);	1881	register_hotcpu_notifier(&cpufreq_cpu_notifier);
1882	dprintk("driver %s up and running\n", driver_data->name);	1882	dprintk("driver %s up and running\n", driver_data->name);
1883	cpufreq_debug_enable_ratelimit();	1883	cpufreq_debug_enable_ratelimit();
1884	}	1884	}
1885		1885
1886	return ret;	1886	return ret;
1887	}	1887	}
1888	EXPORT_SYMBOL_GPL(cpufreq_register_driver);	1888	EXPORT_SYMBOL_GPL(cpufreq_register_driver);
1889		1889
1890		1890
1891	/**	1891	/**
1892	* cpufreq_unregister_driver - unregister the current CPUFreq driver	1892	* cpufreq_unregister_driver - unregister the current CPUFreq driver
1893	*	1893	*
1894	* Unregister the current CPUFreq driver. Only call this if you have	1894	* Unregister the current CPUFreq driver. Only call this if you have
1895	* the right to do so, i.e. if you have succeeded in initialising before!	1895	* the right to do so, i.e. if you have succeeded in initialising before!
1896	* Returns zero if successful, and -EINVAL if the cpufreq_driver is	1896	* Returns zero if successful, and -EINVAL if the cpufreq_driver is
1897	* currently not initialised.	1897	* currently not initialised.
1898	*/	1898	*/
1899	int cpufreq_unregister_driver(struct cpufreq_driver *driver)	1899	int cpufreq_unregister_driver(struct cpufreq_driver *driver)
1900	{	1900	{
1901	unsigned long flags;	1901	unsigned long flags;
1902		1902
1903	cpufreq_debug_disable_ratelimit();	1903	cpufreq_debug_disable_ratelimit();
1904		1904
1905	if (!cpufreq_driver \|\| (driver != cpufreq_driver)) {	1905	if (!cpufreq_driver \|\| (driver != cpufreq_driver)) {
1906	cpufreq_debug_enable_ratelimit();	1906	cpufreq_debug_enable_ratelimit();
1907	return -EINVAL;	1907	return -EINVAL;
1908	}	1908	}
1909		1909
1910	dprintk("unregistering driver %s\n", driver->name);	1910	dprintk("unregistering driver %s\n", driver->name);
1911		1911
1912	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);	1912	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
1913	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);	1913	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
1914		1914
1915	spin_lock_irqsave(&cpufreq_driver_lock, flags);	1915	spin_lock_irqsave(&cpufreq_driver_lock, flags);
1916	cpufreq_driver = NULL;	1916	cpufreq_driver = NULL;
1917	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);	1917	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
1918		1918
1919	return 0;	1919	return 0;
1920	}	1920	}
1921	EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);	1921	EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
1922		1922
1923	static int __init cpufreq_core_init(void)	1923	static int __init cpufreq_core_init(void)
1924	{	1924	{
1925	int cpu;	1925	int cpu;
1926		1926
1927	for_each_possible_cpu(cpu) {	1927	for_each_possible_cpu(cpu) {
1928	per_cpu(policy_cpu, cpu) = -1;	1928	per_cpu(policy_cpu, cpu) = -1;
1929	init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));	1929	init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
1930	}	1930	}
1931	return 0;	1931	return 0;
1932	}	1932	}
1933		1933
1934	core_initcall(cpufreq_core_init);	1934	core_initcall(cpufreq_core_init);
1935		1935

kernel/sched_cpupri.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* kernel/sched_cpupri.c	2	* kernel/sched_cpupri.c
3	*	3	*
4	* CPU priority management	4	* CPU priority management
5	*	5	*
6	* Copyright (C) 2007-2008 Novell	6	* Copyright (C) 2007-2008 Novell
7	*	7	*
8	* Author: Gregory Haskins <ghaskins@novell.com>	8	* Author: Gregory Haskins <ghaskins@novell.com>
9	*	9	*
10	* This code tracks the priority of each CPU so that global migration	10	* This code tracks the priority of each CPU so that global migration
11	* decisions are easy to calculate. Each CPU can be in a state as follows:	11	* decisions are easy to calculate. Each CPU can be in a state as follows:
12	*	12	*
13	* (INVALID), IDLE, NORMAL, RT1, ... RT99	13	* (INVALID), IDLE, NORMAL, RT1, ... RT99
14	*	14	*
15	* going from the lowest priority to the highest. CPUs in the INVALID state	15	* going from the lowest priority to the highest. CPUs in the INVALID state
16	* are not eligible for routing. The system maintains this state with	16	* are not eligible for routing. The system maintains this state with
17	* a 2 dimensional bitmap (the first for priority class, the second for cpus	17	* a 2 dimensional bitmap (the first for priority class, the second for cpus
18	* in that class). Therefore a typical application without affinity	18	* in that class). Therefore a typical application without affinity
19	* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit	19	* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20	* searches). For tasks with affinity restrictions, the algorithm has a	20	* searches). For tasks with affinity restrictions, the algorithm has a
21	* worst case complexity of O(min(102, nr_domcpus)), though the scenario that	21	* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22	* yields the worst case search is fairly contrived.	22	* yields the worst case search is fairly contrived.
23	*	23	*
24	* This program is free software; you can redistribute it and/or	24	* This program is free software; you can redistribute it and/or
25	* modify it under the terms of the GNU General Public License	25	* modify it under the terms of the GNU General Public License
26	* as published by the Free Software Foundation; version 2	26	* as published by the Free Software Foundation; version 2
27	* of the License.	27	* of the License.
28	*/	28	*/
29		29
30	#include "sched_cpupri.h"	30	#include "sched_cpupri.h"
31		31
32	/* Convert between a 140 based task->prio, and our 102 based cpupri */	32	/* Convert between a 140 based task->prio, and our 102 based cpupri */
33	static int convert_prio(int prio)	33	static int convert_prio(int prio)
34	{	34	{
35	int cpupri;	35	int cpupri;
36		36
37	if (prio == CPUPRI_INVALID)	37	if (prio == CPUPRI_INVALID)
38	cpupri = CPUPRI_INVALID;	38	cpupri = CPUPRI_INVALID;
39	else if (prio == MAX_PRIO)	39	else if (prio == MAX_PRIO)
40	cpupri = CPUPRI_IDLE;	40	cpupri = CPUPRI_IDLE;
41	else if (prio >= MAX_RT_PRIO)	41	else if (prio >= MAX_RT_PRIO)
42	cpupri = CPUPRI_NORMAL;	42	cpupri = CPUPRI_NORMAL;
43	else	43	else
44	cpupri = MAX_RT_PRIO - prio + 1;	44	cpupri = MAX_RT_PRIO - prio + 1;
45		45
46	return cpupri;	46	return cpupri;
47	}	47	}
48		48
49	#define for_each_cpupri_active(array, idx) \	49	#define for_each_cpupri_active(array, idx) \
50	for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \	50	for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51	idx < CPUPRI_NR_PRIORITIES; \	51	idx < CPUPRI_NR_PRIORITIES; \
52	idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))	52	idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53		53
54	/**	54	/**
55	* cpupri_find - find the best (lowest-pri) CPU in the system	55	* cpupri_find - find the best (lowest-pri) CPU in the system
56	* @cp: The cpupri context	56	* @cp: The cpupri context
57	* @p: The task	57	* @p: The task
58	* @lowest_mask: A mask to fill in with selected CPUs (or NULL)	58	* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59	*	59	*
60	* Note: This function returns the recommended CPUs as calculated during the	60	* Note: This function returns the recommended CPUs as calculated during the
61	* current invokation. By the time the call returns, the CPUs may have in	61	* current invokation. By the time the call returns, the CPUs may have in
62	* fact changed priorities any number of times. While not ideal, it is not	62	* fact changed priorities any number of times. While not ideal, it is not
63	* an issue of correctness since the normal rebalancer logic will correct	63	* an issue of correctness since the normal rebalancer logic will correct
64	* any discrepancies created by racing against the uncertainty of the current	64	* any discrepancies created by racing against the uncertainty of the current
65	* priority configuration.	65	* priority configuration.
66	*	66	*
67	* Returns: (int)bool - CPUs were found	67	* Returns: (int)bool - CPUs were found
68	*/	68	*/
69	int cpupri_find(struct cpupri cp, struct task_struct p,	69	int cpupri_find(struct cpupri cp, struct task_struct p,
70	struct cpumask *lowest_mask)	70	struct cpumask *lowest_mask)
71	{	71	{
72	int idx = 0;	72	int idx = 0;
73	int task_pri = convert_prio(p->prio);	73	int task_pri = convert_prio(p->prio);
74		74
75	for_each_cpupri_active(cp->pri_active, idx) {	75	for_each_cpupri_active(cp->pri_active, idx) {
76	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];	76	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77		77
78	if (idx >= task_pri)	78	if (idx >= task_pri)
79	break;	79	break;
80		80
81	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)	81	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82	continue;	82	continue;
83		83
84	if (lowest_mask)	84	if (lowest_mask)
85	cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);	85	cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86	return 1;	86	return 1;
87	}	87	}
88		88
89	return 0;	89	return 0;
90	}	90	}
91		91
92	/**	92	/**
93	* cpupri_set - update the cpu priority setting	93	* cpupri_set - update the cpu priority setting
94	* @cp: The cpupri context	94	* @cp: The cpupri context
95	* @cpu: The target cpu	95	* @cpu: The target cpu
96	* @pri: The priority (INVALID-RT99) to assign to this CPU	96	* @pri: The priority (INVALID-RT99) to assign to this CPU
97	*	97	*
98	* Note: Assumes cpu_rq(cpu)->lock is locked	98	* Note: Assumes cpu_rq(cpu)->lock is locked
99	*	99	*
100	* Returns: (void)	100	* Returns: (void)
101	*/	101	*/
102	void cpupri_set(struct cpupri *cp, int cpu, int newpri)	102	void cpupri_set(struct cpupri *cp, int cpu, int newpri)
103	{	103	{
104	int *currpri = &cp->cpu_to_pri[cpu];	104	int *currpri = &cp->cpu_to_pri[cpu];
105	int oldpri = *currpri;	105	int oldpri = *currpri;
106	unsigned long flags;	106	unsigned long flags;
107		107
108	newpri = convert_prio(newpri);	108	newpri = convert_prio(newpri);
109		109
110	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);	110	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
111		111
112	if (newpri == oldpri)	112	if (newpri == oldpri)
113	return;	113	return;
114		114
115	/*	115	/*
116	* If the cpu was currently mapped to a different value, we	116	* If the cpu was currently mapped to a different value, we
117	* first need to unmap the old value	117	* first need to unmap the old value
118	*/	118	*/
119	if (likely(oldpri != CPUPRI_INVALID)) {	119	if (likely(oldpri != CPUPRI_INVALID)) {
120	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];	120	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
121		121
122	spin_lock_irqsave(&vec->lock, flags);	122	spin_lock_irqsave(&vec->lock, flags);
123		123
124	vec->count--;	124	vec->count--;
125	if (!vec->count)	125	if (!vec->count)
126	clear_bit(oldpri, cp->pri_active);	126	clear_bit(oldpri, cp->pri_active);
127	cpumask_clear_cpu(cpu, vec->mask);	127	cpumask_clear_cpu(cpu, vec->mask);
128		128
129	spin_unlock_irqrestore(&vec->lock, flags);	129	spin_unlock_irqrestore(&vec->lock, flags);
130	}	130	}
131		131
132	if (likely(newpri != CPUPRI_INVALID)) {	132	if (likely(newpri != CPUPRI_INVALID)) {
133	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];	133	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
134		134
135	spin_lock_irqsave(&vec->lock, flags);	135	spin_lock_irqsave(&vec->lock, flags);
136		136
137	cpumask_set_cpu(cpu, vec->mask);	137	cpumask_set_cpu(cpu, vec->mask);
138	vec->count++;	138	vec->count++;
139	if (vec->count == 1)	139	if (vec->count == 1)
140	set_bit(newpri, cp->pri_active);	140	set_bit(newpri, cp->pri_active);
141		141
142	spin_unlock_irqrestore(&vec->lock, flags);	142	spin_unlock_irqrestore(&vec->lock, flags);
143	}	143	}
144		144
145	*currpri = newpri;	145	*currpri = newpri;
146	}	146	}
147		147
148	/**	148	/**
149	* cpupri_init - initialize the cpupri structure	149	* cpupri_init - initialize the cpupri structure
150	* @cp: The cpupri context	150	* @cp: The cpupri context
151	* @bootmem: true if allocations need to use bootmem	151	* @bootmem: true if allocations need to use bootmem
152	*	152	*
153	* Returns: -ENOMEM if memory fails.	153	* Returns: -ENOMEM if memory fails.
154	*/	154	*/
155	int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)	155	int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
156	{	156	{
157	int i;	157	int i;
158		158
159	memset(cp, 0, sizeof(*cp));	159	memset(cp, 0, sizeof(*cp));
160		160
161	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {	161	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
162	struct cpupri_vec *vec = &cp->pri_to_cpu[i];	162	struct cpupri_vec *vec = &cp->pri_to_cpu[i];
163		163
164	spin_lock_init(&vec->lock);	164	spin_lock_init(&vec->lock);
165	vec->count = 0;	165	vec->count = 0;
166	if (bootmem)	166	if (bootmem)
167	alloc_bootmem_cpumask_var(&vec->mask);	167	alloc_bootmem_cpumask_var(&vec->mask);
168	else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))	168	else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
169	goto cleanup;	169	goto cleanup;
170	}	170	}
171		171
172	for_each_possible_cpu(i)	172	for_each_possible_cpu(i)
173	cp->cpu_to_pri[i] = CPUPRI_INVALID;	173	cp->cpu_to_pri[i] = CPUPRI_INVALID;
174	return 0;	174	return 0;
175		175
176	cleanup:	176	cleanup:
177	for (i--; i >= 0; i--)	177	for (i--; i >= 0; i--)
178	free_cpumask_var(cp->pri_to_cpu[i].mask);	178	free_cpumask_var(cp->pri_to_cpu[i].mask);
179	return -ENOMEM;	179	return -ENOMEM;
180	}	180	}
181		181
182	/**	182	/**
183	* cpupri_cleanup - clean up the cpupri structure	183	* cpupri_cleanup - clean up the cpupri structure
184	* @cp: The cpupri context	184	* @cp: The cpupri context
185	*/	185	*/
186	void cpupri_cleanup(struct cpupri *cp)	186	void cpupri_cleanup(struct cpupri *cp)
187	{	187	{
188	int i;	188	int i;
189		189
190	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)	190	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
191	free_cpumask_var(cp->pri_to_cpu[i].mask);	191	free_cpumask_var(cp->pri_to_cpu[i].mask);
192	}	192	}
193		193

kernel/sched_rt.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR	2	* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3	* policies)	3	* policies)
4	*/	4	*/
5		5
6	static inline struct task_struct rt_task_of(struct sched_rt_entity rt_se)	6	static inline struct task_struct rt_task_of(struct sched_rt_entity rt_se)
7	{	7	{
8	return container_of(rt_se, struct task_struct, rt);	8	return container_of(rt_se, struct task_struct, rt);
9	}	9	}
10		10
11	#ifdef CONFIG_RT_GROUP_SCHED	11	#ifdef CONFIG_RT_GROUP_SCHED
12		12
13	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)	13	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)
14	{	14	{
15	return rt_rq->rq;	15	return rt_rq->rq;
16	}	16	}
17		17
18	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)	18	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)
19	{	19	{
20	return rt_se->rt_rq;	20	return rt_se->rt_rq;
21	}	21	}
22		22
23	#else /* CONFIG_RT_GROUP_SCHED */	23	#else /* CONFIG_RT_GROUP_SCHED */
24		24
25	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)	25	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)
26	{	26	{
27	return container_of(rt_rq, struct rq, rt);	27	return container_of(rt_rq, struct rq, rt);
28	}	28	}
29		29
30	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)	30	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)
31	{	31	{
32	struct task_struct *p = rt_task_of(rt_se);	32	struct task_struct *p = rt_task_of(rt_se);
33	struct rq *rq = task_rq(p);	33	struct rq *rq = task_rq(p);
34		34
35	return &rq->rt;	35	return &rq->rt;
36	}	36	}
37		37
38	#endif /* CONFIG_RT_GROUP_SCHED */	38	#endif /* CONFIG_RT_GROUP_SCHED */
39		39
40	#ifdef CONFIG_SMP	40	#ifdef CONFIG_SMP
41		41
42	static inline int rt_overloaded(struct rq *rq)	42	static inline int rt_overloaded(struct rq *rq)
43	{	43	{
44	return atomic_read(&rq->rd->rto_count);	44	return atomic_read(&rq->rd->rto_count);
45	}	45	}
46		46
47	static inline void rt_set_overload(struct rq *rq)	47	static inline void rt_set_overload(struct rq *rq)
48	{	48	{
49	if (!rq->online)	49	if (!rq->online)
50	return;	50	return;
51		51
52	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);	52	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
53	/*	53	/*
54	* Make sure the mask is visible before we set	54	* Make sure the mask is visible before we set
55	* the overload count. That is checked to determine	55	* the overload count. That is checked to determine
56	* if we should look at the mask. It would be a shame	56	* if we should look at the mask. It would be a shame
57	* if we looked at the mask, but the mask was not	57	* if we looked at the mask, but the mask was not
58	* updated yet.	58	* updated yet.
59	*/	59	*/
60	wmb();	60	wmb();
61	atomic_inc(&rq->rd->rto_count);	61	atomic_inc(&rq->rd->rto_count);
62	}	62	}
63		63
64	static inline void rt_clear_overload(struct rq *rq)	64	static inline void rt_clear_overload(struct rq *rq)
65	{	65	{
66	if (!rq->online)	66	if (!rq->online)
67	return;	67	return;
68		68
69	/* the order here really doesn't matter */	69	/* the order here really doesn't matter */
70	atomic_dec(&rq->rd->rto_count);	70	atomic_dec(&rq->rd->rto_count);
71	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);	71	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
72	}	72	}
73		73
74	static void update_rt_migration(struct rt_rq *rt_rq)	74	static void update_rt_migration(struct rt_rq *rt_rq)
75	{	75	{
76	if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {	76	if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
77	if (!rt_rq->overloaded) {	77	if (!rt_rq->overloaded) {
78	rt_set_overload(rq_of_rt_rq(rt_rq));	78	rt_set_overload(rq_of_rt_rq(rt_rq));
79	rt_rq->overloaded = 1;	79	rt_rq->overloaded = 1;
80	}	80	}
81	} else if (rt_rq->overloaded) {	81	} else if (rt_rq->overloaded) {
82	rt_clear_overload(rq_of_rt_rq(rt_rq));	82	rt_clear_overload(rq_of_rt_rq(rt_rq));
83	rt_rq->overloaded = 0;	83	rt_rq->overloaded = 0;
84	}	84	}
85	}	85	}
86		86
87	static void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	87	static void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
88	{	88	{
89	if (rt_se->nr_cpus_allowed > 1)	89	if (rt_se->nr_cpus_allowed > 1)
90	rt_rq->rt_nr_migratory++;	90	rt_rq->rt_nr_migratory++;
91		91
92	update_rt_migration(rt_rq);	92	update_rt_migration(rt_rq);
93	}	93	}
94		94
95	static void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	95	static void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
96	{	96	{
97	if (rt_se->nr_cpus_allowed > 1)	97	if (rt_se->nr_cpus_allowed > 1)
98	rt_rq->rt_nr_migratory--;	98	rt_rq->rt_nr_migratory--;
99		99
100	update_rt_migration(rt_rq);	100	update_rt_migration(rt_rq);
101	}	101	}
102		102
103	static void enqueue_pushable_task(struct rq rq, struct task_struct p)	103	static void enqueue_pushable_task(struct rq rq, struct task_struct p)
104	{	104	{
105	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);	105	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
106	plist_node_init(&p->pushable_tasks, p->prio);	106	plist_node_init(&p->pushable_tasks, p->prio);
107	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);	107	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
108	}	108	}
109		109
110	static void dequeue_pushable_task(struct rq rq, struct task_struct p)	110	static void dequeue_pushable_task(struct rq rq, struct task_struct p)
111	{	111	{
112	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);	112	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
113	}	113	}
114		114
115	#else	115	#else
116		116
117	static inline void enqueue_pushable_task(struct rq rq, struct task_struct p)	117	static inline void enqueue_pushable_task(struct rq rq, struct task_struct p)
118	{	118	{
119	}	119	}
120		120
121	static inline void dequeue_pushable_task(struct rq rq, struct task_struct p)	121	static inline void dequeue_pushable_task(struct rq rq, struct task_struct p)
122	{	122	{
123	}	123	}
124		124
125	static inline	125	static inline
126	void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	126	void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
127	{	127	{
128	}	128	}
129		129
130	static inline	130	static inline
131	void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	131	void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
132	{	132	{
133	}	133	}
134		134
135	#endif /* CONFIG_SMP */	135	#endif /* CONFIG_SMP */
136		136
137	static inline int on_rt_rq(struct sched_rt_entity *rt_se)	137	static inline int on_rt_rq(struct sched_rt_entity *rt_se)
138	{	138	{
139	return !list_empty(&rt_se->run_list);	139	return !list_empty(&rt_se->run_list);
140	}	140	}
141		141
142	#ifdef CONFIG_RT_GROUP_SCHED	142	#ifdef CONFIG_RT_GROUP_SCHED
143		143
144	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)	144	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
145	{	145	{
146	if (!rt_rq->tg)	146	if (!rt_rq->tg)
147	return RUNTIME_INF;	147	return RUNTIME_INF;
148		148
149	return rt_rq->rt_runtime;	149	return rt_rq->rt_runtime;
150	}	150	}
151		151
152	static inline u64 sched_rt_period(struct rt_rq *rt_rq)	152	static inline u64 sched_rt_period(struct rt_rq *rt_rq)
153	{	153	{
154	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);	154	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
155	}	155	}
156		156
157	#define for_each_leaf_rt_rq(rt_rq, rq) \	157	#define for_each_leaf_rt_rq(rt_rq, rq) \
158	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)	158	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
159		159
160	#define for_each_sched_rt_entity(rt_se) \	160	#define for_each_sched_rt_entity(rt_se) \
161	for (; rt_se; rt_se = rt_se->parent)	161	for (; rt_se; rt_se = rt_se->parent)
162		162
163	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)	163	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)
164	{	164	{
165	return rt_se->my_q;	165	return rt_se->my_q;
166	}	166	}
167		167
168	static void enqueue_rt_entity(struct sched_rt_entity *rt_se);	168	static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
169	static void dequeue_rt_entity(struct sched_rt_entity *rt_se);	169	static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
170		170
171	static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)	171	static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
172	{	172	{
173	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;	173	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
174	struct sched_rt_entity *rt_se = rt_rq->rt_se;	174	struct sched_rt_entity *rt_se = rt_rq->rt_se;
175		175
176	if (rt_rq->rt_nr_running) {	176	if (rt_rq->rt_nr_running) {
177	if (rt_se && !on_rt_rq(rt_se))	177	if (rt_se && !on_rt_rq(rt_se))
178	enqueue_rt_entity(rt_se);	178	enqueue_rt_entity(rt_se);
179	if (rt_rq->highest_prio.curr < curr->prio)	179	if (rt_rq->highest_prio.curr < curr->prio)
180	resched_task(curr);	180	resched_task(curr);
181	}	181	}
182	}	182	}
183		183
184	static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)	184	static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
185	{	185	{
186	struct sched_rt_entity *rt_se = rt_rq->rt_se;	186	struct sched_rt_entity *rt_se = rt_rq->rt_se;
187		187
188	if (rt_se && on_rt_rq(rt_se))	188	if (rt_se && on_rt_rq(rt_se))
189	dequeue_rt_entity(rt_se);	189	dequeue_rt_entity(rt_se);
190	}	190	}
191		191
192	static inline int rt_rq_throttled(struct rt_rq *rt_rq)	192	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
193	{	193	{
194	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;	194	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
195	}	195	}
196		196
197	static int rt_se_boosted(struct sched_rt_entity *rt_se)	197	static int rt_se_boosted(struct sched_rt_entity *rt_se)
198	{	198	{
199	struct rt_rq *rt_rq = group_rt_rq(rt_se);	199	struct rt_rq *rt_rq = group_rt_rq(rt_se);
200	struct task_struct *p;	200	struct task_struct *p;
201		201
202	if (rt_rq)	202	if (rt_rq)
203	return !!rt_rq->rt_nr_boosted;	203	return !!rt_rq->rt_nr_boosted;
204		204
205	p = rt_task_of(rt_se);	205	p = rt_task_of(rt_se);
206	return p->prio != p->normal_prio;	206	return p->prio != p->normal_prio;
207	}	207	}
208		208
209	#ifdef CONFIG_SMP	209	#ifdef CONFIG_SMP
210	static inline const struct cpumask *sched_rt_period_mask(void)	210	static inline const struct cpumask *sched_rt_period_mask(void)
211	{	211	{
212	return cpu_rq(smp_processor_id())->rd->span;	212	return cpu_rq(smp_processor_id())->rd->span;
213	}	213	}
214	#else	214	#else
215	static inline const struct cpumask *sched_rt_period_mask(void)	215	static inline const struct cpumask *sched_rt_period_mask(void)
216	{	216	{
217	return cpu_online_mask;	217	return cpu_online_mask;
218	}	218	}
219	#endif	219	#endif
220		220
221	static inline	221	static inline
222	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)	222	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)
223	{	223	{
224	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];	224	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
225	}	225	}
226		226
227	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)	227	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)
228	{	228	{
229	return &rt_rq->tg->rt_bandwidth;	229	return &rt_rq->tg->rt_bandwidth;
230	}	230	}
231		231
232	#else /* !CONFIG_RT_GROUP_SCHED */	232	#else /* !CONFIG_RT_GROUP_SCHED */
233		233
234	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)	234	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
235	{	235	{
236	return rt_rq->rt_runtime;	236	return rt_rq->rt_runtime;
237	}	237	}
238		238
239	static inline u64 sched_rt_period(struct rt_rq *rt_rq)	239	static inline u64 sched_rt_period(struct rt_rq *rt_rq)
240	{	240	{
241	return ktime_to_ns(def_rt_bandwidth.rt_period);	241	return ktime_to_ns(def_rt_bandwidth.rt_period);
242	}	242	}
243		243
244	#define for_each_leaf_rt_rq(rt_rq, rq) \	244	#define for_each_leaf_rt_rq(rt_rq, rq) \
245	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)	245	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
246		246
247	#define for_each_sched_rt_entity(rt_se) \	247	#define for_each_sched_rt_entity(rt_se) \
248	for (; rt_se; rt_se = NULL)	248	for (; rt_se; rt_se = NULL)
249		249
250	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)	250	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)
251	{	251	{
252	return NULL;	252	return NULL;
253	}	253	}
254		254
255	static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)	255	static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
256	{	256	{
257	if (rt_rq->rt_nr_running)	257	if (rt_rq->rt_nr_running)
258	resched_task(rq_of_rt_rq(rt_rq)->curr);	258	resched_task(rq_of_rt_rq(rt_rq)->curr);
259	}	259	}
260		260
261	static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)	261	static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
262	{	262	{
263	}	263	}
264		264
265	static inline int rt_rq_throttled(struct rt_rq *rt_rq)	265	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
266	{	266	{
267	return rt_rq->rt_throttled;	267	return rt_rq->rt_throttled;
268	}	268	}
269		269
270	static inline const struct cpumask *sched_rt_period_mask(void)	270	static inline const struct cpumask *sched_rt_period_mask(void)
271	{	271	{
272	return cpu_online_mask;	272	return cpu_online_mask;
273	}	273	}
274		274
275	static inline	275	static inline
276	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)	276	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)
277	{	277	{
278	return &cpu_rq(cpu)->rt;	278	return &cpu_rq(cpu)->rt;
279	}	279	}
280		280
281	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)	281	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)
282	{	282	{
283	return &def_rt_bandwidth;	283	return &def_rt_bandwidth;
284	}	284	}
285		285
286	#endif /* CONFIG_RT_GROUP_SCHED */	286	#endif /* CONFIG_RT_GROUP_SCHED */
287		287
288	#ifdef CONFIG_SMP	288	#ifdef CONFIG_SMP
289	/*	289	/*
290	* We ran out of runtime, see if we can borrow some from our neighbours.	290	* We ran out of runtime, see if we can borrow some from our neighbours.
291	*/	291	*/
292	static int do_balance_runtime(struct rt_rq *rt_rq)	292	static int do_balance_runtime(struct rt_rq *rt_rq)
293	{	293	{
294	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);	294	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
295	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;	295	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
296	int i, weight, more = 0;	296	int i, weight, more = 0;
297	u64 rt_period;	297	u64 rt_period;
298		298
299	weight = cpumask_weight(rd->span);	299	weight = cpumask_weight(rd->span);
300		300
301	spin_lock(&rt_b->rt_runtime_lock);	301	spin_lock(&rt_b->rt_runtime_lock);
302	rt_period = ktime_to_ns(rt_b->rt_period);	302	rt_period = ktime_to_ns(rt_b->rt_period);
303	for_each_cpu(i, rd->span) {	303	for_each_cpu(i, rd->span) {
304	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);	304	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
305	s64 diff;	305	s64 diff;
306		306
307	if (iter == rt_rq)	307	if (iter == rt_rq)
308	continue;	308	continue;
309		309
310	spin_lock(&iter->rt_runtime_lock);	310	spin_lock(&iter->rt_runtime_lock);
311	/*	311	/*
312	* Either all rqs have inf runtime and there's nothing to steal	312	* Either all rqs have inf runtime and there's nothing to steal
313	* or __disable_runtime() below sets a specific rq to inf to	313	* or __disable_runtime() below sets a specific rq to inf to
314	* indicate its been disabled and disalow stealing.	314	* indicate its been disabled and disalow stealing.
315	*/	315	*/
316	if (iter->rt_runtime == RUNTIME_INF)	316	if (iter->rt_runtime == RUNTIME_INF)
317	goto next;	317	goto next;
318		318
319	/*	319	/*
320	* From runqueues with spare time, take 1/n part of their	320	* From runqueues with spare time, take 1/n part of their
321	* spare time, but no more than our period.	321	* spare time, but no more than our period.
322	*/	322	*/
323	diff = iter->rt_runtime - iter->rt_time;	323	diff = iter->rt_runtime - iter->rt_time;
324	if (diff > 0) {	324	if (diff > 0) {
325	diff = div_u64((u64)diff, weight);	325	diff = div_u64((u64)diff, weight);
326	if (rt_rq->rt_runtime + diff > rt_period)	326	if (rt_rq->rt_runtime + diff > rt_period)
327	diff = rt_period - rt_rq->rt_runtime;	327	diff = rt_period - rt_rq->rt_runtime;
328	iter->rt_runtime -= diff;	328	iter->rt_runtime -= diff;
329	rt_rq->rt_runtime += diff;	329	rt_rq->rt_runtime += diff;
330	more = 1;	330	more = 1;
331	if (rt_rq->rt_runtime == rt_period) {	331	if (rt_rq->rt_runtime == rt_period) {
332	spin_unlock(&iter->rt_runtime_lock);	332	spin_unlock(&iter->rt_runtime_lock);
333	break;	333	break;
334	}	334	}
335	}	335	}
336	next:	336	next:
337	spin_unlock(&iter->rt_runtime_lock);	337	spin_unlock(&iter->rt_runtime_lock);
338	}	338	}
339	spin_unlock(&rt_b->rt_runtime_lock);	339	spin_unlock(&rt_b->rt_runtime_lock);
340		340
341	return more;	341	return more;
342	}	342	}
343		343
344	/*	344	/*
345	* Ensure this RQ takes back all the runtime it lend to its neighbours.	345	* Ensure this RQ takes back all the runtime it lend to its neighbours.
346	*/	346	*/
347	static void __disable_runtime(struct rq *rq)	347	static void __disable_runtime(struct rq *rq)
348	{	348	{
349	struct root_domain *rd = rq->rd;	349	struct root_domain *rd = rq->rd;
350	struct rt_rq *rt_rq;	350	struct rt_rq *rt_rq;
351		351
352	if (unlikely(!scheduler_running))	352	if (unlikely(!scheduler_running))
353	return;	353	return;
354		354
355	for_each_leaf_rt_rq(rt_rq, rq) {	355	for_each_leaf_rt_rq(rt_rq, rq) {
356	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);	356	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
357	s64 want;	357	s64 want;
358	int i;	358	int i;
359		359
360	spin_lock(&rt_b->rt_runtime_lock);	360	spin_lock(&rt_b->rt_runtime_lock);
361	spin_lock(&rt_rq->rt_runtime_lock);	361	spin_lock(&rt_rq->rt_runtime_lock);
362	/*	362	/*
363	* Either we're all inf and nobody needs to borrow, or we're	363	* Either we're all inf and nobody needs to borrow, or we're
364	* already disabled and thus have nothing to do, or we have	364	* already disabled and thus have nothing to do, or we have
365	* exactly the right amount of runtime to take out.	365	* exactly the right amount of runtime to take out.
366	*/	366	*/
367	if (rt_rq->rt_runtime == RUNTIME_INF \|\|	367	if (rt_rq->rt_runtime == RUNTIME_INF \|\|
368	rt_rq->rt_runtime == rt_b->rt_runtime)	368	rt_rq->rt_runtime == rt_b->rt_runtime)
369	goto balanced;	369	goto balanced;
370	spin_unlock(&rt_rq->rt_runtime_lock);	370	spin_unlock(&rt_rq->rt_runtime_lock);
371		371
372	/*	372	/*
373	* Calculate the difference between what we started out with	373	* Calculate the difference between what we started out with
374	* and what we current have, that's the amount of runtime	374	* and what we current have, that's the amount of runtime
375	* we lend and now have to reclaim.	375	* we lend and now have to reclaim.
376	*/	376	*/
377	want = rt_b->rt_runtime - rt_rq->rt_runtime;	377	want = rt_b->rt_runtime - rt_rq->rt_runtime;
378		378
379	/*	379	/*
380	* Greedy reclaim, take back as much as we can.	380	* Greedy reclaim, take back as much as we can.
381	*/	381	*/
382	for_each_cpu(i, rd->span) {	382	for_each_cpu(i, rd->span) {
383	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);	383	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
384	s64 diff;	384	s64 diff;
385		385
386	/*	386	/*
387	* Can't reclaim from ourselves or disabled runqueues.	387	* Can't reclaim from ourselves or disabled runqueues.
388	*/	388	*/
389	if (iter == rt_rq \|\| iter->rt_runtime == RUNTIME_INF)	389	if (iter == rt_rq \|\| iter->rt_runtime == RUNTIME_INF)
390	continue;	390	continue;
391		391
392	spin_lock(&iter->rt_runtime_lock);	392	spin_lock(&iter->rt_runtime_lock);
393	if (want > 0) {	393	if (want > 0) {
394	diff = min_t(s64, iter->rt_runtime, want);	394	diff = min_t(s64, iter->rt_runtime, want);
395	iter->rt_runtime -= diff;	395	iter->rt_runtime -= diff;
396	want -= diff;	396	want -= diff;
397	} else {	397	} else {
398	iter->rt_runtime -= want;	398	iter->rt_runtime -= want;
399	want -= want;	399	want -= want;
400	}	400	}
401	spin_unlock(&iter->rt_runtime_lock);	401	spin_unlock(&iter->rt_runtime_lock);
402		402
403	if (!want)	403	if (!want)
404	break;	404	break;
405	}	405	}
406		406
407	spin_lock(&rt_rq->rt_runtime_lock);	407	spin_lock(&rt_rq->rt_runtime_lock);
408	/*	408	/*
409	* We cannot be left wanting - that would mean some runtime	409	* We cannot be left wanting - that would mean some runtime
410	* leaked out of the system.	410	* leaked out of the system.
411	*/	411	*/
412	BUG_ON(want);	412	BUG_ON(want);
413	balanced:	413	balanced:
414	/*	414	/*
415	* Disable all the borrow logic by pretending we have inf	415	* Disable all the borrow logic by pretending we have inf
416	* runtime - in which case borrowing doesn't make sense.	416	* runtime - in which case borrowing doesn't make sense.
417	*/	417	*/
418	rt_rq->rt_runtime = RUNTIME_INF;	418	rt_rq->rt_runtime = RUNTIME_INF;
419	spin_unlock(&rt_rq->rt_runtime_lock);	419	spin_unlock(&rt_rq->rt_runtime_lock);
420	spin_unlock(&rt_b->rt_runtime_lock);	420	spin_unlock(&rt_b->rt_runtime_lock);
421	}	421	}
422	}	422	}
423		423
424	static void disable_runtime(struct rq *rq)	424	static void disable_runtime(struct rq *rq)
425	{	425	{
426	unsigned long flags;	426	unsigned long flags;
427		427
428	spin_lock_irqsave(&rq->lock, flags);	428	spin_lock_irqsave(&rq->lock, flags);
429	__disable_runtime(rq);	429	__disable_runtime(rq);
430	spin_unlock_irqrestore(&rq->lock, flags);	430	spin_unlock_irqrestore(&rq->lock, flags);
431	}	431	}
432		432
433	static void __enable_runtime(struct rq *rq)	433	static void __enable_runtime(struct rq *rq)
434	{	434	{
435	struct rt_rq *rt_rq;	435	struct rt_rq *rt_rq;
436		436
437	if (unlikely(!scheduler_running))	437	if (unlikely(!scheduler_running))
438	return;	438	return;
439		439
440	/*	440	/*
441	* Reset each runqueue's bandwidth settings	441	* Reset each runqueue's bandwidth settings
442	*/	442	*/
443	for_each_leaf_rt_rq(rt_rq, rq) {	443	for_each_leaf_rt_rq(rt_rq, rq) {
444	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);	444	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
445		445
446	spin_lock(&rt_b->rt_runtime_lock);	446	spin_lock(&rt_b->rt_runtime_lock);
447	spin_lock(&rt_rq->rt_runtime_lock);	447	spin_lock(&rt_rq->rt_runtime_lock);
448	rt_rq->rt_runtime = rt_b->rt_runtime;	448	rt_rq->rt_runtime = rt_b->rt_runtime;
449	rt_rq->rt_time = 0;	449	rt_rq->rt_time = 0;
450	rt_rq->rt_throttled = 0;	450	rt_rq->rt_throttled = 0;
451	spin_unlock(&rt_rq->rt_runtime_lock);	451	spin_unlock(&rt_rq->rt_runtime_lock);
452	spin_unlock(&rt_b->rt_runtime_lock);	452	spin_unlock(&rt_b->rt_runtime_lock);
453	}	453	}
454	}	454	}
455		455
456	static void enable_runtime(struct rq *rq)	456	static void enable_runtime(struct rq *rq)
457	{	457	{
458	unsigned long flags;	458	unsigned long flags;
459		459
460	spin_lock_irqsave(&rq->lock, flags);	460	spin_lock_irqsave(&rq->lock, flags);
461	__enable_runtime(rq);	461	__enable_runtime(rq);
462	spin_unlock_irqrestore(&rq->lock, flags);	462	spin_unlock_irqrestore(&rq->lock, flags);
463	}	463	}
464		464
465	static int balance_runtime(struct rt_rq *rt_rq)	465	static int balance_runtime(struct rt_rq *rt_rq)
466	{	466	{
467	int more = 0;	467	int more = 0;
468		468
469	if (rt_rq->rt_time > rt_rq->rt_runtime) {	469	if (rt_rq->rt_time > rt_rq->rt_runtime) {
470	spin_unlock(&rt_rq->rt_runtime_lock);	470	spin_unlock(&rt_rq->rt_runtime_lock);
471	more = do_balance_runtime(rt_rq);	471	more = do_balance_runtime(rt_rq);
472	spin_lock(&rt_rq->rt_runtime_lock);	472	spin_lock(&rt_rq->rt_runtime_lock);
473	}	473	}
474		474
475	return more;	475	return more;
476	}	476	}
477	#else /* !CONFIG_SMP */	477	#else /* !CONFIG_SMP */
478	static inline int balance_runtime(struct rt_rq *rt_rq)	478	static inline int balance_runtime(struct rt_rq *rt_rq)
479	{	479	{
480	return 0;	480	return 0;
481	}	481	}
482	#endif /* CONFIG_SMP */	482	#endif /* CONFIG_SMP */
483		483
484	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)	484	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
485	{	485	{
486	int i, idle = 1;	486	int i, idle = 1;
487	const struct cpumask *span;	487	const struct cpumask *span;
488		488
489	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)	489	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)
490	return 1;	490	return 1;
491		491
492	span = sched_rt_period_mask();	492	span = sched_rt_period_mask();
493	for_each_cpu(i, span) {	493	for_each_cpu(i, span) {
494	int enqueue = 0;	494	int enqueue = 0;
495	struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);	495	struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
496	struct rq *rq = rq_of_rt_rq(rt_rq);	496	struct rq *rq = rq_of_rt_rq(rt_rq);
497		497
498	spin_lock(&rq->lock);	498	spin_lock(&rq->lock);
499	if (rt_rq->rt_time) {	499	if (rt_rq->rt_time) {
500	u64 runtime;	500	u64 runtime;
501		501
502	spin_lock(&rt_rq->rt_runtime_lock);	502	spin_lock(&rt_rq->rt_runtime_lock);
503	if (rt_rq->rt_throttled)	503	if (rt_rq->rt_throttled)
504	balance_runtime(rt_rq);	504	balance_runtime(rt_rq);
505	runtime = rt_rq->rt_runtime;	505	runtime = rt_rq->rt_runtime;
506	rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);	506	rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
507	if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {	507	if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
508	rt_rq->rt_throttled = 0;	508	rt_rq->rt_throttled = 0;
509	enqueue = 1;	509	enqueue = 1;
510	}	510	}
511	if (rt_rq->rt_time \|\| rt_rq->rt_nr_running)	511	if (rt_rq->rt_time \|\| rt_rq->rt_nr_running)
512	idle = 0;	512	idle = 0;
513	spin_unlock(&rt_rq->rt_runtime_lock);	513	spin_unlock(&rt_rq->rt_runtime_lock);
514	} else if (rt_rq->rt_nr_running)	514	} else if (rt_rq->rt_nr_running)
515	idle = 0;	515	idle = 0;
516		516
517	if (enqueue)	517	if (enqueue)
518	sched_rt_rq_enqueue(rt_rq);	518	sched_rt_rq_enqueue(rt_rq);
519	spin_unlock(&rq->lock);	519	spin_unlock(&rq->lock);
520	}	520	}
521		521
522	return idle;	522	return idle;
523	}	523	}
524		524
525	static inline int rt_se_prio(struct sched_rt_entity *rt_se)	525	static inline int rt_se_prio(struct sched_rt_entity *rt_se)
526	{	526	{
527	#ifdef CONFIG_RT_GROUP_SCHED	527	#ifdef CONFIG_RT_GROUP_SCHED
528	struct rt_rq *rt_rq = group_rt_rq(rt_se);	528	struct rt_rq *rt_rq = group_rt_rq(rt_se);
529		529
530	if (rt_rq)	530	if (rt_rq)
531	return rt_rq->highest_prio.curr;	531	return rt_rq->highest_prio.curr;
532	#endif	532	#endif
533		533
534	return rt_task_of(rt_se)->prio;	534	return rt_task_of(rt_se)->prio;
535	}	535	}
536		536
537	static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)	537	static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
538	{	538	{
539	u64 runtime = sched_rt_runtime(rt_rq);	539	u64 runtime = sched_rt_runtime(rt_rq);
540		540
541	if (rt_rq->rt_throttled)	541	if (rt_rq->rt_throttled)
542	return rt_rq_throttled(rt_rq);	542	return rt_rq_throttled(rt_rq);
543		543
544	if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))	544	if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
545	return 0;	545	return 0;
546		546
547	balance_runtime(rt_rq);	547	balance_runtime(rt_rq);
548	runtime = sched_rt_runtime(rt_rq);	548	runtime = sched_rt_runtime(rt_rq);
549	if (runtime == RUNTIME_INF)	549	if (runtime == RUNTIME_INF)
550	return 0;	550	return 0;
551		551
552	if (rt_rq->rt_time > runtime) {	552	if (rt_rq->rt_time > runtime) {
553	rt_rq->rt_throttled = 1;	553	rt_rq->rt_throttled = 1;
554	if (rt_rq_throttled(rt_rq)) {	554	if (rt_rq_throttled(rt_rq)) {
555	sched_rt_rq_dequeue(rt_rq);	555	sched_rt_rq_dequeue(rt_rq);
556	return 1;	556	return 1;
557	}	557	}
558	}	558	}
559		559
560	return 0;	560	return 0;
561	}	561	}
562		562
563	/*	563	/*
564	* Update the current task's runtime statistics. Skip current tasks that	564	* Update the current task's runtime statistics. Skip current tasks that
565	* are not in our scheduling class.	565	* are not in our scheduling class.
566	*/	566	*/
567	static void update_curr_rt(struct rq *rq)	567	static void update_curr_rt(struct rq *rq)
568	{	568	{
569	struct task_struct *curr = rq->curr;	569	struct task_struct *curr = rq->curr;
570	struct sched_rt_entity *rt_se = &curr->rt;	570	struct sched_rt_entity *rt_se = &curr->rt;
571	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);	571	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
572	u64 delta_exec;	572	u64 delta_exec;
573		573
574	if (!task_has_rt_policy(curr))	574	if (!task_has_rt_policy(curr))
575	return;	575	return;
576		576
577	delta_exec = rq->clock - curr->se.exec_start;	577	delta_exec = rq->clock - curr->se.exec_start;
578	if (unlikely((s64)delta_exec < 0))	578	if (unlikely((s64)delta_exec < 0))
579	delta_exec = 0;	579	delta_exec = 0;
580		580
581	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));	581	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
582		582
583	curr->se.sum_exec_runtime += delta_exec;	583	curr->se.sum_exec_runtime += delta_exec;
584	account_group_exec_runtime(curr, delta_exec);	584	account_group_exec_runtime(curr, delta_exec);
585		585
586	curr->se.exec_start = rq->clock;	586	curr->se.exec_start = rq->clock;
587	cpuacct_charge(curr, delta_exec);	587	cpuacct_charge(curr, delta_exec);
588		588
589	if (!rt_bandwidth_enabled())	589	if (!rt_bandwidth_enabled())
590	return;	590	return;
591		591
592	for_each_sched_rt_entity(rt_se) {	592	for_each_sched_rt_entity(rt_se) {
593	rt_rq = rt_rq_of_se(rt_se);	593	rt_rq = rt_rq_of_se(rt_se);
594		594
595	if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {	595	if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
596	spin_lock(&rt_rq->rt_runtime_lock);	596	spin_lock(&rt_rq->rt_runtime_lock);
597	rt_rq->rt_time += delta_exec;	597	rt_rq->rt_time += delta_exec;
598	if (sched_rt_runtime_exceeded(rt_rq))	598	if (sched_rt_runtime_exceeded(rt_rq))
599	resched_task(curr);	599	resched_task(curr);
600	spin_unlock(&rt_rq->rt_runtime_lock);	600	spin_unlock(&rt_rq->rt_runtime_lock);
601	}	601	}
602	}	602	}
603	}	603	}
604		604
605	#if defined CONFIG_SMP	605	#if defined CONFIG_SMP
606		606
607	static struct task_struct pick_next_highest_task_rt(struct rq rq, int cpu);	607	static struct task_struct pick_next_highest_task_rt(struct rq rq, int cpu);
608		608
609	static inline int next_prio(struct rq *rq)	609	static inline int next_prio(struct rq *rq)
610	{	610	{
611	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);	611	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
612		612
613	if (next && rt_prio(next->prio))	613	if (next && rt_prio(next->prio))
614	return next->prio;	614	return next->prio;
615	else	615	else
616	return MAX_RT_PRIO;	616	return MAX_RT_PRIO;
617	}	617	}
618		618
619	static void	619	static void
620	inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)	620	inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
621	{	621	{
622	struct rq *rq = rq_of_rt_rq(rt_rq);	622	struct rq *rq = rq_of_rt_rq(rt_rq);
623		623
624	if (prio < prev_prio) {	624	if (prio < prev_prio) {
625		625
626	/*	626	/*
627	* If the new task is higher in priority than anything on the	627	* If the new task is higher in priority than anything on the
628	* run-queue, we know that the previous high becomes our	628	* run-queue, we know that the previous high becomes our
629	* next-highest.	629	* next-highest.
630	*/	630	*/
631	rt_rq->highest_prio.next = prev_prio;	631	rt_rq->highest_prio.next = prev_prio;
632		632
633	if (rq->online)	633	if (rq->online)
634	cpupri_set(&rq->rd->cpupri, rq->cpu, prio);	634	cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
635		635
636	} else if (prio == rt_rq->highest_prio.curr)	636	} else if (prio == rt_rq->highest_prio.curr)
637	/*	637	/*
638	* If the next task is equal in priority to the highest on	638	* If the next task is equal in priority to the highest on
639	* the run-queue, then we implicitly know that the next highest	639	* the run-queue, then we implicitly know that the next highest
640	* task cannot be any lower than current	640	* task cannot be any lower than current
641	*/	641	*/
642	rt_rq->highest_prio.next = prio;	642	rt_rq->highest_prio.next = prio;
643	else if (prio < rt_rq->highest_prio.next)	643	else if (prio < rt_rq->highest_prio.next)
644	/*	644	/*
645	* Otherwise, we need to recompute next-highest	645	* Otherwise, we need to recompute next-highest
646	*/	646	*/
647	rt_rq->highest_prio.next = next_prio(rq);	647	rt_rq->highest_prio.next = next_prio(rq);
648	}	648	}
649		649
650	static void	650	static void
651	dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)	651	dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
652	{	652	{
653	struct rq *rq = rq_of_rt_rq(rt_rq);	653	struct rq *rq = rq_of_rt_rq(rt_rq);
654		654
655	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))	655	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
656	rt_rq->highest_prio.next = next_prio(rq);	656	rt_rq->highest_prio.next = next_prio(rq);
657		657
658	if (rq->online && rt_rq->highest_prio.curr != prev_prio)	658	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
659	cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);	659	cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
660	}	660	}
661		661
662	#else /* CONFIG_SMP */	662	#else /* CONFIG_SMP */
663		663
664	static inline	664	static inline
665	void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}	665	void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
666	static inline	666	static inline
667	void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}	667	void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
668		668
669	#endif /* CONFIG_SMP */	669	#endif /* CONFIG_SMP */
670		670
671	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED	671	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
672	static void	672	static void
673	inc_rt_prio(struct rt_rq *rt_rq, int prio)	673	inc_rt_prio(struct rt_rq *rt_rq, int prio)
674	{	674	{
675	int prev_prio = rt_rq->highest_prio.curr;	675	int prev_prio = rt_rq->highest_prio.curr;
676		676
677	if (prio < prev_prio)	677	if (prio < prev_prio)
678	rt_rq->highest_prio.curr = prio;	678	rt_rq->highest_prio.curr = prio;
679		679
680	inc_rt_prio_smp(rt_rq, prio, prev_prio);	680	inc_rt_prio_smp(rt_rq, prio, prev_prio);
681	}	681	}
682		682
683	static void	683	static void
684	dec_rt_prio(struct rt_rq *rt_rq, int prio)	684	dec_rt_prio(struct rt_rq *rt_rq, int prio)
685	{	685	{
686	int prev_prio = rt_rq->highest_prio.curr;	686	int prev_prio = rt_rq->highest_prio.curr;
687		687
688	if (rt_rq->rt_nr_running) {	688	if (rt_rq->rt_nr_running) {
689		689
690	WARN_ON(prio < prev_prio);	690	WARN_ON(prio < prev_prio);
691		691
692	/*	692	/*
693	* This may have been our highest task, and therefore	693	* This may have been our highest task, and therefore
694	* we may have some recomputation to do	694	* we may have some recomputation to do
695	*/	695	*/
696	if (prio == prev_prio) {	696	if (prio == prev_prio) {
697	struct rt_prio_array *array = &rt_rq->active;	697	struct rt_prio_array *array = &rt_rq->active;
698		698
699	rt_rq->highest_prio.curr =	699	rt_rq->highest_prio.curr =
700	sched_find_first_bit(array->bitmap);	700	sched_find_first_bit(array->bitmap);
701	}	701	}
702		702
703	} else	703	} else
704	rt_rq->highest_prio.curr = MAX_RT_PRIO;	704	rt_rq->highest_prio.curr = MAX_RT_PRIO;
705		705
706	dec_rt_prio_smp(rt_rq, prio, prev_prio);	706	dec_rt_prio_smp(rt_rq, prio, prev_prio);
707	}	707	}
708		708
709	#else	709	#else
710		710
711	static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}	711	static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
712	static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}	712	static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
713		713
714	#endif /* CONFIG_SMP \|\| CONFIG_RT_GROUP_SCHED */	714	#endif /* CONFIG_SMP \|\| CONFIG_RT_GROUP_SCHED */
715		715
716	#ifdef CONFIG_RT_GROUP_SCHED	716	#ifdef CONFIG_RT_GROUP_SCHED
717		717
718	static void	718	static void
719	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	719	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
720	{	720	{
721	if (rt_se_boosted(rt_se))	721	if (rt_se_boosted(rt_se))
722	rt_rq->rt_nr_boosted++;	722	rt_rq->rt_nr_boosted++;
723		723
724	if (rt_rq->tg)	724	if (rt_rq->tg)
725	start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);	725	start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
726	}	726	}
727		727
728	static void	728	static void
729	dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	729	dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
730	{	730	{
731	if (rt_se_boosted(rt_se))	731	if (rt_se_boosted(rt_se))
732	rt_rq->rt_nr_boosted--;	732	rt_rq->rt_nr_boosted--;
733		733
734	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);	734	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
735	}	735	}
736		736
737	#else /* CONFIG_RT_GROUP_SCHED */	737	#else /* CONFIG_RT_GROUP_SCHED */
738		738
739	static void	739	static void
740	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	740	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
741	{	741	{
742	start_rt_bandwidth(&def_rt_bandwidth);	742	start_rt_bandwidth(&def_rt_bandwidth);
743	}	743	}
744		744
745	static inline	745	static inline
746	void dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq) {}	746	void dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq) {}
747		747
748	#endif /* CONFIG_RT_GROUP_SCHED */	748	#endif /* CONFIG_RT_GROUP_SCHED */
749		749
750	static inline	750	static inline
751	void inc_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	751	void inc_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
752	{	752	{
753	int prio = rt_se_prio(rt_se);	753	int prio = rt_se_prio(rt_se);
754		754
755	WARN_ON(!rt_prio(prio));	755	WARN_ON(!rt_prio(prio));
756	rt_rq->rt_nr_running++;	756	rt_rq->rt_nr_running++;
757		757
758	inc_rt_prio(rt_rq, prio);	758	inc_rt_prio(rt_rq, prio);
759	inc_rt_migration(rt_se, rt_rq);	759	inc_rt_migration(rt_se, rt_rq);
760	inc_rt_group(rt_se, rt_rq);	760	inc_rt_group(rt_se, rt_rq);
761	}	761	}
762		762
763	static inline	763	static inline
764	void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	764	void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
765	{	765	{
766	WARN_ON(!rt_prio(rt_se_prio(rt_se)));	766	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
767	WARN_ON(!rt_rq->rt_nr_running);	767	WARN_ON(!rt_rq->rt_nr_running);
768	rt_rq->rt_nr_running--;	768	rt_rq->rt_nr_running--;
769		769
770	dec_rt_prio(rt_rq, rt_se_prio(rt_se));	770	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
771	dec_rt_migration(rt_se, rt_rq);	771	dec_rt_migration(rt_se, rt_rq);
772	dec_rt_group(rt_se, rt_rq);	772	dec_rt_group(rt_se, rt_rq);
773	}	773	}
774		774
775	static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)	775	static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
776	{	776	{
777	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);	777	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
778	struct rt_prio_array *array = &rt_rq->active;	778	struct rt_prio_array *array = &rt_rq->active;
779	struct rt_rq *group_rq = group_rt_rq(rt_se);	779	struct rt_rq *group_rq = group_rt_rq(rt_se);
780	struct list_head *queue = array->queue + rt_se_prio(rt_se);	780	struct list_head *queue = array->queue + rt_se_prio(rt_se);
781		781
782	/*	782	/*
783	* Don't enqueue the group if its throttled, or when empty.	783	* Don't enqueue the group if its throttled, or when empty.
784	* The latter is a consequence of the former when a child group	784	* The latter is a consequence of the former when a child group
785	* get throttled and the current group doesn't have any other	785	* get throttled and the current group doesn't have any other
786	* active members.	786	* active members.
787	*/	787	*/
788	if (group_rq && (rt_rq_throttled(group_rq) \|\| !group_rq->rt_nr_running))	788	if (group_rq && (rt_rq_throttled(group_rq) \|\| !group_rq->rt_nr_running))
789	return;	789	return;
790		790
791	list_add_tail(&rt_se->run_list, queue);	791	list_add_tail(&rt_se->run_list, queue);
792	__set_bit(rt_se_prio(rt_se), array->bitmap);	792	__set_bit(rt_se_prio(rt_se), array->bitmap);
793		793
794	inc_rt_tasks(rt_se, rt_rq);	794	inc_rt_tasks(rt_se, rt_rq);
795	}	795	}
796		796
797	static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)	797	static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
798	{	798	{
799	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);	799	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
800	struct rt_prio_array *array = &rt_rq->active;	800	struct rt_prio_array *array = &rt_rq->active;
801		801
802	list_del_init(&rt_se->run_list);	802	list_del_init(&rt_se->run_list);
803	if (list_empty(array->queue + rt_se_prio(rt_se)))	803	if (list_empty(array->queue + rt_se_prio(rt_se)))
804	__clear_bit(rt_se_prio(rt_se), array->bitmap);	804	__clear_bit(rt_se_prio(rt_se), array->bitmap);
805		805
806	dec_rt_tasks(rt_se, rt_rq);	806	dec_rt_tasks(rt_se, rt_rq);
807	}	807	}
808		808
809	/*	809	/*
810	* Because the prio of an upper entry depends on the lower	810	* Because the prio of an upper entry depends on the lower
811	* entries, we must remove entries top - down.	811	* entries, we must remove entries top - down.
812	*/	812	*/
813	static void dequeue_rt_stack(struct sched_rt_entity *rt_se)	813	static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
814	{	814	{
815	struct sched_rt_entity *back = NULL;	815	struct sched_rt_entity *back = NULL;
816		816
817	for_each_sched_rt_entity(rt_se) {	817	for_each_sched_rt_entity(rt_se) {
818	rt_se->back = back;	818	rt_se->back = back;
819	back = rt_se;	819	back = rt_se;
820	}	820	}
821		821
822	for (rt_se = back; rt_se; rt_se = rt_se->back) {	822	for (rt_se = back; rt_se; rt_se = rt_se->back) {
823	if (on_rt_rq(rt_se))	823	if (on_rt_rq(rt_se))
824	__dequeue_rt_entity(rt_se);	824	__dequeue_rt_entity(rt_se);
825	}	825	}
826	}	826	}
827		827
828	static void enqueue_rt_entity(struct sched_rt_entity *rt_se)	828	static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
829	{	829	{
830	dequeue_rt_stack(rt_se);	830	dequeue_rt_stack(rt_se);
831	for_each_sched_rt_entity(rt_se)	831	for_each_sched_rt_entity(rt_se)
832	__enqueue_rt_entity(rt_se);	832	__enqueue_rt_entity(rt_se);
833	}	833	}
834		834
835	static void dequeue_rt_entity(struct sched_rt_entity *rt_se)	835	static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
836	{	836	{
837	dequeue_rt_stack(rt_se);	837	dequeue_rt_stack(rt_se);
838		838
839	for_each_sched_rt_entity(rt_se) {	839	for_each_sched_rt_entity(rt_se) {
840	struct rt_rq *rt_rq = group_rt_rq(rt_se);	840	struct rt_rq *rt_rq = group_rt_rq(rt_se);
841		841
842	if (rt_rq && rt_rq->rt_nr_running)	842	if (rt_rq && rt_rq->rt_nr_running)
843	__enqueue_rt_entity(rt_se);	843	__enqueue_rt_entity(rt_se);
844	}	844	}
845	}	845	}
846		846
847	/*	847	/*
848	* Adding/removing a task to/from a priority array:	848	* Adding/removing a task to/from a priority array:
849	*/	849	*/
850	static void enqueue_task_rt(struct rq rq, struct task_struct p, int wakeup)	850	static void enqueue_task_rt(struct rq rq, struct task_struct p, int wakeup)
851	{	851	{
852	struct sched_rt_entity *rt_se = &p->rt;	852	struct sched_rt_entity *rt_se = &p->rt;
853		853
854	if (wakeup)	854	if (wakeup)
855	rt_se->timeout = 0;	855	rt_se->timeout = 0;
856		856
857	enqueue_rt_entity(rt_se);	857	enqueue_rt_entity(rt_se);
858		858
859	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)	859	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
860	enqueue_pushable_task(rq, p);	860	enqueue_pushable_task(rq, p);
861		861
862	inc_cpu_load(rq, p->se.load.weight);	862	inc_cpu_load(rq, p->se.load.weight);
863	}	863	}
864		864
865	static void dequeue_task_rt(struct rq rq, struct task_struct p, int sleep)	865	static void dequeue_task_rt(struct rq rq, struct task_struct p, int sleep)
866	{	866	{
867	struct sched_rt_entity *rt_se = &p->rt;	867	struct sched_rt_entity *rt_se = &p->rt;
868		868
869	update_curr_rt(rq);	869	update_curr_rt(rq);
870	dequeue_rt_entity(rt_se);	870	dequeue_rt_entity(rt_se);
871		871
872	dequeue_pushable_task(rq, p);	872	dequeue_pushable_task(rq, p);
873		873
874	dec_cpu_load(rq, p->se.load.weight);	874	dec_cpu_load(rq, p->se.load.weight);
875	}	875	}
876		876
877	/*	877	/*
878	* Put task to the end of the run list without the overhead of dequeue	878	* Put task to the end of the run list without the overhead of dequeue
879	* followed by enqueue.	879	* followed by enqueue.
880	*/	880	*/
881	static void	881	static void
882	requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se, int head)	882	requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se, int head)
883	{	883	{
884	if (on_rt_rq(rt_se)) {	884	if (on_rt_rq(rt_se)) {
885	struct rt_prio_array *array = &rt_rq->active;	885	struct rt_prio_array *array = &rt_rq->active;
886	struct list_head *queue = array->queue + rt_se_prio(rt_se);	886	struct list_head *queue = array->queue + rt_se_prio(rt_se);
887		887
888	if (head)	888	if (head)
889	list_move(&rt_se->run_list, queue);	889	list_move(&rt_se->run_list, queue);
890	else	890	else
891	list_move_tail(&rt_se->run_list, queue);	891	list_move_tail(&rt_se->run_list, queue);
892	}	892	}
893	}	893	}
894		894
895	static void requeue_task_rt(struct rq rq, struct task_struct p, int head)	895	static void requeue_task_rt(struct rq rq, struct task_struct p, int head)
896	{	896	{
897	struct sched_rt_entity *rt_se = &p->rt;	897	struct sched_rt_entity *rt_se = &p->rt;
898	struct rt_rq *rt_rq;	898	struct rt_rq *rt_rq;
899		899
900	for_each_sched_rt_entity(rt_se) {	900	for_each_sched_rt_entity(rt_se) {
901	rt_rq = rt_rq_of_se(rt_se);	901	rt_rq = rt_rq_of_se(rt_se);
902	requeue_rt_entity(rt_rq, rt_se, head);	902	requeue_rt_entity(rt_rq, rt_se, head);
903	}	903	}
904	}	904	}
905		905
906	static void yield_task_rt(struct rq *rq)	906	static void yield_task_rt(struct rq *rq)
907	{	907	{
908	requeue_task_rt(rq, rq->curr, 0);	908	requeue_task_rt(rq, rq->curr, 0);
909	}	909	}
910		910
911	#ifdef CONFIG_SMP	911	#ifdef CONFIG_SMP
912	static int find_lowest_rq(struct task_struct *task);	912	static int find_lowest_rq(struct task_struct *task);
913		913
914	static int select_task_rq_rt(struct task_struct *p, int sync)	914	static int select_task_rq_rt(struct task_struct *p, int sync)
915	{	915	{
916	struct rq *rq = task_rq(p);	916	struct rq *rq = task_rq(p);
917		917
918	/*	918	/*
919	* If the current task is an RT task, then	919	* If the current task is an RT task, then
920	* try to see if we can wake this RT task up on another	920	* try to see if we can wake this RT task up on another
921	* runqueue. Otherwise simply start this RT task	921	* runqueue. Otherwise simply start this RT task
922	* on its current runqueue.	922	* on its current runqueue.
923	*	923	*
924	* We want to avoid overloading runqueues. Even if	924	* We want to avoid overloading runqueues. Even if
925	* the RT task is of higher priority than the current RT task.	925	* the RT task is of higher priority than the current RT task.
926	* RT tasks behave differently than other tasks. If	926	* RT tasks behave differently than other tasks. If
927	* one gets preempted, we try to push it off to another queue.	927	* one gets preempted, we try to push it off to another queue.
928	* So trying to keep a preempting RT task on the same	928	* So trying to keep a preempting RT task on the same
929	* cache hot CPU will force the running RT task to	929	* cache hot CPU will force the running RT task to
930	* a cold CPU. So we waste all the cache for the lower	930	* a cold CPU. So we waste all the cache for the lower
931	* RT task in hopes of saving some of a RT task	931	* RT task in hopes of saving some of a RT task
932	* that is just being woken and probably will have	932	* that is just being woken and probably will have
933	* cold cache anyway.	933	* cold cache anyway.
934	*/	934	*/
935	if (unlikely(rt_task(rq->curr)) &&	935	if (unlikely(rt_task(rq->curr)) &&
936	(p->rt.nr_cpus_allowed > 1)) {	936	(p->rt.nr_cpus_allowed > 1)) {
937	int cpu = find_lowest_rq(p);	937	int cpu = find_lowest_rq(p);
938		938
939	return (cpu == -1) ? task_cpu(p) : cpu;	939	return (cpu == -1) ? task_cpu(p) : cpu;
940	}	940	}
941		941
942	/*	942	/*
943	* Otherwise, just let it ride on the affined RQ and the	943	* Otherwise, just let it ride on the affined RQ and the
944	* post-schedule router will push the preempted task away	944	* post-schedule router will push the preempted task away
945	*/	945	*/
946	return task_cpu(p);	946	return task_cpu(p);
947	}	947	}
948		948
949	static void check_preempt_equal_prio(struct rq rq, struct task_struct p)	949	static void check_preempt_equal_prio(struct rq rq, struct task_struct p)
950	{	950	{
951	if (rq->curr->rt.nr_cpus_allowed == 1)	951	if (rq->curr->rt.nr_cpus_allowed == 1)
952	return;	952	return;
953		953
954	if (p->rt.nr_cpus_allowed != 1	954	if (p->rt.nr_cpus_allowed != 1
955	&& cpupri_find(&rq->rd->cpupri, p, NULL))	955	&& cpupri_find(&rq->rd->cpupri, p, NULL))
956	return;	956	return;
957		957
958	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))	958	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
959	return;	959	return;
960		960
961	/*	961	/*
962	* There appears to be other cpus that can accept	962	* There appears to be other cpus that can accept
963	* current and none to run 'p', so lets reschedule	963	* current and none to run 'p', so lets reschedule
964	* to try and push current away:	964	* to try and push current away:
965	*/	965	*/
966	requeue_task_rt(rq, p, 1);	966	requeue_task_rt(rq, p, 1);
967	resched_task(rq->curr);	967	resched_task(rq->curr);
968	}	968	}
969		969
970	#endif /* CONFIG_SMP */	970	#endif /* CONFIG_SMP */
971		971
972	/*	972	/*
973	* Preempt the current task with a newly woken task if needed:	973	* Preempt the current task with a newly woken task if needed:
974	*/	974	*/
975	static void check_preempt_curr_rt(struct rq rq, struct task_struct p, int sync)	975	static void check_preempt_curr_rt(struct rq rq, struct task_struct p, int sync)
976	{	976	{
977	if (p->prio < rq->curr->prio) {	977	if (p->prio < rq->curr->prio) {
978	resched_task(rq->curr);	978	resched_task(rq->curr);
979	return;	979	return;
980	}	980	}
981		981
982	#ifdef CONFIG_SMP	982	#ifdef CONFIG_SMP
983	/*	983	/*
984	* If:	984	* If:
985	*	985	*
986	* - the newly woken task is of equal priority to the current task	986	* - the newly woken task is of equal priority to the current task
987	* - the newly woken task is non-migratable while current is migratable	987	* - the newly woken task is non-migratable while current is migratable
988	* - current will be preempted on the next reschedule	988	* - current will be preempted on the next reschedule
989	*	989	*
990	* we should check to see if current can readily move to a different	990	* we should check to see if current can readily move to a different
991	* cpu. If so, we will reschedule to allow the push logic to try	991	* cpu. If so, we will reschedule to allow the push logic to try
992	* to move current somewhere else, making room for our non-migratable	992	* to move current somewhere else, making room for our non-migratable
993	* task.	993	* task.
994	*/	994	*/
995	if (p->prio == rq->curr->prio && !need_resched())	995	if (p->prio == rq->curr->prio && !need_resched())
996	check_preempt_equal_prio(rq, p);	996	check_preempt_equal_prio(rq, p);
997	#endif	997	#endif
998	}	998	}
999		999
1000	static struct sched_rt_entity pick_next_rt_entity(struct rq rq,	1000	static struct sched_rt_entity pick_next_rt_entity(struct rq rq,
1001	struct rt_rq *rt_rq)	1001	struct rt_rq *rt_rq)
1002	{	1002	{
1003	struct rt_prio_array *array = &rt_rq->active;	1003	struct rt_prio_array *array = &rt_rq->active;
1004	struct sched_rt_entity *next = NULL;	1004	struct sched_rt_entity *next = NULL;
1005	struct list_head *queue;	1005	struct list_head *queue;
1006	int idx;	1006	int idx;
1007		1007
1008	idx = sched_find_first_bit(array->bitmap);	1008	idx = sched_find_first_bit(array->bitmap);
1009	BUG_ON(idx >= MAX_RT_PRIO);	1009	BUG_ON(idx >= MAX_RT_PRIO);
1010		1010
1011	queue = array->queue + idx;	1011	queue = array->queue + idx;
1012	next = list_entry(queue->next, struct sched_rt_entity, run_list);	1012	next = list_entry(queue->next, struct sched_rt_entity, run_list);
1013		1013
1014	return next;	1014	return next;
1015	}	1015	}
1016		1016
1017	static struct task_struct _pick_next_task_rt(struct rq rq)	1017	static struct task_struct _pick_next_task_rt(struct rq rq)
1018	{	1018	{
1019	struct sched_rt_entity *rt_se;	1019	struct sched_rt_entity *rt_se;
1020	struct task_struct *p;	1020	struct task_struct *p;
1021	struct rt_rq *rt_rq;	1021	struct rt_rq *rt_rq;
1022		1022
1023	rt_rq = &rq->rt;	1023	rt_rq = &rq->rt;
1024		1024
1025	if (unlikely(!rt_rq->rt_nr_running))	1025	if (unlikely(!rt_rq->rt_nr_running))
1026	return NULL;	1026	return NULL;
1027		1027
1028	if (rt_rq_throttled(rt_rq))	1028	if (rt_rq_throttled(rt_rq))
1029	return NULL;	1029	return NULL;
1030		1030
1031	do {	1031	do {
1032	rt_se = pick_next_rt_entity(rq, rt_rq);	1032	rt_se = pick_next_rt_entity(rq, rt_rq);
1033	BUG_ON(!rt_se);	1033	BUG_ON(!rt_se);
1034	rt_rq = group_rt_rq(rt_se);	1034	rt_rq = group_rt_rq(rt_se);
1035	} while (rt_rq);	1035	} while (rt_rq);
1036		1036
1037	p = rt_task_of(rt_se);	1037	p = rt_task_of(rt_se);
1038	p->se.exec_start = rq->clock;	1038	p->se.exec_start = rq->clock;
1039		1039
1040	return p;	1040	return p;
1041	}	1041	}
1042		1042
1043	static struct task_struct pick_next_task_rt(struct rq rq)	1043	static struct task_struct pick_next_task_rt(struct rq rq)
1044	{	1044	{
1045	struct task_struct *p = _pick_next_task_rt(rq);	1045	struct task_struct *p = _pick_next_task_rt(rq);
1046		1046
1047	/* The running task is never eligible for pushing */	1047	/* The running task is never eligible for pushing */
1048	if (p)	1048	if (p)
1049	dequeue_pushable_task(rq, p);	1049	dequeue_pushable_task(rq, p);
1050		1050
1051	return p;	1051	return p;
1052	}	1052	}
1053		1053
1054	static void put_prev_task_rt(struct rq rq, struct task_struct p)	1054	static void put_prev_task_rt(struct rq rq, struct task_struct p)
1055	{	1055	{
1056	update_curr_rt(rq);	1056	update_curr_rt(rq);
1057	p->se.exec_start = 0;	1057	p->se.exec_start = 0;
1058		1058
1059	/*	1059	/*
1060	* The previous task needs to be made eligible for pushing	1060	* The previous task needs to be made eligible for pushing
1061	* if it is still active	1061	* if it is still active
1062	*/	1062	*/
1063	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)	1063	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1064	enqueue_pushable_task(rq, p);	1064	enqueue_pushable_task(rq, p);
1065	}	1065	}
1066		1066
1067	#ifdef CONFIG_SMP	1067	#ifdef CONFIG_SMP
1068		1068
1069	/* Only try algorithms three times */	1069	/* Only try algorithms three times */
1070	#define RT_MAX_TRIES 3	1070	#define RT_MAX_TRIES 3
1071		1071
1072	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);	1072	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);
1073		1073
1074	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)	1074	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)
1075	{	1075	{
1076	if (!task_running(rq, p) &&	1076	if (!task_running(rq, p) &&
1077	(cpu < 0 \|\| cpumask_test_cpu(cpu, &p->cpus_allowed)) &&	1077	(cpu < 0 \|\| cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1078	(p->rt.nr_cpus_allowed > 1))	1078	(p->rt.nr_cpus_allowed > 1))
1079	return 1;	1079	return 1;
1080	return 0;	1080	return 0;
1081	}	1081	}
1082		1082
1083	/* Return the second highest RT task, NULL otherwise */	1083	/* Return the second highest RT task, NULL otherwise */
1084	static struct task_struct pick_next_highest_task_rt(struct rq rq, int cpu)	1084	static struct task_struct pick_next_highest_task_rt(struct rq rq, int cpu)
1085	{	1085	{
1086	struct task_struct *next = NULL;	1086	struct task_struct *next = NULL;
1087	struct sched_rt_entity *rt_se;	1087	struct sched_rt_entity *rt_se;
1088	struct rt_prio_array *array;	1088	struct rt_prio_array *array;
1089	struct rt_rq *rt_rq;	1089	struct rt_rq *rt_rq;
1090	int idx;	1090	int idx;
1091		1091
1092	for_each_leaf_rt_rq(rt_rq, rq) {	1092	for_each_leaf_rt_rq(rt_rq, rq) {
1093	array = &rt_rq->active;	1093	array = &rt_rq->active;
1094	idx = sched_find_first_bit(array->bitmap);	1094	idx = sched_find_first_bit(array->bitmap);
1095	next_idx:	1095	next_idx:
1096	if (idx >= MAX_RT_PRIO)	1096	if (idx >= MAX_RT_PRIO)
1097	continue;	1097	continue;
1098	if (next && next->prio < idx)	1098	if (next && next->prio < idx)
1099	continue;	1099	continue;
1100	list_for_each_entry(rt_se, array->queue + idx, run_list) {	1100	list_for_each_entry(rt_se, array->queue + idx, run_list) {
1101	struct task_struct *p = rt_task_of(rt_se);	1101	struct task_struct *p = rt_task_of(rt_se);
1102	if (pick_rt_task(rq, p, cpu)) {	1102	if (pick_rt_task(rq, p, cpu)) {
1103	next = p;	1103	next = p;
1104	break;	1104	break;
1105	}	1105	}
1106	}	1106	}
1107	if (!next) {	1107	if (!next) {
1108	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);	1108	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1109	goto next_idx;	1109	goto next_idx;
1110	}	1110	}
1111	}	1111	}
1112		1112
1113	return next;	1113	return next;
1114	}	1114	}
1115		1115
1116	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);	1116	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1117		1117
1118	static inline int pick_optimal_cpu(int this_cpu,	1118	static inline int pick_optimal_cpu(int this_cpu,
1119	const struct cpumask *mask)	1119	const struct cpumask *mask)
1120	{	1120	{
1121	int first;	1121	int first;
1122		1122
1123	/* "this_cpu" is cheaper to preempt than a remote processor */	1123	/* "this_cpu" is cheaper to preempt than a remote processor */
1124	if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))	1124	if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1125	return this_cpu;	1125	return this_cpu;
1126		1126
1127	first = cpumask_first(mask);	1127	first = cpumask_first(mask);
1128	if (first < nr_cpu_ids)	1128	if (first < nr_cpu_ids)
1129	return first;	1129	return first;
1130		1130
1131	return -1;	1131	return -1;
1132	}	1132	}
1133		1133
1134	static int find_lowest_rq(struct task_struct *task)	1134	static int find_lowest_rq(struct task_struct *task)
1135	{	1135	{
1136	struct sched_domain *sd;	1136	struct sched_domain *sd;
1137	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);	1137	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1138	int this_cpu = smp_processor_id();	1138	int this_cpu = smp_processor_id();
1139	int cpu = task_cpu(task);	1139	int cpu = task_cpu(task);
1140	cpumask_var_t domain_mask;	1140	cpumask_var_t domain_mask;
1141		1141
1142	if (task->rt.nr_cpus_allowed == 1)	1142	if (task->rt.nr_cpus_allowed == 1)
1143	return -1; /* No other targets possible */	1143	return -1; /* No other targets possible */
1144		1144
1145	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))	1145	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1146	return -1; /* No targets found */	1146	return -1; /* No targets found */
1147		1147
1148	/*	1148	/*
1149	* Only consider CPUs that are usable for migration.	1149	* Only consider CPUs that are usable for migration.
1150	* I guess we might want to change cpupri_find() to ignore those	1150	* I guess we might want to change cpupri_find() to ignore those
1151	* in the first place.	1151	* in the first place.
1152	*/	1152	*/
1153	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);	1153	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1154		1154
1155	/*	1155	/*
1156	* At this point we have built a mask of cpus representing the	1156	* At this point we have built a mask of cpus representing the
1157	* lowest priority tasks in the system. Now we want to elect	1157	* lowest priority tasks in the system. Now we want to elect
1158	* the best one based on our affinity and topology.	1158	* the best one based on our affinity and topology.
1159	*	1159	*
1160	* We prioritize the last cpu that the task executed on since	1160	* We prioritize the last cpu that the task executed on since
1161	* it is most likely cache-hot in that location.	1161	* it is most likely cache-hot in that location.
1162	*/	1162	*/
1163	if (cpumask_test_cpu(cpu, lowest_mask))	1163	if (cpumask_test_cpu(cpu, lowest_mask))
1164	return cpu;	1164	return cpu;
1165		1165
1166	/*	1166	/*
1167	* Otherwise, we consult the sched_domains span maps to figure	1167	* Otherwise, we consult the sched_domains span maps to figure
1168	* out which cpu is logically closest to our hot cache data.	1168	* out which cpu is logically closest to our hot cache data.
1169	*/	1169	*/
1170	if (this_cpu == cpu)	1170	if (this_cpu == cpu)
1171	this_cpu = -1; /* Skip this_cpu opt if the same */	1171	this_cpu = -1; /* Skip this_cpu opt if the same */
1172		1172
1173	if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {	1173	if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1174	for_each_domain(cpu, sd) {	1174	for_each_domain(cpu, sd) {
1175	if (sd->flags & SD_WAKE_AFFINE) {	1175	if (sd->flags & SD_WAKE_AFFINE) {
1176	int best_cpu;	1176	int best_cpu;
1177		1177
1178	cpumask_and(domain_mask,	1178	cpumask_and(domain_mask,
1179	sched_domain_span(sd),	1179	sched_domain_span(sd),
1180	lowest_mask);	1180	lowest_mask);
1181		1181
1182	best_cpu = pick_optimal_cpu(this_cpu,	1182	best_cpu = pick_optimal_cpu(this_cpu,
1183	domain_mask);	1183	domain_mask);
1184		1184
1185	if (best_cpu != -1) {	1185	if (best_cpu != -1) {
1186	free_cpumask_var(domain_mask);	1186	free_cpumask_var(domain_mask);
1187	return best_cpu;	1187	return best_cpu;
1188	}	1188	}
1189	}	1189	}
1190	}	1190	}
1191	free_cpumask_var(domain_mask);	1191	free_cpumask_var(domain_mask);
1192	}	1192	}
1193		1193
1194	/*	1194	/*
1195	* And finally, if there were no matches within the domains	1195	* And finally, if there were no matches within the domains
1196	* just give the caller something to work with from the compatible	1196	* just give the caller something to work with from the compatible
1197	* locations.	1197	* locations.
1198	*/	1198	*/
1199	return pick_optimal_cpu(this_cpu, lowest_mask);	1199	return pick_optimal_cpu(this_cpu, lowest_mask);
1200	}	1200	}
1201		1201
1202	/* Will lock the rq it finds */	1202	/* Will lock the rq it finds */
1203	static struct rq find_lock_lowest_rq(struct task_struct task, struct rq *rq)	1203	static struct rq find_lock_lowest_rq(struct task_struct task, struct rq *rq)
1204	{	1204	{
1205	struct rq *lowest_rq = NULL;	1205	struct rq *lowest_rq = NULL;
1206	int tries;	1206	int tries;
1207	int cpu;	1207	int cpu;
1208		1208
1209	for (tries = 0; tries < RT_MAX_TRIES; tries++) {	1209	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1210	cpu = find_lowest_rq(task);	1210	cpu = find_lowest_rq(task);
1211		1211
1212	if ((cpu == -1) \|\| (cpu == rq->cpu))	1212	if ((cpu == -1) \|\| (cpu == rq->cpu))
1213	break;	1213	break;
1214		1214
1215	lowest_rq = cpu_rq(cpu);	1215	lowest_rq = cpu_rq(cpu);
1216		1216
1217	/* if the prio of this runqueue changed, try again */	1217	/* if the prio of this runqueue changed, try again */
1218	if (double_lock_balance(rq, lowest_rq)) {	1218	if (double_lock_balance(rq, lowest_rq)) {
1219	/*	1219	/*
1220	* We had to unlock the run queue. In	1220	* We had to unlock the run queue. In
1221	* the mean time, task could have	1221	* the mean time, task could have
1222	* migrated already or had its affinity changed.	1222	* migrated already or had its affinity changed.
1223	* Also make sure that it wasn't scheduled on its rq.	1223	* Also make sure that it wasn't scheduled on its rq.
1224	*/	1224	*/
1225	if (unlikely(task_rq(task) != rq \|\|	1225	if (unlikely(task_rq(task) != rq \|\|
1226	!cpumask_test_cpu(lowest_rq->cpu,	1226	!cpumask_test_cpu(lowest_rq->cpu,
1227	&task->cpus_allowed) \|\|	1227	&task->cpus_allowed) \|\|
1228	task_running(rq, task) \|\|	1228	task_running(rq, task) \|\|
1229	!task->se.on_rq)) {	1229	!task->se.on_rq)) {
1230		1230
1231	spin_unlock(&lowest_rq->lock);	1231	spin_unlock(&lowest_rq->lock);
1232	lowest_rq = NULL;	1232	lowest_rq = NULL;
1233	break;	1233	break;
1234	}	1234	}
1235	}	1235	}
1236		1236
1237	/* If this rq is still suitable use it. */	1237	/* If this rq is still suitable use it. */
1238	if (lowest_rq->rt.highest_prio.curr > task->prio)	1238	if (lowest_rq->rt.highest_prio.curr > task->prio)
1239	break;	1239	break;
1240		1240
1241	/* try again */	1241	/* try again */
1242	double_unlock_balance(rq, lowest_rq);	1242	double_unlock_balance(rq, lowest_rq);
1243	lowest_rq = NULL;	1243	lowest_rq = NULL;
1244	}	1244	}
1245		1245
1246	return lowest_rq;	1246	return lowest_rq;
1247	}	1247	}
1248		1248
1249	static inline int has_pushable_tasks(struct rq *rq)	1249	static inline int has_pushable_tasks(struct rq *rq)
1250	{	1250	{
1251	return !plist_head_empty(&rq->rt.pushable_tasks);	1251	return !plist_head_empty(&rq->rt.pushable_tasks);
1252	}	1252	}
1253		1253
1254	static struct task_struct pick_next_pushable_task(struct rq rq)	1254	static struct task_struct pick_next_pushable_task(struct rq rq)
1255	{	1255	{
1256	struct task_struct *p;	1256	struct task_struct *p;
1257		1257
1258	if (!has_pushable_tasks(rq))	1258	if (!has_pushable_tasks(rq))
1259	return NULL;	1259	return NULL;
1260		1260
1261	p = plist_first_entry(&rq->rt.pushable_tasks,	1261	p = plist_first_entry(&rq->rt.pushable_tasks,
1262	struct task_struct, pushable_tasks);	1262	struct task_struct, pushable_tasks);
1263		1263
1264	BUG_ON(rq->cpu != task_cpu(p));	1264	BUG_ON(rq->cpu != task_cpu(p));
1265	BUG_ON(task_current(rq, p));	1265	BUG_ON(task_current(rq, p));
1266	BUG_ON(p->rt.nr_cpus_allowed <= 1);	1266	BUG_ON(p->rt.nr_cpus_allowed <= 1);
1267		1267
1268	BUG_ON(!p->se.on_rq);	1268	BUG_ON(!p->se.on_rq);
1269	BUG_ON(!rt_task(p));	1269	BUG_ON(!rt_task(p));
1270		1270
1271	return p;	1271	return p;
1272	}	1272	}
1273		1273
1274	/*	1274	/*
1275	* If the current CPU has more than one RT task, see if the non	1275	* If the current CPU has more than one RT task, see if the non
1276	* running task can migrate over to a CPU that is running a task	1276	* running task can migrate over to a CPU that is running a task
1277	* of lesser priority.	1277	* of lesser priority.
1278	*/	1278	*/
1279	static int push_rt_task(struct rq *rq)	1279	static int push_rt_task(struct rq *rq)
1280	{	1280	{
1281	struct task_struct *next_task;	1281	struct task_struct *next_task;
1282	struct rq *lowest_rq;	1282	struct rq *lowest_rq;
1283		1283
1284	if (!rq->rt.overloaded)	1284	if (!rq->rt.overloaded)
1285	return 0;	1285	return 0;
1286		1286
1287	next_task = pick_next_pushable_task(rq);	1287	next_task = pick_next_pushable_task(rq);
1288	if (!next_task)	1288	if (!next_task)
1289	return 0;	1289	return 0;
1290		1290
1291	retry:	1291	retry:
1292	if (unlikely(next_task == rq->curr)) {	1292	if (unlikely(next_task == rq->curr)) {
1293	WARN_ON(1);	1293	WARN_ON(1);
1294	return 0;	1294	return 0;
1295	}	1295	}
1296		1296
1297	/*	1297	/*
1298	* It's possible that the next_task slipped in of	1298	* It's possible that the next_task slipped in of
1299	* higher priority than current. If that's the case	1299	* higher priority than current. If that's the case
1300	* just reschedule current.	1300	* just reschedule current.
1301	*/	1301	*/
1302	if (unlikely(next_task->prio < rq->curr->prio)) {	1302	if (unlikely(next_task->prio < rq->curr->prio)) {
1303	resched_task(rq->curr);	1303	resched_task(rq->curr);
1304	return 0;	1304	return 0;
1305	}	1305	}
1306		1306
1307	/* We might release rq lock */	1307	/* We might release rq lock */
1308	get_task_struct(next_task);	1308	get_task_struct(next_task);
1309		1309
1310	/* find_lock_lowest_rq locks the rq if found */	1310	/* find_lock_lowest_rq locks the rq if found */
1311	lowest_rq = find_lock_lowest_rq(next_task, rq);	1311	lowest_rq = find_lock_lowest_rq(next_task, rq);
1312	if (!lowest_rq) {	1312	if (!lowest_rq) {
1313	struct task_struct *task;	1313	struct task_struct *task;
1314	/*	1314	/*
1315	* find lock_lowest_rq releases rq->lock	1315	* find lock_lowest_rq releases rq->lock
1316	* so it is possible that next_task has migrated.	1316	* so it is possible that next_task has migrated.
1317	*	1317	*
1318	* We need to make sure that the task is still on the same	1318	* We need to make sure that the task is still on the same
1319	* run-queue and is also still the next task eligible for	1319	* run-queue and is also still the next task eligible for
1320	* pushing.	1320	* pushing.
1321	*/	1321	*/
1322	task = pick_next_pushable_task(rq);	1322	task = pick_next_pushable_task(rq);
1323	if (task_cpu(next_task) == rq->cpu && task == next_task) {	1323	if (task_cpu(next_task) == rq->cpu && task == next_task) {
1324	/*	1324	/*
1325	* If we get here, the task hasnt moved at all, but	1325	* If we get here, the task hasnt moved at all, but
1326	* it has failed to push. We will not try again,	1326	* it has failed to push. We will not try again,
1327	* since the other cpus will pull from us when they	1327	* since the other cpus will pull from us when they
1328	* are ready.	1328	* are ready.
1329	*/	1329	*/
1330	dequeue_pushable_task(rq, next_task);	1330	dequeue_pushable_task(rq, next_task);
1331	goto out;	1331	goto out;
1332	}	1332	}
1333		1333
1334	if (!task)	1334	if (!task)
1335	/* No more tasks, just exit */	1335	/* No more tasks, just exit */
1336	goto out;	1336	goto out;
1337		1337
1338	/*	1338	/*
1339	* Something has shifted, try again.	1339	* Something has shifted, try again.
1340	*/	1340	*/
1341	put_task_struct(next_task);	1341	put_task_struct(next_task);
1342	next_task = task;	1342	next_task = task;
1343	goto retry;	1343	goto retry;
1344	}	1344	}
1345		1345
1346	deactivate_task(rq, next_task, 0);	1346	deactivate_task(rq, next_task, 0);
1347	set_task_cpu(next_task, lowest_rq->cpu);	1347	set_task_cpu(next_task, lowest_rq->cpu);
1348	activate_task(lowest_rq, next_task, 0);	1348	activate_task(lowest_rq, next_task, 0);
1349		1349
1350	resched_task(lowest_rq->curr);	1350	resched_task(lowest_rq->curr);
1351		1351
1352	double_unlock_balance(rq, lowest_rq);	1352	double_unlock_balance(rq, lowest_rq);
1353		1353
1354	out:	1354	out:
1355	put_task_struct(next_task);	1355	put_task_struct(next_task);
1356		1356
1357	return 1;	1357	return 1;
1358	}	1358	}
1359		1359
1360	static void push_rt_tasks(struct rq *rq)	1360	static void push_rt_tasks(struct rq *rq)
1361	{	1361	{
1362	/* push_rt_task will return true if it moved an RT */	1362	/* push_rt_task will return true if it moved an RT */
1363	while (push_rt_task(rq))	1363	while (push_rt_task(rq))
1364	;	1364	;
1365	}	1365	}
1366		1366
1367	static int pull_rt_task(struct rq *this_rq)	1367	static int pull_rt_task(struct rq *this_rq)
1368	{	1368	{
1369	int this_cpu = this_rq->cpu, ret = 0, cpu;	1369	int this_cpu = this_rq->cpu, ret = 0, cpu;
1370	struct task_struct *p;	1370	struct task_struct *p;
1371	struct rq *src_rq;	1371	struct rq *src_rq;
1372		1372
1373	if (likely(!rt_overloaded(this_rq)))	1373	if (likely(!rt_overloaded(this_rq)))
1374	return 0;	1374	return 0;
1375		1375
1376	for_each_cpu(cpu, this_rq->rd->rto_mask) {	1376	for_each_cpu(cpu, this_rq->rd->rto_mask) {
1377	if (this_cpu == cpu)	1377	if (this_cpu == cpu)
1378	continue;	1378	continue;
1379		1379
1380	src_rq = cpu_rq(cpu);	1380	src_rq = cpu_rq(cpu);
1381		1381
1382	/*	1382	/*
1383	* Don't bother taking the src_rq->lock if the next highest	1383	* Don't bother taking the src_rq->lock if the next highest
1384	* task is known to be lower-priority than our current task.	1384	* task is known to be lower-priority than our current task.
1385	* This may look racy, but if this value is about to go	1385	* This may look racy, but if this value is about to go
1386	* logically higher, the src_rq will push this task away.	1386	* logically higher, the src_rq will push this task away.
1387	* And if its going logically lower, we do not care	1387	* And if its going logically lower, we do not care
1388	*/	1388	*/
1389	if (src_rq->rt.highest_prio.next >=	1389	if (src_rq->rt.highest_prio.next >=
1390	this_rq->rt.highest_prio.curr)	1390	this_rq->rt.highest_prio.curr)
1391	continue;	1391	continue;
1392		1392
1393	/*	1393	/*
1394	* We can potentially drop this_rq's lock in	1394	* We can potentially drop this_rq's lock in
1395	* double_lock_balance, and another CPU could	1395	* double_lock_balance, and another CPU could
1396	* alter this_rq	1396	* alter this_rq
1397	*/	1397	*/
1398	double_lock_balance(this_rq, src_rq);	1398	double_lock_balance(this_rq, src_rq);
1399		1399
1400	/*	1400	/*
1401	* Are there still pullable RT tasks?	1401	* Are there still pullable RT tasks?
1402	*/	1402	*/
1403	if (src_rq->rt.rt_nr_running <= 1)	1403	if (src_rq->rt.rt_nr_running <= 1)
1404	goto skip;	1404	goto skip;
1405		1405
1406	p = pick_next_highest_task_rt(src_rq, this_cpu);	1406	p = pick_next_highest_task_rt(src_rq, this_cpu);
1407		1407
1408	/*	1408	/*
1409	* Do we have an RT task that preempts	1409	* Do we have an RT task that preempts
1410	* the to-be-scheduled task?	1410	* the to-be-scheduled task?
1411	*/	1411	*/
1412	if (p && (p->prio < this_rq->rt.highest_prio.curr)) {	1412	if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1413	WARN_ON(p == src_rq->curr);	1413	WARN_ON(p == src_rq->curr);
1414	WARN_ON(!p->se.on_rq);	1414	WARN_ON(!p->se.on_rq);
1415		1415
1416	/*	1416	/*
1417	* There's a chance that p is higher in priority	1417	* There's a chance that p is higher in priority
1418	* than what's currently running on its cpu.	1418	* than what's currently running on its cpu.
1419	* This is just that p is wakeing up and hasn't	1419	* This is just that p is wakeing up and hasn't
1420	* had a chance to schedule. We only pull	1420	* had a chance to schedule. We only pull
1421	* p if it is lower in priority than the	1421	* p if it is lower in priority than the
1422	* current task on the run queue	1422	* current task on the run queue
1423	*/	1423	*/
1424	if (p->prio < src_rq->curr->prio)	1424	if (p->prio < src_rq->curr->prio)
1425	goto skip;	1425	goto skip;
1426		1426
1427	ret = 1;	1427	ret = 1;
1428		1428
1429	deactivate_task(src_rq, p, 0);	1429	deactivate_task(src_rq, p, 0);
1430	set_task_cpu(p, this_cpu);	1430	set_task_cpu(p, this_cpu);
1431	activate_task(this_rq, p, 0);	1431	activate_task(this_rq, p, 0);
1432	/*	1432	/*
1433	* We continue with the search, just in	1433	* We continue with the search, just in
1434	* case there's an even higher prio task	1434	* case there's an even higher prio task
1435	* in another runqueue. (low likelyhood	1435	* in another runqueue. (low likelyhood
1436	* but possible)	1436	* but possible)
1437	*/	1437	*/
1438	}	1438	}
1439	skip:	1439	skip:
1440	double_unlock_balance(this_rq, src_rq);	1440	double_unlock_balance(this_rq, src_rq);
1441	}	1441	}
1442		1442
1443	return ret;	1443	return ret;
1444	}	1444	}
1445		1445
1446	static void pre_schedule_rt(struct rq rq, struct task_struct prev)	1446	static void pre_schedule_rt(struct rq rq, struct task_struct prev)
1447	{	1447	{
1448	/* Try to pull RT tasks here if we lower this rq's prio */	1448	/* Try to pull RT tasks here if we lower this rq's prio */
1449	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)	1449	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1450	pull_rt_task(rq);	1450	pull_rt_task(rq);
1451	}	1451	}
1452		1452
1453	/*	1453	/*
1454	* assumes rq->lock is held	1454	* assumes rq->lock is held
1455	*/	1455	*/
1456	static int needs_post_schedule_rt(struct rq *rq)	1456	static int needs_post_schedule_rt(struct rq *rq)
1457	{	1457	{
1458	return has_pushable_tasks(rq);	1458	return has_pushable_tasks(rq);
1459	}	1459	}
1460		1460
1461	static void post_schedule_rt(struct rq *rq)	1461	static void post_schedule_rt(struct rq *rq)
1462	{	1462	{
1463	/*	1463	/*
1464	* This is only called if needs_post_schedule_rt() indicates that	1464	* This is only called if needs_post_schedule_rt() indicates that
1465	* we need to push tasks away	1465	* we need to push tasks away
1466	*/	1466	*/
1467	spin_lock_irq(&rq->lock);	1467	spin_lock_irq(&rq->lock);
1468	push_rt_tasks(rq);	1468	push_rt_tasks(rq);
1469	spin_unlock_irq(&rq->lock);	1469	spin_unlock_irq(&rq->lock);
1470	}	1470	}
1471		1471
1472	/*	1472	/*
1473	* If we are not running and we are not going to reschedule soon, we should	1473	* If we are not running and we are not going to reschedule soon, we should
1474	* try to push tasks away now	1474	* try to push tasks away now
1475	*/	1475	*/
1476	static void task_wake_up_rt(struct rq rq, struct task_struct p)	1476	static void task_wake_up_rt(struct rq rq, struct task_struct p)
1477	{	1477	{
1478	if (!task_running(rq, p) &&	1478	if (!task_running(rq, p) &&
1479	!test_tsk_need_resched(rq->curr) &&	1479	!test_tsk_need_resched(rq->curr) &&
1480	has_pushable_tasks(rq) &&	1480	has_pushable_tasks(rq) &&
1481	p->rt.nr_cpus_allowed > 1)	1481	p->rt.nr_cpus_allowed > 1)
1482	push_rt_tasks(rq);	1482	push_rt_tasks(rq);
1483	}	1483	}
1484		1484
1485	static unsigned long	1485	static unsigned long
1486	load_balance_rt(struct rq this_rq, int this_cpu, struct rq busiest,	1486	load_balance_rt(struct rq this_rq, int this_cpu, struct rq busiest,
1487	unsigned long max_load_move,	1487	unsigned long max_load_move,
1488	struct sched_domain *sd, enum cpu_idle_type idle,	1488	struct sched_domain *sd, enum cpu_idle_type idle,
1489	int all_pinned, int this_best_prio)	1489	int all_pinned, int this_best_prio)
1490	{	1490	{
1491	/* don't touch RT tasks */	1491	/* don't touch RT tasks */
1492	return 0;	1492	return 0;
1493	}	1493	}
1494		1494
1495	static int	1495	static int
1496	move_one_task_rt(struct rq this_rq, int this_cpu, struct rq busiest,	1496	move_one_task_rt(struct rq this_rq, int this_cpu, struct rq busiest,
1497	struct sched_domain *sd, enum cpu_idle_type idle)	1497	struct sched_domain *sd, enum cpu_idle_type idle)
1498	{	1498	{
1499	/* don't touch RT tasks */	1499	/* don't touch RT tasks */
1500	return 0;	1500	return 0;
1501	}	1501	}
1502		1502
1503	static void set_cpus_allowed_rt(struct task_struct *p,	1503	static void set_cpus_allowed_rt(struct task_struct *p,
1504	const struct cpumask *new_mask)	1504	const struct cpumask *new_mask)
1505	{	1505	{
1506	int weight = cpumask_weight(new_mask);	1506	int weight = cpumask_weight(new_mask);
1507		1507
1508	BUG_ON(!rt_task(p));	1508	BUG_ON(!rt_task(p));
1509		1509
1510	/*	1510	/*
1511	* Update the migration status of the RQ if we have an RT task	1511	* Update the migration status of the RQ if we have an RT task
1512	* which is running AND changing its weight value.	1512	* which is running AND changing its weight value.
1513	*/	1513	*/
1514	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {	1514	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1515	struct rq *rq = task_rq(p);	1515	struct rq *rq = task_rq(p);
1516		1516
1517	if (!task_current(rq, p)) {	1517	if (!task_current(rq, p)) {
1518	/*	1518	/*
1519	* Make sure we dequeue this task from the pushable list	1519	* Make sure we dequeue this task from the pushable list
1520	* before going further. It will either remain off of	1520	* before going further. It will either remain off of
1521	* the list because we are no longer pushable, or it	1521	* the list because we are no longer pushable, or it
1522	* will be requeued.	1522	* will be requeued.
1523	*/	1523	*/
1524	if (p->rt.nr_cpus_allowed > 1)	1524	if (p->rt.nr_cpus_allowed > 1)
1525	dequeue_pushable_task(rq, p);	1525	dequeue_pushable_task(rq, p);
1526		1526
1527	/*	1527	/*
1528	* Requeue if our weight is changing and still > 1	1528	* Requeue if our weight is changing and still > 1
1529	*/	1529	*/
1530	if (weight > 1)	1530	if (weight > 1)
1531	enqueue_pushable_task(rq, p);	1531	enqueue_pushable_task(rq, p);
1532		1532
1533	}	1533	}
1534		1534
1535	if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {	1535	if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1536	rq->rt.rt_nr_migratory++;	1536	rq->rt.rt_nr_migratory++;
1537	} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {	1537	} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1538	BUG_ON(!rq->rt.rt_nr_migratory);	1538	BUG_ON(!rq->rt.rt_nr_migratory);
1539	rq->rt.rt_nr_migratory--;	1539	rq->rt.rt_nr_migratory--;
1540	}	1540	}
1541		1541
1542	update_rt_migration(&rq->rt);	1542	update_rt_migration(&rq->rt);
1543	}	1543	}
1544		1544
1545	cpumask_copy(&p->cpus_allowed, new_mask);	1545	cpumask_copy(&p->cpus_allowed, new_mask);
1546	p->rt.nr_cpus_allowed = weight;	1546	p->rt.nr_cpus_allowed = weight;
1547	}	1547	}
1548		1548
1549	/* Assumes rq->lock is held */	1549	/* Assumes rq->lock is held */
1550	static void rq_online_rt(struct rq *rq)	1550	static void rq_online_rt(struct rq *rq)
1551	{	1551	{
1552	if (rq->rt.overloaded)	1552	if (rq->rt.overloaded)
1553	rt_set_overload(rq);	1553	rt_set_overload(rq);
1554		1554
1555	__enable_runtime(rq);	1555	__enable_runtime(rq);
1556		1556
1557	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);	1557	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1558	}	1558	}
1559		1559
1560	/* Assumes rq->lock is held */	1560	/* Assumes rq->lock is held */
1561	static void rq_offline_rt(struct rq *rq)	1561	static void rq_offline_rt(struct rq *rq)
1562	{	1562	{
1563	if (rq->rt.overloaded)	1563	if (rq->rt.overloaded)
1564	rt_clear_overload(rq);	1564	rt_clear_overload(rq);
1565		1565
1566	__disable_runtime(rq);	1566	__disable_runtime(rq);
1567		1567
1568	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);	1568	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1569	}	1569	}
1570		1570
1571	/*	1571	/*
1572	* When switch from the rt queue, we bring ourselves to a position	1572	* When switch from the rt queue, we bring ourselves to a position
1573	* that we might want to pull RT tasks from other runqueues.	1573	* that we might want to pull RT tasks from other runqueues.
1574	*/	1574	*/
1575	static void switched_from_rt(struct rq rq, struct task_struct p,	1575	static void switched_from_rt(struct rq rq, struct task_struct p,
1576	int running)	1576	int running)
1577	{	1577	{
1578	/*	1578	/*
1579	* If there are other RT tasks then we will reschedule	1579	* If there are other RT tasks then we will reschedule
1580	* and the scheduling of the other RT tasks will handle	1580	* and the scheduling of the other RT tasks will handle
1581	* the balancing. But if we are the last RT task	1581	* the balancing. But if we are the last RT task
1582	* we may need to handle the pulling of RT tasks	1582	* we may need to handle the pulling of RT tasks
1583	* now.	1583	* now.
1584	*/	1584	*/
1585	if (!rq->rt.rt_nr_running)	1585	if (!rq->rt.rt_nr_running)
1586	pull_rt_task(rq);	1586	pull_rt_task(rq);
1587	}	1587	}
1588		1588
1589	static inline void init_sched_rt_class(void)	1589	static inline void init_sched_rt_class(void)
1590	{	1590	{
1591	unsigned int i;	1591	unsigned int i;
1592		1592
1593	for_each_possible_cpu(i)	1593	for_each_possible_cpu(i)
1594	alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),	1594	zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1595	GFP_KERNEL, cpu_to_node(i));	1595	GFP_KERNEL, cpu_to_node(i));
1596	}	1596	}
1597	#endif /* CONFIG_SMP */	1597	#endif /* CONFIG_SMP */
1598		1598
1599	/*	1599	/*
1600	* When switching a task to RT, we may overload the runqueue	1600	* When switching a task to RT, we may overload the runqueue
1601	* with RT tasks. In this case we try to push them off to	1601	* with RT tasks. In this case we try to push them off to
1602	* other runqueues.	1602	* other runqueues.
1603	*/	1603	*/
1604	static void switched_to_rt(struct rq rq, struct task_struct p,	1604	static void switched_to_rt(struct rq rq, struct task_struct p,
1605	int running)	1605	int running)
1606	{	1606	{
1607	int check_resched = 1;	1607	int check_resched = 1;
1608		1608
1609	/*	1609	/*
1610	* If we are already running, then there's nothing	1610	* If we are already running, then there's nothing
1611	* that needs to be done. But if we are not running	1611	* that needs to be done. But if we are not running
1612	* we may need to preempt the current running task.	1612	* we may need to preempt the current running task.
1613	* If that current running task is also an RT task	1613	* If that current running task is also an RT task
1614	* then see if we can move to another run queue.	1614	* then see if we can move to another run queue.
1615	*/	1615	*/
1616	if (!running) {	1616	if (!running) {
1617	#ifdef CONFIG_SMP	1617	#ifdef CONFIG_SMP
1618	if (rq->rt.overloaded && push_rt_task(rq) &&	1618	if (rq->rt.overloaded && push_rt_task(rq) &&
1619	/* Don't resched if we changed runqueues */	1619	/* Don't resched if we changed runqueues */
1620	rq != task_rq(p))	1620	rq != task_rq(p))
1621	check_resched = 0;	1621	check_resched = 0;
1622	#endif /* CONFIG_SMP */	1622	#endif /* CONFIG_SMP */
1623	if (check_resched && p->prio < rq->curr->prio)	1623	if (check_resched && p->prio < rq->curr->prio)
1624	resched_task(rq->curr);	1624	resched_task(rq->curr);
1625	}	1625	}
1626	}	1626	}
1627		1627
1628	/*	1628	/*
1629	* Priority of the task has changed. This may cause	1629	* Priority of the task has changed. This may cause
1630	* us to initiate a push or pull.	1630	* us to initiate a push or pull.
1631	*/	1631	*/
1632	static void prio_changed_rt(struct rq rq, struct task_struct p,	1632	static void prio_changed_rt(struct rq rq, struct task_struct p,
1633	int oldprio, int running)	1633	int oldprio, int running)
1634	{	1634	{
1635	if (running) {	1635	if (running) {
1636	#ifdef CONFIG_SMP	1636	#ifdef CONFIG_SMP
1637	/*	1637	/*
1638	* If our priority decreases while running, we	1638	* If our priority decreases while running, we
1639	* may need to pull tasks to this runqueue.	1639	* may need to pull tasks to this runqueue.
1640	*/	1640	*/
1641	if (oldprio < p->prio)	1641	if (oldprio < p->prio)
1642	pull_rt_task(rq);	1642	pull_rt_task(rq);
1643	/*	1643	/*
1644	* If there's a higher priority task waiting to run	1644	* If there's a higher priority task waiting to run
1645	* then reschedule. Note, the above pull_rt_task	1645	* then reschedule. Note, the above pull_rt_task
1646	* can release the rq lock and p could migrate.	1646	* can release the rq lock and p could migrate.
1647	* Only reschedule if p is still on the same runqueue.	1647	* Only reschedule if p is still on the same runqueue.
1648	*/	1648	*/
1649	if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)	1649	if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1650	resched_task(p);	1650	resched_task(p);
1651	#else	1651	#else
1652	/* For UP simply resched on drop of prio */	1652	/* For UP simply resched on drop of prio */
1653	if (oldprio < p->prio)	1653	if (oldprio < p->prio)
1654	resched_task(p);	1654	resched_task(p);
1655	#endif /* CONFIG_SMP */	1655	#endif /* CONFIG_SMP */
1656	} else {	1656	} else {
1657	/*	1657	/*
1658	* This task is not running, but if it is	1658	* This task is not running, but if it is
1659	* greater than the current running task	1659	* greater than the current running task
1660	* then reschedule.	1660	* then reschedule.
1661	*/	1661	*/
1662	if (p->prio < rq->curr->prio)	1662	if (p->prio < rq->curr->prio)
1663	resched_task(rq->curr);	1663	resched_task(rq->curr);
1664	}	1664	}
1665	}	1665	}
1666		1666
1667	static void watchdog(struct rq rq, struct task_struct p)	1667	static void watchdog(struct rq rq, struct task_struct p)
1668	{	1668	{
1669	unsigned long soft, hard;	1669	unsigned long soft, hard;
1670		1670
1671	if (!p->signal)	1671	if (!p->signal)
1672	return;	1672	return;
1673		1673
1674	soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;	1674	soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1675	hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;	1675	hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1676		1676
1677	if (soft != RLIM_INFINITY) {	1677	if (soft != RLIM_INFINITY) {
1678	unsigned long next;	1678	unsigned long next;
1679		1679
1680	p->rt.timeout++;	1680	p->rt.timeout++;
1681	next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);	1681	next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1682	if (p->rt.timeout > next)	1682	if (p->rt.timeout > next)
1683	p->cputime_expires.sched_exp = p->se.sum_exec_runtime;	1683	p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1684	}	1684	}
1685	}	1685	}
1686		1686
1687	static void task_tick_rt(struct rq rq, struct task_struct p, int queued)	1687	static void task_tick_rt(struct rq rq, struct task_struct p, int queued)
1688	{	1688	{
1689	update_curr_rt(rq);	1689	update_curr_rt(rq);
1690		1690
1691	watchdog(rq, p);	1691	watchdog(rq, p);
1692		1692
1693	/*	1693	/*
1694	* RR tasks need a special form of timeslice management.	1694	* RR tasks need a special form of timeslice management.
1695	* FIFO tasks have no timeslices.	1695	* FIFO tasks have no timeslices.
1696	*/	1696	*/
1697	if (p->policy != SCHED_RR)	1697	if (p->policy != SCHED_RR)
1698	return;	1698	return;
1699		1699
1700	if (--p->rt.time_slice)	1700	if (--p->rt.time_slice)
1701	return;	1701	return;
1702		1702
1703	p->rt.time_slice = DEF_TIMESLICE;	1703	p->rt.time_slice = DEF_TIMESLICE;
1704		1704
1705	/*	1705	/*
1706	* Requeue to the end of queue if we are not the only element	1706	* Requeue to the end of queue if we are not the only element
1707	* on the queue:	1707	* on the queue:
1708	*/	1708	*/
1709	if (p->rt.run_list.prev != p->rt.run_list.next) {	1709	if (p->rt.run_list.prev != p->rt.run_list.next) {
1710	requeue_task_rt(rq, p, 0);	1710	requeue_task_rt(rq, p, 0);
1711	set_tsk_need_resched(p);	1711	set_tsk_need_resched(p);
1712	}	1712	}
1713	}	1713	}
1714		1714
1715	static void set_curr_task_rt(struct rq *rq)	1715	static void set_curr_task_rt(struct rq *rq)
1716	{	1716	{
1717	struct task_struct *p = rq->curr;	1717	struct task_struct *p = rq->curr;
1718		1718
1719	p->se.exec_start = rq->clock;	1719	p->se.exec_start = rq->clock;
1720		1720
1721	/* The running task is never eligible for pushing */	1721	/* The running task is never eligible for pushing */
1722	dequeue_pushable_task(rq, p);	1722	dequeue_pushable_task(rq, p);
1723	}	1723	}
1724		1724
1725	static const struct sched_class rt_sched_class = {	1725	static const struct sched_class rt_sched_class = {
1726	.next = &fair_sched_class,	1726	.next = &fair_sched_class,
1727	.enqueue_task = enqueue_task_rt,	1727	.enqueue_task = enqueue_task_rt,
1728	.dequeue_task = dequeue_task_rt,	1728	.dequeue_task = dequeue_task_rt,
1729	.yield_task = yield_task_rt,	1729	.yield_task = yield_task_rt,
1730		1730
1731	.check_preempt_curr = check_preempt_curr_rt,	1731	.check_preempt_curr = check_preempt_curr_rt,
1732		1732
1733	.pick_next_task = pick_next_task_rt,	1733	.pick_next_task = pick_next_task_rt,
1734	.put_prev_task = put_prev_task_rt,	1734	.put_prev_task = put_prev_task_rt,
1735		1735
1736	#ifdef CONFIG_SMP	1736	#ifdef CONFIG_SMP
1737	.select_task_rq = select_task_rq_rt,	1737	.select_task_rq = select_task_rq_rt,
1738		1738
1739	.load_balance = load_balance_rt,	1739	.load_balance = load_balance_rt,
1740	.move_one_task = move_one_task_rt,	1740	.move_one_task = move_one_task_rt,
1741	.set_cpus_allowed = set_cpus_allowed_rt,	1741	.set_cpus_allowed = set_cpus_allowed_rt,
1742	.rq_online = rq_online_rt,	1742	.rq_online = rq_online_rt,
1743	.rq_offline = rq_offline_rt,	1743	.rq_offline = rq_offline_rt,
1744	.pre_schedule = pre_schedule_rt,	1744	.pre_schedule = pre_schedule_rt,
1745	.needs_post_schedule = needs_post_schedule_rt,	1745	.needs_post_schedule = needs_post_schedule_rt,
1746	.post_schedule = post_schedule_rt,	1746	.post_schedule = post_schedule_rt,
1747	.task_wake_up = task_wake_up_rt,	1747	.task_wake_up = task_wake_up_rt,
1748	.switched_from = switched_from_rt,	1748	.switched_from = switched_from_rt,
1749	#endif	1749	#endif
1750		1750
1751	.set_curr_task = set_curr_task_rt,	1751	.set_curr_task = set_curr_task_rt,
1752	.task_tick = task_tick_rt,	1752	.task_tick = task_tick_rt,
1753		1753
1754	.prio_changed = prio_changed_rt,	1754	.prio_changed = prio_changed_rt,
1755	.switched_to = switched_to_rt,	1755	.switched_to = switched_to_rt,
1756	};	1756	};
1757		1757
1758	#ifdef CONFIG_SCHED_DEBUG	1758	#ifdef CONFIG_SCHED_DEBUG
1759	extern void print_rt_rq(struct seq_file m, int cpu, struct rt_rq rt_rq);	1759	extern void print_rt_rq(struct seq_file m, int cpu, struct rt_rq rt_rq);
1760		1760
1761	static void print_rt_stats(struct seq_file *m, int cpu)	1761	static void print_rt_stats(struct seq_file *m, int cpu)
1762	{	1762	{
1763	struct rt_rq *rt_rq;	1763	struct rt_rq *rt_rq;
1764		1764
1765	rcu_read_lock();	1765	rcu_read_lock();
1766	for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))	1766	for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1767	print_rt_rq(m, cpu, rt_rq);	1767	print_rt_rq(m, cpu, rt_rq);
1768	rcu_read_unlock();	1768	rcu_read_unlock();
1769	}	1769	}
1770	#endif /* CONFIG_SCHED_DEBUG */	1770	#endif /* CONFIG_SCHED_DEBUG */
1771		1771
1772		1772

kernel/smp.c

Diff comments View file @ eaa9584

1	/*	1	/*
2	* Generic helpers for smp ipi calls	2	* Generic helpers for smp ipi calls
3	*	3	*
4	* (C) Jens Axboe <jens.axboe@oracle.com> 2008	4	* (C) Jens Axboe <jens.axboe@oracle.com> 2008
5	*/	5	*/
6	#include <linux/rcupdate.h>	6	#include <linux/rcupdate.h>
7	#include <linux/rculist.h>	7	#include <linux/rculist.h>
8	#include <linux/kernel.h>	8	#include <linux/kernel.h>
9	#include <linux/module.h>	9	#include <linux/module.h>
10	#include <linux/percpu.h>	10	#include <linux/percpu.h>
11	#include <linux/init.h>	11	#include <linux/init.h>
12	#include <linux/smp.h>	12	#include <linux/smp.h>
13	#include <linux/cpu.h>	13	#include <linux/cpu.h>
14		14
15	static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);	15	static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16		16
17	static struct {	17	static struct {
18	struct list_head queue;	18	struct list_head queue;
19	spinlock_t lock;	19	spinlock_t lock;
20	} call_function __cacheline_aligned_in_smp =	20	} call_function __cacheline_aligned_in_smp =
21	{	21	{
22	.queue = LIST_HEAD_INIT(call_function.queue),	22	.queue = LIST_HEAD_INIT(call_function.queue),
23	.lock = __SPIN_LOCK_UNLOCKED(call_function.lock),	23	.lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
24	};	24	};
25		25
26	enum {	26	enum {
27	CSD_FLAG_LOCK = 0x01,	27	CSD_FLAG_LOCK = 0x01,
28	};	28	};
29		29
30	struct call_function_data {	30	struct call_function_data {
31	struct call_single_data csd;	31	struct call_single_data csd;
32	spinlock_t lock;	32	spinlock_t lock;
33	unsigned int refs;	33	unsigned int refs;
34	cpumask_var_t cpumask;	34	cpumask_var_t cpumask;
35	};	35	};
36		36
37	struct call_single_queue {	37	struct call_single_queue {
38	struct list_head list;	38	struct list_head list;
39	spinlock_t lock;	39	spinlock_t lock;
40	};	40	};
41		41
42	static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {	42	static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
43	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),	43	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44	};	44	};
45		45
46	static int	46	static int
47	hotplug_cfd(struct notifier_block nfb, unsigned long action, void hcpu)	47	hotplug_cfd(struct notifier_block nfb, unsigned long action, void hcpu)
48	{	48	{
49	long cpu = (long)hcpu;	49	long cpu = (long)hcpu;
50	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);	50	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
51		51
52	switch (action) {	52	switch (action) {
53	case CPU_UP_PREPARE:	53	case CPU_UP_PREPARE:
54	case CPU_UP_PREPARE_FROZEN:	54	case CPU_UP_PREPARE_FROZEN:
55	if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,	55	if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
56	cpu_to_node(cpu)))	56	cpu_to_node(cpu)))
57	return NOTIFY_BAD;	57	return NOTIFY_BAD;
58	break;	58	break;
59		59
60	#ifdef CONFIG_CPU_HOTPLUG	60	#ifdef CONFIG_CPU_HOTPLUG
61	case CPU_UP_CANCELED:	61	case CPU_UP_CANCELED:
62	case CPU_UP_CANCELED_FROZEN:	62	case CPU_UP_CANCELED_FROZEN:
63		63
64	case CPU_DEAD:	64	case CPU_DEAD:
65	case CPU_DEAD_FROZEN:	65	case CPU_DEAD_FROZEN:
66	free_cpumask_var(cfd->cpumask);	66	free_cpumask_var(cfd->cpumask);
67	break;	67	break;
68	#endif	68	#endif
69	};	69	};
70		70
71	return NOTIFY_OK;	71	return NOTIFY_OK;
72	}	72	}
73		73
74	static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {	74	static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
75	.notifier_call = hotplug_cfd,	75	.notifier_call = hotplug_cfd,
76	};	76	};
77		77
78	static int __cpuinit init_call_single_data(void)	78	static int __cpuinit init_call_single_data(void)
79	{	79	{
80	void cpu = (void )(long)smp_processor_id();	80	void cpu = (void )(long)smp_processor_id();
81	int i;	81	int i;
82		82
83	for_each_possible_cpu(i) {	83	for_each_possible_cpu(i) {
84	struct call_single_queue *q = &per_cpu(call_single_queue, i);	84	struct call_single_queue *q = &per_cpu(call_single_queue, i);
85		85
86	spin_lock_init(&q->lock);	86	spin_lock_init(&q->lock);
87	INIT_LIST_HEAD(&q->list);	87	INIT_LIST_HEAD(&q->list);
88	}	88	}
89		89
90	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);	90	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
91	register_cpu_notifier(&hotplug_cfd_notifier);	91	register_cpu_notifier(&hotplug_cfd_notifier);
92		92
93	return 0;	93	return 0;
94	}	94	}
95	early_initcall(init_call_single_data);	95	early_initcall(init_call_single_data);
96		96
97	/*	97	/*
98	* csd_lock/csd_unlock used to serialize access to per-cpu csd resources	98	* csd_lock/csd_unlock used to serialize access to per-cpu csd resources
99	*	99	*
100	* For non-synchronous ipi calls the csd can still be in use by the	100	* For non-synchronous ipi calls the csd can still be in use by the
101	* previous function call. For multi-cpu calls its even more interesting	101	* previous function call. For multi-cpu calls its even more interesting
102	* as we'll have to ensure no other cpu is observing our csd.	102	* as we'll have to ensure no other cpu is observing our csd.
103	*/	103	*/
104	static void csd_lock_wait(struct call_single_data *data)	104	static void csd_lock_wait(struct call_single_data *data)
105	{	105	{
106	while (data->flags & CSD_FLAG_LOCK)	106	while (data->flags & CSD_FLAG_LOCK)
107	cpu_relax();	107	cpu_relax();
108	}	108	}
109		109
110	static void csd_lock(struct call_single_data *data)	110	static void csd_lock(struct call_single_data *data)
111	{	111	{
112	csd_lock_wait(data);	112	csd_lock_wait(data);
113	data->flags = CSD_FLAG_LOCK;	113	data->flags = CSD_FLAG_LOCK;
114		114
115	/*	115	/*
116	* prevent CPU from reordering the above assignment	116	* prevent CPU from reordering the above assignment
117	* to ->flags with any subsequent assignments to other	117	* to ->flags with any subsequent assignments to other
118	* fields of the specified call_single_data structure:	118	* fields of the specified call_single_data structure:
119	*/	119	*/
120	smp_mb();	120	smp_mb();
121	}	121	}
122		122
123	static void csd_unlock(struct call_single_data *data)	123	static void csd_unlock(struct call_single_data *data)
124	{	124	{
125	WARN_ON(!(data->flags & CSD_FLAG_LOCK));	125	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
126		126
127	/*	127	/*
128	* ensure we're all done before releasing data:	128	* ensure we're all done before releasing data:
129	*/	129	*/
130	smp_mb();	130	smp_mb();
131		131
132	data->flags &= ~CSD_FLAG_LOCK;	132	data->flags &= ~CSD_FLAG_LOCK;
133	}	133	}
134		134
135	/*	135	/*
136	* Insert a previously allocated call_single_data element	136	* Insert a previously allocated call_single_data element
137	* for execution on the given CPU. data must already have	137	* for execution on the given CPU. data must already have
138	* ->func, ->info, and ->flags set.	138	* ->func, ->info, and ->flags set.
139	*/	139	*/
140	static	140	static
141	void generic_exec_single(int cpu, struct call_single_data *data, int wait)	141	void generic_exec_single(int cpu, struct call_single_data *data, int wait)
142	{	142	{
143	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);	143	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
144	unsigned long flags;	144	unsigned long flags;
145	int ipi;	145	int ipi;
146		146
147	spin_lock_irqsave(&dst->lock, flags);	147	spin_lock_irqsave(&dst->lock, flags);
148	ipi = list_empty(&dst->list);	148	ipi = list_empty(&dst->list);
149	list_add_tail(&data->list, &dst->list);	149	list_add_tail(&data->list, &dst->list);
150	spin_unlock_irqrestore(&dst->lock, flags);	150	spin_unlock_irqrestore(&dst->lock, flags);
151		151
152	/*	152	/*
153	* The list addition should be visible before sending the IPI	153	* The list addition should be visible before sending the IPI
154	* handler locks the list to pull the entry off it because of	154	* handler locks the list to pull the entry off it because of
155	* normal cache coherency rules implied by spinlocks.	155	* normal cache coherency rules implied by spinlocks.
156	*	156	*
157	* If IPIs can go out of order to the cache coherency protocol	157	* If IPIs can go out of order to the cache coherency protocol
158	* in an architecture, sufficient synchronisation should be added	158	* in an architecture, sufficient synchronisation should be added
159	* to arch code to make it appear to obey cache coherency WRT	159	* to arch code to make it appear to obey cache coherency WRT
160	* locking and barrier primitives. Generic code isn't really	160	* locking and barrier primitives. Generic code isn't really
161	* equipped to do the right thing...	161	* equipped to do the right thing...
162	*/	162	*/
163	if (ipi)	163	if (ipi)
164	arch_send_call_function_single_ipi(cpu);	164	arch_send_call_function_single_ipi(cpu);
165		165
166	if (wait)	166	if (wait)
167	csd_lock_wait(data);	167	csd_lock_wait(data);
168	}	168	}
169		169
170	/*	170	/*
171	* Invoked by arch to handle an IPI for call function. Must be called with	171	* Invoked by arch to handle an IPI for call function. Must be called with
172	* interrupts disabled.	172	* interrupts disabled.
173	*/	173	*/
174	void generic_smp_call_function_interrupt(void)	174	void generic_smp_call_function_interrupt(void)
175	{	175	{
176	struct call_function_data *data;	176	struct call_function_data *data;
177	int cpu = get_cpu();	177	int cpu = get_cpu();
178		178
179	/*	179	/*
180	* Ensure entry is visible on call_function_queue after we have	180	* Ensure entry is visible on call_function_queue after we have
181	* entered the IPI. See comment in smp_call_function_many.	181	* entered the IPI. See comment in smp_call_function_many.
182	* If we don't have this, then we may miss an entry on the list	182	* If we don't have this, then we may miss an entry on the list
183	* and never get another IPI to process it.	183	* and never get another IPI to process it.
184	*/	184	*/
185	smp_mb();	185	smp_mb();
186		186
187	/*	187	/*
188	* It's ok to use list_for_each_rcu() here even though we may	188	* It's ok to use list_for_each_rcu() here even though we may
189	* delete 'pos', since list_del_rcu() doesn't clear ->next	189	* delete 'pos', since list_del_rcu() doesn't clear ->next
190	*/	190	*/
191	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {	191	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
192	int refs;	192	int refs;
193		193
194	spin_lock(&data->lock);	194	spin_lock(&data->lock);
195	if (!cpumask_test_cpu(cpu, data->cpumask)) {	195	if (!cpumask_test_cpu(cpu, data->cpumask)) {
196	spin_unlock(&data->lock);	196	spin_unlock(&data->lock);
197	continue;	197	continue;
198	}	198	}
199	cpumask_clear_cpu(cpu, data->cpumask);	199	cpumask_clear_cpu(cpu, data->cpumask);
200	spin_unlock(&data->lock);	200	spin_unlock(&data->lock);
201		201
202	data->csd.func(data->csd.info);	202	data->csd.func(data->csd.info);
203		203
204	spin_lock(&data->lock);	204	spin_lock(&data->lock);
205	WARN_ON(data->refs == 0);	205	WARN_ON(data->refs == 0);
206	refs = --data->refs;	206	refs = --data->refs;
207	if (!refs) {	207	if (!refs) {
208	spin_lock(&call_function.lock);	208	spin_lock(&call_function.lock);
209	list_del_rcu(&data->csd.list);	209	list_del_rcu(&data->csd.list);
210	spin_unlock(&call_function.lock);	210	spin_unlock(&call_function.lock);
211	}	211	}
212	spin_unlock(&data->lock);	212	spin_unlock(&data->lock);
213		213
214	if (refs)	214	if (refs)
215	continue;	215	continue;
216		216
217	csd_unlock(&data->csd);	217	csd_unlock(&data->csd);
218	}	218	}
219		219
220	put_cpu();	220	put_cpu();
221	}	221	}
222		222
223	/*	223	/*
224	* Invoked by arch to handle an IPI for call function single. Must be	224	* Invoked by arch to handle an IPI for call function single. Must be
225	* called from the arch with interrupts disabled.	225	* called from the arch with interrupts disabled.
226	*/	226	*/
227	void generic_smp_call_function_single_interrupt(void)	227	void generic_smp_call_function_single_interrupt(void)
228	{	228	{
229	struct call_single_queue *q = &__get_cpu_var(call_single_queue);	229	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
230	unsigned int data_flags;	230	unsigned int data_flags;
231	LIST_HEAD(list);	231	LIST_HEAD(list);
232		232
233	spin_lock(&q->lock);	233	spin_lock(&q->lock);
234	list_replace_init(&q->list, &list);	234	list_replace_init(&q->list, &list);
235	spin_unlock(&q->lock);	235	spin_unlock(&q->lock);
236		236
237	while (!list_empty(&list)) {	237	while (!list_empty(&list)) {
238	struct call_single_data *data;	238	struct call_single_data *data;
239		239
240	data = list_entry(list.next, struct call_single_data, list);	240	data = list_entry(list.next, struct call_single_data, list);
241	list_del(&data->list);	241	list_del(&data->list);
242		242
243	/*	243	/*
244	* 'data' can be invalid after this call if flags == 0	244	* 'data' can be invalid after this call if flags == 0
245	* (when called through generic_exec_single()),	245	* (when called through generic_exec_single()),
246	* so save them away before making the call:	246	* so save them away before making the call:
247	*/	247	*/
248	data_flags = data->flags;	248	data_flags = data->flags;
249		249
250	data->func(data->info);	250	data->func(data->info);
251		251
252	/*	252	/*
253	* Unlocked CSDs are valid through generic_exec_single():	253	* Unlocked CSDs are valid through generic_exec_single():
254	*/	254	*/
255	if (data_flags & CSD_FLAG_LOCK)	255	if (data_flags & CSD_FLAG_LOCK)
256	csd_unlock(data);	256	csd_unlock(data);
257	}	257	}
258	}	258	}
259		259
260	static DEFINE_PER_CPU(struct call_single_data, csd_data);	260	static DEFINE_PER_CPU(struct call_single_data, csd_data);
261		261
262	/*	262	/*
263	* smp_call_function_single - Run a function on a specific CPU	263	* smp_call_function_single - Run a function on a specific CPU
264	* @func: The function to run. This must be fast and non-blocking.	264	* @func: The function to run. This must be fast and non-blocking.
265	* @info: An arbitrary pointer to pass to the function.	265	* @info: An arbitrary pointer to pass to the function.
266	* @wait: If true, wait until function has completed on other CPUs.	266	* @wait: If true, wait until function has completed on other CPUs.
267	*	267	*
268	* Returns 0 on success, else a negative status code. Note that @wait	268	* Returns 0 on success, else a negative status code. Note that @wait
269	* will be implicitly turned on in case of allocation failures, since	269	* will be implicitly turned on in case of allocation failures, since
270	* we fall back to on-stack allocation.	270	* we fall back to on-stack allocation.
271	*/	271	*/
272	int smp_call_function_single(int cpu, void (func) (void info), void *info,	272	int smp_call_function_single(int cpu, void (func) (void info), void *info,
273	int wait)	273	int wait)
274	{	274	{
275	struct call_single_data d = {	275	struct call_single_data d = {
276	.flags = 0,	276	.flags = 0,
277	};	277	};
278	unsigned long flags;	278	unsigned long flags;
279	int this_cpu;	279	int this_cpu;
280	int err = 0;	280	int err = 0;
281		281
282	/*	282	/*
283	* prevent preemption and reschedule on another processor,	283	* prevent preemption and reschedule on another processor,
284	* as well as CPU removal	284	* as well as CPU removal
285	*/	285	*/
286	this_cpu = get_cpu();	286	this_cpu = get_cpu();
287		287
288	/* Can deadlock when called with interrupts disabled */	288	/* Can deadlock when called with interrupts disabled */
289	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);	289	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
290		290
291	if (cpu == this_cpu) {	291	if (cpu == this_cpu) {
292	local_irq_save(flags);	292	local_irq_save(flags);
293	func(info);	293	func(info);
294	local_irq_restore(flags);	294	local_irq_restore(flags);
295	} else {	295	} else {
296	if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {	296	if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
297	struct call_single_data *data = &d;	297	struct call_single_data *data = &d;
298		298
299	if (!wait)	299	if (!wait)
300	data = &__get_cpu_var(csd_data);	300	data = &__get_cpu_var(csd_data);
301		301
302	csd_lock(data);	302	csd_lock(data);
303		303
304	data->func = func;	304	data->func = func;
305	data->info = info;	305	data->info = info;
306	generic_exec_single(cpu, data, wait);	306	generic_exec_single(cpu, data, wait);
307	} else {	307	} else {
308	err = -ENXIO; /* CPU not online */	308	err = -ENXIO; /* CPU not online */
309	}	309	}
310	}	310	}
311		311
312	put_cpu();	312	put_cpu();
313		313
314	return err;	314	return err;
315	}	315	}
316	EXPORT_SYMBOL(smp_call_function_single);	316	EXPORT_SYMBOL(smp_call_function_single);
317		317
318	/**	318	/**
319	* __smp_call_function_single(): Run a function on another CPU	319	* __smp_call_function_single(): Run a function on another CPU
320	* @cpu: The CPU to run on.	320	* @cpu: The CPU to run on.
321	* @data: Pre-allocated and setup data structure	321	* @data: Pre-allocated and setup data structure
322	*	322	*
323	* Like smp_call_function_single(), but allow caller to pass in a	323	* Like smp_call_function_single(), but allow caller to pass in a
324	* pre-allocated data structure. Useful for embedding @data inside	324	* pre-allocated data structure. Useful for embedding @data inside
325	* other structures, for instance.	325	* other structures, for instance.
326	*/	326	*/
327	void __smp_call_function_single(int cpu, struct call_single_data *data,	327	void __smp_call_function_single(int cpu, struct call_single_data *data,
328	int wait)	328	int wait)
329	{	329	{
330	csd_lock(data);	330	csd_lock(data);
331		331
332	/* Can deadlock when called with interrupts disabled */	332	/* Can deadlock when called with interrupts disabled */
333	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);	333	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
334		334
335	generic_exec_single(cpu, data, wait);	335	generic_exec_single(cpu, data, wait);
336	}	336	}
337		337
338	/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */	338	/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
339		339
340	#ifndef arch_send_call_function_ipi_mask	340	#ifndef arch_send_call_function_ipi_mask
341	# define arch_send_call_function_ipi_mask(maskp) \	341	# define arch_send_call_function_ipi_mask(maskp) \
342	arch_send_call_function_ipi(*(maskp))	342	arch_send_call_function_ipi(*(maskp))
343	#endif	343	#endif
344		344
345	/**	345	/**
346	* smp_call_function_many(): Run a function on a set of other CPUs.	346	* smp_call_function_many(): Run a function on a set of other CPUs.
347	* @mask: The set of cpus to run on (only runs on online subset).	347	* @mask: The set of cpus to run on (only runs on online subset).
348	* @func: The function to run. This must be fast and non-blocking.	348	* @func: The function to run. This must be fast and non-blocking.
349	* @info: An arbitrary pointer to pass to the function.	349	* @info: An arbitrary pointer to pass to the function.
350	* @wait: If true, wait (atomically) until function has completed	350	* @wait: If true, wait (atomically) until function has completed
351	* on other CPUs.	351	* on other CPUs.
352	*	352	*
353	* If @wait is true, then returns once @func has returned. Note that @wait	353	* If @wait is true, then returns once @func has returned. Note that @wait
354	* will be implicitly turned on in case of allocation failures, since	354	* will be implicitly turned on in case of allocation failures, since
355	* we fall back to on-stack allocation.	355	* we fall back to on-stack allocation.
356	*	356	*
357	* You must not call this function with disabled interrupts or from a	357	* You must not call this function with disabled interrupts or from a
358	* hardware interrupt handler or from a bottom half handler. Preemption	358	* hardware interrupt handler or from a bottom half handler. Preemption
359	* must be disabled when calling this function.	359	* must be disabled when calling this function.
360	*/	360	*/
361	void smp_call_function_many(const struct cpumask *mask,	361	void smp_call_function_many(const struct cpumask *mask,
362	void (func)(void ), void *info, bool wait)	362	void (func)(void ), void *info, bool wait)
363	{	363	{
364	struct call_function_data *data;	364	struct call_function_data *data;
365	unsigned long flags;	365	unsigned long flags;
366	int cpu, next_cpu, this_cpu = smp_processor_id();	366	int cpu, next_cpu, this_cpu = smp_processor_id();
367		367
368	/* Can deadlock when called with interrupts disabled */	368	/* Can deadlock when called with interrupts disabled */
369	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);	369	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
370		370
371	/* So, what's a CPU they want? Ignoring this one. */	371	/* So, what's a CPU they want? Ignoring this one. */
372	cpu = cpumask_first_and(mask, cpu_online_mask);	372	cpu = cpumask_first_and(mask, cpu_online_mask);
373	if (cpu == this_cpu)	373	if (cpu == this_cpu)
374	cpu = cpumask_next_and(cpu, mask, cpu_online_mask);	374	cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
375		375
376	/* No online cpus? We're done. */	376	/* No online cpus? We're done. */
377	if (cpu >= nr_cpu_ids)	377	if (cpu >= nr_cpu_ids)
378	return;	378	return;
379		379
380	/* Do we have another CPU which isn't us? */	380	/* Do we have another CPU which isn't us? */
381	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);	381	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
382	if (next_cpu == this_cpu)	382	if (next_cpu == this_cpu)
383	next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);	383	next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
384		384
385	/* Fastpath: do that cpu by itself. */	385	/* Fastpath: do that cpu by itself. */
386	if (next_cpu >= nr_cpu_ids) {	386	if (next_cpu >= nr_cpu_ids) {
387	smp_call_function_single(cpu, func, info, wait);	387	smp_call_function_single(cpu, func, info, wait);
388	return;	388	return;
389	}	389	}
390		390
391	data = &__get_cpu_var(cfd_data);	391	data = &__get_cpu_var(cfd_data);
392	csd_lock(&data->csd);	392	csd_lock(&data->csd);
393		393
394	spin_lock_irqsave(&data->lock, flags);	394	spin_lock_irqsave(&data->lock, flags);
395	data->csd.func = func;	395	data->csd.func = func;
396	data->csd.info = info;	396	data->csd.info = info;
397	cpumask_and(data->cpumask, mask, cpu_online_mask);	397	cpumask_and(data->cpumask, mask, cpu_online_mask);
398	cpumask_clear_cpu(this_cpu, data->cpumask);	398	cpumask_clear_cpu(this_cpu, data->cpumask);
399	data->refs = cpumask_weight(data->cpumask);	399	data->refs = cpumask_weight(data->cpumask);
400		400
401	spin_lock(&call_function.lock);	401	spin_lock(&call_function.lock);
402	/*	402	/*
403	* Place entry at the _HEAD_ of the list, so that any cpu still	403	* Place entry at the _HEAD_ of the list, so that any cpu still
404	* observing the entry in generic_smp_call_function_interrupt()	404	* observing the entry in generic_smp_call_function_interrupt()
405	* will not miss any other list entries:	405	* will not miss any other list entries:
406	*/	406	*/
407	list_add_rcu(&data->csd.list, &call_function.queue);	407	list_add_rcu(&data->csd.list, &call_function.queue);
408	spin_unlock(&call_function.lock);	408	spin_unlock(&call_function.lock);
409		409
410	spin_unlock_irqrestore(&data->lock, flags);	410	spin_unlock_irqrestore(&data->lock, flags);
411		411
412	/*	412	/*
413	* Make the list addition visible before sending the ipi.	413	* Make the list addition visible before sending the ipi.
414	* (IPIs must obey or appear to obey normal Linux cache	414	* (IPIs must obey or appear to obey normal Linux cache
415	* coherency rules -- see comment in generic_exec_single).	415	* coherency rules -- see comment in generic_exec_single).
416	*/	416	*/
417	smp_mb();	417	smp_mb();
418		418
419	/* Send a message to all CPUs in the map */	419	/* Send a message to all CPUs in the map */
420	arch_send_call_function_ipi_mask(data->cpumask);	420	arch_send_call_function_ipi_mask(data->cpumask);
421		421
422	/* Optionally wait for the CPUs to complete */	422	/* Optionally wait for the CPUs to complete */
423	if (wait)	423	if (wait)
424	csd_lock_wait(&data->csd);	424	csd_lock_wait(&data->csd);
425	}	425	}
426	EXPORT_SYMBOL(smp_call_function_many);	426	EXPORT_SYMBOL(smp_call_function_many);
427		427
428	/**	428	/**
429	* smp_call_function(): Run a function on all other CPUs.	429	* smp_call_function(): Run a function on all other CPUs.
430	* @func: The function to run. This must be fast and non-blocking.	430	* @func: The function to run. This must be fast and non-blocking.
431	* @info: An arbitrary pointer to pass to the function.	431	* @info: An arbitrary pointer to pass to the function.
432	* @wait: If true, wait (atomically) until function has completed	432	* @wait: If true, wait (atomically) until function has completed
433	* on other CPUs.	433	* on other CPUs.
434	*	434	*
435	* Returns 0.	435	* Returns 0.
436	*	436	*
437	* If @wait is true, then returns once @func has returned; otherwise	437	* If @wait is true, then returns once @func has returned; otherwise
438	* it returns just before the target cpu calls @func. In case of allocation	438	* it returns just before the target cpu calls @func. In case of allocation
439	* failure, @wait will be implicitly turned on.	439	* failure, @wait will be implicitly turned on.
440	*	440	*
441	* You must not call this function with disabled interrupts or from a	441	* You must not call this function with disabled interrupts or from a
442	* hardware interrupt handler or from a bottom half handler.	442	* hardware interrupt handler or from a bottom half handler.
443	*/	443	*/
444	int smp_call_function(void (func)(void ), void *info, int wait)	444	int smp_call_function(void (func)(void ), void *info, int wait)
445	{	445	{
446	preempt_disable();	446	preempt_disable();
447	smp_call_function_many(cpu_online_mask, func, info, wait);	447	smp_call_function_many(cpu_online_mask, func, info, wait);
448	preempt_enable();	448	preempt_enable();
449		449
450	return 0;	450	return 0;
451	}	451	}
452	EXPORT_SYMBOL(smp_call_function);	452	EXPORT_SYMBOL(smp_call_function);
453		453
454	void ipi_call_lock(void)	454	void ipi_call_lock(void)
455	{	455	{
456	spin_lock(&call_function.lock);	456	spin_lock(&call_function.lock);
457	}	457	}
458		458
459	void ipi_call_unlock(void)	459	void ipi_call_unlock(void)
460	{	460	{
461	spin_unlock(&call_function.lock);	461	spin_unlock(&call_function.lock);
462	}	462	}
463		463
464	void ipi_call_lock_irq(void)	464	void ipi_call_lock_irq(void)
465	{	465	{
466	spin_lock_irq(&call_function.lock);	466	spin_lock_irq(&call_function.lock);
467	}	467	}
468		468
469	void ipi_call_unlock_irq(void)	469	void ipi_call_unlock_irq(void)
470	{	470	{
471	spin_unlock_irq(&call_function.lock);	471	spin_unlock_irq(&call_function.lock);
472	}	472	}
473		473