m68k: merge and clean up delay.h files

The real difference between the mmu and non-mmu varients of the delay.h files has nothing to do with having an mmu or not. It is processor family differences that means slightly different code. Merge the delay_mm.h and delay_no.h files back into a single file. The primarly difference we need to deal with is whether the processor supports a 32bit * 32bit -> 64bit multiply. Without it we need to do some shift scaling as well as use a 32bit * 32bit -> 32bit multiply. If building for a multi-CPU type kernel then we must use the simpler mult/shift scaling. This version of delay code allows the CPU32 family to use a 64bit mul, since it supports this instruction, the old code did not. The changes use macros where appropriate to try and optimize constant sized udelay times. And it removes the use of a fixed lib function for the non-mmu case. Code size on typical kernel configurations is similar, or only larger by a few tens of bytes. Also removed the unused muldiv() code from delay_mm.h. Build and run tested on ColdFire and ARAnyM. Build tested only on 68328 and 68360 (CPU32). Signed-off-by: Greg Ungerer <gerg@uclinux.org>

m68k: merge and clean up delay.h files
The real difference between the mmu and non-mmu varients of the delay.h files has nothing to do with having an mmu or not. It is processor family differences that means slightly different code. Merge the delay_mm.h and delay_no.h files back into a single file. The primarly difference we need to deal with is whether the processor supports a 32bit * 32bit -> 64bit multiply. Without it we need to do some shift scaling as well as use a 32bit * 32bit -> 32bit multiply. If building for a multi-CPU type kernel then we must use the simpler mult/shift scaling. This version of delay code allows the CPU32 family to use a 64bit mul, since it supports this instruction, the old code did not. The changes use macros where appropriate to try and optimize constant sized udelay times. And it removes the use of a fixed lib function for the non-mmu case. Code size on typical kernel configurations is similar, or only larger by a few tens of bytes. Also removed the unused muldiv() code from delay_mm.h. Build and run tested on ColdFire and ARAnyM. Build tested only on 68328 and 68360 (CPU32). Signed-off-by: Greg Ungerer <gerg@uclinux.org>
Greg Ungerer
1 parent 622e9472dd
Showing 5 changed files with 95 additions and 158 deletions Side-by-side Diff
arch/m68k/include/asm/delay.h
arch/m68k/include/asm/delay_mm.h
arch/m68k/include/asm/delay_no.h
arch/m68k/lib/Makefile
arch/m68k/lib/delay.c
-#ifdef __uClinux__
-#include "delay_no.h"
+#ifndef _M68K_DELAY_H
+#define _M68K_DELAY_H
+
+#include <asm/param.h>
+
+/*
+ * Copyright (C) 1994 Hamish Macdonald
+ * Copyright (C) 2004 Greg Ungerer <gerg@uclinux.com>
+ *
+ * Delay routines, using a pre-computed "loops_per_jiffy" value.
+ */
+
+#if defined(CONFIG_COLDFIRE)
+/*
+ * The ColdFire runs the delay loop at significantly different speeds
+ * depending upon long word alignment or not.  We'll pad it to
+ * long word alignment which is the faster version.
+ * The 0x4a8e is of course a 'tstl %fp' instruction.  This is better
+ * than using a NOP (0x4e71) instruction because it executes in one
+ * cycle not three and doesn't allow for an arbitrary delay waiting
+ * for bus cycles to finish.  Also fp/a6 isn't likely to cause a
+ * stall waiting for the register to become valid if such is added
+ * to the coldfire at some stage.
+ */
+#define	DELAY_ALIGN	".balignw 4, 0x4a8e\n\t"
 #else
-#include "delay_mm.h"
+/*
+ * No instruction alignment required for other m68k types.
+ */
+#define	DELAY_ALIGN
 #endif
+
+static inline void __delay(unsigned long loops)
+{
+	__asm__ __volatile__ (
+		DELAY_ALIGN
+		"1: subql #1,%0\n\t"
+		"jcc 1b"
+		: "=d" (loops)
+		: "0" (loops));
+}
+
+extern void __bad_udelay(void);
+
+
+#if defined(CONFIG_M68000) || defined(CONFIG_COLDFIRE)
+/*
+ * The simpler m68k and ColdFire processors do not have a 32*32->64
+ * multiply instruction. So we need to handle them a little differently.
+ * We use a bit of shifting and a single 32*32->32 multiply to get close.
+ * This is a macro so that the const version can factor out the first
+ * multiply and shift.
+ */
+#define	HZSCALE		(268435456 / (1000000 / HZ))
+
+#define	__const_udelay(u) \
+	__delay(((((u) * HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6)
+
+#else
+
+static inline void __xdelay(unsigned long xloops)
+{
+	unsigned long tmp;
+
+	__asm__ ("mulul %2,%0:%1"
+		: "=d" (xloops), "=d" (tmp)
+		: "d" (xloops), "1" (loops_per_jiffy));
+	__delay(xloops * HZ);
+}
+
+/*
+ * The definition of __const_udelay is specifically made a macro so that
+ * the const factor (4295 = 2**32 / 1000000) can be optimized out when
+ * the delay is a const.
+ */
+#define	__const_udelay(n)	(__xdelay((n) * 4295))
+
+#endif
+
+static inline void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs);
+}
+
+/*
+ * Use only for very small delays ( < 1 msec).  Should probably use a
+ * lookup table, really, as the multiplications take much too long with
+ * short delays.  This is a "reasonable" implementation, though (and the
+ * first constant multiplications gets optimized away if the delay is
+ * a constant)
+ */
+#define udelay(n) (__builtin_constant_p(n) ? \
+	((n) > 20000 ? __bad_udelay() : __const_udelay(n)) : __udelay(n))
+
+
+#endif /* defined(_M68K_DELAY_H) */
-#ifndef _M68K_DELAY_H
-#define _M68K_DELAY_H
-
-#include <asm/param.h>
-
-/*
- * Copyright (C) 1994 Hamish Macdonald
- *
- * Delay routines, using a pre-computed "loops_per_jiffy" value.
- */
-
-static inline void __delay(unsigned long loops)
-{
-	__asm__ __volatile__ ("1: subql #1,%0; jcc 1b"
-		: "=d" (loops) : "0" (loops));
-}
-
-extern void __bad_udelay(void);
-
-/*
- * Use only for very small delays ( < 1 msec).  Should probably use a
- * lookup table, really, as the multiplications take much too long with
- * short delays.  This is a "reasonable" implementation, though (and the
- * first constant multiplications gets optimized away if the delay is
- * a constant)
- */
-static inline void __const_udelay(unsigned long xloops)
-{
-	unsigned long tmp;
-
-	__asm__ ("mulul %2,%0:%1"
-		: "=d" (xloops), "=d" (tmp)
-		: "d" (xloops), "1" (loops_per_jiffy));
-	__delay(xloops * HZ);
-}
-
-static inline void __udelay(unsigned long usecs)
-{
-	__const_udelay(usecs * 4295);	/* 2**32 / 1000000 */
-}
-
-#define udelay(n) (__builtin_constant_p(n) ? \
-	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 4295)) : \
-	__udelay(n))
-
-static inline unsigned long muldiv(unsigned long a, unsigned long b,
-				   unsigned long c)
-{
-	unsigned long tmp;
-
-	__asm__ ("mulul %2,%0:%1; divul %3,%0:%1"
-		: "=d" (tmp), "=d" (a)
-		: "d" (b), "d" (c), "1" (a));
-	return a;
-}
-
-#endif /* defined(_M68K_DELAY_H) */
-#ifndef _M68KNOMMU_DELAY_H
-#define _M68KNOMMU_DELAY_H
-
-/*
- * Copyright (C) 1994 Hamish Macdonald
- * Copyright (C) 2004 Greg Ungerer <gerg@snapgear.com>
- */
-
-#include <asm/param.h>
-
-static inline void __delay(unsigned long loops)
-{
-#if defined(CONFIG_COLDFIRE)
-	/* The coldfire runs this loop at significantly different speeds
-	 * depending upon long word alignment or not.  We'll pad it to
-	 * long word alignment which is the faster version.
-	 * The 0x4a8e is of course a 'tstl %fp' instruction.  This is better
-	 * than using a NOP (0x4e71) instruction because it executes in one
-	 * cycle not three and doesn't allow for an arbitrary delay waiting
-	 * for bus cycles to finish.  Also fp/a6 isn't likely to cause a
-	 * stall waiting for the register to become valid if such is added
-	 * to the coldfire at some stage.
-	 */
-	__asm__ __volatile__ (	".balignw 4, 0x4a8e\n\t"
-				"1: subql #1, %0\n\t"
-				"jcc 1b"
-		: "=d" (loops) : "0" (loops));
-#else
-	__asm__ __volatile__ (	"1: subql #1, %0\n\t"
-				"jcc 1b"
-		: "=d" (loops) : "0" (loops));
-#endif
-}
-
-/*
- *	Ideally we use a 32*32->64 multiply to calculate the number of
- *	loop iterations, but the older standard 68k and ColdFire do not
- *	have this instruction. So for them we have a clsoe approximation
- *	loop using 32*32->32 multiplies only. This calculation based on
- *	the ARM version of delay.
- *
- *	We want to implement:
- *
- *	loops = (usecs * 0x10c6 * HZ * loops_per_jiffy) / 2^32
- */
-
-#define	HZSCALE		(268435456 / (1000000/HZ))
-
-extern unsigned long loops_per_jiffy;
-
-static inline void _udelay(unsigned long usecs)
-{
-#if defined(CONFIG_M68328) || defined(CONFIG_M68EZ328) || \
-    defined(CONFIG_M68VZ328) || defined(CONFIG_M68360) || \
-    defined(CONFIG_COLDFIRE)
-	__delay((((usecs * HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6);
-#else
-	unsigned long tmp;
-
-	usecs *= 4295;		/* 2**32 / 1000000 */
-	__asm__ ("mulul %2,%0:%1"
-		: "=d" (usecs), "=d" (tmp)
-		: "d" (usecs), "1" (loops_per_jiffy*HZ));
-	__delay(usecs);
-#endif
-}
-
-/*
- *	Moved the udelay() function into library code, no longer inlined.
- *	I had to change the algorithm because we are overflowing now on
- *	the faster ColdFire parts. The code is a little bigger, so it makes
- *	sense to library it.
- */
-extern void udelay(unsigned long usecs);
-
-#endif /* defined(_M68KNOMMU_DELAY_H) */
@@ -9,6 +9,6 @@
 ifdef CONFIG_MMU
 lib-y	+= string.o uaccess.o checksum_mm.o
 else
-lib-y	+= mulsi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o delay.o checksum_no.o
+lib-y	+= mulsi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o checksum_no.o
 endif
-/*
- *	arch/m68knommu/lib/delay.c
- *
- *	(C) Copyright 2004, Greg Ungerer <gerg@snapgear.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <asm/param.h>
-#include <asm/delay.h>
-
-EXPORT_SYMBOL(udelay);
-
-void udelay(unsigned long usecs)
-{
-	_udelay(usecs);
-}
1		-#ifdef __uClinux__
2		-#include "delay_no.h"
	1	+#ifndef _M68K_DELAY_H
	2	+#define _M68K_DELAY_H
	3	+
	4	+#include <asm/param.h>
	5	+
	6	+/*
	7	+ * Copyright (C) 1994 Hamish Macdonald
	8	+ * Copyright (C) 2004 Greg Ungerer <gerg@uclinux.com>
	9	+ *
	10	+ * Delay routines, using a pre-computed "loops_per_jiffy" value.
	11	+ */
	12	+
	13	+#if defined(CONFIG_COLDFIRE)
	14	+/*
	15	+ * The ColdFire runs the delay loop at significantly different speeds
	16	+ * depending upon long word alignment or not. We'll pad it to
	17	+ * long word alignment which is the faster version.
	18	+ * The 0x4a8e is of course a 'tstl %fp' instruction. This is better
	19	+ * than using a NOP (0x4e71) instruction because it executes in one
	20	+ * cycle not three and doesn't allow for an arbitrary delay waiting
	21	+ * for bus cycles to finish. Also fp/a6 isn't likely to cause a
	22	+ * stall waiting for the register to become valid if such is added
	23	+ * to the coldfire at some stage.
	24	+ */
	25	+#define DELAY_ALIGN ".balignw 4, 0x4a8e\n\t"
3	26	#else
4		-#include "delay_mm.h"
	27	+/*
	28	+ * No instruction alignment required for other m68k types.
	29	+ */
	30	+#define DELAY_ALIGN
5	31	#endif
	32	+
	33	+static inline void __delay(unsigned long loops)
	34	+{
	35	+ __asm__ __volatile__ (
	36	+ DELAY_ALIGN
	37	+ "1: subql #1,%0\n\t"
	38	+ "jcc 1b"
	39	+ : "=d" (loops)
	40	+ : "0" (loops));
	41	+}
	42	+
	43	+extern void __bad_udelay(void);
	44	+
	45	+
	46	+#if defined(CONFIG_M68000) \|\| defined(CONFIG_COLDFIRE)
	47	+/*
	48	+ * The simpler m68k and ColdFire processors do not have a 32*32->64
	49	+ * multiply instruction. So we need to handle them a little differently.
	50	+ * We use a bit of shifting and a single 32*32->32 multiply to get close.
	51	+ * This is a macro so that the const version can factor out the first
	52	+ * multiply and shift.
	53	+ */
	54	+#define HZSCALE (268435456 / (1000000 / HZ))
	55	+
	56	+#define __const_udelay(u) \
	57	+ __delay(((((u) * HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6)
	58	+
	59	+#else
	60	+
	61	+static inline void __xdelay(unsigned long xloops)
	62	+{
	63	+ unsigned long tmp;
	64	+
	65	+ __asm__ ("mulul %2,%0:%1"
	66	+ : "=d" (xloops), "=d" (tmp)
	67	+ : "d" (xloops), "1" (loops_per_jiffy));
	68	+ __delay(xloops * HZ);
	69	+}
	70	+
	71	+/*
	72	+ * The definition of __const_udelay is specifically made a macro so that
	73	+ * the const factor (4295 = 2**32 / 1000000) can be optimized out when
	74	+ * the delay is a const.
	75	+ */
	76	+#define __const_udelay(n) (__xdelay((n) * 4295))
	77	+
	78	+#endif
	79	+
	80	+static inline void __udelay(unsigned long usecs)
	81	+{
	82	+ __const_udelay(usecs);
	83	+}
	84	+
	85	+/*
	86	+ * Use only for very small delays ( < 1 msec). Should probably use a
	87	+ * lookup table, really, as the multiplications take much too long with
	88	+ * short delays. This is a "reasonable" implementation, though (and the
	89	+ * first constant multiplications gets optimized away if the delay is
	90	+ * a constant)
	91	+ */
	92	+#define udelay(n) (__builtin_constant_p(n) ? \
	93	+ ((n) > 20000 ? __bad_udelay() : __const_udelay(n)) : __udelay(n))
	94	+
	95	+
	96	+#endif /* defined(_M68K_DELAY_H) */
1		-#ifndef _M68K_DELAY_H
2		-#define _M68K_DELAY_H
3		-
4		-#include <asm/param.h>
5		-
6		-/*
7		- * Copyright (C) 1994 Hamish Macdonald
8		- *
9		- * Delay routines, using a pre-computed "loops_per_jiffy" value.
10		- */
11		-
12		-static inline void __delay(unsigned long loops)
13		-{
14		- __asm__ __volatile__ ("1: subql #1,%0; jcc 1b"
15		- : "=d" (loops) : "0" (loops));
16		-}
17		-
18		-extern void __bad_udelay(void);
19		-
20		-/*
21		- * Use only for very small delays ( < 1 msec). Should probably use a
22		- * lookup table, really, as the multiplications take much too long with
23		- * short delays. This is a "reasonable" implementation, though (and the
24		- * first constant multiplications gets optimized away if the delay is
25		- * a constant)
26		- */
27		-static inline void __const_udelay(unsigned long xloops)
28		-{
29		- unsigned long tmp;
30		-
31		- __asm__ ("mulul %2,%0:%1"
32		- : "=d" (xloops), "=d" (tmp)
33		- : "d" (xloops), "1" (loops_per_jiffy));
34		- __delay(xloops * HZ);
35		-}
36		-
37		-static inline void __udelay(unsigned long usecs)
38		-{
39		- __const_udelay(usecs * 4295); /* 2*32 / 1000000 /
40		-}
41		-
42		-#define udelay(n) (__builtin_constant_p(n) ? \
43		- ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 4295)) : \
44		- __udelay(n))
45		-
46		-static inline unsigned long muldiv(unsigned long a, unsigned long b,
47		- unsigned long c)
48		-{
49		- unsigned long tmp;
50		-
51		- __asm__ ("mulul %2,%0:%1; divul %3,%0:%1"
52		- : "=d" (tmp), "=d" (a)
53		- : "d" (b), "d" (c), "1" (a));
54		- return a;
55		-}
56		-
57		-#endif /* defined(_M68K_DELAY_H) */
1		-#ifndef _M68KNOMMU_DELAY_H
2		-#define _M68KNOMMU_DELAY_H
3		-
4		-/*
5		- * Copyright (C) 1994 Hamish Macdonald
6		- * Copyright (C) 2004 Greg Ungerer <gerg@snapgear.com>
7		- */
8		-
9		-#include <asm/param.h>
10		-
11		-static inline void __delay(unsigned long loops)
12		-{
13		-#if defined(CONFIG_COLDFIRE)
14		- /* The coldfire runs this loop at significantly different speeds
15		- * depending upon long word alignment or not. We'll pad it to
16		- * long word alignment which is the faster version.
17		- * The 0x4a8e is of course a 'tstl %fp' instruction. This is better
18		- * than using a NOP (0x4e71) instruction because it executes in one
19		- * cycle not three and doesn't allow for an arbitrary delay waiting
20		- * for bus cycles to finish. Also fp/a6 isn't likely to cause a
21		- * stall waiting for the register to become valid if such is added
22		- * to the coldfire at some stage.
23		- */
24		- __asm__ __volatile__ ( ".balignw 4, 0x4a8e\n\t"
25		- "1: subql #1, %0\n\t"
26		- "jcc 1b"
27		- : "=d" (loops) : "0" (loops));
28		-#else
29		- __asm__ __volatile__ ( "1: subql #1, %0\n\t"
30		- "jcc 1b"
31		- : "=d" (loops) : "0" (loops));
32		-#endif
33		-}
34		-
35		-/*
36		- * Ideally we use a 32*32->64 multiply to calculate the number of
37		- * loop iterations, but the older standard 68k and ColdFire do not
38		- * have this instruction. So for them we have a clsoe approximation
39		- * loop using 32*32->32 multiplies only. This calculation based on
40		- * the ARM version of delay.
41		- *
42		- * We want to implement:
43		- *
44		- * loops = (usecs * 0x10c6 * HZ * loops_per_jiffy) / 2^32
45		- */
46		-
47		-#define HZSCALE (268435456 / (1000000/HZ))
48		-
49		-extern unsigned long loops_per_jiffy;
50		-
51		-static inline void _udelay(unsigned long usecs)
52		-{
53		-#if defined(CONFIG_M68328) \|\| defined(CONFIG_M68EZ328) \|\| \
54		- defined(CONFIG_M68VZ328) \|\| defined(CONFIG_M68360) \|\| \
55		- defined(CONFIG_COLDFIRE)
56		- __delay((((usecs * HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6);
57		-#else
58		- unsigned long tmp;
59		-
60		- usecs = 4295; / 2*32 / 1000000 /
61		- __asm__ ("mulul %2,%0:%1"
62		- : "=d" (usecs), "=d" (tmp)
63		- : "d" (usecs), "1" (loops_per_jiffy*HZ));
64		- __delay(usecs);
65		-#endif
66		-}
67		-
68		-/*
69		- * Moved the udelay() function into library code, no longer inlined.
70		- * I had to change the algorithm because we are overflowing now on
71		- * the faster ColdFire parts. The code is a little bigger, so it makes
72		- * sense to library it.
73		- */
74		-extern void udelay(unsigned long usecs);
75		-
76		-#endif /* defined(_M68KNOMMU_DELAY_H) */
...	...	@@ -9,6 +9,6 @@
9	9	ifdef CONFIG_MMU
10	10	lib-y += string.o uaccess.o checksum_mm.o
11	11	else
12		-lib-y += mulsi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o delay.o checksum_no.o
	12	+lib-y += mulsi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o checksum_no.o
13	13	endif
1		-/*
2		- * arch/m68knommu/lib/delay.c
3		- *
4		- * (C) Copyright 2004, Greg Ungerer <gerg@snapgear.com>
5		- *
6		- * This program is free software; you can redistribute it and/or modify
7		- * it under the terms of the GNU General Public License version 2 as
8		- * published by the Free Software Foundation.
9		- */
10		-
11		-#include <linux/module.h>
12		-#include <asm/param.h>
13		-#include <asm/delay.h>
14		-
15		-EXPORT_SYMBOL(udelay);
16		-
17		-void udelay(unsigned long usecs)
18		-{
19		- _udelay(usecs);
20		-}