Commit d61931d89be506372d01a90d1755f6d0a9fafe2d

Authored by Borislav Petkov
Committed by H. Peter Anvin
1 parent 1527bc8b92

x86: Add optimized popcnt variants

Add support for the hardware version of the Hamming weight function,
popcnt, present in CPUs which advertize it under CPUID, Function
0x0000_0001_ECX[23]. On CPUs which don't support it, we fallback to the
default lib/hweight.c sw versions.

A synthetic benchmark comparing popcnt with __sw_hweight64 showed almost
a 3x speedup on a F10h machine.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100318112015.GC11152@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

Showing 8 changed files with 108 additions and 18 deletions Side-by-side Diff

... ... @@ -238,6 +238,11 @@
238 238 def_bool y
239 239 depends on X86_32 && !CC_STACKPROTECTOR
240 240  
  241 +config ARCH_HWEIGHT_CFLAGS
  242 + string
  243 + default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
  244 + default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
  245 +
241 246 config KTIME_SCALAR
242 247 def_bool X86_32
243 248 source "init/Kconfig"
arch/x86/include/asm/alternative.h
... ... @@ -39,9 +39,6 @@
39 39 #define LOCK_PREFIX ""
40 40 #endif
41 41  
42   -/* This must be included *after* the definition of LOCK_PREFIX */
43   -#include <asm/cpufeature.h>
44   -
45 42 struct alt_instr {
46 43 u8 *instr; /* original instruction */
47 44 u8 *replacement;
... ... @@ -94,6 +91,12 @@
94 91 ".section .altinstr_replacement, \"ax\"\n" \
95 92 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
96 93 ".previous"
  94 +
  95 +/*
  96 + * This must be included *after* the definition of ALTERNATIVE due to
  97 + * <asm/arch_hweight.h>
  98 + */
  99 +#include <asm/cpufeature.h>
97 100  
98 101 /*
99 102 * Alternative instructions for different CPU types or capabilities.
arch/x86/include/asm/arch_hweight.h
  1 +#ifndef _ASM_X86_HWEIGHT_H
  2 +#define _ASM_X86_HWEIGHT_H
  3 +
  4 +#ifdef CONFIG_64BIT
  5 +/* popcnt %rdi, %rax */
  6 +#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
  7 +#define REG_IN "D"
  8 +#define REG_OUT "a"
  9 +#else
  10 +/* popcnt %eax, %eax */
  11 +#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0"
  12 +#define REG_IN "a"
  13 +#define REG_OUT "a"
  14 +#endif
  15 +
  16 +/*
  17 + * __sw_hweightXX are called from within the alternatives below
  18 + * and callee-clobbered registers need to be taken care of. See
  19 + * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
  20 + * compiler switches.
  21 + */
  22 +static inline unsigned int __arch_hweight32(unsigned int w)
  23 +{
  24 + unsigned int res = 0;
  25 +
  26 + asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT)
  27 + : "="REG_OUT (res)
  28 + : REG_IN (w));
  29 +
  30 + return res;
  31 +}
  32 +
  33 +static inline unsigned int __arch_hweight16(unsigned int w)
  34 +{
  35 + return __arch_hweight32(w & 0xffff);
  36 +}
  37 +
  38 +static inline unsigned int __arch_hweight8(unsigned int w)
  39 +{
  40 + return __arch_hweight32(w & 0xff);
  41 +}
  42 +
  43 +static inline unsigned long __arch_hweight64(__u64 w)
  44 +{
  45 + unsigned long res = 0;
  46 +
  47 +#ifdef CONFIG_X86_32
  48 + return __arch_hweight32((u32)w) +
  49 + __arch_hweight32((u32)(w >> 32));
  50 +#else
  51 + asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT)
  52 + : "="REG_OUT (res)
  53 + : REG_IN (w));
  54 +#endif /* CONFIG_X86_32 */
  55 +
  56 + return res;
  57 +}
  58 +
  59 +#endif
arch/x86/include/asm/bitops.h
... ... @@ -444,7 +444,9 @@
444 444  
445 445 #define ARCH_HAS_FAST_MULTIPLIER 1
446 446  
447   -#include <asm-generic/bitops/hweight.h>
  447 +#include <asm/arch_hweight.h>
  448 +
  449 +#include <asm-generic/bitops/const_hweight.h>
448 450  
449 451 #endif /* __KERNEL__ */
450 452  
include/asm-generic/bitops/arch_hweight.h
... ... @@ -3,10 +3,24 @@
3 3  
4 4 #include <asm/types.h>
5 5  
6   -extern unsigned int __arch_hweight32(unsigned int w);
7   -extern unsigned int __arch_hweight16(unsigned int w);
8   -extern unsigned int __arch_hweight8(unsigned int w);
9   -extern unsigned long __arch_hweight64(__u64 w);
  6 +inline unsigned int __arch_hweight32(unsigned int w)
  7 +{
  8 + return __sw_hweight32(w);
  9 +}
10 10  
  11 +inline unsigned int __arch_hweight16(unsigned int w)
  12 +{
  13 + return __sw_hweight16(w);
  14 +}
  15 +
  16 +inline unsigned int __arch_hweight8(unsigned int w)
  17 +{
  18 + return __sw_hweight8(w);
  19 +}
  20 +
  21 +inline unsigned long __arch_hweight64(__u64 w)
  22 +{
  23 + return __sw_hweight64(w);
  24 +}
11 25 #endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
... ... @@ -39,7 +39,10 @@
39 39 lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
40 40 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
41 41 obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
  42 +
  43 +CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
42 44 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
  45 +
43 46 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
44 47 obj-$(CONFIG_BTREE) += btree.o
45 48 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
... ... @@ -9,7 +9,7 @@
9 9 * The Hamming Weight of a number is the total number of bits set in it.
10 10 */
11 11  
12   -unsigned int __arch_hweight32(unsigned int w)
  12 +unsigned int __sw_hweight32(unsigned int w)
13 13 {
14 14 #ifdef ARCH_HAS_FAST_MULTIPLIER
15 15 w -= (w >> 1) & 0x55555555;
16 16  
17 17  
18 18  
19 19  
20 20  
21 21  
... ... @@ -24,30 +24,30 @@
24 24 return (res + (res >> 16)) & 0x000000FF;
25 25 #endif
26 26 }
27   -EXPORT_SYMBOL(__arch_hweight32);
  27 +EXPORT_SYMBOL(__sw_hweight32);
28 28  
29   -unsigned int __arch_hweight16(unsigned int w)
  29 +unsigned int __sw_hweight16(unsigned int w)
30 30 {
31 31 unsigned int res = w - ((w >> 1) & 0x5555);
32 32 res = (res & 0x3333) + ((res >> 2) & 0x3333);
33 33 res = (res + (res >> 4)) & 0x0F0F;
34 34 return (res + (res >> 8)) & 0x00FF;
35 35 }
36   -EXPORT_SYMBOL(__arch_hweight16);
  36 +EXPORT_SYMBOL(__sw_hweight16);
37 37  
38   -unsigned int __arch_hweight8(unsigned int w)
  38 +unsigned int __sw_hweight8(unsigned int w)
39 39 {
40 40 unsigned int res = w - ((w >> 1) & 0x55);
41 41 res = (res & 0x33) + ((res >> 2) & 0x33);
42 42 return (res + (res >> 4)) & 0x0F;
43 43 }
44   -EXPORT_SYMBOL(__arch_hweight8);
  44 +EXPORT_SYMBOL(__sw_hweight8);
45 45  
46   -unsigned long __arch_hweight64(__u64 w)
  46 +unsigned long __sw_hweight64(__u64 w)
47 47 {
48 48 #if BITS_PER_LONG == 32
49   - return __arch_hweight32((unsigned int)(w >> 32)) +
50   - __arch_hweight32((unsigned int)w);
  49 + return __sw_hweight32((unsigned int)(w >> 32)) +
  50 + __sw_hweight32((unsigned int)w);
51 51 #elif BITS_PER_LONG == 64
52 52 #ifdef ARCH_HAS_FAST_MULTIPLIER
53 53 w -= (w >> 1) & 0x5555555555555555ul;
... ... @@ -64,5 +64,5 @@
64 64 #endif
65 65 #endif
66 66 }
67   -EXPORT_SYMBOL(__arch_hweight64);
  67 +EXPORT_SYMBOL(__sw_hweight64);
scripts/Makefile.lib
... ... @@ -245,4 +245,8 @@
245 245 cmd_lzo = (cat $(filter-out FORCE,$^) | \
246 246 lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
247 247 (rm -f $@ ; false)
  248 +
  249 +# misc stuff
  250 +# ---------------------------------------------------------------------------
  251 +quote:="