Blame view

arch/x86/lib/memcpy_64.S 4.13 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
  /* Copyright 2002 Andi Kleen */
038b0a6d8   Dave Jones   Remove all inclus...
2

8d379dad8   Jan Beulich   [PATCH] annotate ...
3
  #include <linux/linkage.h>
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
4

8d379dad8   Jan Beulich   [PATCH] annotate ...
5
  #include <asm/cpufeature.h>
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
6
  #include <asm/dwarf2.h>
101068c1f   Fenghua Yu   x86, mem: memcpy_...
7
  #include <asm/alternative-asm.h>
8d379dad8   Jan Beulich   [PATCH] annotate ...
8

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
  /*
   * memcpy - Copy a memory block.
   *
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
12
13
14
15
16
   * Input:
   *  rdi destination
   *  rsi source
   *  rdx count
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
17
18
   * Output:
   * rax original destination
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
19
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20

f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
21
22
23
  /*
   * memcpy_c() - fast string ops (REP MOVSQ) based variant.
   *
7269e8812   Jan Beulich   x86-64: Modify me...
24
   * This gets patched over the unrolled variant (below) via the
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
25
26
   * alternative instructions framework:
   */
7269e8812   Jan Beulich   x86-64: Modify me...
27
28
  	.section .altinstr_replacement, "ax", @progbits
  .Lmemcpy_c:
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
29
30
31
32
33
  	movq %rdi, %rax
  
  	movl %edx, %ecx
  	shrl $3, %ecx
  	andl $7, %edx
8d379dad8   Jan Beulich   [PATCH] annotate ...
34
  	rep movsq
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
35
  	movl %edx, %ecx
8d379dad8   Jan Beulich   [PATCH] annotate ...
36
37
  	rep movsb
  	ret
7269e8812   Jan Beulich   x86-64: Modify me...
38
39
  .Lmemcpy_e:
  	.previous
8d379dad8   Jan Beulich   [PATCH] annotate ...
40

101068c1f   Fenghua Yu   x86, mem: memcpy_...
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
  /*
   * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
   * memcpy_c. Use memcpy_c_e when possible.
   *
   * This gets patched over the unrolled variant (below) via the
   * alternative instructions framework:
   */
  	.section .altinstr_replacement, "ax", @progbits
  .Lmemcpy_c_e:
  	movq %rdi, %rax
  
  	movl %edx, %ecx
  	rep movsb
  	ret
  .Lmemcpy_e_e:
  	.previous
8d379dad8   Jan Beulich   [PATCH] annotate ...
57
58
59
  ENTRY(__memcpy)
  ENTRY(memcpy)
  	CFI_STARTPROC
59daa706f   Ma Ling   x86, mem: Optimiz...
60
  	movq %rdi, %rax
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
61

f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
62
  	/*
59daa706f   Ma Ling   x86, mem: Optimiz...
63
  	 * Use 32bit CMP here to avoid long NOP padding.
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
64
  	 */
59daa706f   Ma Ling   x86, mem: Optimiz...
65
66
  	cmp  $0x20, %edx
  	jb .Lhandle_tail
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
67

f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
68
  	/*
9de4966a4   Bart Van Assche   x86: Fix spelling...
69
  	 * We check whether memory false dependence could occur,
59daa706f   Ma Ling   x86, mem: Optimiz...
70
  	 * then jump to corresponding copy mode.
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
71
  	 */
59daa706f   Ma Ling   x86, mem: Optimiz...
72
73
74
75
76
  	cmp  %dil, %sil
  	jl .Lcopy_backward
  	subl $0x20, %edx
  .Lcopy_forward_loop:
  	subq $0x20,	%rdx
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
77

f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
78
  	/*
59daa706f   Ma Ling   x86, mem: Optimiz...
79
  	 * Move in blocks of 4x8 bytes:
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
80
  	 */
59daa706f   Ma Ling   x86, mem: Optimiz...
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
  	movq 0*8(%rsi),	%r8
  	movq 1*8(%rsi),	%r9
  	movq 2*8(%rsi),	%r10
  	movq 3*8(%rsi),	%r11
  	leaq 4*8(%rsi),	%rsi
  
  	movq %r8,	0*8(%rdi)
  	movq %r9,	1*8(%rdi)
  	movq %r10,	2*8(%rdi)
  	movq %r11,	3*8(%rdi)
  	leaq 4*8(%rdi),	%rdi
  	jae  .Lcopy_forward_loop
  	addq $0x20,	%rdx
  	jmp  .Lhandle_tail
  
  .Lcopy_backward:
  	/*
  	 * Calculate copy position to tail.
  	 */
  	addq %rdx,	%rsi
  	addq %rdx,	%rdi
  	subq $0x20,	%rdx
  	/*
  	 * At most 3 ALU operations in one cycle,
  	 * so append NOPS in the same 16bytes trunk.
  	 */
  	.p2align 4
  .Lcopy_backward_loop:
  	subq $0x20,	%rdx
  	movq -1*8(%rsi),	%r8
  	movq -2*8(%rsi),	%r9
  	movq -3*8(%rsi),	%r10
  	movq -4*8(%rsi),	%r11
  	leaq -4*8(%rsi),	%rsi
  	movq %r8,		-1*8(%rdi)
  	movq %r9,		-2*8(%rdi)
  	movq %r10,		-3*8(%rdi)
  	movq %r11,		-4*8(%rdi)
  	leaq -4*8(%rdi),	%rdi
  	jae  .Lcopy_backward_loop
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
121

59daa706f   Ma Ling   x86, mem: Optimiz...
122
123
124
125
126
127
  	/*
  	 * Calculate copy position to head.
  	 */
  	addq $0x20,	%rdx
  	subq %rdx,	%rsi
  	subq %rdx,	%rdi
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
128
  .Lhandle_tail:
59daa706f   Ma Ling   x86, mem: Optimiz...
129
130
  	cmpq $16,	%rdx
  	jb   .Lless_16bytes
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
131

59daa706f   Ma Ling   x86, mem: Optimiz...
132
133
134
135
136
137
138
139
140
141
142
143
  	/*
  	 * Move data from 16 bytes to 31 bytes.
  	 */
  	movq 0*8(%rsi), %r8
  	movq 1*8(%rsi),	%r9
  	movq -2*8(%rsi, %rdx),	%r10
  	movq -1*8(%rsi, %rdx),	%r11
  	movq %r8,	0*8(%rdi)
  	movq %r9,	1*8(%rdi)
  	movq %r10,	-2*8(%rdi, %rdx)
  	movq %r11,	-1*8(%rdi, %rdx)
  	retq
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
144
  	.p2align 4
59daa706f   Ma Ling   x86, mem: Optimiz...
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
  .Lless_16bytes:
  	cmpq $8,	%rdx
  	jb   .Lless_8bytes
  	/*
  	 * Move data from 8 bytes to 15 bytes.
  	 */
  	movq 0*8(%rsi),	%r8
  	movq -1*8(%rsi, %rdx),	%r9
  	movq %r8,	0*8(%rdi)
  	movq %r9,	-1*8(%rdi, %rdx)
  	retq
  	.p2align 4
  .Lless_8bytes:
  	cmpq $4,	%rdx
  	jb   .Lless_3bytes
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
160

59daa706f   Ma Ling   x86, mem: Optimiz...
161
162
163
164
165
166
167
168
  	/*
  	 * Move data from 4 bytes to 7 bytes.
  	 */
  	movl (%rsi), %ecx
  	movl -4(%rsi, %rdx), %r8d
  	movl %ecx, (%rdi)
  	movl %r8d, -4(%rdi, %rdx)
  	retq
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
169
  	.p2align 4
59daa706f   Ma Ling   x86, mem: Optimiz...
170
171
172
173
174
175
  .Lless_3bytes:
  	cmpl $0, %edx
  	je .Lend
  	/*
  	 * Move data from 1 bytes to 3 bytes.
  	 */
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
176
  .Lloop_1:
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
177
178
  	movb (%rsi), %r8b
  	movb %r8b, (%rdi)
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
179
180
  	incq %rdi
  	incq %rsi
59daa706f   Ma Ling   x86, mem: Optimiz...
181
  	decl %edx
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
182
  	jnz .Lloop_1
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
183
  .Lend:
59daa706f   Ma Ling   x86, mem: Optimiz...
184
  	retq
8d379dad8   Jan Beulich   [PATCH] annotate ...
185
186
187
  	CFI_ENDPROC
  ENDPROC(memcpy)
  ENDPROC(__memcpy)
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
188

f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
189
  	/*
101068c1f   Fenghua Yu   x86, mem: memcpy_...
190
191
192
193
194
195
196
197
198
  	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
  	 * If the feature is supported, memcpy_c_e() is the first choice.
  	 * If enhanced rep movsb copy is not available, use fast string copy
  	 * memcpy_c() when possible. This is faster and code is simpler than
  	 * original memcpy().
  	 * Otherwise, original memcpy() is used.
  	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
           * feature to implement the right patch order.
  	 *
f3b6eaf01   Ingo Molnar   x86: memcpy, clea...
199
200
201
202
  	 * Replace only beginning, memcpy is used to apply alternatives,
  	 * so it is silly to overwrite itself with nops - reboot is the
  	 * only outcome...
  	 */
101068c1f   Fenghua Yu   x86, mem: memcpy_...
203
204
205
206
207
  	.section .altinstructions, "a"
  	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
  			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
  	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
  			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
7bcd3f34e   Andi Kleen   [PATCH] x86_64: U...
208
  	.previous