Commit ae2c6ca64118b934ef85f66adb03d5bbfdd57201
1 parent
da20116166
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
sparc64: Add SPARC-T4 optimized memcpy.
Before After -------------- -------------- bw_tcp: 1288.53 MB/sec 1637.77 MB/sec bw_pipe: 1517.18 MB/sec 2107.61 MB/sec bw_unix: 1838.38 MB/sec 2640.91 MB/sec make -s -j128 allmodconfig 5min 49sec 5min 31sec Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 8 changed files with 546 additions and 2 deletions Side-by-side Diff
arch/sparc/kernel/head_64.S
... | ... | @@ -559,10 +559,10 @@ |
559 | 559 | be,pt %xcc, niagara2_patch |
560 | 560 | nop |
561 | 561 | cmp %g1, SUN4V_CHIP_NIAGARA4 |
562 | - be,pt %xcc, niagara2_patch | |
562 | + be,pt %xcc, niagara4_patch | |
563 | 563 | nop |
564 | 564 | cmp %g1, SUN4V_CHIP_NIAGARA5 |
565 | - be,pt %xcc, niagara2_patch | |
565 | + be,pt %xcc, niagara4_patch | |
566 | 566 | nop |
567 | 567 | |
568 | 568 | call generic_patch_copyops |
... | ... | @@ -573,6 +573,16 @@ |
573 | 573 | nop |
574 | 574 | |
575 | 575 | ba,a,pt %xcc, 80f |
576 | +niagara4_patch: | |
577 | + call niagara4_patch_copyops | |
578 | + nop | |
579 | + call niagara_patch_bzero | |
580 | + nop | |
581 | + call niagara4_patch_pageops | |
582 | + nop | |
583 | + | |
584 | + ba,a,pt %xcc, 80f | |
585 | + | |
576 | 586 | niagara2_patch: |
577 | 587 | call niagara2_patch_copyops |
578 | 588 | nop |
arch/sparc/lib/Makefile
... | ... | @@ -32,6 +32,9 @@ |
32 | 32 | lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o |
33 | 33 | lib-$(CONFIG_SPARC64) += NG2patch.o |
34 | 34 | |
35 | +lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o | |
36 | +lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o | |
37 | + | |
35 | 38 | lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o |
36 | 39 | lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o |
37 | 40 |
arch/sparc/lib/NG4copy_from_user.S
1 | +/* NG4copy_from_user.S: Niagara-2 optimized copy from userspace. | |
2 | + * | |
3 | + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | |
4 | + */ | |
5 | + | |
6 | +#define EX_LD(x) \ | |
7 | +98: x; \ | |
8 | + .section __ex_table,"a";\ | |
9 | + .align 4; \ | |
10 | + .word 98b, __retl_one_asi;\ | |
11 | + .text; \ | |
12 | + .align 4; | |
13 | + | |
14 | +#ifndef ASI_AIUS | |
15 | +#define ASI_AIUS 0x11 | |
16 | +#endif | |
17 | + | |
18 | +#define FUNC_NAME NG4copy_from_user | |
19 | +#define LOAD(type,addr,dest) type##a [addr] %asi, dest | |
20 | +#define EX_RETVAL(x) 0 | |
21 | + | |
22 | +#ifdef __KERNEL__ | |
23 | +#define PREAMBLE \ | |
24 | + rd %asi, %g1; \ | |
25 | + cmp %g1, ASI_AIUS; \ | |
26 | + bne,pn %icc, ___copy_in_user; \ | |
27 | + nop | |
28 | +#endif | |
29 | + | |
30 | +#include "NG4memcpy.S" |
arch/sparc/lib/NG4copy_page.S
1 | +/* NG4copy_page.S: Niagara-4 optimized copy page. | |
2 | + * | |
3 | + * Copyright (C) 2012 (davem@davemloft.net) | |
4 | + */ | |
5 | + | |
6 | +#include <asm/asi.h> | |
7 | +#include <asm/page.h> | |
8 | + | |
9 | + .text | |
10 | + .align 32 | |
11 | + | |
12 | + .register %g2, #scratch | |
13 | + .register %g3, #scratch | |
14 | + | |
15 | + .globl NG4copy_user_page | |
16 | +NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ | |
17 | + prefetch [%o1 + 0x000], #n_reads_strong | |
18 | + prefetch [%o1 + 0x040], #n_reads_strong | |
19 | + prefetch [%o1 + 0x080], #n_reads_strong | |
20 | + prefetch [%o1 + 0x0c0], #n_reads_strong | |
21 | + set PAGE_SIZE, %g7 | |
22 | + prefetch [%o1 + 0x100], #n_reads_strong | |
23 | + prefetch [%o1 + 0x140], #n_reads_strong | |
24 | + prefetch [%o1 + 0x180], #n_reads_strong | |
25 | + prefetch [%o1 + 0x1c0], #n_reads_strong | |
26 | +1: | |
27 | + ldx [%o1 + 0x00], %o2 | |
28 | + subcc %g7, 0x40, %g7 | |
29 | + ldx [%o1 + 0x08], %o3 | |
30 | + ldx [%o1 + 0x10], %o4 | |
31 | + ldx [%o1 + 0x18], %o5 | |
32 | + ldx [%o1 + 0x20], %g1 | |
33 | + stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
34 | + add %o0, 0x08, %o0 | |
35 | + ldx [%o1 + 0x28], %g2 | |
36 | + stxa %o3, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
37 | + add %o0, 0x08, %o0 | |
38 | + ldx [%o1 + 0x30], %g3 | |
39 | + stxa %o4, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
40 | + add %o0, 0x08, %o0 | |
41 | + ldx [%o1 + 0x38], %o2 | |
42 | + add %o1, 0x40, %o1 | |
43 | + stxa %o5, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
44 | + add %o0, 0x08, %o0 | |
45 | + stxa %g1, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
46 | + add %o0, 0x08, %o0 | |
47 | + stxa %g2, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
48 | + add %o0, 0x08, %o0 | |
49 | + stxa %g3, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
50 | + add %o0, 0x08, %o0 | |
51 | + stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P | |
52 | + add %o0, 0x08, %o0 | |
53 | + bne,pt %icc, 1b | |
54 | + prefetch [%o1 + 0x200], #n_reads_strong | |
55 | + retl | |
56 | + membar #StoreLoad | #StoreStore | |
57 | + .size NG4copy_user_page,.-NG4copy_user_page |
arch/sparc/lib/NG4copy_to_user.S
1 | +/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace. | |
2 | + * | |
3 | + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | |
4 | + */ | |
5 | + | |
6 | +#define EX_ST(x) \ | |
7 | +98: x; \ | |
8 | + .section __ex_table,"a";\ | |
9 | + .align 4; \ | |
10 | + .word 98b, __retl_one_asi;\ | |
11 | + .text; \ | |
12 | + .align 4; | |
13 | + | |
14 | +#ifndef ASI_AIUS | |
15 | +#define ASI_AIUS 0x11 | |
16 | +#endif | |
17 | + | |
18 | +#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS | |
19 | +#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 | |
20 | +#endif | |
21 | + | |
22 | +#define FUNC_NAME NG4copy_to_user | |
23 | +#define STORE(type,src,addr) type##a src, [addr] %asi | |
24 | +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS | |
25 | +#define EX_RETVAL(x) 0 | |
26 | + | |
27 | +#ifdef __KERNEL__ | |
28 | + /* Writing to %asi is _expensive_ so we hardcode it. | |
29 | + * Reading %asi to check for KERNEL_DS is comparatively | |
30 | + * cheap. | |
31 | + */ | |
32 | +#define PREAMBLE \ | |
33 | + rd %asi, %g1; \ | |
34 | + cmp %g1, ASI_AIUS; \ | |
35 | + bne,pn %icc, ___copy_in_user; \ | |
36 | + nop | |
37 | +#endif | |
38 | + | |
39 | +#include "NG4memcpy.S" |
arch/sparc/lib/NG4memcpy.S
1 | +/* NG4memcpy.S: Niagara-4 optimized memcpy. | |
2 | + * | |
3 | + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | |
4 | + */ | |
5 | + | |
6 | +#ifdef __KERNEL__ | |
7 | +#include <asm/visasm.h> | |
8 | +#include <asm/asi.h> | |
9 | +#define GLOBAL_SPARE %g7 | |
10 | +#else | |
11 | +#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | |
12 | +#define FPRS_FEF 0x04 | |
13 | + | |
14 | +/* On T4 it is very expensive to access ASRs like %fprs and | |
15 | + * %asi, avoiding a read or a write can save ~50 cycles. | |
16 | + */ | |
17 | +#define FPU_ENTER \ | |
18 | + rd %fprs, %o5; \ | |
19 | + andcc %o5, FPRS_FEF, %g0; \ | |
20 | + be,a,pn %icc, 999f; \ | |
21 | + wr %g0, FPRS_FEF, %fprs; \ | |
22 | + 999: | |
23 | + | |
24 | +#ifdef MEMCPY_DEBUG | |
25 | +#define VISEntryHalf FPU_ENTER; \ | |
26 | + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; | |
27 | +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
28 | +#else | |
29 | +#define VISEntryHalf FPU_ENTER | |
30 | +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
31 | +#endif | |
32 | + | |
33 | +#define GLOBAL_SPARE %g5 | |
34 | +#endif | |
35 | + | |
36 | +#ifndef STORE_ASI | |
37 | +#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | |
38 | +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | |
39 | +#else | |
40 | +#define STORE_ASI 0x80 /* ASI_P */ | |
41 | +#endif | |
42 | +#endif | |
43 | + | |
44 | +#ifndef EX_LD | |
45 | +#define EX_LD(x) x | |
46 | +#endif | |
47 | + | |
48 | +#ifndef EX_ST | |
49 | +#define EX_ST(x) x | |
50 | +#endif | |
51 | + | |
52 | +#ifndef EX_RETVAL | |
53 | +#define EX_RETVAL(x) x | |
54 | +#endif | |
55 | + | |
56 | +#ifndef LOAD | |
57 | +#define LOAD(type,addr,dest) type [addr], dest | |
58 | +#endif | |
59 | + | |
60 | +#ifndef STORE | |
61 | +#ifndef MEMCPY_DEBUG | |
62 | +#define STORE(type,src,addr) type src, [addr] | |
63 | +#else | |
64 | +#define STORE(type,src,addr) type##a src, [addr] %asi | |
65 | +#endif | |
66 | +#endif | |
67 | + | |
68 | +#ifndef STORE_INIT | |
69 | +#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | |
70 | +#endif | |
71 | + | |
72 | +#ifndef FUNC_NAME | |
73 | +#define FUNC_NAME NG4memcpy | |
74 | +#endif | |
75 | +#ifndef PREAMBLE | |
76 | +#define PREAMBLE | |
77 | +#endif | |
78 | + | |
79 | +#ifndef XCC | |
80 | +#define XCC xcc | |
81 | +#endif | |
82 | + | |
83 | + .register %g2,#scratch | |
84 | + .register %g3,#scratch | |
85 | + | |
86 | + .text | |
87 | + .align 64 | |
88 | + | |
89 | + .globl FUNC_NAME | |
90 | + .type FUNC_NAME,#function | |
91 | +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | |
92 | +#ifdef MEMCPY_DEBUG | |
93 | + wr %g0, 0x80, %asi | |
94 | +#endif | |
95 | + srlx %o2, 31, %g2 | |
96 | + cmp %g2, 0 | |
97 | + tne %XCC, 5 | |
98 | + PREAMBLE | |
99 | + mov %o0, %o3 | |
100 | + brz,pn %o2, .Lexit | |
101 | + cmp %o2, 3 | |
102 | + ble,pn %icc, .Ltiny | |
103 | + cmp %o2, 19 | |
104 | + ble,pn %icc, .Lsmall | |
105 | + or %o0, %o1, %g2 | |
106 | + cmp %o2, 128 | |
107 | + bl,pn %icc, .Lmedium | |
108 | + nop | |
109 | + | |
110 | +.Llarge:/* len >= 0x80 */ | |
111 | + /* First get dest 8 byte aligned. */ | |
112 | + sub %g0, %o0, %g1 | |
113 | + and %g1, 0x7, %g1 | |
114 | + brz,pt %g1, 51f | |
115 | + sub %o2, %g1, %o2 | |
116 | + | |
117 | +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) | |
118 | + add %o1, 1, %o1 | |
119 | + subcc %g1, 1, %g1 | |
120 | + add %o0, 1, %o0 | |
121 | + bne,pt %icc, 1b | |
122 | + EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
123 | + | |
124 | +51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) | |
125 | + LOAD(prefetch, %o1 + 0x080, #n_reads_strong) | |
126 | + LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) | |
127 | + LOAD(prefetch, %o1 + 0x100, #n_reads_strong) | |
128 | + LOAD(prefetch, %o1 + 0x140, #n_reads_strong) | |
129 | + LOAD(prefetch, %o1 + 0x180, #n_reads_strong) | |
130 | + LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) | |
131 | + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
132 | + | |
133 | + /* Check if we can use the straight fully aligned | |
134 | + * loop, or we require the alignaddr/faligndata variant. | |
135 | + */ | |
136 | + andcc %o1, 0x7, %o5 | |
137 | + bne,pn %icc, .Llarge_src_unaligned | |
138 | + sub %g0, %o0, %g1 | |
139 | + | |
140 | + /* Legitimize the use of initializing stores by getting dest | |
141 | + * to be 64-byte aligned. | |
142 | + */ | |
143 | + and %g1, 0x3f, %g1 | |
144 | + brz,pt %g1, .Llarge_aligned | |
145 | + sub %o2, %g1, %o2 | |
146 | + | |
147 | +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) | |
148 | + add %o1, 8, %o1 | |
149 | + subcc %g1, 8, %g1 | |
150 | + add %o0, 8, %o0 | |
151 | + bne,pt %icc, 1b | |
152 | + EX_ST(STORE(stx, %g2, %o0 - 0x08)) | |
153 | + | |
154 | +.Llarge_aligned: | |
155 | + /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ | |
156 | + andn %o2, 0x3f, %o4 | |
157 | + sub %o2, %o4, %o2 | |
158 | + | |
159 | +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
160 | + add %o1, 0x40, %o1 | |
161 | + EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) | |
162 | + subcc %o4, 0x40, %o4 | |
163 | + EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) | |
164 | + EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) | |
165 | + EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) | |
166 | + EX_ST(STORE_INIT(%g1, %o0)) | |
167 | + add %o0, 0x08, %o0 | |
168 | + EX_ST(STORE_INIT(%g2, %o0)) | |
169 | + add %o0, 0x08, %o0 | |
170 | + EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) | |
171 | + EX_ST(STORE_INIT(%g3, %o0)) | |
172 | + add %o0, 0x08, %o0 | |
173 | + EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) | |
174 | + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
175 | + add %o0, 0x08, %o0 | |
176 | + EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) | |
177 | + EX_ST(STORE_INIT(%o5, %o0)) | |
178 | + add %o0, 0x08, %o0 | |
179 | + EX_ST(STORE_INIT(%g2, %o0)) | |
180 | + add %o0, 0x08, %o0 | |
181 | + EX_ST(STORE_INIT(%g3, %o0)) | |
182 | + add %o0, 0x08, %o0 | |
183 | + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
184 | + add %o0, 0x08, %o0 | |
185 | + bne,pt %icc, 1b | |
186 | + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
187 | + | |
188 | + membar #StoreLoad | #StoreStore | |
189 | + | |
190 | + brz,pn %o2, .Lexit | |
191 | + cmp %o2, 19 | |
192 | + ble,pn %icc, .Lsmall_unaligned | |
193 | + nop | |
194 | + ba,a,pt %icc, .Lmedium_noprefetch | |
195 | + | |
196 | +.Lexit: retl | |
197 | + mov EX_RETVAL(%o3), %o0 | |
198 | + | |
199 | +.Llarge_src_unaligned: | |
200 | + andn %o2, 0x3f, %o4 | |
201 | + sub %o2, %o4, %o2 | |
202 | + VISEntryHalf | |
203 | + alignaddr %o1, %g0, %g1 | |
204 | + add %o1, %o4, %o1 | |
205 | + EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) | |
206 | +1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) | |
207 | + subcc %o4, 0x40, %o4 | |
208 | + EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) | |
209 | + EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) | |
210 | + EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) | |
211 | + EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) | |
212 | + EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) | |
213 | + EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) | |
214 | + faligndata %f0, %f2, %f16 | |
215 | + EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) | |
216 | + faligndata %f2, %f4, %f18 | |
217 | + add %g1, 0x40, %g1 | |
218 | + faligndata %f4, %f6, %f20 | |
219 | + faligndata %f6, %f8, %f22 | |
220 | + faligndata %f8, %f10, %f24 | |
221 | + faligndata %f10, %f12, %f26 | |
222 | + faligndata %f12, %f14, %f28 | |
223 | + faligndata %f14, %f0, %f30 | |
224 | + EX_ST(STORE(std, %f16, %o0 + 0x00)) | |
225 | + EX_ST(STORE(std, %f18, %o0 + 0x08)) | |
226 | + EX_ST(STORE(std, %f20, %o0 + 0x10)) | |
227 | + EX_ST(STORE(std, %f22, %o0 + 0x18)) | |
228 | + EX_ST(STORE(std, %f24, %o0 + 0x20)) | |
229 | + EX_ST(STORE(std, %f26, %o0 + 0x28)) | |
230 | + EX_ST(STORE(std, %f28, %o0 + 0x30)) | |
231 | + EX_ST(STORE(std, %f30, %o0 + 0x38)) | |
232 | + add %o0, 0x40, %o0 | |
233 | + bne,pt %icc, 1b | |
234 | + LOAD(prefetch, %g1 + 0x200, #n_reads_strong) | |
235 | + VISExitHalf | |
236 | + | |
237 | + brz,pn %o2, .Lexit | |
238 | + cmp %o2, 19 | |
239 | + ble,pn %icc, .Lsmall_unaligned | |
240 | + nop | |
241 | + ba,a,pt %icc, .Lmedium_unaligned | |
242 | + | |
243 | +.Lmedium: | |
244 | + LOAD(prefetch, %o1 + 0x40, #n_reads_strong) | |
245 | + andcc %g2, 0x7, %g0 | |
246 | + bne,pn %icc, .Lmedium_unaligned | |
247 | + nop | |
248 | +.Lmedium_noprefetch: | |
249 | + andncc %o2, 0x20 - 1, %o5 | |
250 | + be,pn %icc, 2f | |
251 | + sub %o2, %o5, %o2 | |
252 | +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
253 | + EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) | |
254 | + EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) | |
255 | + EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) | |
256 | + add %o1, 0x20, %o1 | |
257 | + subcc %o5, 0x20, %o5 | |
258 | + EX_ST(STORE(stx, %g1, %o0 + 0x00)) | |
259 | + EX_ST(STORE(stx, %g2, %o0 + 0x08)) | |
260 | + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) | |
261 | + EX_ST(STORE(stx, %o4, %o0 + 0x18)) | |
262 | + bne,pt %icc, 1b | |
263 | + add %o0, 0x20, %o0 | |
264 | +2: andcc %o2, 0x18, %o5 | |
265 | + be,pt %icc, 3f | |
266 | + sub %o2, %o5, %o2 | |
267 | +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
268 | + add %o1, 0x08, %o1 | |
269 | + add %o0, 0x08, %o0 | |
270 | + subcc %o5, 0x08, %o5 | |
271 | + bne,pt %icc, 1b | |
272 | + EX_ST(STORE(stx, %g1, %o0 - 0x08)) | |
273 | +3: brz,pt %o2, .Lexit | |
274 | + cmp %o2, 0x04 | |
275 | + bl,pn %icc, .Ltiny | |
276 | + nop | |
277 | + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
278 | + add %o1, 0x04, %o1 | |
279 | + add %o0, 0x04, %o0 | |
280 | + subcc %o2, 0x04, %o2 | |
281 | + bne,pn %icc, .Ltiny | |
282 | + EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
283 | + ba,a,pt %icc, .Lexit | |
284 | +.Lmedium_unaligned: | |
285 | + /* First get dest 8 byte aligned. */ | |
286 | + sub %g0, %o0, %g1 | |
287 | + and %g1, 0x7, %g1 | |
288 | + brz,pt %g1, 2f | |
289 | + sub %o2, %g1, %o2 | |
290 | + | |
291 | +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) | |
292 | + add %o1, 1, %o1 | |
293 | + subcc %g1, 1, %g1 | |
294 | + add %o0, 1, %o0 | |
295 | + bne,pt %icc, 1b | |
296 | + EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
297 | +2: | |
298 | + and %o1, 0x7, %g1 | |
299 | + brz,pn %g1, .Lmedium_noprefetch | |
300 | + sll %g1, 3, %g1 | |
301 | + mov 64, %g2 | |
302 | + sub %g2, %g1, %g2 | |
303 | + andn %o1, 0x7, %o1 | |
304 | + EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) | |
305 | + sllx %o4, %g1, %o4 | |
306 | + andn %o2, 0x08 - 1, %o5 | |
307 | + sub %o2, %o5, %o2 | |
308 | +1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) | |
309 | + add %o1, 0x08, %o1 | |
310 | + subcc %o5, 0x08, %o5 | |
311 | + srlx %g3, %g2, GLOBAL_SPARE | |
312 | + or GLOBAL_SPARE, %o4, GLOBAL_SPARE | |
313 | + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) | |
314 | + add %o0, 0x08, %o0 | |
315 | + bne,pt %icc, 1b | |
316 | + sllx %g3, %g1, %o4 | |
317 | + srl %g1, 3, %g1 | |
318 | + add %o1, %g1, %o1 | |
319 | + brz,pn %o2, .Lexit | |
320 | + nop | |
321 | + ba,pt %icc, .Lsmall_unaligned | |
322 | + | |
323 | +.Ltiny: | |
324 | + EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
325 | + subcc %o2, 1, %o2 | |
326 | + be,pn %icc, .Lexit | |
327 | + EX_ST(STORE(stb, %g1, %o0 + 0x00)) | |
328 | + EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) | |
329 | + subcc %o2, 1, %o2 | |
330 | + be,pn %icc, .Lexit | |
331 | + EX_ST(STORE(stb, %g1, %o0 + 0x01)) | |
332 | + EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) | |
333 | + ba,pt %icc, .Lexit | |
334 | + EX_ST(STORE(stb, %g1, %o0 + 0x02)) | |
335 | + | |
336 | +.Lsmall: | |
337 | + andcc %g2, 0x3, %g0 | |
338 | + bne,pn %icc, .Lsmall_unaligned | |
339 | + andn %o2, 0x4 - 1, %o5 | |
340 | + sub %o2, %o5, %o2 | |
341 | +1: | |
342 | + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
343 | + add %o1, 0x04, %o1 | |
344 | + subcc %o5, 0x04, %o5 | |
345 | + add %o0, 0x04, %o0 | |
346 | + bne,pt %icc, 1b | |
347 | + EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
348 | + brz,pt %o2, .Lexit | |
349 | + nop | |
350 | + ba,a,pt %icc, .Ltiny | |
351 | + | |
352 | +.Lsmall_unaligned: | |
353 | +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
354 | + add %o1, 1, %o1 | |
355 | + add %o0, 1, %o0 | |
356 | + subcc %o2, 1, %o2 | |
357 | + bne,pt %icc, 1b | |
358 | + EX_ST(STORE(stb, %g1, %o0 - 0x01)) | |
359 | + ba,a,pt %icc, .Lexit | |
360 | + .size FUNC_NAME, .-FUNC_NAME |
arch/sparc/lib/NG4patch.S
1 | +/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant. | |
2 | + * | |
3 | + * Copyright (C) 2012 David S. Miller <davem@davemloft.net> | |
4 | + */ | |
5 | + | |
6 | +#define BRANCH_ALWAYS 0x10680000 | |
7 | +#define NOP 0x01000000 | |
8 | +#define NG_DO_PATCH(OLD, NEW) \ | |
9 | + sethi %hi(NEW), %g1; \ | |
10 | + or %g1, %lo(NEW), %g1; \ | |
11 | + sethi %hi(OLD), %g2; \ | |
12 | + or %g2, %lo(OLD), %g2; \ | |
13 | + sub %g1, %g2, %g1; \ | |
14 | + sethi %hi(BRANCH_ALWAYS), %g3; \ | |
15 | + sll %g1, 11, %g1; \ | |
16 | + srl %g1, 11 + 2, %g1; \ | |
17 | + or %g3, %lo(BRANCH_ALWAYS), %g3; \ | |
18 | + or %g3, %g1, %g3; \ | |
19 | + stw %g3, [%g2]; \ | |
20 | + sethi %hi(NOP), %g3; \ | |
21 | + or %g3, %lo(NOP), %g3; \ | |
22 | + stw %g3, [%g2 + 0x4]; \ | |
23 | + flush %g2; | |
24 | + | |
25 | + .globl niagara4_patch_copyops | |
26 | + .type niagara4_patch_copyops,#function | |
27 | +niagara4_patch_copyops: | |
28 | + NG_DO_PATCH(memcpy, NG4memcpy) | |
29 | + NG_DO_PATCH(___copy_from_user, NG4copy_from_user) | |
30 | + NG_DO_PATCH(___copy_to_user, NG4copy_to_user) | |
31 | + retl | |
32 | + nop | |
33 | + .size niagara4_patch_copyops,.-niagara4_patch_copyops | |
34 | + | |
35 | + .globl niagara4_patch_pageops | |
36 | + .type niagara4_patch_pageops,#function | |
37 | +niagara4_patch_pageops: | |
38 | + NG_DO_PATCH(copy_user_page, NG4copy_user_page) | |
39 | + NG_DO_PATCH(_clear_page, NGclear_page) | |
40 | + NG_DO_PATCH(clear_user_page, NGclear_user_page) | |
41 | + retl | |
42 | + nop | |
43 | + .size niagara4_patch_pageops,.-niagara4_patch_pageops |