Commit ae2c6ca64118b934ef85f66adb03d5bbfdd57201

Authored by David S. Miller
1 parent da20116166

sparc64: Add SPARC-T4 optimized memcpy.

Before		After
		--------------	--------------
bw_tcp:         1288.53 MB/sec	1637.77 MB/sec
bw_pipe:        1517.18 MB/sec	2107.61 MB/sec
bw_unix:        1838.38 MB/sec	2640.91 MB/sec

make -s -j128
allmodconfig	5min 49sec	5min 31sec

Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 8 changed files with 546 additions and 2 deletions Side-by-side Diff

arch/sparc/kernel/head_64.S
... ... @@ -559,10 +559,10 @@
559 559 be,pt %xcc, niagara2_patch
560 560 nop
561 561 cmp %g1, SUN4V_CHIP_NIAGARA4
562   - be,pt %xcc, niagara2_patch
  562 + be,pt %xcc, niagara4_patch
563 563 nop
564 564 cmp %g1, SUN4V_CHIP_NIAGARA5
565   - be,pt %xcc, niagara2_patch
  565 + be,pt %xcc, niagara4_patch
566 566 nop
567 567  
568 568 call generic_patch_copyops
... ... @@ -573,6 +573,16 @@
573 573 nop
574 574  
575 575 ba,a,pt %xcc, 80f
  576 +niagara4_patch:
  577 + call niagara4_patch_copyops
  578 + nop
  579 + call niagara_patch_bzero
  580 + nop
  581 + call niagara4_patch_pageops
  582 + nop
  583 +
  584 + ba,a,pt %xcc, 80f
  585 +
576 586 niagara2_patch:
577 587 call niagara2_patch_copyops
578 588 nop
arch/sparc/lib/Makefile
... ... @@ -32,6 +32,9 @@
32 32 lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o
33 33 lib-$(CONFIG_SPARC64) += NG2patch.o
34 34  
  35 +lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o
  36 +lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o
  37 +
35 38 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
36 39 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
37 40  
arch/sparc/lib/NG4copy_from_user.S
  1 +/* NG4copy_from_user.S: Niagara-2 optimized copy from userspace.
  2 + *
  3 + * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
  4 + */
  5 +
  6 +#define EX_LD(x) \
  7 +98: x; \
  8 + .section __ex_table,"a";\
  9 + .align 4; \
  10 + .word 98b, __retl_one_asi;\
  11 + .text; \
  12 + .align 4;
  13 +
  14 +#ifndef ASI_AIUS
  15 +#define ASI_AIUS 0x11
  16 +#endif
  17 +
  18 +#define FUNC_NAME NG4copy_from_user
  19 +#define LOAD(type,addr,dest) type##a [addr] %asi, dest
  20 +#define EX_RETVAL(x) 0
  21 +
  22 +#ifdef __KERNEL__
  23 +#define PREAMBLE \
  24 + rd %asi, %g1; \
  25 + cmp %g1, ASI_AIUS; \
  26 + bne,pn %icc, ___copy_in_user; \
  27 + nop
  28 +#endif
  29 +
  30 +#include "NG4memcpy.S"
arch/sparc/lib/NG4copy_page.S
  1 +/* NG4copy_page.S: Niagara-4 optimized copy page.
  2 + *
  3 + * Copyright (C) 2012 (davem@davemloft.net)
  4 + */
  5 +
  6 +#include <asm/asi.h>
  7 +#include <asm/page.h>
  8 +
  9 + .text
  10 + .align 32
  11 +
  12 + .register %g2, #scratch
  13 + .register %g3, #scratch
  14 +
  15 + .globl NG4copy_user_page
  16 +NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */
  17 + prefetch [%o1 + 0x000], #n_reads_strong
  18 + prefetch [%o1 + 0x040], #n_reads_strong
  19 + prefetch [%o1 + 0x080], #n_reads_strong
  20 + prefetch [%o1 + 0x0c0], #n_reads_strong
  21 + set PAGE_SIZE, %g7
  22 + prefetch [%o1 + 0x100], #n_reads_strong
  23 + prefetch [%o1 + 0x140], #n_reads_strong
  24 + prefetch [%o1 + 0x180], #n_reads_strong
  25 + prefetch [%o1 + 0x1c0], #n_reads_strong
  26 +1:
  27 + ldx [%o1 + 0x00], %o2
  28 + subcc %g7, 0x40, %g7
  29 + ldx [%o1 + 0x08], %o3
  30 + ldx [%o1 + 0x10], %o4
  31 + ldx [%o1 + 0x18], %o5
  32 + ldx [%o1 + 0x20], %g1
  33 + stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  34 + add %o0, 0x08, %o0
  35 + ldx [%o1 + 0x28], %g2
  36 + stxa %o3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  37 + add %o0, 0x08, %o0
  38 + ldx [%o1 + 0x30], %g3
  39 + stxa %o4, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  40 + add %o0, 0x08, %o0
  41 + ldx [%o1 + 0x38], %o2
  42 + add %o1, 0x40, %o1
  43 + stxa %o5, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  44 + add %o0, 0x08, %o0
  45 + stxa %g1, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  46 + add %o0, 0x08, %o0
  47 + stxa %g2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  48 + add %o0, 0x08, %o0
  49 + stxa %g3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  50 + add %o0, 0x08, %o0
  51 + stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
  52 + add %o0, 0x08, %o0
  53 + bne,pt %icc, 1b
  54 + prefetch [%o1 + 0x200], #n_reads_strong
  55 + retl
  56 + membar #StoreLoad | #StoreStore
  57 + .size NG4copy_user_page,.-NG4copy_user_page
arch/sparc/lib/NG4copy_to_user.S
  1 +/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace.
  2 + *
  3 + * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
  4 + */
  5 +
  6 +#define EX_ST(x) \
  7 +98: x; \
  8 + .section __ex_table,"a";\
  9 + .align 4; \
  10 + .word 98b, __retl_one_asi;\
  11 + .text; \
  12 + .align 4;
  13 +
  14 +#ifndef ASI_AIUS
  15 +#define ASI_AIUS 0x11
  16 +#endif
  17 +
  18 +#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
  19 +#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
  20 +#endif
  21 +
  22 +#define FUNC_NAME NG4copy_to_user
  23 +#define STORE(type,src,addr) type##a src, [addr] %asi
  24 +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS
  25 +#define EX_RETVAL(x) 0
  26 +
  27 +#ifdef __KERNEL__
  28 + /* Writing to %asi is _expensive_ so we hardcode it.
  29 + * Reading %asi to check for KERNEL_DS is comparatively
  30 + * cheap.
  31 + */
  32 +#define PREAMBLE \
  33 + rd %asi, %g1; \
  34 + cmp %g1, ASI_AIUS; \
  35 + bne,pn %icc, ___copy_in_user; \
  36 + nop
  37 +#endif
  38 +
  39 +#include "NG4memcpy.S"
arch/sparc/lib/NG4memcpy.S
  1 +/* NG4memcpy.S: Niagara-4 optimized memcpy.
  2 + *
  3 + * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
  4 + */
  5 +
  6 +#ifdef __KERNEL__
  7 +#include <asm/visasm.h>
  8 +#include <asm/asi.h>
  9 +#define GLOBAL_SPARE %g7
  10 +#else
  11 +#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  12 +#define FPRS_FEF 0x04
  13 +
  14 +/* On T4 it is very expensive to access ASRs like %fprs and
  15 + * %asi, avoiding a read or a write can save ~50 cycles.
  16 + */
  17 +#define FPU_ENTER \
  18 + rd %fprs, %o5; \
  19 + andcc %o5, FPRS_FEF, %g0; \
  20 + be,a,pn %icc, 999f; \
  21 + wr %g0, FPRS_FEF, %fprs; \
  22 + 999:
  23 +
  24 +#ifdef MEMCPY_DEBUG
  25 +#define VISEntryHalf FPU_ENTER; \
  26 + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
  27 +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  28 +#else
  29 +#define VISEntryHalf FPU_ENTER
  30 +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  31 +#endif
  32 +
  33 +#define GLOBAL_SPARE %g5
  34 +#endif
  35 +
  36 +#ifndef STORE_ASI
  37 +#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  38 +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
  39 +#else
  40 +#define STORE_ASI 0x80 /* ASI_P */
  41 +#endif
  42 +#endif
  43 +
  44 +#ifndef EX_LD
  45 +#define EX_LD(x) x
  46 +#endif
  47 +
  48 +#ifndef EX_ST
  49 +#define EX_ST(x) x
  50 +#endif
  51 +
  52 +#ifndef EX_RETVAL
  53 +#define EX_RETVAL(x) x
  54 +#endif
  55 +
  56 +#ifndef LOAD
  57 +#define LOAD(type,addr,dest) type [addr], dest
  58 +#endif
  59 +
  60 +#ifndef STORE
  61 +#ifndef MEMCPY_DEBUG
  62 +#define STORE(type,src,addr) type src, [addr]
  63 +#else
  64 +#define STORE(type,src,addr) type##a src, [addr] %asi
  65 +#endif
  66 +#endif
  67 +
  68 +#ifndef STORE_INIT
  69 +#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
  70 +#endif
  71 +
  72 +#ifndef FUNC_NAME
  73 +#define FUNC_NAME NG4memcpy
  74 +#endif
  75 +#ifndef PREAMBLE
  76 +#define PREAMBLE
  77 +#endif
  78 +
  79 +#ifndef XCC
  80 +#define XCC xcc
  81 +#endif
  82 +
  83 + .register %g2,#scratch
  84 + .register %g3,#scratch
  85 +
  86 + .text
  87 + .align 64
  88 +
  89 + .globl FUNC_NAME
  90 + .type FUNC_NAME,#function
  91 +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
  92 +#ifdef MEMCPY_DEBUG
  93 + wr %g0, 0x80, %asi
  94 +#endif
  95 + srlx %o2, 31, %g2
  96 + cmp %g2, 0
  97 + tne %XCC, 5
  98 + PREAMBLE
  99 + mov %o0, %o3
  100 + brz,pn %o2, .Lexit
  101 + cmp %o2, 3
  102 + ble,pn %icc, .Ltiny
  103 + cmp %o2, 19
  104 + ble,pn %icc, .Lsmall
  105 + or %o0, %o1, %g2
  106 + cmp %o2, 128
  107 + bl,pn %icc, .Lmedium
  108 + nop
  109 +
  110 +.Llarge:/* len >= 0x80 */
  111 + /* First get dest 8 byte aligned. */
  112 + sub %g0, %o0, %g1
  113 + and %g1, 0x7, %g1
  114 + brz,pt %g1, 51f
  115 + sub %o2, %g1, %o2
  116 +
  117 +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
  118 + add %o1, 1, %o1
  119 + subcc %g1, 1, %g1
  120 + add %o0, 1, %o0
  121 + bne,pt %icc, 1b
  122 + EX_ST(STORE(stb, %g2, %o0 - 0x01))
  123 +
  124 +51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
  125 + LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
  126 + LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
  127 + LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
  128 + LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
  129 + LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
  130 + LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
  131 + LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
  132 +
  133 + /* Check if we can use the straight fully aligned
  134 + * loop, or we require the alignaddr/faligndata variant.
  135 + */
  136 + andcc %o1, 0x7, %o5
  137 + bne,pn %icc, .Llarge_src_unaligned
  138 + sub %g0, %o0, %g1
  139 +
  140 + /* Legitimize the use of initializing stores by getting dest
  141 + * to be 64-byte aligned.
  142 + */
  143 + and %g1, 0x3f, %g1
  144 + brz,pt %g1, .Llarge_aligned
  145 + sub %o2, %g1, %o2
  146 +
  147 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
  148 + add %o1, 8, %o1
  149 + subcc %g1, 8, %g1
  150 + add %o0, 8, %o0
  151 + bne,pt %icc, 1b
  152 + EX_ST(STORE(stx, %g2, %o0 - 0x08))
  153 +
  154 +.Llarge_aligned:
  155 + /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
  156 + andn %o2, 0x3f, %o4
  157 + sub %o2, %o4, %o2
  158 +
  159 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
  160 + add %o1, 0x40, %o1
  161 + EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
  162 + subcc %o4, 0x40, %o4
  163 + EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
  164 + EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
  165 + EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
  166 + EX_ST(STORE_INIT(%g1, %o0))
  167 + add %o0, 0x08, %o0
  168 + EX_ST(STORE_INIT(%g2, %o0))
  169 + add %o0, 0x08, %o0
  170 + EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
  171 + EX_ST(STORE_INIT(%g3, %o0))
  172 + add %o0, 0x08, %o0
  173 + EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
  174 + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
  175 + add %o0, 0x08, %o0
  176 + EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
  177 + EX_ST(STORE_INIT(%o5, %o0))
  178 + add %o0, 0x08, %o0
  179 + EX_ST(STORE_INIT(%g2, %o0))
  180 + add %o0, 0x08, %o0
  181 + EX_ST(STORE_INIT(%g3, %o0))
  182 + add %o0, 0x08, %o0
  183 + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
  184 + add %o0, 0x08, %o0
  185 + bne,pt %icc, 1b
  186 + LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
  187 +
  188 + membar #StoreLoad | #StoreStore
  189 +
  190 + brz,pn %o2, .Lexit
  191 + cmp %o2, 19
  192 + ble,pn %icc, .Lsmall_unaligned
  193 + nop
  194 + ba,a,pt %icc, .Lmedium_noprefetch
  195 +
  196 +.Lexit: retl
  197 + mov EX_RETVAL(%o3), %o0
  198 +
  199 +.Llarge_src_unaligned:
  200 + andn %o2, 0x3f, %o4
  201 + sub %o2, %o4, %o2
  202 + VISEntryHalf
  203 + alignaddr %o1, %g0, %g1
  204 + add %o1, %o4, %o1
  205 + EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
  206 +1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
  207 + subcc %o4, 0x40, %o4
  208 + EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
  209 + EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
  210 + EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
  211 + EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
  212 + EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
  213 + EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
  214 + faligndata %f0, %f2, %f16
  215 + EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
  216 + faligndata %f2, %f4, %f18
  217 + add %g1, 0x40, %g1
  218 + faligndata %f4, %f6, %f20
  219 + faligndata %f6, %f8, %f22
  220 + faligndata %f8, %f10, %f24
  221 + faligndata %f10, %f12, %f26
  222 + faligndata %f12, %f14, %f28
  223 + faligndata %f14, %f0, %f30
  224 + EX_ST(STORE(std, %f16, %o0 + 0x00))
  225 + EX_ST(STORE(std, %f18, %o0 + 0x08))
  226 + EX_ST(STORE(std, %f20, %o0 + 0x10))
  227 + EX_ST(STORE(std, %f22, %o0 + 0x18))
  228 + EX_ST(STORE(std, %f24, %o0 + 0x20))
  229 + EX_ST(STORE(std, %f26, %o0 + 0x28))
  230 + EX_ST(STORE(std, %f28, %o0 + 0x30))
  231 + EX_ST(STORE(std, %f30, %o0 + 0x38))
  232 + add %o0, 0x40, %o0
  233 + bne,pt %icc, 1b
  234 + LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
  235 + VISExitHalf
  236 +
  237 + brz,pn %o2, .Lexit
  238 + cmp %o2, 19
  239 + ble,pn %icc, .Lsmall_unaligned
  240 + nop
  241 + ba,a,pt %icc, .Lmedium_unaligned
  242 +
  243 +.Lmedium:
  244 + LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
  245 + andcc %g2, 0x7, %g0
  246 + bne,pn %icc, .Lmedium_unaligned
  247 + nop
  248 +.Lmedium_noprefetch:
  249 + andncc %o2, 0x20 - 1, %o5
  250 + be,pn %icc, 2f
  251 + sub %o2, %o5, %o2
  252 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
  253 + EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
  254 + EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
  255 + EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
  256 + add %o1, 0x20, %o1
  257 + subcc %o5, 0x20, %o5
  258 + EX_ST(STORE(stx, %g1, %o0 + 0x00))
  259 + EX_ST(STORE(stx, %g2, %o0 + 0x08))
  260 + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
  261 + EX_ST(STORE(stx, %o4, %o0 + 0x18))
  262 + bne,pt %icc, 1b
  263 + add %o0, 0x20, %o0
  264 +2: andcc %o2, 0x18, %o5
  265 + be,pt %icc, 3f
  266 + sub %o2, %o5, %o2
  267 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
  268 + add %o1, 0x08, %o1
  269 + add %o0, 0x08, %o0
  270 + subcc %o5, 0x08, %o5
  271 + bne,pt %icc, 1b
  272 + EX_ST(STORE(stx, %g1, %o0 - 0x08))
  273 +3: brz,pt %o2, .Lexit
  274 + cmp %o2, 0x04
  275 + bl,pn %icc, .Ltiny
  276 + nop
  277 + EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
  278 + add %o1, 0x04, %o1
  279 + add %o0, 0x04, %o0
  280 + subcc %o2, 0x04, %o2
  281 + bne,pn %icc, .Ltiny
  282 + EX_ST(STORE(stw, %g1, %o0 - 0x04))
  283 + ba,a,pt %icc, .Lexit
  284 +.Lmedium_unaligned:
  285 + /* First get dest 8 byte aligned. */
  286 + sub %g0, %o0, %g1
  287 + and %g1, 0x7, %g1
  288 + brz,pt %g1, 2f
  289 + sub %o2, %g1, %o2
  290 +
  291 +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
  292 + add %o1, 1, %o1
  293 + subcc %g1, 1, %g1
  294 + add %o0, 1, %o0
  295 + bne,pt %icc, 1b
  296 + EX_ST(STORE(stb, %g2, %o0 - 0x01))
  297 +2:
  298 + and %o1, 0x7, %g1
  299 + brz,pn %g1, .Lmedium_noprefetch
  300 + sll %g1, 3, %g1
  301 + mov 64, %g2
  302 + sub %g2, %g1, %g2
  303 + andn %o1, 0x7, %o1
  304 + EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
  305 + sllx %o4, %g1, %o4
  306 + andn %o2, 0x08 - 1, %o5
  307 + sub %o2, %o5, %o2
  308 +1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
  309 + add %o1, 0x08, %o1
  310 + subcc %o5, 0x08, %o5
  311 + srlx %g3, %g2, GLOBAL_SPARE
  312 + or GLOBAL_SPARE, %o4, GLOBAL_SPARE
  313 + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
  314 + add %o0, 0x08, %o0
  315 + bne,pt %icc, 1b
  316 + sllx %g3, %g1, %o4
  317 + srl %g1, 3, %g1
  318 + add %o1, %g1, %o1
  319 + brz,pn %o2, .Lexit
  320 + nop
  321 + ba,pt %icc, .Lsmall_unaligned
  322 +
  323 +.Ltiny:
  324 + EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
  325 + subcc %o2, 1, %o2
  326 + be,pn %icc, .Lexit
  327 + EX_ST(STORE(stb, %g1, %o0 + 0x00))
  328 + EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
  329 + subcc %o2, 1, %o2
  330 + be,pn %icc, .Lexit
  331 + EX_ST(STORE(stb, %g1, %o0 + 0x01))
  332 + EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
  333 + ba,pt %icc, .Lexit
  334 + EX_ST(STORE(stb, %g1, %o0 + 0x02))
  335 +
  336 +.Lsmall:
  337 + andcc %g2, 0x3, %g0
  338 + bne,pn %icc, .Lsmall_unaligned
  339 + andn %o2, 0x4 - 1, %o5
  340 + sub %o2, %o5, %o2
  341 +1:
  342 + EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
  343 + add %o1, 0x04, %o1
  344 + subcc %o5, 0x04, %o5
  345 + add %o0, 0x04, %o0
  346 + bne,pt %icc, 1b
  347 + EX_ST(STORE(stw, %g1, %o0 - 0x04))
  348 + brz,pt %o2, .Lexit
  349 + nop
  350 + ba,a,pt %icc, .Ltiny
  351 +
  352 +.Lsmall_unaligned:
  353 +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
  354 + add %o1, 1, %o1
  355 + add %o0, 1, %o0
  356 + subcc %o2, 1, %o2
  357 + bne,pt %icc, 1b
  358 + EX_ST(STORE(stb, %g1, %o0 - 0x01))
  359 + ba,a,pt %icc, .Lexit
  360 + .size FUNC_NAME, .-FUNC_NAME
arch/sparc/lib/NG4patch.S
  1 +/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant.
  2 + *
  3 + * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
  4 + */
  5 +
  6 +#define BRANCH_ALWAYS 0x10680000
  7 +#define NOP 0x01000000
  8 +#define NG_DO_PATCH(OLD, NEW) \
  9 + sethi %hi(NEW), %g1; \
  10 + or %g1, %lo(NEW), %g1; \
  11 + sethi %hi(OLD), %g2; \
  12 + or %g2, %lo(OLD), %g2; \
  13 + sub %g1, %g2, %g1; \
  14 + sethi %hi(BRANCH_ALWAYS), %g3; \
  15 + sll %g1, 11, %g1; \
  16 + srl %g1, 11 + 2, %g1; \
  17 + or %g3, %lo(BRANCH_ALWAYS), %g3; \
  18 + or %g3, %g1, %g3; \
  19 + stw %g3, [%g2]; \
  20 + sethi %hi(NOP), %g3; \
  21 + or %g3, %lo(NOP), %g3; \
  22 + stw %g3, [%g2 + 0x4]; \
  23 + flush %g2;
  24 +
  25 + .globl niagara4_patch_copyops
  26 + .type niagara4_patch_copyops,#function
  27 +niagara4_patch_copyops:
  28 + NG_DO_PATCH(memcpy, NG4memcpy)
  29 + NG_DO_PATCH(___copy_from_user, NG4copy_from_user)
  30 + NG_DO_PATCH(___copy_to_user, NG4copy_to_user)
  31 + retl
  32 + nop
  33 + .size niagara4_patch_copyops,.-niagara4_patch_copyops
  34 +
  35 + .globl niagara4_patch_pageops
  36 + .type niagara4_patch_pageops,#function
  37 +niagara4_patch_pageops:
  38 + NG_DO_PATCH(copy_user_page, NG4copy_user_page)
  39 + NG_DO_PATCH(_clear_page, NGclear_page)
  40 + NG_DO_PATCH(clear_user_page, NGclear_user_page)
  41 + retl
  42 + nop
  43 + .size niagara4_patch_pageops,.-niagara4_patch_pageops
arch/sparc/lib/NGpage.S
... ... @@ -59,6 +59,8 @@
59 59 restore
60 60  
61 61 .align 32
  62 + .globl NGclear_page
  63 + .globl NGclear_user_page
62 64 NGclear_page: /* %o0=dest */
63 65 NGclear_user_page: /* %o0=dest, %o1=vaddr */
64 66 rd %asi, %g3