Commit b3f271e86e5a440713716bb222e1aa1227994c50

Authored by Anton Blanchard
Committed by Benjamin Herrenschmidt
1 parent bce4b4bd91

powerpc: POWER7 optimised memcpy using VMX and enhanced prefetch

Implement a POWER7 optimised memcpy using VMX and enhanced prefetch
instructions.

This is a copy of the POWER7 optimised copy_to_user/copy_from_user
loop. Detailed implementation and performance details can be found in
commit a66086b8197d (powerpc: POWER7 optimised
copy_to_user/copy_from_user using VMX).

I noticed memcpy issues when profiling a RAID6 workload:

	.memcpy
	.async_memcpy
	.async_copy_data
	.__raid_run_ops
	.handle_stripe
	.raid5d
	.md_thread

I created a simplified testcase by building a RAID6 array with 4 1GB
ramdisks (booting with brd.rd_size=1048576):

# mdadm -CR -e 1.2 /dev/md0 --level=6 -n4 /dev/ram[0-3]

I then timed how long it took to write to the entire array:

# dd if=/dev/zero of=/dev/md0 bs=1M

Before: 892 MB/s
After:  999 MB/s

A 12% improvement.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Showing 3 changed files with 656 additions and 1 deletions Side-by-side Diff

arch/powerpc/lib/Makefile
... ... @@ -17,7 +17,8 @@
17 17 obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
18 18 memcpy_64.o usercopy_64.o mem_64.o string.o \
19 19 checksum_wrappers_64.o hweight_64.o \
20   - copyuser_power7.o string_64.o copypage_power7.o
  20 + copyuser_power7.o string_64.o copypage_power7.o \
  21 + memcpy_power7.o
21 22 obj-$(CONFIG_XMON) += sstep.o ldstfp.o
22 23 obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o
23 24 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o
arch/powerpc/lib/memcpy_64.S
... ... @@ -11,7 +11,11 @@
11 11  
12 12 .align 7
13 13 _GLOBAL(memcpy)
  14 +BEGIN_FTR_SECTION
14 15 std r3,48(r1) /* save destination pointer for return value */
  16 +FTR_SECTION_ELSE
  17 + b memcpy_power7
  18 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
15 19 PPC_MTOCRF(0x01,r5)
16 20 cmpldi cr1,r5,16
17 21 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry
arch/powerpc/lib/memcpy_power7.S
  1 +/*
  2 + * This program is free software; you can redistribute it and/or modify
  3 + * it under the terms of the GNU General Public License as published by
  4 + * the Free Software Foundation; either version 2 of the License, or
  5 + * (at your option) any later version.
  6 + *
  7 + * This program is distributed in the hope that it will be useful,
  8 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10 + * GNU General Public License for more details.
  11 + *
  12 + * You should have received a copy of the GNU General Public License
  13 + * along with this program; if not, write to the Free Software
  14 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15 + *
  16 + * Copyright (C) IBM Corporation, 2012
  17 + *
  18 + * Author: Anton Blanchard <anton@au.ibm.com>
  19 + */
  20 +#include <asm/ppc_asm.h>
  21 +
  22 +#define STACKFRAMESIZE 256
  23 +#define STK_REG(i) (112 + ((i)-14)*8)
  24 +
  25 +_GLOBAL(memcpy_power7)
  26 +#ifdef CONFIG_ALTIVEC
  27 + cmpldi r5,16
  28 + cmpldi cr1,r5,4096
  29 +
  30 + std r3,48(r1)
  31 +
  32 + blt .Lshort_copy
  33 + bgt cr1,.Lvmx_copy
  34 +#else
  35 + cmpldi r5,16
  36 +
  37 + std r3,48(r1)
  38 +
  39 + blt .Lshort_copy
  40 +#endif
  41 +
  42 +.Lnonvmx_copy:
  43 + /* Get the source 8B aligned */
  44 + neg r6,r4
  45 + mtocrf 0x01,r6
  46 + clrldi r6,r6,(64-3)
  47 +
  48 + bf cr7*4+3,1f
  49 + lbz r0,0(r4)
  50 + addi r4,r4,1
  51 + stb r0,0(r3)
  52 + addi r3,r3,1
  53 +
  54 +1: bf cr7*4+2,2f
  55 + lhz r0,0(r4)
  56 + addi r4,r4,2
  57 + sth r0,0(r3)
  58 + addi r3,r3,2
  59 +
  60 +2: bf cr7*4+1,3f
  61 + lwz r0,0(r4)
  62 + addi r4,r4,4
  63 + stw r0,0(r3)
  64 + addi r3,r3,4
  65 +
  66 +3: sub r5,r5,r6
  67 + cmpldi r5,128
  68 + blt 5f
  69 +
  70 + mflr r0
  71 + stdu r1,-STACKFRAMESIZE(r1)
  72 + std r14,STK_REG(r14)(r1)
  73 + std r15,STK_REG(r15)(r1)
  74 + std r16,STK_REG(r16)(r1)
  75 + std r17,STK_REG(r17)(r1)
  76 + std r18,STK_REG(r18)(r1)
  77 + std r19,STK_REG(r19)(r1)
  78 + std r20,STK_REG(r20)(r1)
  79 + std r21,STK_REG(r21)(r1)
  80 + std r22,STK_REG(r22)(r1)
  81 + std r0,STACKFRAMESIZE+16(r1)
  82 +
  83 + srdi r6,r5,7
  84 + mtctr r6
  85 +
  86 + /* Now do cacheline (128B) sized loads and stores. */
  87 + .align 5
  88 +4:
  89 + ld r0,0(r4)
  90 + ld r6,8(r4)
  91 + ld r7,16(r4)
  92 + ld r8,24(r4)
  93 + ld r9,32(r4)
  94 + ld r10,40(r4)
  95 + ld r11,48(r4)
  96 + ld r12,56(r4)
  97 + ld r14,64(r4)
  98 + ld r15,72(r4)
  99 + ld r16,80(r4)
  100 + ld r17,88(r4)
  101 + ld r18,96(r4)
  102 + ld r19,104(r4)
  103 + ld r20,112(r4)
  104 + ld r21,120(r4)
  105 + addi r4,r4,128
  106 + std r0,0(r3)
  107 + std r6,8(r3)
  108 + std r7,16(r3)
  109 + std r8,24(r3)
  110 + std r9,32(r3)
  111 + std r10,40(r3)
  112 + std r11,48(r3)
  113 + std r12,56(r3)
  114 + std r14,64(r3)
  115 + std r15,72(r3)
  116 + std r16,80(r3)
  117 + std r17,88(r3)
  118 + std r18,96(r3)
  119 + std r19,104(r3)
  120 + std r20,112(r3)
  121 + std r21,120(r3)
  122 + addi r3,r3,128
  123 + bdnz 4b
  124 +
  125 + clrldi r5,r5,(64-7)
  126 +
  127 + ld r14,STK_REG(r14)(r1)
  128 + ld r15,STK_REG(r15)(r1)
  129 + ld r16,STK_REG(r16)(r1)
  130 + ld r17,STK_REG(r17)(r1)
  131 + ld r18,STK_REG(r18)(r1)
  132 + ld r19,STK_REG(r19)(r1)
  133 + ld r20,STK_REG(r20)(r1)
  134 + ld r21,STK_REG(r21)(r1)
  135 + ld r22,STK_REG(r22)(r1)
  136 + addi r1,r1,STACKFRAMESIZE
  137 +
  138 + /* Up to 127B to go */
  139 +5: srdi r6,r5,4
  140 + mtocrf 0x01,r6
  141 +
  142 +6: bf cr7*4+1,7f
  143 + ld r0,0(r4)
  144 + ld r6,8(r4)
  145 + ld r7,16(r4)
  146 + ld r8,24(r4)
  147 + ld r9,32(r4)
  148 + ld r10,40(r4)
  149 + ld r11,48(r4)
  150 + ld r12,56(r4)
  151 + addi r4,r4,64
  152 + std r0,0(r3)
  153 + std r6,8(r3)
  154 + std r7,16(r3)
  155 + std r8,24(r3)
  156 + std r9,32(r3)
  157 + std r10,40(r3)
  158 + std r11,48(r3)
  159 + std r12,56(r3)
  160 + addi r3,r3,64
  161 +
  162 + /* Up to 63B to go */
  163 +7: bf cr7*4+2,8f
  164 + ld r0,0(r4)
  165 + ld r6,8(r4)
  166 + ld r7,16(r4)
  167 + ld r8,24(r4)
  168 + addi r4,r4,32
  169 + std r0,0(r3)
  170 + std r6,8(r3)
  171 + std r7,16(r3)
  172 + std r8,24(r3)
  173 + addi r3,r3,32
  174 +
  175 + /* Up to 31B to go */
  176 +8: bf cr7*4+3,9f
  177 + ld r0,0(r4)
  178 + ld r6,8(r4)
  179 + addi r4,r4,16
  180 + std r0,0(r3)
  181 + std r6,8(r3)
  182 + addi r3,r3,16
  183 +
  184 +9: clrldi r5,r5,(64-4)
  185 +
  186 + /* Up to 15B to go */
  187 +.Lshort_copy:
  188 + mtocrf 0x01,r5
  189 + bf cr7*4+0,12f
  190 + lwz r0,0(r4) /* Less chance of a reject with word ops */
  191 + lwz r6,4(r4)
  192 + addi r4,r4,8
  193 + stw r0,0(r3)
  194 + stw r6,4(r3)
  195 + addi r3,r3,8
  196 +
  197 +12: bf cr7*4+1,13f
  198 + lwz r0,0(r4)
  199 + addi r4,r4,4
  200 + stw r0,0(r3)
  201 + addi r3,r3,4
  202 +
  203 +13: bf cr7*4+2,14f
  204 + lhz r0,0(r4)
  205 + addi r4,r4,2
  206 + sth r0,0(r3)
  207 + addi r3,r3,2
  208 +
  209 +14: bf cr7*4+3,15f
  210 + lbz r0,0(r4)
  211 + stb r0,0(r3)
  212 +
  213 +15: ld r3,48(r1)
  214 + blr
  215 +
  216 +.Lunwind_stack_nonvmx_copy:
  217 + addi r1,r1,STACKFRAMESIZE
  218 + b .Lnonvmx_copy
  219 +
  220 +#ifdef CONFIG_ALTIVEC
  221 +.Lvmx_copy:
  222 + mflr r0
  223 + std r4,56(r1)
  224 + std r5,64(r1)
  225 + std r0,16(r1)
  226 + stdu r1,-STACKFRAMESIZE(r1)
  227 + bl .enter_vmx_copy
  228 + cmpwi r3,0
  229 + ld r0,STACKFRAMESIZE+16(r1)
  230 + ld r3,STACKFRAMESIZE+48(r1)
  231 + ld r4,STACKFRAMESIZE+56(r1)
  232 + ld r5,STACKFRAMESIZE+64(r1)
  233 + mtlr r0
  234 +
  235 + /*
  236 + * We prefetch both the source and destination using enhanced touch
  237 + * instructions. We use a stream ID of 0 for the load side and
  238 + * 1 for the store side.
  239 + */
  240 + clrrdi r6,r4,7
  241 + clrrdi r9,r3,7
  242 + ori r9,r9,1 /* stream=1 */
  243 +
  244 + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  245 + cmpldi cr1,r7,0x3FF
  246 + ble cr1,1f
  247 + li r7,0x3FF
  248 +1: lis r0,0x0E00 /* depth=7 */
  249 + sldi r7,r7,7
  250 + or r7,r7,r0
  251 + ori r10,r7,1 /* stream=1 */
  252 +
  253 + lis r8,0x8000 /* GO=1 */
  254 + clrldi r8,r8,32
  255 +
  256 +.machine push
  257 +.machine "power4"
  258 + dcbt r0,r6,0b01000
  259 + dcbt r0,r7,0b01010
  260 + dcbtst r0,r9,0b01000
  261 + dcbtst r0,r10,0b01010
  262 + eieio
  263 + dcbt r0,r8,0b01010 /* GO */
  264 +.machine pop
  265 +
  266 + beq .Lunwind_stack_nonvmx_copy
  267 +
  268 + /*
  269 + * If source and destination are not relatively aligned we use a
  270 + * slower permute loop.
  271 + */
  272 + xor r6,r4,r3
  273 + rldicl. r6,r6,0,(64-4)
  274 + bne .Lvmx_unaligned_copy
  275 +
  276 + /* Get the destination 16B aligned */
  277 + neg r6,r3
  278 + mtocrf 0x01,r6
  279 + clrldi r6,r6,(64-4)
  280 +
  281 + bf cr7*4+3,1f
  282 + lbz r0,0(r4)
  283 + addi r4,r4,1
  284 + stb r0,0(r3)
  285 + addi r3,r3,1
  286 +
  287 +1: bf cr7*4+2,2f
  288 + lhz r0,0(r4)
  289 + addi r4,r4,2
  290 + sth r0,0(r3)
  291 + addi r3,r3,2
  292 +
  293 +2: bf cr7*4+1,3f
  294 + lwz r0,0(r4)
  295 + addi r4,r4,4
  296 + stw r0,0(r3)
  297 + addi r3,r3,4
  298 +
  299 +3: bf cr7*4+0,4f
  300 + ld r0,0(r4)
  301 + addi r4,r4,8
  302 + std r0,0(r3)
  303 + addi r3,r3,8
  304 +
  305 +4: sub r5,r5,r6
  306 +
  307 + /* Get the desination 128B aligned */
  308 + neg r6,r3
  309 + srdi r7,r6,4
  310 + mtocrf 0x01,r7
  311 + clrldi r6,r6,(64-7)
  312 +
  313 + li r9,16
  314 + li r10,32
  315 + li r11,48
  316 +
  317 + bf cr7*4+3,5f
  318 + lvx vr1,r0,r4
  319 + addi r4,r4,16
  320 + stvx vr1,r0,r3
  321 + addi r3,r3,16
  322 +
  323 +5: bf cr7*4+2,6f
  324 + lvx vr1,r0,r4
  325 + lvx vr0,r4,r9
  326 + addi r4,r4,32
  327 + stvx vr1,r0,r3
  328 + stvx vr0,r3,r9
  329 + addi r3,r3,32
  330 +
  331 +6: bf cr7*4+1,7f
  332 + lvx vr3,r0,r4
  333 + lvx vr2,r4,r9
  334 + lvx vr1,r4,r10
  335 + lvx vr0,r4,r11
  336 + addi r4,r4,64
  337 + stvx vr3,r0,r3
  338 + stvx vr2,r3,r9
  339 + stvx vr1,r3,r10
  340 + stvx vr0,r3,r11
  341 + addi r3,r3,64
  342 +
  343 +7: sub r5,r5,r6
  344 + srdi r6,r5,7
  345 +
  346 + std r14,STK_REG(r14)(r1)
  347 + std r15,STK_REG(r15)(r1)
  348 + std r16,STK_REG(r16)(r1)
  349 +
  350 + li r12,64
  351 + li r14,80
  352 + li r15,96
  353 + li r16,112
  354 +
  355 + mtctr r6
  356 +
  357 + /*
  358 + * Now do cacheline sized loads and stores. By this stage the
  359 + * cacheline stores are also cacheline aligned.
  360 + */
  361 + .align 5
  362 +8:
  363 + lvx vr7,r0,r4
  364 + lvx vr6,r4,r9
  365 + lvx vr5,r4,r10
  366 + lvx vr4,r4,r11
  367 + lvx vr3,r4,r12
  368 + lvx vr2,r4,r14
  369 + lvx vr1,r4,r15
  370 + lvx vr0,r4,r16
  371 + addi r4,r4,128
  372 + stvx vr7,r0,r3
  373 + stvx vr6,r3,r9
  374 + stvx vr5,r3,r10
  375 + stvx vr4,r3,r11
  376 + stvx vr3,r3,r12
  377 + stvx vr2,r3,r14
  378 + stvx vr1,r3,r15
  379 + stvx vr0,r3,r16
  380 + addi r3,r3,128
  381 + bdnz 8b
  382 +
  383 + ld r14,STK_REG(r14)(r1)
  384 + ld r15,STK_REG(r15)(r1)
  385 + ld r16,STK_REG(r16)(r1)
  386 +
  387 + /* Up to 127B to go */
  388 + clrldi r5,r5,(64-7)
  389 + srdi r6,r5,4
  390 + mtocrf 0x01,r6
  391 +
  392 + bf cr7*4+1,9f
  393 + lvx vr3,r0,r4
  394 + lvx vr2,r4,r9
  395 + lvx vr1,r4,r10
  396 + lvx vr0,r4,r11
  397 + addi r4,r4,64
  398 + stvx vr3,r0,r3
  399 + stvx vr2,r3,r9
  400 + stvx vr1,r3,r10
  401 + stvx vr0,r3,r11
  402 + addi r3,r3,64
  403 +
  404 +9: bf cr7*4+2,10f
  405 + lvx vr1,r0,r4
  406 + lvx vr0,r4,r9
  407 + addi r4,r4,32
  408 + stvx vr1,r0,r3
  409 + stvx vr0,r3,r9
  410 + addi r3,r3,32
  411 +
  412 +10: bf cr7*4+3,11f
  413 + lvx vr1,r0,r4
  414 + addi r4,r4,16
  415 + stvx vr1,r0,r3
  416 + addi r3,r3,16
  417 +
  418 + /* Up to 15B to go */
  419 +11: clrldi r5,r5,(64-4)
  420 + mtocrf 0x01,r5
  421 + bf cr7*4+0,12f
  422 + ld r0,0(r4)
  423 + addi r4,r4,8
  424 + std r0,0(r3)
  425 + addi r3,r3,8
  426 +
  427 +12: bf cr7*4+1,13f
  428 + lwz r0,0(r4)
  429 + addi r4,r4,4
  430 + stw r0,0(r3)
  431 + addi r3,r3,4
  432 +
  433 +13: bf cr7*4+2,14f
  434 + lhz r0,0(r4)
  435 + addi r4,r4,2
  436 + sth r0,0(r3)
  437 + addi r3,r3,2
  438 +
  439 +14: bf cr7*4+3,15f
  440 + lbz r0,0(r4)
  441 + stb r0,0(r3)
  442 +
  443 +15: addi r1,r1,STACKFRAMESIZE
  444 + ld r3,48(r1)
  445 + b .exit_vmx_copy /* tail call optimise */
  446 +
  447 +.Lvmx_unaligned_copy:
  448 + /* Get the destination 16B aligned */
  449 + neg r6,r3
  450 + mtocrf 0x01,r6
  451 + clrldi r6,r6,(64-4)
  452 +
  453 + bf cr7*4+3,1f
  454 + lbz r0,0(r4)
  455 + addi r4,r4,1
  456 + stb r0,0(r3)
  457 + addi r3,r3,1
  458 +
  459 +1: bf cr7*4+2,2f
  460 + lhz r0,0(r4)
  461 + addi r4,r4,2
  462 + sth r0,0(r3)
  463 + addi r3,r3,2
  464 +
  465 +2: bf cr7*4+1,3f
  466 + lwz r0,0(r4)
  467 + addi r4,r4,4
  468 + stw r0,0(r3)
  469 + addi r3,r3,4
  470 +
  471 +3: bf cr7*4+0,4f
  472 + lwz r0,0(r4) /* Less chance of a reject with word ops */
  473 + lwz r7,4(r4)
  474 + addi r4,r4,8
  475 + stw r0,0(r3)
  476 + stw r7,4(r3)
  477 + addi r3,r3,8
  478 +
  479 +4: sub r5,r5,r6
  480 +
  481 + /* Get the desination 128B aligned */
  482 + neg r6,r3
  483 + srdi r7,r6,4
  484 + mtocrf 0x01,r7
  485 + clrldi r6,r6,(64-7)
  486 +
  487 + li r9,16
  488 + li r10,32
  489 + li r11,48
  490 +
  491 + lvsl vr16,0,r4 /* Setup permute control vector */
  492 + lvx vr0,0,r4
  493 + addi r4,r4,16
  494 +
  495 + bf cr7*4+3,5f
  496 + lvx vr1,r0,r4
  497 + vperm vr8,vr0,vr1,vr16
  498 + addi r4,r4,16
  499 + stvx vr8,r0,r3
  500 + addi r3,r3,16
  501 + vor vr0,vr1,vr1
  502 +
  503 +5: bf cr7*4+2,6f
  504 + lvx vr1,r0,r4
  505 + vperm vr8,vr0,vr1,vr16
  506 + lvx vr0,r4,r9
  507 + vperm vr9,vr1,vr0,vr16
  508 + addi r4,r4,32
  509 + stvx vr8,r0,r3
  510 + stvx vr9,r3,r9
  511 + addi r3,r3,32
  512 +
  513 +6: bf cr7*4+1,7f
  514 + lvx vr3,r0,r4
  515 + vperm vr8,vr0,vr3,vr16
  516 + lvx vr2,r4,r9
  517 + vperm vr9,vr3,vr2,vr16
  518 + lvx vr1,r4,r10
  519 + vperm vr10,vr2,vr1,vr16
  520 + lvx vr0,r4,r11
  521 + vperm vr11,vr1,vr0,vr16
  522 + addi r4,r4,64
  523 + stvx vr8,r0,r3
  524 + stvx vr9,r3,r9
  525 + stvx vr10,r3,r10
  526 + stvx vr11,r3,r11
  527 + addi r3,r3,64
  528 +
  529 +7: sub r5,r5,r6
  530 + srdi r6,r5,7
  531 +
  532 + std r14,STK_REG(r14)(r1)
  533 + std r15,STK_REG(r15)(r1)
  534 + std r16,STK_REG(r16)(r1)
  535 +
  536 + li r12,64
  537 + li r14,80
  538 + li r15,96
  539 + li r16,112
  540 +
  541 + mtctr r6
  542 +
  543 + /*
  544 + * Now do cacheline sized loads and stores. By this stage the
  545 + * cacheline stores are also cacheline aligned.
  546 + */
  547 + .align 5
  548 +8:
  549 + lvx vr7,r0,r4
  550 + vperm vr8,vr0,vr7,vr16
  551 + lvx vr6,r4,r9
  552 + vperm vr9,vr7,vr6,vr16
  553 + lvx vr5,r4,r10
  554 + vperm vr10,vr6,vr5,vr16
  555 + lvx vr4,r4,r11
  556 + vperm vr11,vr5,vr4,vr16
  557 + lvx vr3,r4,r12
  558 + vperm vr12,vr4,vr3,vr16
  559 + lvx vr2,r4,r14
  560 + vperm vr13,vr3,vr2,vr16
  561 + lvx vr1,r4,r15
  562 + vperm vr14,vr2,vr1,vr16
  563 + lvx vr0,r4,r16
  564 + vperm vr15,vr1,vr0,vr16
  565 + addi r4,r4,128
  566 + stvx vr8,r0,r3
  567 + stvx vr9,r3,r9
  568 + stvx vr10,r3,r10
  569 + stvx vr11,r3,r11
  570 + stvx vr12,r3,r12
  571 + stvx vr13,r3,r14
  572 + stvx vr14,r3,r15
  573 + stvx vr15,r3,r16
  574 + addi r3,r3,128
  575 + bdnz 8b
  576 +
  577 + ld r14,STK_REG(r14)(r1)
  578 + ld r15,STK_REG(r15)(r1)
  579 + ld r16,STK_REG(r16)(r1)
  580 +
  581 + /* Up to 127B to go */
  582 + clrldi r5,r5,(64-7)
  583 + srdi r6,r5,4
  584 + mtocrf 0x01,r6
  585 +
  586 + bf cr7*4+1,9f
  587 + lvx vr3,r0,r4
  588 + vperm vr8,vr0,vr3,vr16
  589 + lvx vr2,r4,r9
  590 + vperm vr9,vr3,vr2,vr16
  591 + lvx vr1,r4,r10
  592 + vperm vr10,vr2,vr1,vr16
  593 + lvx vr0,r4,r11
  594 + vperm vr11,vr1,vr0,vr16
  595 + addi r4,r4,64
  596 + stvx vr8,r0,r3
  597 + stvx vr9,r3,r9
  598 + stvx vr10,r3,r10
  599 + stvx vr11,r3,r11
  600 + addi r3,r3,64
  601 +
  602 +9: bf cr7*4+2,10f
  603 + lvx vr1,r0,r4
  604 + vperm vr8,vr0,vr1,vr16
  605 + lvx vr0,r4,r9
  606 + vperm vr9,vr1,vr0,vr16
  607 + addi r4,r4,32
  608 + stvx vr8,r0,r3
  609 + stvx vr9,r3,r9
  610 + addi r3,r3,32
  611 +
  612 +10: bf cr7*4+3,11f
  613 + lvx vr1,r0,r4
  614 + vperm vr8,vr0,vr1,vr16
  615 + addi r4,r4,16
  616 + stvx vr8,r0,r3
  617 + addi r3,r3,16
  618 +
  619 + /* Up to 15B to go */
  620 +11: clrldi r5,r5,(64-4)
  621 + addi r4,r4,-16 /* Unwind the +16 load offset */
  622 + mtocrf 0x01,r5
  623 + bf cr7*4+0,12f
  624 + lwz r0,0(r4) /* Less chance of a reject with word ops */
  625 + lwz r6,4(r4)
  626 + addi r4,r4,8
  627 + stw r0,0(r3)
  628 + stw r6,4(r3)
  629 + addi r3,r3,8
  630 +
  631 +12: bf cr7*4+1,13f
  632 + lwz r0,0(r4)
  633 + addi r4,r4,4
  634 + stw r0,0(r3)
  635 + addi r3,r3,4
  636 +
  637 +13: bf cr7*4+2,14f
  638 + lhz r0,0(r4)
  639 + addi r4,r4,2
  640 + sth r0,0(r3)
  641 + addi r3,r3,2
  642 +
  643 +14: bf cr7*4+3,15f
  644 + lbz r0,0(r4)
  645 + stb r0,0(r3)
  646 +
  647 +15: addi r1,r1,STACKFRAMESIZE
  648 + ld r3,48(r1)
  649 + b .exit_vmx_copy /* tail call optimise */
  650 +#endif /* CONFiG_ALTIVEC */