Commit f860c90bd6ce22c6a0a352cc16acc74fba3d628e

Authored by Atsushi Nemoto
Committed by Ralf Baechle
1 parent 61e84f9987

[MIPS] csum_partial and copy in parallel

Implement optimized asm version of csum_partial_copy_nocheck,
csum_partial_copy_from_user and csum_and_copy_to_user which can do
calculate and copy in parallel, based on memcpy.S.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

Showing 5 changed files with 464 additions and 65 deletions Side-by-side Diff

arch/mips/kernel/mips_ksyms.c
... ... @@ -46,6 +46,8 @@
46 46 EXPORT_SYMBOL(__strnlen_user_asm);
47 47  
48 48 EXPORT_SYMBOL(csum_partial);
  49 +EXPORT_SYMBOL(csum_partial_copy_nocheck);
  50 +EXPORT_SYMBOL(__csum_partial_copy_user);
49 51  
50 52 EXPORT_SYMBOL(invalid_pte_table);
arch/mips/lib/Makefile
... ... @@ -2,7 +2,7 @@
2 2 # Makefile for MIPS-specific library files..
3 3 #
4 4  
5   -lib-y += csum_partial.o csum_partial_copy.o memcpy.o promlib.o \
  5 +lib-y += csum_partial.o memcpy.o promlib.o \
6 6 strlen_user.o strncpy_user.o strnlen_user.o uncached.o
7 7  
8 8 obj-y += iomap.o
arch/mips/lib/csum_partial.S
... ... @@ -8,7 +8,9 @@
8 8 * Copyright (C) 1998, 1999 Ralf Baechle
9 9 * Copyright (C) 1999 Silicon Graphics, Inc.
10 10 */
  11 +#include <linux/errno.h>
11 12 #include <asm/asm.h>
  13 +#include <asm/asm-offsets.h>
12 14 #include <asm/regdef.h>
13 15  
14 16 #ifdef CONFIG_64BIT
... ... @@ -271,4 +273,444 @@
271 273 jr ra
272 274 .set noreorder
273 275 END(csum_partial)
  276 +
  277 +
  278 +/*
  279 + * checksum and copy routines based on memcpy.S
  280 + *
  281 + * csum_partial_copy_nocheck(src, dst, len, sum)
  282 + * __csum_partial_copy_user(src, dst, len, sum, errp)
  283 + *
  284 + * See "Spec" in memcpy.S for details. Unlike __copy_user, all
  285 + * function in this file use the standard calling convention.
  286 + */
  287 +
  288 +#define src a0
  289 +#define dst a1
  290 +#define len a2
  291 +#define psum a3
  292 +#define sum v0
  293 +#define odd t8
  294 +#define errptr t9
  295 +
  296 +/*
  297 + * The exception handler for loads requires that:
  298 + * 1- AT contain the address of the byte just past the end of the source
  299 + * of the copy,
  300 + * 2- src_entry <= src < AT, and
  301 + * 3- (dst - src) == (dst_entry - src_entry),
  302 + * The _entry suffix denotes values when __copy_user was called.
  303 + *
  304 + * (1) is set up up by __csum_partial_copy_from_user and maintained by
  305 + * not writing AT in __csum_partial_copy
  306 + * (2) is met by incrementing src by the number of bytes copied
  307 + * (3) is met by not doing loads between a pair of increments of dst and src
  308 + *
  309 + * The exception handlers for stores stores -EFAULT to errptr and return.
  310 + * These handlers do not need to overwrite any data.
  311 + */
  312 +
  313 +#define EXC(inst_reg,addr,handler) \
  314 +9: inst_reg, addr; \
  315 + .section __ex_table,"a"; \
  316 + PTR 9b, handler; \
  317 + .previous
  318 +
  319 +#ifdef USE_DOUBLE
  320 +
  321 +#define LOAD ld
  322 +#define LOADL ldl
  323 +#define LOADR ldr
  324 +#define STOREL sdl
  325 +#define STORER sdr
  326 +#define STORE sd
  327 +#define ADD daddu
  328 +#define SUB dsubu
  329 +#define SRL dsrl
  330 +#define SLL dsll
  331 +#define SLLV dsllv
  332 +#define SRLV dsrlv
  333 +#define NBYTES 8
  334 +#define LOG_NBYTES 3
  335 +
  336 +#else
  337 +
  338 +#define LOAD lw
  339 +#define LOADL lwl
  340 +#define LOADR lwr
  341 +#define STOREL swl
  342 +#define STORER swr
  343 +#define STORE sw
  344 +#define ADD addu
  345 +#define SUB subu
  346 +#define SRL srl
  347 +#define SLL sll
  348 +#define SLLV sllv
  349 +#define SRLV srlv
  350 +#define NBYTES 4
  351 +#define LOG_NBYTES 2
  352 +
  353 +#endif /* USE_DOUBLE */
  354 +
  355 +#ifdef CONFIG_CPU_LITTLE_ENDIAN
  356 +#define LDFIRST LOADR
  357 +#define LDREST LOADL
  358 +#define STFIRST STORER
  359 +#define STREST STOREL
  360 +#define SHIFT_DISCARD SLLV
  361 +#define SHIFT_DISCARD_REVERT SRLV
  362 +#else
  363 +#define LDFIRST LOADL
  364 +#define LDREST LOADR
  365 +#define STFIRST STOREL
  366 +#define STREST STORER
  367 +#define SHIFT_DISCARD SRLV
  368 +#define SHIFT_DISCARD_REVERT SLLV
  369 +#endif
  370 +
  371 +#define FIRST(unit) ((unit)*NBYTES)
  372 +#define REST(unit) (FIRST(unit)+NBYTES-1)
  373 +
  374 +#define ADDRMASK (NBYTES-1)
  375 +
  376 + .set noat
  377 +
  378 +LEAF(__csum_partial_copy_user)
  379 + PTR_ADDU AT, src, len /* See (1) above. */
  380 +#ifdef CONFIG_64BIT
  381 + move errptr, a4
  382 +#else
  383 + lw errptr, 16(sp)
  384 +#endif
  385 +FEXPORT(csum_partial_copy_nocheck)
  386 + move sum, zero
  387 + move odd, zero
  388 + /*
  389 + * Note: dst & src may be unaligned, len may be 0
  390 + * Temps
  391 + */
  392 + /*
  393 + * The "issue break"s below are very approximate.
  394 + * Issue delays for dcache fills will perturb the schedule, as will
  395 + * load queue full replay traps, etc.
  396 + *
  397 + * If len < NBYTES use byte operations.
  398 + */
  399 + sltu t2, len, NBYTES
  400 + and t1, dst, ADDRMASK
  401 + bnez t2, copy_bytes_checklen
  402 + and t0, src, ADDRMASK
  403 + andi odd, dst, 0x1 /* odd buffer? */
  404 + bnez t1, dst_unaligned
  405 + nop
  406 + bnez t0, src_unaligned_dst_aligned
  407 + /*
  408 + * use delay slot for fall-through
  409 + * src and dst are aligned; need to compute rem
  410 + */
  411 +both_aligned:
  412 + SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
  413 + beqz t0, cleanup_both_aligned # len < 8*NBYTES
  414 + nop
  415 + SUB len, 8*NBYTES # subtract here for bgez loop
  416 + .align 4
  417 +1:
  418 +EXC( LOAD t0, UNIT(0)(src), l_exc)
  419 +EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  420 +EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  421 +EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  422 +EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
  423 +EXC( LOAD t5, UNIT(5)(src), l_exc_copy)
  424 +EXC( LOAD t6, UNIT(6)(src), l_exc_copy)
  425 +EXC( LOAD t7, UNIT(7)(src), l_exc_copy)
  426 + SUB len, len, 8*NBYTES
  427 + ADD src, src, 8*NBYTES
  428 +EXC( STORE t0, UNIT(0)(dst), s_exc)
  429 + ADDC(sum, t0)
  430 +EXC( STORE t1, UNIT(1)(dst), s_exc)
  431 + ADDC(sum, t1)
  432 +EXC( STORE t2, UNIT(2)(dst), s_exc)
  433 + ADDC(sum, t2)
  434 +EXC( STORE t3, UNIT(3)(dst), s_exc)
  435 + ADDC(sum, t3)
  436 +EXC( STORE t4, UNIT(4)(dst), s_exc)
  437 + ADDC(sum, t4)
  438 +EXC( STORE t5, UNIT(5)(dst), s_exc)
  439 + ADDC(sum, t5)
  440 +EXC( STORE t6, UNIT(6)(dst), s_exc)
  441 + ADDC(sum, t6)
  442 +EXC( STORE t7, UNIT(7)(dst), s_exc)
  443 + ADDC(sum, t7)
  444 + bgez len, 1b
  445 + ADD dst, dst, 8*NBYTES
  446 + ADD len, 8*NBYTES # revert len (see above)
  447 +
  448 + /*
  449 + * len == the number of bytes left to copy < 8*NBYTES
  450 + */
  451 +cleanup_both_aligned:
  452 +#define rem t7
  453 + beqz len, done
  454 + sltu t0, len, 4*NBYTES
  455 + bnez t0, less_than_4units
  456 + and rem, len, (NBYTES-1) # rem = len % NBYTES
  457 + /*
  458 + * len >= 4*NBYTES
  459 + */
  460 +EXC( LOAD t0, UNIT(0)(src), l_exc)
  461 +EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  462 +EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  463 +EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  464 + SUB len, len, 4*NBYTES
  465 + ADD src, src, 4*NBYTES
  466 +EXC( STORE t0, UNIT(0)(dst), s_exc)
  467 + ADDC(sum, t0)
  468 +EXC( STORE t1, UNIT(1)(dst), s_exc)
  469 + ADDC(sum, t1)
  470 +EXC( STORE t2, UNIT(2)(dst), s_exc)
  471 + ADDC(sum, t2)
  472 +EXC( STORE t3, UNIT(3)(dst), s_exc)
  473 + ADDC(sum, t3)
  474 + beqz len, done
  475 + ADD dst, dst, 4*NBYTES
  476 +less_than_4units:
  477 + /*
  478 + * rem = len % NBYTES
  479 + */
  480 + beq rem, len, copy_bytes
  481 + nop
  482 +1:
  483 +EXC( LOAD t0, 0(src), l_exc)
  484 + ADD src, src, NBYTES
  485 + SUB len, len, NBYTES
  486 +EXC( STORE t0, 0(dst), s_exc)
  487 + ADDC(sum, t0)
  488 + bne rem, len, 1b
  489 + ADD dst, dst, NBYTES
  490 +
  491 + /*
  492 + * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
  493 + * A loop would do only a byte at a time with possible branch
  494 + * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
  495 + * because can't assume read-access to dst. Instead, use
  496 + * STREST dst, which doesn't require read access to dst.
  497 + *
  498 + * This code should perform better than a simple loop on modern,
  499 + * wide-issue mips processors because the code has fewer branches and
  500 + * more instruction-level parallelism.
  501 + */
  502 +#define bits t2
  503 + beqz len, done
  504 + ADD t1, dst, len # t1 is just past last byte of dst
  505 + li bits, 8*NBYTES
  506 + SLL rem, len, 3 # rem = number of bits to keep
  507 +EXC( LOAD t0, 0(src), l_exc)
  508 + SUB bits, bits, rem # bits = number of bits to discard
  509 + SHIFT_DISCARD t0, t0, bits
  510 +EXC( STREST t0, -1(t1), s_exc)
  511 + SHIFT_DISCARD_REVERT t0, t0, bits
  512 + .set reorder
  513 + ADDC(sum, t0)
  514 + b done
  515 + .set noreorder
  516 +dst_unaligned:
  517 + /*
  518 + * dst is unaligned
  519 + * t0 = src & ADDRMASK
  520 + * t1 = dst & ADDRMASK; T1 > 0
  521 + * len >= NBYTES
  522 + *
  523 + * Copy enough bytes to align dst
  524 + * Set match = (src and dst have same alignment)
  525 + */
  526 +#define match rem
  527 +EXC( LDFIRST t3, FIRST(0)(src), l_exc)
  528 + ADD t2, zero, NBYTES
  529 +EXC( LDREST t3, REST(0)(src), l_exc_copy)
  530 + SUB t2, t2, t1 # t2 = number of bytes copied
  531 + xor match, t0, t1
  532 +EXC( STFIRST t3, FIRST(0)(dst), s_exc)
  533 + SLL t4, t1, 3 # t4 = number of bits to discard
  534 + SHIFT_DISCARD t3, t3, t4
  535 + /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
  536 + ADDC(sum, t3)
  537 + beq len, t2, done
  538 + SUB len, len, t2
  539 + ADD dst, dst, t2
  540 + beqz match, both_aligned
  541 + ADD src, src, t2
  542 +
  543 +src_unaligned_dst_aligned:
  544 + SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  545 + beqz t0, cleanup_src_unaligned
  546 + and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  547 +1:
  548 +/*
  549 + * Avoid consecutive LD*'s to the same register since some mips
  550 + * implementations can't issue them in the same cycle.
  551 + * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  552 + * are to the same unit (unless src is aligned, but it's not).
  553 + */
  554 +EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  555 +EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
  556 + SUB len, len, 4*NBYTES
  557 +EXC( LDREST t0, REST(0)(src), l_exc_copy)
  558 +EXC( LDREST t1, REST(1)(src), l_exc_copy)
  559 +EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
  560 +EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
  561 +EXC( LDREST t2, REST(2)(src), l_exc_copy)
  562 +EXC( LDREST t3, REST(3)(src), l_exc_copy)
  563 + ADD src, src, 4*NBYTES
  564 +#ifdef CONFIG_CPU_SB1
  565 + nop # improves slotting
  566 +#endif
  567 +EXC( STORE t0, UNIT(0)(dst), s_exc)
  568 + ADDC(sum, t0)
  569 +EXC( STORE t1, UNIT(1)(dst), s_exc)
  570 + ADDC(sum, t1)
  571 +EXC( STORE t2, UNIT(2)(dst), s_exc)
  572 + ADDC(sum, t2)
  573 +EXC( STORE t3, UNIT(3)(dst), s_exc)
  574 + ADDC(sum, t3)
  575 + bne len, rem, 1b
  576 + ADD dst, dst, 4*NBYTES
  577 +
  578 +cleanup_src_unaligned:
  579 + beqz len, done
  580 + and rem, len, NBYTES-1 # rem = len % NBYTES
  581 + beq rem, len, copy_bytes
  582 + nop
  583 +1:
  584 +EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  585 +EXC( LDREST t0, REST(0)(src), l_exc_copy)
  586 + ADD src, src, NBYTES
  587 + SUB len, len, NBYTES
  588 +EXC( STORE t0, 0(dst), s_exc)
  589 + ADDC(sum, t0)
  590 + bne len, rem, 1b
  591 + ADD dst, dst, NBYTES
  592 +
  593 +copy_bytes_checklen:
  594 + beqz len, done
  595 + nop
  596 +copy_bytes:
  597 + /* 0 < len < NBYTES */
  598 +#ifdef CONFIG_CPU_LITTLE_ENDIAN
  599 +#define SHIFT_START 0
  600 +#define SHIFT_INC 8
  601 +#else
  602 +#define SHIFT_START 8*(NBYTES-1)
  603 +#define SHIFT_INC -8
  604 +#endif
  605 + move t2, zero # partial word
  606 + li t3, SHIFT_START # shift
  607 +/* use l_exc_copy here to return correct sum on fault */
  608 +#define COPY_BYTE(N) \
  609 +EXC( lbu t0, N(src), l_exc_copy); \
  610 + SUB len, len, 1; \
  611 +EXC( sb t0, N(dst), s_exc); \
  612 + SLLV t0, t0, t3; \
  613 + addu t3, SHIFT_INC; \
  614 + beqz len, copy_bytes_done; \
  615 + or t2, t0
  616 +
  617 + COPY_BYTE(0)
  618 + COPY_BYTE(1)
  619 +#ifdef USE_DOUBLE
  620 + COPY_BYTE(2)
  621 + COPY_BYTE(3)
  622 + COPY_BYTE(4)
  623 + COPY_BYTE(5)
  624 +#endif
  625 +EXC( lbu t0, NBYTES-2(src), l_exc_copy)
  626 + SUB len, len, 1
  627 +EXC( sb t0, NBYTES-2(dst), s_exc)
  628 + SLLV t0, t0, t3
  629 + or t2, t0
  630 +copy_bytes_done:
  631 + ADDC(sum, t2)
  632 +done:
  633 + /* fold checksum */
  634 +#ifdef USE_DOUBLE
  635 + dsll32 v1, sum, 0
  636 + daddu sum, v1
  637 + sltu v1, sum, v1
  638 + dsra32 sum, sum, 0
  639 + addu sum, v1
  640 +#endif
  641 + sll v1, sum, 16
  642 + addu sum, v1
  643 + sltu v1, sum, v1
  644 + srl sum, sum, 16
  645 + addu sum, v1
  646 +
  647 + /* odd buffer alignment? */
  648 + beqz odd, 1f
  649 + nop
  650 + sll v1, sum, 8
  651 + srl sum, sum, 8
  652 + or sum, v1
  653 + andi sum, 0xffff
  654 +1:
  655 + .set reorder
  656 + ADDC(sum, psum)
  657 + jr ra
  658 + .set noreorder
  659 +
  660 +l_exc_copy:
  661 + /*
  662 + * Copy bytes from src until faulting load address (or until a
  663 + * lb faults)
  664 + *
  665 + * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
  666 + * may be more than a byte beyond the last address.
  667 + * Hence, the lb below may get an exception.
  668 + *
  669 + * Assumes src < THREAD_BUADDR($28)
  670 + */
  671 + LOAD t0, TI_TASK($28)
  672 + li t2, SHIFT_START
  673 + LOAD t0, THREAD_BUADDR(t0)
  674 +1:
  675 +EXC( lbu t1, 0(src), l_exc)
  676 + ADD src, src, 1
  677 + sb t1, 0(dst) # can't fault -- we're copy_from_user
  678 + SLLV t1, t1, t2
  679 + addu t2, SHIFT_INC
  680 + ADDC(sum, t1)
  681 + bne src, t0, 1b
  682 + ADD dst, dst, 1
  683 +l_exc:
  684 + LOAD t0, TI_TASK($28)
  685 + nop
  686 + LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
  687 + nop
  688 + SUB len, AT, t0 # len number of uncopied bytes
  689 + /*
  690 + * Here's where we rely on src and dst being incremented in tandem,
  691 + * See (3) above.
  692 + * dst += (fault addr - src) to put dst at first byte to clear
  693 + */
  694 + ADD dst, t0 # compute start address in a1
  695 + SUB dst, src
  696 + /*
  697 + * Clear len bytes starting at dst. Can't call __bzero because it
  698 + * might modify len. An inefficient loop for these rare times...
  699 + */
  700 + beqz len, done
  701 + SUB src, len, 1
  702 +1: sb zero, 0(dst)
  703 + ADD dst, dst, 1
  704 + bnez src, 1b
  705 + SUB src, src, 1
  706 + li v1, -EFAULT
  707 + b done
  708 + sw v1, (errptr)
  709 +
  710 +s_exc:
  711 + li v0, -1 /* invalid checksum */
  712 + li v1, -EFAULT
  713 + jr ra
  714 + sw v1, (errptr)
  715 + END(__csum_partial_copy_user)
arch/mips/lib/csum_partial_copy.c
1   -/*
2   - * This file is subject to the terms and conditions of the GNU General Public
3   - * License. See the file "COPYING" in the main directory of this archive
4   - * for more details.
5   - *
6   - * Copyright (C) 1994, 1995 Waldorf Electronics GmbH
7   - * Copyright (C) 1998, 1999 Ralf Baechle
8   - */
9   -#include <linux/kernel.h>
10   -#include <linux/module.h>
11   -#include <linux/types.h>
12   -#include <asm/byteorder.h>
13   -#include <asm/string.h>
14   -#include <asm/uaccess.h>
15   -#include <net/checksum.h>
16   -
17   -/*
18   - * copy while checksumming, otherwise like csum_partial
19   - */
20   -__wsum csum_partial_copy_nocheck(const void *src,
21   - void *dst, int len, __wsum sum)
22   -{
23   - /*
24   - * It's 2:30 am and I don't feel like doing it real ...
25   - * This is lots slower than the real thing (tm)
26   - */
27   - sum = csum_partial(src, len, sum);
28   - memcpy(dst, src, len);
29   -
30   - return sum;
31   -}
32   -
33   -EXPORT_SYMBOL(csum_partial_copy_nocheck);
34   -
35   -/*
36   - * Copy from userspace and compute checksum. If we catch an exception
37   - * then zero the rest of the buffer.
38   - */
39   -__wsum csum_partial_copy_from_user (const void __user *src,
40   - void *dst, int len, __wsum sum, int *err_ptr)
41   -{
42   - int missing;
43   -
44   - might_sleep();
45   - missing = copy_from_user(dst, src, len);
46   - if (missing) {
47   - memset(dst + len - missing, 0, missing);
48   - *err_ptr = -EFAULT;
49   - }
50   -
51   - return csum_partial(dst, len, sum);
52   -}
include/asm-mips/checksum.h
... ... @@ -29,31 +29,38 @@
29 29 */
30 30 __wsum csum_partial(const void *buff, int len, __wsum sum);
31 31  
  32 +__wsum __csum_partial_copy_user(const void *src, void *dst,
  33 + int len, __wsum sum, int *err_ptr);
  34 +
32 35 /*
33 36 * this is a new version of the above that records errors it finds in *errp,
34 37 * but continues and zeros the rest of the buffer.
35 38 */
36   -__wsum csum_partial_copy_from_user(const void __user *src,
37   - void *dst, int len,
38   - __wsum sum, int *errp);
  39 +static inline
  40 +__wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len,
  41 + __wsum sum, int *err_ptr)
  42 +{
  43 + might_sleep();
  44 + return __csum_partial_copy_user((__force void *)src, dst,
  45 + len, sum, err_ptr);
  46 +}
39 47  
40 48 /*
41 49 * Copy and checksum to user
42 50 */
43 51 #define HAVE_CSUM_COPY_USER
44   -static inline __wsum csum_and_copy_to_user (const void *src, void __user *dst,
45   - int len, __wsum sum,
46   - int *err_ptr)
  52 +static inline
  53 +__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
  54 + __wsum sum, int *err_ptr)
47 55 {
48 56 might_sleep();
49   - sum = csum_partial(src, len, sum);
50   -
51   - if (copy_to_user(dst, src, len)) {
  57 + if (access_ok(VERIFY_WRITE, dst, len))
  58 + return __csum_partial_copy_user(src, (__force void *)dst,
  59 + len, sum, err_ptr);
  60 + if (len)
52 61 *err_ptr = -EFAULT;
53   - return (__force __wsum)-1;
54   - }
55 62  
56   - return sum;
  63 + return (__force __wsum)-1; /* invalid checksum */
57 64 }
58 65  
59 66 /*