Blame view

arch/sh/lib/memset-sh4.S 1.59 KB
dfc349402   Stuart Menefy   sh: Optimised mem...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  /*
   * "memset" implementation for SH4
   *
   * Copyright (C) 1999  Niibe Yutaka
   * Copyright (c) 2009  STMicroelectronics Limited
   * Author: Stuart Menefy <stuart.menefy:st.com>
   */
  
  /*
   *            void *memset(void *s, int c, size_t n);
   */
  
  #include <linux/linkage.h>
  
  ENTRY(memset)
  	mov	#12,r0
  	add	r6,r4
  	cmp/gt	r6,r0
  	bt/s	40f		! if it's too small, set a byte at once
  	 mov	r4,r0
  	and	#3,r0
  	cmp/eq	#0,r0
  	bt/s	2f		! It's aligned
  	 sub	r0,r6
  1:
  	dt	r0
  	bf/s	1b
  	 mov.b	r5,@-r4
  2:				! make VVVV
  	extu.b	r5,r5
  	swap.b	r5,r0		!   V0
  	or	r0,r5		!   VV
  	swap.w	r5,r0		! VV00
  	or	r0,r5		! VVVV
  
  	! Check if enough bytes need to be copied to be worth the big loop
  	mov	#0x40, r0	! (MT)
  	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
  
  	bt/s	22f
  	 mov	r6,r0
  
  	! align the dst to the cache block size if necessary
  	mov	r4, r3
  	mov	#~(0x1f), r1
  
  	and	r3, r1
  	cmp/eq	r3, r1
  
  	bt/s	11f		! dst is already aligned
  	 sub	r1, r3		! r3-r1 -> r3
  	shlr2	r3		! number of loops
  
  10:	mov.l	r5,@-r4
  	dt	r3
  	bf/s	10b
  	 add	#-4, r6
  
  11:	! dst is 32byte aligned
  	mov	r6,r2
  	mov	#-5,r0
  	shld	r0,r2		! number of loops
  
  	add	#-32, r4
  	mov	r5, r0
  12:
  	movca.l	r0,@r4
  	mov.l	r5,@(4, r4)
  	mov.l	r5,@(8, r4)
  	mov.l	r5,@(12,r4)
  	mov.l	r5,@(16,r4)
  	mov.l	r5,@(20,r4)
  	add	#-0x20, r6
  	mov.l	r5,@(24,r4)
  	dt	r2
  	mov.l	r5,@(28,r4)
  	bf/s	12b
  	 add	#-32, r4
  
  	add	#32, r4
  	mov	#8, r0
  	cmp/ge	r0, r6
  	bf	40f
  
  	mov	r6,r0
  22:
  	shlr2	r0
  	shlr	r0		! r0 = r6 >> 3
  3:
  	dt	r0
  	mov.l	r5,@-r4		! set 8-byte at once
  	bf/s	3b
  	 mov.l	r5,@-r4
  	!
  	mov	#7,r0
  	and	r0,r6
  
  	! fill bytes (length may be zero)
  40:	tst	r6,r6
  	bt	5f
  4:
  	dt	r6
  	bf/s	4b
  	 mov.b	r5,@-r4
  5:
  	rts
  	 mov	r4,r0