Blame view

arch/sh/lib64/memcpy.S 4.27 KB
4466b20cf   Paul Mundt   sh: Add SH-5 opti...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
  /* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
  /* Modified by SuperH, Inc. September 2003 */
  !
  ! Fast SH memcpy
  !
  ! by Toshiyasu Morita (tm@netcom.com)
  ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  ! SH5 code Copyright 2002 SuperH Ltd.
  !
  ! Entry: ARG0: destination pointer
  !        ARG1: source pointer
  !        ARG2: byte count
  !
  ! Exit:  RESULT: destination pointer
  !        any other registers in the range r0-r7: trashed
  !
  ! Notes: Usually one wants to do small reads and write a longword, but
  !        unfortunately it is difficult in some cases to concatanate bytes
  !        into a longword on the SH, so this does a longword read and small
  !        writes.
  !
  ! This implementation makes two assumptions about how it is called:
  !
  ! 1.: If the byte count is nonzero, the address of the last byte to be
  !     copied is unsigned greater than the address of the first byte to
  !     be copied.  This could be easily swapped for a signed comparison,
  !     but the algorithm used needs some comparison.
  !
  ! 2.: When there are two or three bytes in the last word of an 11-or-more
  !     bytes memory chunk to b copied, the rest of the word can be read
  !     without side effects.
25985edce   Lucas De Marchi   Fix common misspe...
32
  !     This could be easily changed by increasing the minimum size of
4466b20cf   Paul Mundt   sh: Add SH-5 opti...
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
  !     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  !     however, this would cost a few extra cyles on average.
  !     For SHmedia, the assumption is that any quadword can be read in its
  !     enirety if at least one byte is included in the copy.
  !
  
  	.section .text..SHmedia32,"ax"
  	.globl	memcpy
  	.type	memcpy, @function
  	.align	5
  
  memcpy:
  
  #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  
  	ld.b r3,0,r63
  	pta/l Large,tr0
  	movi 25,r0
  	bgeu/u r4,r0,tr0
  	nsb r4,r0
  	shlli r0,5,r0
  	movi (L1-L0+63*32 + 1) & 0xffff,r1
  	sub r1, r0, r0
  L0:	ptrel r0,tr0
  	add r2,r4,r5
  	ptabs r18,tr1
  	add r3,r4,r6
  	blink tr0,r63
  	
  /* Rearranged to make cut2 safe */
  	.balign 8
  L4_7:	/* 4..7 byte memcpy cntd. */
  	stlo.l r2, 0, r0
  	or r6, r7, r6
  	sthi.l r5, -1, r6
  	stlo.l r5, -4, r6
  	blink tr1,r63
  
  	.balign 8
  L1:	/* 0 byte memcpy */
  	nop
  	blink tr1,r63
  	nop
  	nop
  	nop
  	nop
  
  L2_3:	/* 2 or 3 byte memcpy cntd. */
  	st.b r5,-1,r6
  	blink tr1,r63
  
  	/* 1 byte memcpy */
  	ld.b r3,0,r0
  	st.b r2,0,r0
  	blink tr1,r63
  
  L8_15:	/* 8..15 byte memcpy cntd. */
  	stlo.q r2, 0, r0
  	or r6, r7, r6
  	sthi.q r5, -1, r6
  	stlo.q r5, -8, r6
  	blink tr1,r63
  	
  	/* 2 or 3 byte memcpy */
  	ld.b r3,0,r0
  	ld.b r2,0,r63
  	ld.b r3,1,r1
  	st.b r2,0,r0
  	pta/l L2_3,tr0
  	ld.b r6,-1,r6
  	st.b r2,1,r1
  	blink tr0, r63
  
  	/* 4 .. 7 byte memcpy */
  	LDUAL (r3, 0, r0, r1)
  	pta L4_7, tr0
  	ldlo.l r6, -4, r7
  	or r0, r1, r0
  	sthi.l r2, 3, r0
  	ldhi.l r6, -1, r6
  	blink tr0, r63
  
  	/* 8 .. 15 byte memcpy */
  	LDUAQ (r3, 0, r0, r1)
  	pta L8_15, tr0
  	ldlo.q r6, -8, r7
  	or r0, r1, r0
  	sthi.q r2, 7, r0
  	ldhi.q r6, -1, r6
  	blink tr0, r63
  
  	/* 16 .. 24 byte memcpy */
  	LDUAQ (r3, 0, r0, r1)
  	LDUAQ (r3, 8, r8, r9)
  	or r0, r1, r0
  	sthi.q r2, 7, r0
  	or r8, r9, r8
  	sthi.q r2, 15, r8
  	ldlo.q r6, -8, r7
  	ldhi.q r6, -1, r6
  	stlo.q r2, 8, r8
  	stlo.q r2, 0, r0
  	or r6, r7, r6
  	sthi.q r5, -1, r6
  	stlo.q r5, -8, r6
  	blink tr1,r63
  
  Large:
  	ld.b r2, 0, r63
  	pta/l  Loop_ua, tr1
  	ori r3, -8, r7
  	sub r2, r7, r22
  	sub r3, r2, r6
  	add r2, r4, r5
  	ldlo.q r3, 0, r0
  	addi r5, -16, r5
  	movi 64+8, r27 // could subtract r7 from that.
  	stlo.q r2, 0, r0
  	sthi.q r2, 7, r0
  	ldx.q r22, r6, r0
  	bgtu/l r27, r4, tr1
  
  	addi r5, -48, r27
  	pta/l Loop_line, tr0
  	addi r6, 64, r36
  	addi r6, -24, r19
  	addi r6, -16, r20
  	addi r6, -8, r21
  
  Loop_line:
  	ldx.q r22, r36, r63
  	alloco r22, 32
  	addi r22, 32, r22
  	ldx.q r22, r19, r23
  	sthi.q r22, -25, r0
  	ldx.q r22, r20, r24
  	ldx.q r22, r21, r25
  	stlo.q r22, -32, r0
  	ldx.q r22, r6,  r0
  	sthi.q r22, -17, r23
  	sthi.q r22,  -9, r24
  	sthi.q r22,  -1, r25
  	stlo.q r22, -24, r23
  	stlo.q r22, -16, r24
  	stlo.q r22,  -8, r25
  	bgeu r27, r22, tr0
  
  Loop_ua:
  	addi r22, 8, r22
  	sthi.q r22, -1, r0
  	stlo.q r22, -8, r0
  	ldx.q r22, r6, r0
  	bgtu/l r5, r22, tr1
  
  	add r3, r4, r7
  	ldlo.q r7, -8, r1
  	sthi.q r22, 7, r0
  	ldhi.q r7, -1, r7
  	ptabs r18,tr1
  	stlo.q r22, 0, r0
  	or r1, r7, r1
  	sthi.q r5, 15, r1
  	stlo.q r5, 8, r1
  	blink tr1, r63
  
  	.size memcpy,.-memcpy