Commit cf62a8b8134dd37ab365bfe8b972a7ebb1100110

Authored by Markos Chandras
Committed by Ralf Baechle
1 parent bda4d986a6

MIPS: lib: memcpy: Use macro to build the copy_user code

The code can be shared between EVA and non-EVA configurations,
therefore use a macro to build it to avoid code duplications.

Signed-off-by: Markos Chandras <markos.chandras@imgtec.com>

Showing 1 changed file with 143 additions and 110 deletions Side-by-side Diff

arch/mips/lib/memcpy.S
... ... @@ -92,6 +92,10 @@
92 92 /* Pretech type */
93 93 #define SRC_PREFETCH 1
94 94 #define DST_PREFETCH 2
  95 +#define LEGACY_MODE 1
  96 +#define EVA_MODE 2
  97 +#define USEROP 1
  98 +#define KERNELOP 2
95 99  
96 100 /*
97 101 * Wrapper to add an entry in the exception table
98 102  
... ... @@ -103,12 +107,14 @@
103 107 * addr : Address
104 108 * handler : Exception handler
105 109 */
106   -#define EXC(insn, type, reg, addr, handler) \
107   -9: insn reg, addr; \
108   - .section __ex_table,"a"; \
109   - PTR 9b, handler; \
110   - .previous
111 110  
  111 +#define EXC(insn, type, reg, addr, handler) \
  112 + .if \mode == LEGACY_MODE; \
  113 +9: insn reg, addr; \
  114 + .section __ex_table,"a"; \
  115 + PTR 9b, handler; \
  116 + .previous; \
  117 + .endif
112 118 /*
113 119 * Only on the 64-bit kernel we can made use of 64-bit registers.
114 120 */
... ... @@ -177,7 +183,10 @@
177 183 #define LOADB(reg, addr, handler) EXC(lb, LD_INSN, reg, addr, handler)
178 184 #define STOREB(reg, addr, handler) EXC(sb, ST_INSN, reg, addr, handler)
179 185  
180   -#define _PREF(hint, addr, type) PREF(hint, addr)
  186 +#define _PREF(hint, addr, type) \
  187 + .if \mode == LEGACY_MODE; \
  188 + PREF(hint, addr); \
  189 + .endif
181 190  
182 191 #define PREFS(hint, addr) _PREF(hint, addr, SRC_PREFETCH)
183 192 #define PREFD(hint, addr) _PREF(hint, addr, DST_PREFETCH)
184 193  
185 194  
... ... @@ -210,28 +219,24 @@
210 219 .set at=v1
211 220 #endif
212 221  
213   -/*
214   - * t6 is used as a flag to note inatomic mode.
215   - */
216   -LEAF(__copy_user_inatomic)
217   - b __copy_user_common
218   - li t6, 1
219   - END(__copy_user_inatomic)
220   -
221   -/*
222   - * A combined memcpy/__copy_user
223   - * __copy_user sets len to 0 for success; else to an upper bound of
224   - * the number of uncopied bytes.
225   - * memcpy sets v0 to dst.
226   - */
227 222 .align 5
228   -LEAF(memcpy) /* a0=dst a1=src a2=len */
229   - move v0, dst /* return value */
230   -.L__memcpy:
231   -FEXPORT(__copy_user)
232   - li t6, 0 /* not inatomic */
233   -__copy_user_common:
  223 +
234 224 /*
  225 + * Macro to build the __copy_user common code
  226 + * Arguements:
  227 + * mode : LEGACY_MODE or EVA_MODE
  228 + * from : Source operand. USEROP or KERNELOP
  229 + * to : Destination operand. USEROP or KERNELOP
  230 + */
  231 + .macro __BUILD_COPY_USER mode, from, to
  232 +
  233 + /* initialize __memcpy if this the first time we execute this macro */
  234 + .ifnotdef __memcpy
  235 + .set __memcpy, 1
  236 + .hidden __memcpy /* make sure it does not leak */
  237 + .endif
  238 +
  239 + /*
235 240 * Note: dst & src may be unaligned, len may be 0
236 241 * Temps
237 242 */
238 243  
239 244  
240 245  
241 246  
242 247  
243 248  
244 249  
... ... @@ -251,45 +256,45 @@
251 256 and t1, dst, ADDRMASK
252 257 PREFS( 0, 1*32(src) )
253 258 PREFD( 1, 1*32(dst) )
254   - bnez t2, .Lcopy_bytes_checklen
  259 + bnez t2, .Lcopy_bytes_checklen\@
255 260 and t0, src, ADDRMASK
256 261 PREFS( 0, 2*32(src) )
257 262 PREFD( 1, 2*32(dst) )
258   - bnez t1, .Ldst_unaligned
  263 + bnez t1, .Ldst_unaligned\@
259 264 nop
260   - bnez t0, .Lsrc_unaligned_dst_aligned
  265 + bnez t0, .Lsrc_unaligned_dst_aligned\@
261 266 /*
262 267 * use delay slot for fall-through
263 268 * src and dst are aligned; need to compute rem
264 269 */
265   -.Lboth_aligned:
  270 +.Lboth_aligned\@:
266 271 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
267   - beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
  272 + beqz t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
268 273 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
269 274 PREFS( 0, 3*32(src) )
270 275 PREFD( 1, 3*32(dst) )
271 276 .align 4
272 277 1:
273 278 R10KCBARRIER(0(ra))
274   - LOAD(t0, UNIT(0)(src), .Ll_exc)
275   - LOAD(t1, UNIT(1)(src), .Ll_exc_copy)
276   - LOAD(t2, UNIT(2)(src), .Ll_exc_copy)
277   - LOAD(t3, UNIT(3)(src), .Ll_exc_copy)
  279 + LOAD(t0, UNIT(0)(src), .Ll_exc\@)
  280 + LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
  281 + LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
  282 + LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
278 283 SUB len, len, 8*NBYTES
279   - LOAD(t4, UNIT(4)(src), .Ll_exc_copy)
280   - LOAD(t7, UNIT(5)(src), .Ll_exc_copy)
281   - STORE(t0, UNIT(0)(dst), .Ls_exc_p8u)
282   - STORE(t1, UNIT(1)(dst), .Ls_exc_p7u)
283   - LOAD(t0, UNIT(6)(src), .Ll_exc_copy)
284   - LOAD(t1, UNIT(7)(src), .Ll_exc_copy)
  284 + LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)
  285 + LOAD(t7, UNIT(5)(src), .Ll_exc_copy\@)
  286 + STORE(t0, UNIT(0)(dst), .Ls_exc_p8u\@)
  287 + STORE(t1, UNIT(1)(dst), .Ls_exc_p7u\@)
  288 + LOAD(t0, UNIT(6)(src), .Ll_exc_copy\@)
  289 + LOAD(t1, UNIT(7)(src), .Ll_exc_copy\@)
285 290 ADD src, src, 8*NBYTES
286 291 ADD dst, dst, 8*NBYTES
287   - STORE(t2, UNIT(-6)(dst), .Ls_exc_p6u)
288   - STORE(t3, UNIT(-5)(dst), .Ls_exc_p5u)
289   - STORE(t4, UNIT(-4)(dst), .Ls_exc_p4u)
290   - STORE(t7, UNIT(-3)(dst), .Ls_exc_p3u)
291   - STORE(t0, UNIT(-2)(dst), .Ls_exc_p2u)
292   - STORE(t1, UNIT(-1)(dst), .Ls_exc_p1u)
  292 + STORE(t2, UNIT(-6)(dst), .Ls_exc_p6u\@)
  293 + STORE(t3, UNIT(-5)(dst), .Ls_exc_p5u\@)
  294 + STORE(t4, UNIT(-4)(dst), .Ls_exc_p4u\@)
  295 + STORE(t7, UNIT(-3)(dst), .Ls_exc_p3u\@)
  296 + STORE(t0, UNIT(-2)(dst), .Ls_exc_p2u\@)
  297 + STORE(t1, UNIT(-1)(dst), .Ls_exc_p1u\@)
293 298 PREFS( 0, 8*32(src) )
294 299 PREFD( 1, 8*32(dst) )
295 300 bne len, rem, 1b
296 301  
297 302  
298 303  
299 304  
300 305  
301 306  
302 307  
303 308  
... ... @@ -298,41 +303,41 @@
298 303 /*
299 304 * len == rem == the number of bytes left to copy < 8*NBYTES
300 305 */
301   -.Lcleanup_both_aligned:
302   - beqz len, .Ldone
  306 +.Lcleanup_both_aligned\@:
  307 + beqz len, .Ldone\@
303 308 sltu t0, len, 4*NBYTES
304   - bnez t0, .Lless_than_4units
  309 + bnez t0, .Lless_than_4units\@
305 310 and rem, len, (NBYTES-1) # rem = len % NBYTES
306 311 /*
307 312 * len >= 4*NBYTES
308 313 */
309   - LOAD( t0, UNIT(0)(src), .Ll_exc)
310   - LOAD( t1, UNIT(1)(src), .Ll_exc_copy)
311   - LOAD( t2, UNIT(2)(src), .Ll_exc_copy)
312   - LOAD( t3, UNIT(3)(src), .Ll_exc_copy)
  314 + LOAD( t0, UNIT(0)(src), .Ll_exc\@)
  315 + LOAD( t1, UNIT(1)(src), .Ll_exc_copy\@)
  316 + LOAD( t2, UNIT(2)(src), .Ll_exc_copy\@)
  317 + LOAD( t3, UNIT(3)(src), .Ll_exc_copy\@)
313 318 SUB len, len, 4*NBYTES
314 319 ADD src, src, 4*NBYTES
315 320 R10KCBARRIER(0(ra))
316   - STORE(t0, UNIT(0)(dst), .Ls_exc_p4u)
317   - STORE(t1, UNIT(1)(dst), .Ls_exc_p3u)
318   - STORE(t2, UNIT(2)(dst), .Ls_exc_p2u)
319   - STORE(t3, UNIT(3)(dst), .Ls_exc_p1u)
  321 + STORE(t0, UNIT(0)(dst), .Ls_exc_p4u\@)
  322 + STORE(t1, UNIT(1)(dst), .Ls_exc_p3u\@)
  323 + STORE(t2, UNIT(2)(dst), .Ls_exc_p2u\@)
  324 + STORE(t3, UNIT(3)(dst), .Ls_exc_p1u\@)
320 325 .set reorder /* DADDI_WAR */
321 326 ADD dst, dst, 4*NBYTES
322   - beqz len, .Ldone
  327 + beqz len, .Ldone\@
323 328 .set noreorder
324   -.Lless_than_4units:
  329 +.Lless_than_4units\@:
325 330 /*
326 331 * rem = len % NBYTES
327 332 */
328   - beq rem, len, .Lcopy_bytes
  333 + beq rem, len, .Lcopy_bytes\@
329 334 nop
330 335 1:
331 336 R10KCBARRIER(0(ra))
332   - LOAD(t0, 0(src), .Ll_exc)
  337 + LOAD(t0, 0(src), .Ll_exc\@)
333 338 ADD src, src, NBYTES
334 339 SUB len, len, NBYTES
335   - STORE(t0, 0(dst), .Ls_exc_p1u)
  340 + STORE(t0, 0(dst), .Ls_exc_p1u\@)
336 341 .set reorder /* DADDI_WAR */
337 342 ADD dst, dst, NBYTES
338 343 bne rem, len, 1b
339 344  
340 345  
341 346  
... ... @@ -350,17 +355,17 @@
350 355 * more instruction-level parallelism.
351 356 */
352 357 #define bits t2
353   - beqz len, .Ldone
  358 + beqz len, .Ldone\@
354 359 ADD t1, dst, len # t1 is just past last byte of dst
355 360 li bits, 8*NBYTES
356 361 SLL rem, len, 3 # rem = number of bits to keep
357   - LOAD(t0, 0(src), .Ll_exc)
  362 + LOAD(t0, 0(src), .Ll_exc\@)
358 363 SUB bits, bits, rem # bits = number of bits to discard
359 364 SHIFT_DISCARD t0, t0, bits
360   - STREST(t0, -1(t1), .Ls_exc)
  365 + STREST(t0, -1(t1), .Ls_exc\@)
361 366 jr ra
362 367 move len, zero
363   -.Ldst_unaligned:
  368 +.Ldst_unaligned\@:
364 369 /*
365 370 * dst is unaligned
366 371 * t0 = src & ADDRMASK
367 372  
368 373  
369 374  
370 375  
371 376  
... ... @@ -371,23 +376,23 @@
371 376 * Set match = (src and dst have same alignment)
372 377 */
373 378 #define match rem
374   - LDFIRST(t3, FIRST(0)(src), .Ll_exc)
  379 + LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)
375 380 ADD t2, zero, NBYTES
376   - LDREST(t3, REST(0)(src), .Ll_exc_copy)
  381 + LDREST(t3, REST(0)(src), .Ll_exc_copy\@)
377 382 SUB t2, t2, t1 # t2 = number of bytes copied
378 383 xor match, t0, t1
379 384 R10KCBARRIER(0(ra))
380   - STFIRST(t3, FIRST(0)(dst), .Ls_exc)
381   - beq len, t2, .Ldone
  385 + STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)
  386 + beq len, t2, .Ldone\@
382 387 SUB len, len, t2
383 388 ADD dst, dst, t2
384   - beqz match, .Lboth_aligned
  389 + beqz match, .Lboth_aligned\@
385 390 ADD src, src, t2
386 391  
387   -.Lsrc_unaligned_dst_aligned:
  392 +.Lsrc_unaligned_dst_aligned\@:
388 393 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
389 394 PREFS( 0, 3*32(src) )
390   - beqz t0, .Lcleanup_src_unaligned
  395 + beqz t0, .Lcleanup_src_unaligned\@
391 396 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
392 397 PREFD( 1, 3*32(dst) )
393 398 1:
394 399  
395 400  
396 401  
397 402  
398 403  
399 404  
400 405  
401 406  
402 407  
403 408  
... ... @@ -398,58 +403,58 @@
398 403 * are to the same unit (unless src is aligned, but it's not).
399 404 */
400 405 R10KCBARRIER(0(ra))
401   - LDFIRST(t0, FIRST(0)(src), .Ll_exc)
402   - LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy)
  406 + LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
  407 + LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)
403 408 SUB len, len, 4*NBYTES
404   - LDREST(t0, REST(0)(src), .Ll_exc_copy)
405   - LDREST(t1, REST(1)(src), .Ll_exc_copy)
406   - LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy)
407   - LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy)
408   - LDREST(t2, REST(2)(src), .Ll_exc_copy)
409   - LDREST(t3, REST(3)(src), .Ll_exc_copy)
  409 + LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
  410 + LDREST(t1, REST(1)(src), .Ll_exc_copy\@)
  411 + LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)
  412 + LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)
  413 + LDREST(t2, REST(2)(src), .Ll_exc_copy\@)
  414 + LDREST(t3, REST(3)(src), .Ll_exc_copy\@)
410 415 PREFS( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
411 416 ADD src, src, 4*NBYTES
412 417 #ifdef CONFIG_CPU_SB1
413 418 nop # improves slotting
414 419 #endif
415   - STORE(t0, UNIT(0)(dst), .Ls_exc_p4u)
416   - STORE(t1, UNIT(1)(dst), .Ls_exc_p3u)
417   - STORE(t2, UNIT(2)(dst), .Ls_exc_p2u)
418   - STORE(t3, UNIT(3)(dst), .Ls_exc_p1u)
  420 + STORE(t0, UNIT(0)(dst), .Ls_exc_p4u\@)
  421 + STORE(t1, UNIT(1)(dst), .Ls_exc_p3u\@)
  422 + STORE(t2, UNIT(2)(dst), .Ls_exc_p2u\@)
  423 + STORE(t3, UNIT(3)(dst), .Ls_exc_p1u\@)
419 424 PREFD( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
420 425 .set reorder /* DADDI_WAR */
421 426 ADD dst, dst, 4*NBYTES
422 427 bne len, rem, 1b
423 428 .set noreorder
424 429  
425   -.Lcleanup_src_unaligned:
426   - beqz len, .Ldone
  430 +.Lcleanup_src_unaligned\@:
  431 + beqz len, .Ldone\@
427 432 and rem, len, NBYTES-1 # rem = len % NBYTES
428   - beq rem, len, .Lcopy_bytes
  433 + beq rem, len, .Lcopy_bytes\@
429 434 nop
430 435 1:
431 436 R10KCBARRIER(0(ra))
432   - LDFIRST(t0, FIRST(0)(src), .Ll_exc)
433   - LDREST(t0, REST(0)(src), .Ll_exc_copy)
  437 + LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
  438 + LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
434 439 ADD src, src, NBYTES
435 440 SUB len, len, NBYTES
436   - STORE(t0, 0(dst), .Ls_exc_p1u)
  441 + STORE(t0, 0(dst), .Ls_exc_p1u\@)
437 442 .set reorder /* DADDI_WAR */
438 443 ADD dst, dst, NBYTES
439 444 bne len, rem, 1b
440 445 .set noreorder
441 446  
442   -.Lcopy_bytes_checklen:
443   - beqz len, .Ldone
  447 +.Lcopy_bytes_checklen\@:
  448 + beqz len, .Ldone\@
444 449 nop
445   -.Lcopy_bytes:
  450 +.Lcopy_bytes\@:
446 451 /* 0 < len < NBYTES */
447 452 R10KCBARRIER(0(ra))
448 453 #define COPY_BYTE(N) \
449   - LOADB(t0, N(src), .Ll_exc); \
  454 + LOADB(t0, N(src), .Ll_exc\@); \
450 455 SUB len, len, 1; \
451   - beqz len, .Ldone; \
452   - STOREB(t0, N(dst), .Ls_exc_p1)
  456 + beqz len, .Ldone\@; \
  457 + STOREB(t0, N(dst), .Ls_exc_p1\@)
453 458  
454 459 COPY_BYTE(0)
455 460 COPY_BYTE(1)
456 461  
457 462  
458 463  
459 464  
... ... @@ -459,16 +464,19 @@
459 464 COPY_BYTE(4)
460 465 COPY_BYTE(5)
461 466 #endif
462   - LOADB(t0, NBYTES-2(src), .Ll_exc)
  467 + LOADB(t0, NBYTES-2(src), .Ll_exc\@)
463 468 SUB len, len, 1
464 469 jr ra
465   - STOREB(t0, NBYTES-2(dst), .Ls_exc_p1)
466   -.Ldone:
  470 + STOREB(t0, NBYTES-2(dst), .Ls_exc_p1\@)
  471 +.Ldone\@:
467 472 jr ra
468   - nop
  473 + .if __memcpy == 1
469 474 END(memcpy)
  475 + .set __memcpy, 0
  476 + .hidden __memcpy
  477 + .endif
470 478  
471   -.Ll_exc_copy:
  479 +.Ll_exc_copy\@:
472 480 /*
473 481 * Copy bytes from src until faulting load address (or until a
474 482 * lb faults)
475 483  
476 484  
... ... @@ -483,20 +491,20 @@
483 491 nop
484 492 LOADK t0, THREAD_BUADDR(t0)
485 493 1:
486   - LOADB(t1, 0(src), .Ll_exc)
  494 + LOADB(t1, 0(src), .Ll_exc\@)
487 495 ADD src, src, 1
488 496 sb t1, 0(dst) # can't fault -- we're copy_from_user
489 497 .set reorder /* DADDI_WAR */
490 498 ADD dst, dst, 1
491 499 bne src, t0, 1b
492 500 .set noreorder
493   -.Ll_exc:
  501 +.Ll_exc\@:
494 502 LOADK t0, TI_TASK($28)
495 503 nop
496 504 LOADK t0, THREAD_BUADDR(t0) # t0 is just past last good address
497 505 nop
498 506 SUB len, AT, t0 # len number of uncopied bytes
499   - bnez t6, .Ldone /* Skip the zeroing part if inatomic */
  507 + bnez t6, .Ldone\@ /* Skip the zeroing part if inatomic */
500 508 /*
501 509 * Here's where we rely on src and dst being incremented in tandem,
502 510 * See (3) above.
... ... @@ -510,7 +518,7 @@
510 518 */
511 519 .set reorder /* DADDI_WAR */
512 520 SUB src, len, 1
513   - beqz len, .Ldone
  521 + beqz len, .Ldone\@
514 522 .set noreorder
515 523 1: sb zero, 0(dst)
516 524 ADD dst, dst, 1
... ... @@ -531,7 +539,7 @@
531 539  
532 540 #define SEXC(n) \
533 541 .set reorder; /* DADDI_WAR */ \
534   -.Ls_exc_p ## n ## u: \
  542 +.Ls_exc_p ## n ## u\@: \
535 543 ADD len, len, n*NBYTES; \
536 544 jr ra; \
537 545 .set noreorder
538 546  
539 547  
... ... @@ -545,14 +553,15 @@
545 553 SEXC(2)
546 554 SEXC(1)
547 555  
548   -.Ls_exc_p1:
  556 +.Ls_exc_p1\@:
549 557 .set reorder /* DADDI_WAR */
550 558 ADD len, len, 1
551 559 jr ra
552 560 .set noreorder
553   -.Ls_exc:
  561 +.Ls_exc\@:
554 562 jr ra
555 563 nop
  564 + .endm
556 565  
557 566 .align 5
558 567 LEAF(memmove)
... ... @@ -603,4 +612,28 @@
603 612 jr ra
604 613 move a2, zero
605 614 END(__rmemcpy)
  615 +
  616 +/*
  617 + * t6 is used as a flag to note inatomic mode.
  618 + */
  619 +LEAF(__copy_user_inatomic)
  620 + b __copy_user_common
  621 + li t6, 1
  622 + END(__copy_user_inatomic)
  623 +
  624 +/*
  625 + * A combined memcpy/__copy_user
  626 + * __copy_user sets len to 0 for success; else to an upper bound of
  627 + * the number of uncopied bytes.
  628 + * memcpy sets v0 to dst.
  629 + */
  630 + .align 5
  631 +LEAF(memcpy) /* a0=dst a1=src a2=len */
  632 + move v0, dst /* return value */
  633 +.L__memcpy:
  634 +FEXPORT(__copy_user)
  635 + li t6, 0 /* not inatomic */
  636 +__copy_user_common:
  637 + /* Legacy Mode, user <-> user */
  638 + __BUILD_COPY_USER LEGACY_MODE USEROP USEROP