powerpc: Update 64bit __copy_tofrom_user() using CPU_FTR_UNALIGNED_LD_STD
[linux-2.6.git] / arch / powerpc / lib / copyuser_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11
12         .align  7
13 _GLOBAL(__copy_tofrom_user)
14         /* first check for a whole page copy on a page boundary */
15         cmpldi  cr1,r5,16
16         cmpdi   cr6,r5,4096
17         or      r0,r3,r4
18         neg     r6,r3           /* LS 3 bits = # bytes to 8-byte dest bdry */
19         andi.   r0,r0,4095
20         std     r3,-24(r1)
21         crand   cr0*4+2,cr0*4+2,cr6*4+2
22         std     r4,-16(r1)
23         std     r5,-8(r1)
24         dcbt    0,r4
25         beq     .Lcopy_page_4K
26         andi.   r6,r6,7
27         PPC_MTOCRF      0x01,r5
28         blt     cr1,.Lshort_copy
29 /* Below we want to nop out the bne if we're on a CPU that has the
30  * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
31  * cleared.
32  * At the time of writing the only CPU that has this combination of bits
33  * set is Power6.
34  */
35 BEGIN_FTR_SECTION
36         nop
37 FTR_SECTION_ELSE
38         bne     .Ldst_unaligned
39 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
40                     CPU_FTR_UNALIGNED_LD_STD)
41 .Ldst_aligned:
42         addi    r3,r3,-16
43 BEGIN_FTR_SECTION
44         andi.   r0,r4,7
45         bne     .Lsrc_unaligned
46 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
47         srdi    r7,r5,4
48 20:     ld      r9,0(r4)
49         addi    r4,r4,-8
50         mtctr   r7
51         andi.   r5,r5,7
52         bf      cr7*4+0,22f
53         addi    r3,r3,8
54         addi    r4,r4,8
55         mr      r8,r9
56         blt     cr1,72f
57 21:     ld      r9,8(r4)
58 70:     std     r8,8(r3)
59 22:     ldu     r8,16(r4)
60 71:     stdu    r9,16(r3)
61         bdnz    21b
62 72:     std     r8,8(r3)
63         beq+    3f
64         addi    r3,r3,16
65 23:     ld      r9,8(r4)
66 .Ldo_tail:
67         bf      cr7*4+1,1f
68         rotldi  r9,r9,32
69 73:     stw     r9,0(r3)
70         addi    r3,r3,4
71 1:      bf      cr7*4+2,2f
72         rotldi  r9,r9,16
73 74:     sth     r9,0(r3)
74         addi    r3,r3,2
75 2:      bf      cr7*4+3,3f
76         rotldi  r9,r9,8
77 75:     stb     r9,0(r3)
78 3:      li      r3,0
79         blr
80
81 .Lsrc_unaligned:
82         srdi    r6,r5,3
83         addi    r5,r5,-16
84         subf    r4,r0,r4
85         srdi    r7,r5,4
86         sldi    r10,r0,3
87         cmpldi  cr6,r6,3
88         andi.   r5,r5,7
89         mtctr   r7
90         subfic  r11,r10,64
91         add     r5,r5,r0
92         bt      cr7*4+0,28f
93
94 24:     ld      r9,0(r4)        /* 3+2n loads, 2+2n stores */
95 25:     ld      r0,8(r4)
96         sld     r6,r9,r10
97 26:     ldu     r9,16(r4)
98         srd     r7,r0,r11
99         sld     r8,r0,r10
100         or      r7,r7,r6
101         blt     cr6,79f
102 27:     ld      r0,8(r4)
103         b       2f
104
105 28:     ld      r0,0(r4)        /* 4+2n loads, 3+2n stores */
106 29:     ldu     r9,8(r4)
107         sld     r8,r0,r10
108         addi    r3,r3,-8
109         blt     cr6,5f
110 30:     ld      r0,8(r4)
111         srd     r12,r9,r11
112         sld     r6,r9,r10
113 31:     ldu     r9,16(r4)
114         or      r12,r8,r12
115         srd     r7,r0,r11
116         sld     r8,r0,r10
117         addi    r3,r3,16
118         beq     cr6,78f
119
120 1:      or      r7,r7,r6
121 32:     ld      r0,8(r4)
122 76:     std     r12,8(r3)
123 2:      srd     r12,r9,r11
124         sld     r6,r9,r10
125 33:     ldu     r9,16(r4)
126         or      r12,r8,r12
127 77:     stdu    r7,16(r3)
128         srd     r7,r0,r11
129         sld     r8,r0,r10
130         bdnz    1b
131
132 78:     std     r12,8(r3)
133         or      r7,r7,r6
134 79:     std     r7,16(r3)
135 5:      srd     r12,r9,r11
136         or      r12,r8,r12
137 80:     std     r12,24(r3)
138         bne     6f
139         li      r3,0
140         blr
141 6:      cmpwi   cr1,r5,8
142         addi    r3,r3,32
143         sld     r9,r9,r10
144         ble     cr1,.Ldo_tail
145 34:     ld      r0,8(r4)
146         srd     r7,r0,r11
147         or      r9,r7,r9
148         b       .Ldo_tail
149
150 .Ldst_unaligned:
151         PPC_MTOCRF      0x01,r6         /* put #bytes to 8B bdry into cr7 */
152         subf    r5,r6,r5
153         li      r7,0
154         cmpldi  cr1,r5,16
155         bf      cr7*4+3,1f
156 35:     lbz     r0,0(r4)
157 81:     stb     r0,0(r3)
158         addi    r7,r7,1
159 1:      bf      cr7*4+2,2f
160 36:     lhzx    r0,r7,r4
161 82:     sthx    r0,r7,r3
162         addi    r7,r7,2
163 2:      bf      cr7*4+1,3f
164 37:     lwzx    r0,r7,r4
165 83:     stwx    r0,r7,r3
166 3:      PPC_MTOCRF      0x01,r5
167         add     r4,r6,r4
168         add     r3,r6,r3
169         b       .Ldst_aligned
170
171 .Lshort_copy:
172         bf      cr7*4+0,1f
173 38:     lwz     r0,0(r4)
174 39:     lwz     r9,4(r4)
175         addi    r4,r4,8
176 84:     stw     r0,0(r3)
177 85:     stw     r9,4(r3)
178         addi    r3,r3,8
179 1:      bf      cr7*4+1,2f
180 40:     lwz     r0,0(r4)
181         addi    r4,r4,4
182 86:     stw     r0,0(r3)
183         addi    r3,r3,4
184 2:      bf      cr7*4+2,3f
185 41:     lhz     r0,0(r4)
186         addi    r4,r4,2
187 87:     sth     r0,0(r3)
188         addi    r3,r3,2
189 3:      bf      cr7*4+3,4f
190 42:     lbz     r0,0(r4)
191 88:     stb     r0,0(r3)
192 4:      li      r3,0
193         blr
194
195 /*
196  * exception handlers follow
197  * we have to return the number of bytes not copied
198  * for an exception on a load, we set the rest of the destination to 0
199  */
200
201 136:
202 137:
203         add     r3,r3,r7
204         b       1f
205 130:
206 131:
207         addi    r3,r3,8
208 120:
209 122:
210 124:
211 125:
212 126:
213 127:
214 128:
215 129:
216 133:
217         addi    r3,r3,8
218 121:
219 132:
220         addi    r3,r3,8
221 123:
222 134:
223 135:
224 138:
225 139:
226 140:
227 141:
228 142:
229
230 /*
231  * here we have had a fault on a load and r3 points to the first
232  * unmodified byte of the destination
233  */
234 1:      ld      r6,-24(r1)
235         ld      r4,-16(r1)
236         ld      r5,-8(r1)
237         subf    r6,r6,r3
238         add     r4,r4,r6
239         subf    r5,r6,r5        /* #bytes left to go */
240
241 /*
242  * first see if we can copy any more bytes before hitting another exception
243  */
244         mtctr   r5
245 43:     lbz     r0,0(r4)
246         addi    r4,r4,1
247 89:     stb     r0,0(r3)
248         addi    r3,r3,1
249         bdnz    43b
250         li      r3,0            /* huh? all copied successfully this time? */
251         blr
252
253 /*
254  * here we have trapped again, need to clear ctr bytes starting at r3
255  */
256 143:    mfctr   r5
257         li      r0,0
258         mr      r4,r3
259         mr      r3,r5           /* return the number of bytes not copied */
260 1:      andi.   r9,r4,7
261         beq     3f
262 90:     stb     r0,0(r4)
263         addic.  r5,r5,-1
264         addi    r4,r4,1
265         bne     1b
266         blr
267 3:      cmpldi  cr1,r5,8
268         srdi    r9,r5,3
269         andi.   r5,r5,7
270         blt     cr1,93f
271         mtctr   r9
272 91:     std     r0,0(r4)
273         addi    r4,r4,8
274         bdnz    91b
275 93:     beqlr
276         mtctr   r5      
277 92:     stb     r0,0(r4)
278         addi    r4,r4,1
279         bdnz    92b
280         blr
281
282 /*
283  * exception handlers for stores: we just need to work
284  * out how many bytes weren't copied
285  */
286 182:
287 183:
288         add     r3,r3,r7
289         b       1f
290 180:
291         addi    r3,r3,8
292 171:
293 177:
294         addi    r3,r3,8
295 170:
296 172:
297 176:
298 178:
299         addi    r3,r3,4
300 185:
301         addi    r3,r3,4
302 173:
303 174:
304 175:
305 179:
306 181:
307 184:
308 186:
309 187:
310 188:
311 189:    
312 1:
313         ld      r6,-24(r1)
314         ld      r5,-8(r1)
315         add     r6,r6,r5
316         subf    r3,r3,r6        /* #bytes not copied */
317 190:
318 191:
319 192:
320         blr                     /* #bytes not copied in r3 */
321
322         .section __ex_table,"a"
323         .align  3
324         .llong  20b,120b
325         .llong  21b,121b
326         .llong  70b,170b
327         .llong  22b,122b
328         .llong  71b,171b
329         .llong  72b,172b
330         .llong  23b,123b
331         .llong  73b,173b
332         .llong  74b,174b
333         .llong  75b,175b
334         .llong  24b,124b
335         .llong  25b,125b
336         .llong  26b,126b
337         .llong  27b,127b
338         .llong  28b,128b
339         .llong  29b,129b
340         .llong  30b,130b
341         .llong  31b,131b
342         .llong  32b,132b
343         .llong  76b,176b
344         .llong  33b,133b
345         .llong  77b,177b
346         .llong  78b,178b
347         .llong  79b,179b
348         .llong  80b,180b
349         .llong  34b,134b
350         .llong  35b,135b
351         .llong  81b,181b
352         .llong  36b,136b
353         .llong  82b,182b
354         .llong  37b,137b
355         .llong  83b,183b
356         .llong  38b,138b
357         .llong  39b,139b
358         .llong  84b,184b
359         .llong  85b,185b
360         .llong  40b,140b
361         .llong  86b,186b
362         .llong  41b,141b
363         .llong  87b,187b
364         .llong  42b,142b
365         .llong  88b,188b
366         .llong  43b,143b
367         .llong  89b,189b
368         .llong  90b,190b
369         .llong  91b,191b
370         .llong  92b,192b
371         
372         .text
373
374 /*
375  * Routine to copy a whole page of data, optimized for POWER4.
376  * On POWER4 it is more than 50% faster than the simple loop
377  * above (following the .Ldst_aligned label) but it runs slightly
378  * slower on POWER3.
379  */
380 .Lcopy_page_4K:
381         std     r31,-32(1)
382         std     r30,-40(1)
383         std     r29,-48(1)
384         std     r28,-56(1)
385         std     r27,-64(1)
386         std     r26,-72(1)
387         std     r25,-80(1)
388         std     r24,-88(1)
389         std     r23,-96(1)
390         std     r22,-104(1)
391         std     r21,-112(1)
392         std     r20,-120(1)
393         li      r5,4096/32 - 1
394         addi    r3,r3,-8
395         li      r0,5
396 0:      addi    r5,r5,-24
397         mtctr   r0
398 20:     ld      r22,640(4)
399 21:     ld      r21,512(4)
400 22:     ld      r20,384(4)
401 23:     ld      r11,256(4)
402 24:     ld      r9,128(4)
403 25:     ld      r7,0(4)
404 26:     ld      r25,648(4)
405 27:     ld      r24,520(4)
406 28:     ld      r23,392(4)
407 29:     ld      r10,264(4)
408 30:     ld      r8,136(4)
409 31:     ldu     r6,8(4)
410         cmpwi   r5,24
411 1:
412 32:     std     r22,648(3)
413 33:     std     r21,520(3)
414 34:     std     r20,392(3)
415 35:     std     r11,264(3)
416 36:     std     r9,136(3)
417 37:     std     r7,8(3)
418 38:     ld      r28,648(4)
419 39:     ld      r27,520(4)
420 40:     ld      r26,392(4)
421 41:     ld      r31,264(4)
422 42:     ld      r30,136(4)
423 43:     ld      r29,8(4)
424 44:     std     r25,656(3)
425 45:     std     r24,528(3)
426 46:     std     r23,400(3)
427 47:     std     r10,272(3)
428 48:     std     r8,144(3)
429 49:     std     r6,16(3)
430 50:     ld      r22,656(4)
431 51:     ld      r21,528(4)
432 52:     ld      r20,400(4)
433 53:     ld      r11,272(4)
434 54:     ld      r9,144(4)
435 55:     ld      r7,16(4)
436 56:     std     r28,664(3)
437 57:     std     r27,536(3)
438 58:     std     r26,408(3)
439 59:     std     r31,280(3)
440 60:     std     r30,152(3)
441 61:     stdu    r29,24(3)
442 62:     ld      r25,664(4)
443 63:     ld      r24,536(4)
444 64:     ld      r23,408(4)
445 65:     ld      r10,280(4)
446 66:     ld      r8,152(4)
447 67:     ldu     r6,24(4)
448         bdnz    1b
449 68:     std     r22,648(3)
450 69:     std     r21,520(3)
451 70:     std     r20,392(3)
452 71:     std     r11,264(3)
453 72:     std     r9,136(3)
454 73:     std     r7,8(3)
455 74:     addi    r4,r4,640
456 75:     addi    r3,r3,648
457         bge     0b
458         mtctr   r5
459 76:     ld      r7,0(4)
460 77:     ld      r8,8(4)
461 78:     ldu     r9,16(4)
462 3:
463 79:     ld      r10,8(4)
464 80:     std     r7,8(3)
465 81:     ld      r7,16(4)
466 82:     std     r8,16(3)
467 83:     ld      r8,24(4)
468 84:     std     r9,24(3)
469 85:     ldu     r9,32(4)
470 86:     stdu    r10,32(3)
471         bdnz    3b
472 4:
473 87:     ld      r10,8(4)
474 88:     std     r7,8(3)
475 89:     std     r8,16(3)
476 90:     std     r9,24(3)
477 91:     std     r10,32(3)
478 9:      ld      r20,-120(1)
479         ld      r21,-112(1)
480         ld      r22,-104(1)
481         ld      r23,-96(1)
482         ld      r24,-88(1)
483         ld      r25,-80(1)
484         ld      r26,-72(1)
485         ld      r27,-64(1)
486         ld      r28,-56(1)
487         ld      r29,-48(1)
488         ld      r30,-40(1)
489         ld      r31,-32(1)
490         li      r3,0
491         blr
492
493 /*
494  * on an exception, reset to the beginning and jump back into the
495  * standard __copy_tofrom_user
496  */
497 100:    ld      r20,-120(1)
498         ld      r21,-112(1)
499         ld      r22,-104(1)
500         ld      r23,-96(1)
501         ld      r24,-88(1)
502         ld      r25,-80(1)
503         ld      r26,-72(1)
504         ld      r27,-64(1)
505         ld      r28,-56(1)
506         ld      r29,-48(1)
507         ld      r30,-40(1)
508         ld      r31,-32(1)
509         ld      r3,-24(r1)
510         ld      r4,-16(r1)
511         li      r5,4096
512         b       .Ldst_aligned
513
514         .section __ex_table,"a"
515         .align  3
516         .llong  20b,100b
517         .llong  21b,100b
518         .llong  22b,100b
519         .llong  23b,100b
520         .llong  24b,100b
521         .llong  25b,100b
522         .llong  26b,100b
523         .llong  27b,100b
524         .llong  28b,100b
525         .llong  29b,100b
526         .llong  30b,100b
527         .llong  31b,100b
528         .llong  32b,100b
529         .llong  33b,100b
530         .llong  34b,100b
531         .llong  35b,100b
532         .llong  36b,100b
533         .llong  37b,100b
534         .llong  38b,100b
535         .llong  39b,100b
536         .llong  40b,100b
537         .llong  41b,100b
538         .llong  42b,100b
539         .llong  43b,100b
540         .llong  44b,100b
541         .llong  45b,100b
542         .llong  46b,100b
543         .llong  47b,100b
544         .llong  48b,100b
545         .llong  49b,100b
546         .llong  50b,100b
547         .llong  51b,100b
548         .llong  52b,100b
549         .llong  53b,100b
550         .llong  54b,100b
551         .llong  55b,100b
552         .llong  56b,100b
553         .llong  57b,100b
554         .llong  58b,100b
555         .llong  59b,100b
556         .llong  60b,100b
557         .llong  61b,100b
558         .llong  62b,100b
559         .llong  63b,100b
560         .llong  64b,100b
561         .llong  65b,100b
562         .llong  66b,100b
563         .llong  67b,100b
564         .llong  68b,100b
565         .llong  69b,100b
566         .llong  70b,100b
567         .llong  71b,100b
568         .llong  72b,100b
569         .llong  73b,100b
570         .llong  74b,100b
571         .llong  75b,100b
572         .llong  76b,100b
573         .llong  77b,100b
574         .llong  78b,100b
575         .llong  79b,100b
576         .llong  80b,100b
577         .llong  81b,100b
578         .llong  82b,100b
579         .llong  83b,100b
580         .llong  84b,100b
581         .llong  85b,100b
582         .llong  86b,100b
583         .llong  87b,100b
584         .llong  88b,100b
585         .llong  89b,100b
586         .llong  90b,100b
587         .llong  91b,100b