[PATCH] ppc64: support 64k pages
[linux-2.6.git] / arch / powerpc / lib / copyuser_64.S
1 /*
2  * arch/ppc64/lib/copyuser.S
3  *
4  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <asm/processor.h>
12 #include <asm/ppc_asm.h>
13
14         .align  7
15 _GLOBAL(__copy_tofrom_user)
16         /* first check for a whole page copy on a page boundary */
17         cmpldi  cr1,r5,16
18         cmpdi   cr6,r5,4096
19         or      r0,r3,r4
20         neg     r6,r3           /* LS 3 bits = # bytes to 8-byte dest bdry */
21         andi.   r0,r0,4095
22         std     r3,-24(r1)
23         crand   cr0*4+2,cr0*4+2,cr6*4+2
24         std     r4,-16(r1)
25         std     r5,-8(r1)
26         dcbt    0,r4
27         beq     .Lcopy_page_4K
28         andi.   r6,r6,7
29         mtcrf   0x01,r5
30         blt     cr1,.Lshort_copy
31         bne     .Ldst_unaligned
32 .Ldst_aligned:
33         andi.   r0,r4,7
34         addi    r3,r3,-16
35         bne     .Lsrc_unaligned
36         srdi    r7,r5,4
37 20:     ld      r9,0(r4)
38         addi    r4,r4,-8
39         mtctr   r7
40         andi.   r5,r5,7
41         bf      cr7*4+0,22f
42         addi    r3,r3,8
43         addi    r4,r4,8
44         mr      r8,r9
45         blt     cr1,72f
46 21:     ld      r9,8(r4)
47 70:     std     r8,8(r3)
48 22:     ldu     r8,16(r4)
49 71:     stdu    r9,16(r3)
50         bdnz    21b
51 72:     std     r8,8(r3)
52         beq+    3f
53         addi    r3,r3,16
54 23:     ld      r9,8(r4)
55 .Ldo_tail:
56         bf      cr7*4+1,1f
57         rotldi  r9,r9,32
58 73:     stw     r9,0(r3)
59         addi    r3,r3,4
60 1:      bf      cr7*4+2,2f
61         rotldi  r9,r9,16
62 74:     sth     r9,0(r3)
63         addi    r3,r3,2
64 2:      bf      cr7*4+3,3f
65         rotldi  r9,r9,8
66 75:     stb     r9,0(r3)
67 3:      li      r3,0
68         blr
69
70 .Lsrc_unaligned:
71         srdi    r6,r5,3
72         addi    r5,r5,-16
73         subf    r4,r0,r4
74         srdi    r7,r5,4
75         sldi    r10,r0,3
76         cmpldi  cr6,r6,3
77         andi.   r5,r5,7
78         mtctr   r7
79         subfic  r11,r10,64
80         add     r5,r5,r0
81         bt      cr7*4+0,28f
82
83 24:     ld      r9,0(r4)        /* 3+2n loads, 2+2n stores */
84 25:     ld      r0,8(r4)
85         sld     r6,r9,r10
86 26:     ldu     r9,16(r4)
87         srd     r7,r0,r11
88         sld     r8,r0,r10
89         or      r7,r7,r6
90         blt     cr6,79f
91 27:     ld      r0,8(r4)
92         b       2f
93
94 28:     ld      r0,0(r4)        /* 4+2n loads, 3+2n stores */
95 29:     ldu     r9,8(r4)
96         sld     r8,r0,r10
97         addi    r3,r3,-8
98         blt     cr6,5f
99 30:     ld      r0,8(r4)
100         srd     r12,r9,r11
101         sld     r6,r9,r10
102 31:     ldu     r9,16(r4)
103         or      r12,r8,r12
104         srd     r7,r0,r11
105         sld     r8,r0,r10
106         addi    r3,r3,16
107         beq     cr6,78f
108
109 1:      or      r7,r7,r6
110 32:     ld      r0,8(r4)
111 76:     std     r12,8(r3)
112 2:      srd     r12,r9,r11
113         sld     r6,r9,r10
114 33:     ldu     r9,16(r4)
115         or      r12,r8,r12
116 77:     stdu    r7,16(r3)
117         srd     r7,r0,r11
118         sld     r8,r0,r10
119         bdnz    1b
120
121 78:     std     r12,8(r3)
122         or      r7,r7,r6
123 79:     std     r7,16(r3)
124 5:      srd     r12,r9,r11
125         or      r12,r8,r12
126 80:     std     r12,24(r3)
127         bne     6f
128         li      r3,0
129         blr
130 6:      cmpwi   cr1,r5,8
131         addi    r3,r3,32
132         sld     r9,r9,r10
133         ble     cr1,.Ldo_tail
134 34:     ld      r0,8(r4)
135         srd     r7,r0,r11
136         or      r9,r7,r9
137         b       .Ldo_tail
138
139 .Ldst_unaligned:
140         mtcrf   0x01,r6         /* put #bytes to 8B bdry into cr7 */
141         subf    r5,r6,r5
142         li      r7,0
143         cmpldi  r1,r5,16
144         bf      cr7*4+3,1f
145 35:     lbz     r0,0(r4)
146 81:     stb     r0,0(r3)
147         addi    r7,r7,1
148 1:      bf      cr7*4+2,2f
149 36:     lhzx    r0,r7,r4
150 82:     sthx    r0,r7,r3
151         addi    r7,r7,2
152 2:      bf      cr7*4+1,3f
153 37:     lwzx    r0,r7,r4
154 83:     stwx    r0,r7,r3
155 3:      mtcrf   0x01,r5
156         add     r4,r6,r4
157         add     r3,r6,r3
158         b       .Ldst_aligned
159
160 .Lshort_copy:
161         bf      cr7*4+0,1f
162 38:     lwz     r0,0(r4)
163 39:     lwz     r9,4(r4)
164         addi    r4,r4,8
165 84:     stw     r0,0(r3)
166 85:     stw     r9,4(r3)
167         addi    r3,r3,8
168 1:      bf      cr7*4+1,2f
169 40:     lwz     r0,0(r4)
170         addi    r4,r4,4
171 86:     stw     r0,0(r3)
172         addi    r3,r3,4
173 2:      bf      cr7*4+2,3f
174 41:     lhz     r0,0(r4)
175         addi    r4,r4,2
176 87:     sth     r0,0(r3)
177         addi    r3,r3,2
178 3:      bf      cr7*4+3,4f
179 42:     lbz     r0,0(r4)
180 88:     stb     r0,0(r3)
181 4:      li      r3,0
182         blr
183
184 /*
185  * exception handlers follow
186  * we have to return the number of bytes not copied
187  * for an exception on a load, we set the rest of the destination to 0
188  */
189
190 136:
191 137:
192         add     r3,r3,r7
193         b       1f
194 130:
195 131:
196         addi    r3,r3,8
197 120:
198 122:
199 124:
200 125:
201 126:
202 127:
203 128:
204 129:
205 133:
206         addi    r3,r3,8
207 121:
208 132:
209         addi    r3,r3,8
210 123:
211 134:
212 135:
213 138:
214 139:
215 140:
216 141:
217 142:
218
219 /*
220  * here we have had a fault on a load and r3 points to the first
221  * unmodified byte of the destination
222  */
223 1:      ld      r6,-24(r1)
224         ld      r4,-16(r1)
225         ld      r5,-8(r1)
226         subf    r6,r6,r3
227         add     r4,r4,r6
228         subf    r5,r6,r5        /* #bytes left to go */
229
230 /*
231  * first see if we can copy any more bytes before hitting another exception
232  */
233         mtctr   r5
234 43:     lbz     r0,0(r4)
235         addi    r4,r4,1
236 89:     stb     r0,0(r3)
237         addi    r3,r3,1
238         bdnz    43b
239         li      r3,0            /* huh? all copied successfully this time? */
240         blr
241
242 /*
243  * here we have trapped again, need to clear ctr bytes starting at r3
244  */
245 143:    mfctr   r5
246         li      r0,0
247         mr      r4,r3
248         mr      r3,r5           /* return the number of bytes not copied */
249 1:      andi.   r9,r4,7
250         beq     3f
251 90:     stb     r0,0(r4)
252         addic.  r5,r5,-1
253         addi    r4,r4,1
254         bne     1b
255         blr
256 3:      cmpldi  cr1,r5,8
257         srdi    r9,r5,3
258         andi.   r5,r5,7
259         blt     cr1,93f
260         mtctr   r9
261 91:     std     r0,0(r4)
262         addi    r4,r4,8
263         bdnz    91b
264 93:     beqlr
265         mtctr   r5      
266 92:     stb     r0,0(r4)
267         addi    r4,r4,1
268         bdnz    92b
269         blr
270
271 /*
272  * exception handlers for stores: we just need to work
273  * out how many bytes weren't copied
274  */
275 182:
276 183:
277         add     r3,r3,r7
278         b       1f
279 180:
280         addi    r3,r3,8
281 171:
282 177:
283         addi    r3,r3,8
284 170:
285 172:
286 176:
287 178:
288         addi    r3,r3,4
289 185:
290         addi    r3,r3,4
291 173:
292 174:
293 175:
294 179:
295 181:
296 184:
297 186:
298 187:
299 188:
300 189:    
301 1:
302         ld      r6,-24(r1)
303         ld      r5,-8(r1)
304         add     r6,r6,r5
305         subf    r3,r3,r6        /* #bytes not copied */
306 190:
307 191:
308 192:
309         blr                     /* #bytes not copied in r3 */
310
311         .section __ex_table,"a"
312         .align  3
313         .llong  20b,120b
314         .llong  21b,121b
315         .llong  70b,170b
316         .llong  22b,122b
317         .llong  71b,171b
318         .llong  72b,172b
319         .llong  23b,123b
320         .llong  73b,173b
321         .llong  74b,174b
322         .llong  75b,175b
323         .llong  24b,124b
324         .llong  25b,125b
325         .llong  26b,126b
326         .llong  27b,127b
327         .llong  28b,128b
328         .llong  29b,129b
329         .llong  30b,130b
330         .llong  31b,131b
331         .llong  32b,132b
332         .llong  76b,176b
333         .llong  33b,133b
334         .llong  77b,177b
335         .llong  78b,178b
336         .llong  79b,179b
337         .llong  80b,180b
338         .llong  34b,134b
339         .llong  35b,135b
340         .llong  81b,181b
341         .llong  36b,136b
342         .llong  82b,182b
343         .llong  37b,137b
344         .llong  83b,183b
345         .llong  38b,138b
346         .llong  39b,139b
347         .llong  84b,184b
348         .llong  85b,185b
349         .llong  40b,140b
350         .llong  86b,186b
351         .llong  41b,141b
352         .llong  87b,187b
353         .llong  42b,142b
354         .llong  88b,188b
355         .llong  43b,143b
356         .llong  89b,189b
357         .llong  90b,190b
358         .llong  91b,191b
359         .llong  92b,192b
360         
361         .text
362
363 /*
364  * Routine to copy a whole page of data, optimized for POWER4.
365  * On POWER4 it is more than 50% faster than the simple loop
366  * above (following the .Ldst_aligned label) but it runs slightly
367  * slower on POWER3.
368  */
369 .Lcopy_page_4K:
370         std     r31,-32(1)
371         std     r30,-40(1)
372         std     r29,-48(1)
373         std     r28,-56(1)
374         std     r27,-64(1)
375         std     r26,-72(1)
376         std     r25,-80(1)
377         std     r24,-88(1)
378         std     r23,-96(1)
379         std     r22,-104(1)
380         std     r21,-112(1)
381         std     r20,-120(1)
382         li      r5,4096/32 - 1
383         addi    r3,r3,-8
384         li      r0,5
385 0:      addi    r5,r5,-24
386         mtctr   r0
387 20:     ld      r22,640(4)
388 21:     ld      r21,512(4)
389 22:     ld      r20,384(4)
390 23:     ld      r11,256(4)
391 24:     ld      r9,128(4)
392 25:     ld      r7,0(4)
393 26:     ld      r25,648(4)
394 27:     ld      r24,520(4)
395 28:     ld      r23,392(4)
396 29:     ld      r10,264(4)
397 30:     ld      r8,136(4)
398 31:     ldu     r6,8(4)
399         cmpwi   r5,24
400 1:
401 32:     std     r22,648(3)
402 33:     std     r21,520(3)
403 34:     std     r20,392(3)
404 35:     std     r11,264(3)
405 36:     std     r9,136(3)
406 37:     std     r7,8(3)
407 38:     ld      r28,648(4)
408 39:     ld      r27,520(4)
409 40:     ld      r26,392(4)
410 41:     ld      r31,264(4)
411 42:     ld      r30,136(4)
412 43:     ld      r29,8(4)
413 44:     std     r25,656(3)
414 45:     std     r24,528(3)
415 46:     std     r23,400(3)
416 47:     std     r10,272(3)
417 48:     std     r8,144(3)
418 49:     std     r6,16(3)
419 50:     ld      r22,656(4)
420 51:     ld      r21,528(4)
421 52:     ld      r20,400(4)
422 53:     ld      r11,272(4)
423 54:     ld      r9,144(4)
424 55:     ld      r7,16(4)
425 56:     std     r28,664(3)
426 57:     std     r27,536(3)
427 58:     std     r26,408(3)
428 59:     std     r31,280(3)
429 60:     std     r30,152(3)
430 61:     stdu    r29,24(3)
431 62:     ld      r25,664(4)
432 63:     ld      r24,536(4)
433 64:     ld      r23,408(4)
434 65:     ld      r10,280(4)
435 66:     ld      r8,152(4)
436 67:     ldu     r6,24(4)
437         bdnz    1b
438 68:     std     r22,648(3)
439 69:     std     r21,520(3)
440 70:     std     r20,392(3)
441 71:     std     r11,264(3)
442 72:     std     r9,136(3)
443 73:     std     r7,8(3)
444 74:     addi    r4,r4,640
445 75:     addi    r3,r3,648
446         bge     0b
447         mtctr   r5
448 76:     ld      r7,0(4)
449 77:     ld      r8,8(4)
450 78:     ldu     r9,16(4)
451 3:
452 79:     ld      r10,8(4)
453 80:     std     r7,8(3)
454 81:     ld      r7,16(4)
455 82:     std     r8,16(3)
456 83:     ld      r8,24(4)
457 84:     std     r9,24(3)
458 85:     ldu     r9,32(4)
459 86:     stdu    r10,32(3)
460         bdnz    3b
461 4:
462 87:     ld      r10,8(4)
463 88:     std     r7,8(3)
464 89:     std     r8,16(3)
465 90:     std     r9,24(3)
466 91:     std     r10,32(3)
467 9:      ld      r20,-120(1)
468         ld      r21,-112(1)
469         ld      r22,-104(1)
470         ld      r23,-96(1)
471         ld      r24,-88(1)
472         ld      r25,-80(1)
473         ld      r26,-72(1)
474         ld      r27,-64(1)
475         ld      r28,-56(1)
476         ld      r29,-48(1)
477         ld      r30,-40(1)
478         ld      r31,-32(1)
479         li      r3,0
480         blr
481
482 /*
483  * on an exception, reset to the beginning and jump back into the
484  * standard __copy_tofrom_user
485  */
486 100:    ld      r20,-120(1)
487         ld      r21,-112(1)
488         ld      r22,-104(1)
489         ld      r23,-96(1)
490         ld      r24,-88(1)
491         ld      r25,-80(1)
492         ld      r26,-72(1)
493         ld      r27,-64(1)
494         ld      r28,-56(1)
495         ld      r29,-48(1)
496         ld      r30,-40(1)
497         ld      r31,-32(1)
498         ld      r3,-24(r1)
499         ld      r4,-16(r1)
500         li      r5,4096
501         b       .Ldst_aligned
502
503         .section __ex_table,"a"
504         .align  3
505         .llong  20b,100b
506         .llong  21b,100b
507         .llong  22b,100b
508         .llong  23b,100b
509         .llong  24b,100b
510         .llong  25b,100b
511         .llong  26b,100b
512         .llong  27b,100b
513         .llong  28b,100b
514         .llong  29b,100b
515         .llong  30b,100b
516         .llong  31b,100b
517         .llong  32b,100b
518         .llong  33b,100b
519         .llong  34b,100b
520         .llong  35b,100b
521         .llong  36b,100b
522         .llong  37b,100b
523         .llong  38b,100b
524         .llong  39b,100b
525         .llong  40b,100b
526         .llong  41b,100b
527         .llong  42b,100b
528         .llong  43b,100b
529         .llong  44b,100b
530         .llong  45b,100b
531         .llong  46b,100b
532         .llong  47b,100b
533         .llong  48b,100b
534         .llong  49b,100b
535         .llong  50b,100b
536         .llong  51b,100b
537         .llong  52b,100b
538         .llong  53b,100b
539         .llong  54b,100b
540         .llong  55b,100b
541         .llong  56b,100b
542         .llong  57b,100b
543         .llong  58b,100b
544         .llong  59b,100b
545         .llong  60b,100b
546         .llong  61b,100b
547         .llong  62b,100b
548         .llong  63b,100b
549         .llong  64b,100b
550         .llong  65b,100b
551         .llong  66b,100b
552         .llong  67b,100b
553         .llong  68b,100b
554         .llong  69b,100b
555         .llong  70b,100b
556         .llong  71b,100b
557         .llong  72b,100b
558         .llong  73b,100b
559         .llong  74b,100b
560         .llong  75b,100b
561         .llong  76b,100b
562         .llong  77b,100b
563         .llong  78b,100b
564         .llong  79b,100b
565         .llong  80b,100b
566         .llong  81b,100b
567         .llong  82b,100b
568         .llong  83b,100b
569         .llong  84b,100b
570         .llong  85b,100b
571         .llong  86b,100b
572         .llong  87b,100b
573         .llong  88b,100b
574         .llong  89b,100b
575         .llong  90b,100b
576         .llong  91b,100b