Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/sparc/lib/U1memcpy.S
29519 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy.
3
*
4
* Copyright (C) 1997, 2004 David S. Miller ([email protected])
5
* Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek ([email protected])
6
*/
7
8
#ifdef __KERNEL__
9
#include <linux/export.h>
10
#include <linux/linkage.h>
11
#include <asm/visasm.h>
12
#include <asm/asi.h>
13
#define GLOBAL_SPARE g7
14
#else
15
#define GLOBAL_SPARE g5
16
#define ASI_BLK_P 0xf0
17
#define FPRS_FEF 0x04
18
#ifdef MEMCPY_DEBUG
19
#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
20
clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
21
#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
22
#else
23
#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
24
#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
25
#endif
26
#endif
27
28
#ifndef EX_LD
29
#define EX_LD(x,y) x
30
#endif
31
#ifndef EX_LD_FP
32
#define EX_LD_FP(x,y) x
33
#endif
34
35
#ifndef EX_ST
36
#define EX_ST(x,y) x
37
#endif
38
#ifndef EX_ST_FP
39
#define EX_ST_FP(x,y) x
40
#endif
41
42
#ifndef LOAD
43
#define LOAD(type,addr,dest) type [addr], dest
44
#endif
45
46
#ifndef LOAD_BLK
47
#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
48
#endif
49
50
#ifndef STORE
51
#define STORE(type,src,addr) type src, [addr]
52
#endif
53
54
#ifndef STORE_BLK
55
#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
56
#endif
57
58
#ifndef FUNC_NAME
59
#define FUNC_NAME memcpy
60
#endif
61
62
#ifndef PREAMBLE
63
#define PREAMBLE
64
#endif
65
66
#ifndef XCC
67
#define XCC xcc
68
#endif
69
70
#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \
71
faligndata %f1, %f2, %f48; \
72
faligndata %f2, %f3, %f50; \
73
faligndata %f3, %f4, %f52; \
74
faligndata %f4, %f5, %f54; \
75
faligndata %f5, %f6, %f56; \
76
faligndata %f6, %f7, %f58; \
77
faligndata %f7, %f8, %f60; \
78
faligndata %f8, %f9, %f62;
79
80
#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, jmptgt) \
81
EX_LD_FP(LOAD_BLK(%src, %fdest), U1_gs_80_fp); \
82
EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp); \
83
add %src, 0x40, %src; \
84
subcc %GLOBAL_SPARE, 0x40, %GLOBAL_SPARE; \
85
be,pn %xcc, jmptgt; \
86
add %dest, 0x40, %dest; \
87
88
#define LOOP_CHUNK1(src, dest, branch_dest) \
89
MAIN_LOOP_CHUNK(src, dest, f0, f48, branch_dest)
90
#define LOOP_CHUNK2(src, dest, branch_dest) \
91
MAIN_LOOP_CHUNK(src, dest, f16, f48, branch_dest)
92
#define LOOP_CHUNK3(src, dest, branch_dest) \
93
MAIN_LOOP_CHUNK(src, dest, f32, f48, branch_dest)
94
95
#define DO_SYNC membar #Sync;
96
#define STORE_SYNC(dest, fsrc) \
97
EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp); \
98
add %dest, 0x40, %dest; \
99
DO_SYNC
100
101
#define STORE_JUMP(dest, fsrc, target) \
102
EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_40_fp); \
103
add %dest, 0x40, %dest; \
104
ba,pt %xcc, target; \
105
nop;
106
107
#define FINISH_VISCHUNK(dest, f0, f1) \
108
subcc %g3, 8, %g3; \
109
bl,pn %xcc, 95f; \
110
faligndata %f0, %f1, %f48; \
111
EX_ST_FP(STORE(std, %f48, %dest), U1_g3_8_fp); \
112
add %dest, 8, %dest;
113
114
#define UNEVEN_VISCHUNK_LAST(dest, f0, f1) \
115
subcc %g3, 8, %g3; \
116
bl,pn %xcc, 95f; \
117
fsrc2 %f0, %f1;
118
119
#define UNEVEN_VISCHUNK(dest, f0, f1) \
120
UNEVEN_VISCHUNK_LAST(dest, f0, f1) \
121
ba,a,pt %xcc, 93f;
122
123
.register %g2,#scratch
124
.register %g3,#scratch
125
126
.text
127
#ifndef EX_RETVAL
128
#define EX_RETVAL(x) x
129
ENTRY(U1_g1_1_fp)
130
VISExitHalf
131
add %g1, 1, %g1
132
add %g1, %g2, %g1
133
retl
134
add %g1, %o2, %o0
135
ENDPROC(U1_g1_1_fp)
136
ENTRY(U1_g2_0_fp)
137
VISExitHalf
138
retl
139
add %g2, %o2, %o0
140
ENDPROC(U1_g2_0_fp)
141
ENTRY(U1_g2_8_fp)
142
VISExitHalf
143
add %g2, 8, %g2
144
retl
145
add %g2, %o2, %o0
146
ENDPROC(U1_g2_8_fp)
147
ENTRY(U1_gs_0_fp)
148
VISExitHalf
149
add %GLOBAL_SPARE, %g3, %o0
150
retl
151
add %o0, %o2, %o0
152
ENDPROC(U1_gs_0_fp)
153
ENTRY(U1_gs_80_fp)
154
VISExitHalf
155
add %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
156
add %GLOBAL_SPARE, %g3, %o0
157
retl
158
add %o0, %o2, %o0
159
ENDPROC(U1_gs_80_fp)
160
ENTRY(U1_gs_40_fp)
161
VISExitHalf
162
add %GLOBAL_SPARE, 0x40, %GLOBAL_SPARE
163
add %GLOBAL_SPARE, %g3, %o0
164
retl
165
add %o0, %o2, %o0
166
ENDPROC(U1_gs_40_fp)
167
ENTRY(U1_g3_8_fp)
168
VISExitHalf
169
add %g3, 8, %g3
170
retl
171
add %g3, %o2, %o0
172
ENDPROC(U1_g3_8_fp)
173
ENTRY(U1_g3_16_fp)
174
VISExitHalf
175
add %g3, 16, %g3
176
retl
177
add %g3, %o2, %o0
178
ENDPROC(U1_g3_16_fp)
179
ENTRY(U1_o2_0_fp)
180
VISExitHalf
181
retl
182
mov %o2, %o0
183
ENDPROC(U1_o2_0_fp)
184
ENTRY(U1_o2_1_fp)
185
VISExitHalf
186
retl
187
add %o2, 1, %o0
188
ENDPROC(U1_o2_1_fp)
189
ENTRY(U1_gs_0)
190
VISExitHalf
191
retl
192
add %GLOBAL_SPARE, %o2, %o0
193
ENDPROC(U1_gs_0)
194
ENTRY(U1_gs_8)
195
VISExitHalf
196
add %GLOBAL_SPARE, %o2, %GLOBAL_SPARE
197
retl
198
add %GLOBAL_SPARE, 0x8, %o0
199
ENDPROC(U1_gs_8)
200
ENTRY(U1_gs_10)
201
VISExitHalf
202
add %GLOBAL_SPARE, %o2, %GLOBAL_SPARE
203
retl
204
add %GLOBAL_SPARE, 0x10, %o0
205
ENDPROC(U1_gs_10)
206
ENTRY(U1_o2_0)
207
retl
208
mov %o2, %o0
209
ENDPROC(U1_o2_0)
210
ENTRY(U1_o2_8)
211
retl
212
add %o2, 8, %o0
213
ENDPROC(U1_o2_8)
214
ENTRY(U1_o2_4)
215
retl
216
add %o2, 4, %o0
217
ENDPROC(U1_o2_4)
218
ENTRY(U1_o2_1)
219
retl
220
add %o2, 1, %o0
221
ENDPROC(U1_o2_1)
222
ENTRY(U1_g1_0)
223
retl
224
add %g1, %o2, %o0
225
ENDPROC(U1_g1_0)
226
ENTRY(U1_g1_1)
227
add %g1, 1, %g1
228
retl
229
add %g1, %o2, %o0
230
ENDPROC(U1_g1_1)
231
ENTRY(U1_gs_0_o2_adj)
232
and %o2, 7, %o2
233
retl
234
add %GLOBAL_SPARE, %o2, %o0
235
ENDPROC(U1_gs_0_o2_adj)
236
ENTRY(U1_gs_8_o2_adj)
237
and %o2, 7, %o2
238
add %GLOBAL_SPARE, 8, %GLOBAL_SPARE
239
retl
240
add %GLOBAL_SPARE, %o2, %o0
241
ENDPROC(U1_gs_8_o2_adj)
242
#endif
243
244
.align 64
245
246
.globl FUNC_NAME
247
.type FUNC_NAME,#function
248
FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
249
srlx %o2, 31, %g2
250
cmp %g2, 0
251
tne %xcc, 5
252
PREAMBLE
253
mov %o0, %o4
254
cmp %o2, 0
255
be,pn %XCC, 85f
256
or %o0, %o1, %o3
257
cmp %o2, 16
258
blu,a,pn %XCC, 80f
259
or %o3, %o2, %o3
260
261
cmp %o2, (5 * 64)
262
blu,pt %XCC, 70f
263
andcc %o3, 0x7, %g0
264
265
/* Clobbers o5/g1/g2/g3/g7/icc/xcc. */
266
VISEntry
267
268
/* Is 'dst' already aligned on an 64-byte boundary? */
269
andcc %o0, 0x3f, %g2
270
be,pt %XCC, 2f
271
272
/* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
273
* of bytes to copy to make 'dst' 64-byte aligned. We pre-
274
* subtract this from 'len'.
275
*/
276
sub %o0, %o1, %GLOBAL_SPARE
277
sub %g2, 0x40, %g2
278
sub %g0, %g2, %g2
279
sub %o2, %g2, %o2
280
andcc %g2, 0x7, %g1
281
be,pt %icc, 2f
282
and %g2, 0x38, %g2
283
284
1: subcc %g1, 0x1, %g1
285
EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U1_g1_1_fp)
286
EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE), U1_g1_1_fp)
287
bgu,pt %XCC, 1b
288
add %o1, 0x1, %o1
289
290
add %o1, %GLOBAL_SPARE, %o0
291
292
2: cmp %g2, 0x0
293
and %o1, 0x7, %g1
294
be,pt %icc, 3f
295
alignaddr %o1, %g0, %o1
296
297
EX_LD_FP(LOAD(ldd, %o1, %f4), U1_g2_0_fp)
298
1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U1_g2_0_fp)
299
add %o1, 0x8, %o1
300
subcc %g2, 0x8, %g2
301
faligndata %f4, %f6, %f0
302
EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
303
be,pn %icc, 3f
304
add %o0, 0x8, %o0
305
306
EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U1_g2_0_fp)
307
add %o1, 0x8, %o1
308
subcc %g2, 0x8, %g2
309
faligndata %f6, %f4, %f0
310
EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
311
bne,pt %icc, 1b
312
add %o0, 0x8, %o0
313
314
/* Destination is 64-byte aligned. */
315
3:
316
membar #LoadStore | #StoreStore | #StoreLoad
317
318
subcc %o2, 0x40, %GLOBAL_SPARE
319
add %o1, %g1, %g1
320
andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
321
srl %g1, 3, %g2
322
sub %o2, %GLOBAL_SPARE, %g3
323
andn %o1, (0x40 - 1), %o1
324
and %g2, 7, %g2
325
andncc %g3, 0x7, %g3
326
fsrc2 %f0, %f2
327
sub %g3, 0x8, %g3
328
sub %o2, %GLOBAL_SPARE, %o2
329
330
add %g1, %GLOBAL_SPARE, %g1
331
subcc %o2, %g3, %o2
332
333
EX_LD_FP(LOAD_BLK(%o1, %f0), U1_gs_0_fp)
334
add %o1, 0x40, %o1
335
add %g1, %g3, %g1
336
EX_LD_FP(LOAD_BLK(%o1, %f16), U1_gs_0_fp)
337
add %o1, 0x40, %o1
338
sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
339
EX_LD_FP(LOAD_BLK(%o1, %f32), U1_gs_80_fp)
340
add %o1, 0x40, %o1
341
342
/* There are 8 instances of the unrolled loop,
343
* one for each possible alignment of the
344
* source buffer. Each loop instance is 452
345
* bytes.
346
*/
347
sll %g2, 3, %o3
348
sub %o3, %g2, %o3
349
sllx %o3, 4, %o3
350
add %o3, %g2, %o3
351
sllx %o3, 2, %g2
352
1: rd %pc, %o3
353
add %o3, %lo(1f - 1b), %o3
354
jmpl %o3 + %g2, %g0
355
nop
356
357
.align 64
358
1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
359
LOOP_CHUNK1(o1, o0, 1f)
360
FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
361
LOOP_CHUNK2(o1, o0, 2f)
362
FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
363
LOOP_CHUNK3(o1, o0, 3f)
364
ba,pt %xcc, 1b+4
365
faligndata %f0, %f2, %f48
366
1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
367
STORE_SYNC(o0, f48)
368
FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
369
STORE_JUMP(o0, f48, 40f)
370
2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
371
STORE_SYNC(o0, f48)
372
FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
373
STORE_JUMP(o0, f48, 48f)
374
3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
375
STORE_SYNC(o0, f48)
376
FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
377
STORE_JUMP(o0, f48, 56f)
378
379
1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
380
LOOP_CHUNK1(o1, o0, 1f)
381
FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
382
LOOP_CHUNK2(o1, o0, 2f)
383
FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
384
LOOP_CHUNK3(o1, o0, 3f)
385
ba,pt %xcc, 1b+4
386
faligndata %f2, %f4, %f48
387
1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
388
STORE_SYNC(o0, f48)
389
FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
390
STORE_JUMP(o0, f48, 41f)
391
2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
392
STORE_SYNC(o0, f48)
393
FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
394
STORE_JUMP(o0, f48, 49f)
395
3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
396
STORE_SYNC(o0, f48)
397
FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
398
STORE_JUMP(o0, f48, 57f)
399
400
1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
401
LOOP_CHUNK1(o1, o0, 1f)
402
FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
403
LOOP_CHUNK2(o1, o0, 2f)
404
FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
405
LOOP_CHUNK3(o1, o0, 3f)
406
ba,pt %xcc, 1b+4
407
faligndata %f4, %f6, %f48
408
1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
409
STORE_SYNC(o0, f48)
410
FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
411
STORE_JUMP(o0, f48, 42f)
412
2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
413
STORE_SYNC(o0, f48)
414
FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
415
STORE_JUMP(o0, f48, 50f)
416
3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
417
STORE_SYNC(o0, f48)
418
FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
419
STORE_JUMP(o0, f48, 58f)
420
421
1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
422
LOOP_CHUNK1(o1, o0, 1f)
423
FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
424
LOOP_CHUNK2(o1, o0, 2f)
425
FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
426
LOOP_CHUNK3(o1, o0, 3f)
427
ba,pt %xcc, 1b+4
428
faligndata %f6, %f8, %f48
429
1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
430
STORE_SYNC(o0, f48)
431
FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
432
STORE_JUMP(o0, f48, 43f)
433
2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
434
STORE_SYNC(o0, f48)
435
FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
436
STORE_JUMP(o0, f48, 51f)
437
3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
438
STORE_SYNC(o0, f48)
439
FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
440
STORE_JUMP(o0, f48, 59f)
441
442
1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
443
LOOP_CHUNK1(o1, o0, 1f)
444
FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
445
LOOP_CHUNK2(o1, o0, 2f)
446
FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
447
LOOP_CHUNK3(o1, o0, 3f)
448
ba,pt %xcc, 1b+4
449
faligndata %f8, %f10, %f48
450
1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
451
STORE_SYNC(o0, f48)
452
FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
453
STORE_JUMP(o0, f48, 44f)
454
2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
455
STORE_SYNC(o0, f48)
456
FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
457
STORE_JUMP(o0, f48, 52f)
458
3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
459
STORE_SYNC(o0, f48)
460
FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
461
STORE_JUMP(o0, f48, 60f)
462
463
1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
464
LOOP_CHUNK1(o1, o0, 1f)
465
FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
466
LOOP_CHUNK2(o1, o0, 2f)
467
FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
468
LOOP_CHUNK3(o1, o0, 3f)
469
ba,pt %xcc, 1b+4
470
faligndata %f10, %f12, %f48
471
1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
472
STORE_SYNC(o0, f48)
473
FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
474
STORE_JUMP(o0, f48, 45f)
475
2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
476
STORE_SYNC(o0, f48)
477
FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
478
STORE_JUMP(o0, f48, 53f)
479
3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
480
STORE_SYNC(o0, f48)
481
FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
482
STORE_JUMP(o0, f48, 61f)
483
484
1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
485
LOOP_CHUNK1(o1, o0, 1f)
486
FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
487
LOOP_CHUNK2(o1, o0, 2f)
488
FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
489
LOOP_CHUNK3(o1, o0, 3f)
490
ba,pt %xcc, 1b+4
491
faligndata %f12, %f14, %f48
492
1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
493
STORE_SYNC(o0, f48)
494
FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
495
STORE_JUMP(o0, f48, 46f)
496
2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
497
STORE_SYNC(o0, f48)
498
FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
499
STORE_JUMP(o0, f48, 54f)
500
3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
501
STORE_SYNC(o0, f48)
502
FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
503
STORE_JUMP(o0, f48, 62f)
504
505
1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
506
LOOP_CHUNK1(o1, o0, 1f)
507
FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
508
LOOP_CHUNK2(o1, o0, 2f)
509
FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
510
LOOP_CHUNK3(o1, o0, 3f)
511
ba,pt %xcc, 1b+4
512
faligndata %f14, %f16, %f48
513
1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
514
STORE_SYNC(o0, f48)
515
FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
516
STORE_JUMP(o0, f48, 47f)
517
2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
518
STORE_SYNC(o0, f48)
519
FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
520
STORE_JUMP(o0, f48, 55f)
521
3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
522
STORE_SYNC(o0, f48)
523
FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
524
STORE_JUMP(o0, f48, 63f)
525
526
40: FINISH_VISCHUNK(o0, f0, f2)
527
41: FINISH_VISCHUNK(o0, f2, f4)
528
42: FINISH_VISCHUNK(o0, f4, f6)
529
43: FINISH_VISCHUNK(o0, f6, f8)
530
44: FINISH_VISCHUNK(o0, f8, f10)
531
45: FINISH_VISCHUNK(o0, f10, f12)
532
46: FINISH_VISCHUNK(o0, f12, f14)
533
47: UNEVEN_VISCHUNK(o0, f14, f0)
534
48: FINISH_VISCHUNK(o0, f16, f18)
535
49: FINISH_VISCHUNK(o0, f18, f20)
536
50: FINISH_VISCHUNK(o0, f20, f22)
537
51: FINISH_VISCHUNK(o0, f22, f24)
538
52: FINISH_VISCHUNK(o0, f24, f26)
539
53: FINISH_VISCHUNK(o0, f26, f28)
540
54: FINISH_VISCHUNK(o0, f28, f30)
541
55: UNEVEN_VISCHUNK(o0, f30, f0)
542
56: FINISH_VISCHUNK(o0, f32, f34)
543
57: FINISH_VISCHUNK(o0, f34, f36)
544
58: FINISH_VISCHUNK(o0, f36, f38)
545
59: FINISH_VISCHUNK(o0, f38, f40)
546
60: FINISH_VISCHUNK(o0, f40, f42)
547
61: FINISH_VISCHUNK(o0, f42, f44)
548
62: FINISH_VISCHUNK(o0, f44, f46)
549
63: UNEVEN_VISCHUNK_LAST(o0, f46, f0)
550
551
93: EX_LD_FP(LOAD(ldd, %o1, %f2), U1_g3_8_fp)
552
add %o1, 8, %o1
553
subcc %g3, 8, %g3
554
faligndata %f0, %f2, %f8
555
EX_ST_FP(STORE(std, %f8, %o0), U1_g3_16_fp)
556
bl,pn %xcc, 95f
557
add %o0, 8, %o0
558
EX_LD_FP(LOAD(ldd, %o1, %f0), U1_g3_8_fp)
559
add %o1, 8, %o1
560
subcc %g3, 8, %g3
561
faligndata %f2, %f0, %f8
562
EX_ST_FP(STORE(std, %f8, %o0), U1_g3_16_fp)
563
bge,pt %xcc, 93b
564
add %o0, 8, %o0
565
566
95: brz,pt %o2, 2f
567
mov %g1, %o1
568
569
1: EX_LD_FP(LOAD(ldub, %o1, %o3), U1_o2_0_fp)
570
add %o1, 1, %o1
571
subcc %o2, 1, %o2
572
EX_ST_FP(STORE(stb, %o3, %o0), U1_o2_1_fp)
573
bne,pt %xcc, 1b
574
add %o0, 1, %o0
575
576
2: membar #StoreLoad | #StoreStore
577
VISExit
578
retl
579
mov EX_RETVAL(%o4), %o0
580
581
.align 64
582
70: /* 16 < len <= (5 * 64) */
583
bne,pn %XCC, 75f
584
sub %o0, %o1, %o3
585
586
72: andn %o2, 0xf, %GLOBAL_SPARE
587
and %o2, 0xf, %o2
588
1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U1_gs_0)
589
EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U1_gs_0)
590
subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
591
EX_ST(STORE(stx, %o5, %o1 + %o3), U1_gs_10)
592
add %o1, 0x8, %o1
593
EX_ST(STORE(stx, %g1, %o1 + %o3), U1_gs_8)
594
bgu,pt %XCC, 1b
595
add %o1, 0x8, %o1
596
73: andcc %o2, 0x8, %g0
597
be,pt %XCC, 1f
598
nop
599
EX_LD(LOAD(ldx, %o1, %o5), U1_o2_0)
600
sub %o2, 0x8, %o2
601
EX_ST(STORE(stx, %o5, %o1 + %o3), U1_o2_8)
602
add %o1, 0x8, %o1
603
1: andcc %o2, 0x4, %g0
604
be,pt %XCC, 1f
605
nop
606
EX_LD(LOAD(lduw, %o1, %o5), U1_o2_0)
607
sub %o2, 0x4, %o2
608
EX_ST(STORE(stw, %o5, %o1 + %o3), U1_o2_4)
609
add %o1, 0x4, %o1
610
1: cmp %o2, 0
611
be,pt %XCC, 85f
612
nop
613
ba,pt %xcc, 90f
614
nop
615
616
75: andcc %o0, 0x7, %g1
617
sub %g1, 0x8, %g1
618
be,pn %icc, 2f
619
sub %g0, %g1, %g1
620
sub %o2, %g1, %o2
621
622
1: EX_LD(LOAD(ldub, %o1, %o5), U1_g1_0)
623
subcc %g1, 1, %g1
624
EX_ST(STORE(stb, %o5, %o1 + %o3), U1_g1_1)
625
bgu,pt %icc, 1b
626
add %o1, 1, %o1
627
628
2: add %o1, %o3, %o0
629
andcc %o1, 0x7, %g1
630
bne,pt %icc, 8f
631
sll %g1, 3, %g1
632
633
cmp %o2, 16
634
bgeu,pt %icc, 72b
635
nop
636
ba,a,pt %xcc, 73b
637
638
8: mov 64, %o3
639
andn %o1, 0x7, %o1
640
EX_LD(LOAD(ldx, %o1, %g2), U1_o2_0)
641
sub %o3, %g1, %o3
642
andn %o2, 0x7, %GLOBAL_SPARE
643
sllx %g2, %g1, %g2
644
1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U1_gs_0_o2_adj)
645
subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
646
add %o1, 0x8, %o1
647
srlx %g3, %o3, %o5
648
or %o5, %g2, %o5
649
EX_ST(STORE(stx, %o5, %o0), U1_gs_8_o2_adj)
650
add %o0, 0x8, %o0
651
bgu,pt %icc, 1b
652
sllx %g3, %g1, %g2
653
654
srl %g1, 3, %g1
655
andcc %o2, 0x7, %o2
656
be,pn %icc, 85f
657
add %o1, %g1, %o1
658
ba,pt %xcc, 90f
659
sub %o0, %o1, %o3
660
661
.align 64
662
80: /* 0 < len <= 16 */
663
andcc %o3, 0x3, %g0
664
bne,pn %XCC, 90f
665
sub %o0, %o1, %o3
666
667
1: EX_LD(LOAD(lduw, %o1, %g1), U1_o2_0)
668
subcc %o2, 4, %o2
669
EX_ST(STORE(stw, %g1, %o1 + %o3), U1_o2_4)
670
bgu,pt %XCC, 1b
671
add %o1, 4, %o1
672
673
85: retl
674
mov EX_RETVAL(%o4), %o0
675
676
.align 32
677
90: EX_LD(LOAD(ldub, %o1, %g1), U1_o2_0)
678
subcc %o2, 1, %o2
679
EX_ST(STORE(stb, %g1, %o1 + %o3), U1_o2_1)
680
bgu,pt %XCC, 90b
681
add %o1, 1, %o1
682
retl
683
mov EX_RETVAL(%o4), %o0
684
685
.size FUNC_NAME, .-FUNC_NAME
686
EXPORT_SYMBOL(FUNC_NAME)
687
688