Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bionic-x86_64-string/sse2-memmove-slm.S
39475 views
1
/*
2
Copyright (c) 2014, Intel Corporation
3
All rights reserved.
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
* Redistributions of source code must retain the above copyright notice,
9
* this list of conditions and the following disclaimer.
10
11
* Redistributions in binary form must reproduce the above copyright notice,
12
* this list of conditions and the following disclaimer in the documentation
13
* and/or other materials provided with the distribution.
14
15
* Neither the name of Intel Corporation nor the names of its contributors
16
* may be used to endorse or promote products derived from this software
17
* without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#include "cache.h"
32
33
#ifndef MEMMOVE
34
# define MEMMOVE memmove
35
#endif
36
37
#ifndef L
38
# define L(label) .L##label
39
#endif
40
41
#ifndef cfi_startproc
42
# define cfi_startproc .cfi_startproc
43
#endif
44
45
#ifndef cfi_endproc
46
# define cfi_endproc .cfi_endproc
47
#endif
48
49
#ifndef cfi_rel_offset
50
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51
#endif
52
53
#ifndef cfi_restore
54
# define cfi_restore(reg) .cfi_restore reg
55
#endif
56
57
#ifndef cfi_adjust_cfa_offset
58
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59
#endif
60
61
#ifndef ENTRY
62
# define ENTRY(name) \
63
.type name, @function; \
64
.globl name; \
65
.p2align 4; \
66
name: \
67
cfi_startproc
68
#endif
69
70
#ifndef ALIAS_SYMBOL
71
# define ALIAS_SYMBOL(alias, original) \
72
.globl alias; \
73
.equ alias, original
74
#endif
75
76
#ifndef END
77
# define END(name) \
78
cfi_endproc; \
79
.size name, .-name
80
#endif
81
82
#define CFI_PUSH(REG) \
83
cfi_adjust_cfa_offset (4); \
84
cfi_rel_offset (REG, 0)
85
86
#define CFI_POP(REG) \
87
cfi_adjust_cfa_offset (-4); \
88
cfi_restore (REG)
89
90
#define PUSH(REG) push REG;
91
#define POP(REG) pop REG;
92
93
#define ENTRANCE PUSH (%rbx);
94
#define RETURN_END POP (%rbx); ret
95
#define RETURN RETURN_END;
96
97
.section .text.sse2,"ax",@progbits
98
ENTRY (MEMMOVE)
99
ENTRANCE
100
mov %rdi, %rax
101
102
/* Check whether we should copy backward or forward. */
103
cmp %rsi, %rdi
104
je L(mm_return)
105
jg L(mm_len_0_or_more_backward)
106
107
/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
108
separately. */
109
cmp $16, %rdx
110
jbe L(mm_len_0_16_bytes_forward)
111
112
cmp $32, %rdx
113
ja L(mm_len_32_or_more_forward)
114
115
/* Copy [0..32] and return. */
116
movdqu (%rsi), %xmm0
117
movdqu -16(%rsi, %rdx), %xmm1
118
movdqu %xmm0, (%rdi)
119
movdqu %xmm1, -16(%rdi, %rdx)
120
jmp L(mm_return)
121
122
L(mm_len_32_or_more_forward):
123
cmp $64, %rdx
124
ja L(mm_len_64_or_more_forward)
125
126
/* Copy [0..64] and return. */
127
movdqu (%rsi), %xmm0
128
movdqu 16(%rsi), %xmm1
129
movdqu -16(%rsi, %rdx), %xmm2
130
movdqu -32(%rsi, %rdx), %xmm3
131
movdqu %xmm0, (%rdi)
132
movdqu %xmm1, 16(%rdi)
133
movdqu %xmm2, -16(%rdi, %rdx)
134
movdqu %xmm3, -32(%rdi, %rdx)
135
jmp L(mm_return)
136
137
L(mm_len_64_or_more_forward):
138
cmp $128, %rdx
139
ja L(mm_len_128_or_more_forward)
140
141
/* Copy [0..128] and return. */
142
movdqu (%rsi), %xmm0
143
movdqu 16(%rsi), %xmm1
144
movdqu 32(%rsi), %xmm2
145
movdqu 48(%rsi), %xmm3
146
movdqu -64(%rsi, %rdx), %xmm4
147
movdqu -48(%rsi, %rdx), %xmm5
148
movdqu -32(%rsi, %rdx), %xmm6
149
movdqu -16(%rsi, %rdx), %xmm7
150
movdqu %xmm0, (%rdi)
151
movdqu %xmm1, 16(%rdi)
152
movdqu %xmm2, 32(%rdi)
153
movdqu %xmm3, 48(%rdi)
154
movdqu %xmm4, -64(%rdi, %rdx)
155
movdqu %xmm5, -48(%rdi, %rdx)
156
movdqu %xmm6, -32(%rdi, %rdx)
157
movdqu %xmm7, -16(%rdi, %rdx)
158
jmp L(mm_return)
159
160
L(mm_len_128_or_more_forward):
161
/* Aligning the address of destination. */
162
/* save first unaligned 64 bytes */
163
movdqu (%rsi), %xmm0
164
movdqu 16(%rsi), %xmm1
165
movdqu 32(%rsi), %xmm2
166
movdqu 48(%rsi), %xmm3
167
168
lea 64(%rdi), %r8
169
and $-64, %r8 /* r8 now aligned to next 64 byte boundary */
170
sub %rdi, %rsi /* rsi = src - dst = diff */
171
172
movdqu (%r8, %rsi), %xmm4
173
movdqu 16(%r8, %rsi), %xmm5
174
movdqu 32(%r8, %rsi), %xmm6
175
movdqu 48(%r8, %rsi), %xmm7
176
177
movdqu %xmm0, (%rdi)
178
movdqu %xmm1, 16(%rdi)
179
movdqu %xmm2, 32(%rdi)
180
movdqu %xmm3, 48(%rdi)
181
movdqa %xmm4, (%r8)
182
movaps %xmm5, 16(%r8)
183
movaps %xmm6, 32(%r8)
184
movaps %xmm7, 48(%r8)
185
add $64, %r8
186
187
lea (%rdi, %rdx), %rbx
188
and $-64, %rbx
189
cmp %r8, %rbx
190
jbe L(mm_copy_remaining_forward)
191
192
cmp $SHARED_CACHE_SIZE_HALF, %rdx
193
jae L(mm_large_page_loop_forward)
194
195
.p2align 4
196
L(mm_main_loop_forward):
197
198
prefetcht0 128(%r8, %rsi)
199
200
movdqu (%r8, %rsi), %xmm0
201
movdqu 16(%r8, %rsi), %xmm1
202
movdqu 32(%r8, %rsi), %xmm2
203
movdqu 48(%r8, %rsi), %xmm3
204
movdqa %xmm0, (%r8)
205
movaps %xmm1, 16(%r8)
206
movaps %xmm2, 32(%r8)
207
movaps %xmm3, 48(%r8)
208
lea 64(%r8), %r8
209
cmp %r8, %rbx
210
ja L(mm_main_loop_forward)
211
212
L(mm_copy_remaining_forward):
213
add %rdi, %rdx
214
sub %r8, %rdx
215
/* We copied all up till %rdi position in the dst.
216
In %rdx now is how many bytes are left to copy.
217
Now we need to advance %r8. */
218
lea (%r8, %rsi), %r9
219
220
L(mm_remaining_0_64_bytes_forward):
221
cmp $32, %rdx
222
ja L(mm_remaining_33_64_bytes_forward)
223
cmp $16, %rdx
224
ja L(mm_remaining_17_32_bytes_forward)
225
test %rdx, %rdx
226
.p2align 4,,2
227
je L(mm_return)
228
229
cmpb $8, %dl
230
ja L(mm_remaining_9_16_bytes_forward)
231
cmpb $4, %dl
232
.p2align 4,,5
233
ja L(mm_remaining_5_8_bytes_forward)
234
cmpb $2, %dl
235
.p2align 4,,1
236
ja L(mm_remaining_3_4_bytes_forward)
237
movzbl -1(%r9,%rdx), %esi
238
movzbl (%r9), %ebx
239
movb %sil, -1(%r8,%rdx)
240
movb %bl, (%r8)
241
jmp L(mm_return)
242
243
L(mm_remaining_33_64_bytes_forward):
244
movdqu (%r9), %xmm0
245
movdqu 16(%r9), %xmm1
246
movdqu -32(%r9, %rdx), %xmm2
247
movdqu -16(%r9, %rdx), %xmm3
248
movdqu %xmm0, (%r8)
249
movdqu %xmm1, 16(%r8)
250
movdqu %xmm2, -32(%r8, %rdx)
251
movdqu %xmm3, -16(%r8, %rdx)
252
jmp L(mm_return)
253
254
L(mm_remaining_17_32_bytes_forward):
255
movdqu (%r9), %xmm0
256
movdqu -16(%r9, %rdx), %xmm1
257
movdqu %xmm0, (%r8)
258
movdqu %xmm1, -16(%r8, %rdx)
259
jmp L(mm_return)
260
261
L(mm_remaining_5_8_bytes_forward):
262
movl (%r9), %esi
263
movl -4(%r9,%rdx), %ebx
264
movl %esi, (%r8)
265
movl %ebx, -4(%r8,%rdx)
266
jmp L(mm_return)
267
268
L(mm_remaining_9_16_bytes_forward):
269
mov (%r9), %rsi
270
mov -8(%r9, %rdx), %rbx
271
mov %rsi, (%r8)
272
mov %rbx, -8(%r8, %rdx)
273
jmp L(mm_return)
274
275
L(mm_remaining_3_4_bytes_forward):
276
movzwl -2(%r9,%rdx), %esi
277
movzwl (%r9), %ebx
278
movw %si, -2(%r8,%rdx)
279
movw %bx, (%r8)
280
jmp L(mm_return)
281
282
L(mm_len_0_16_bytes_forward):
283
testb $24, %dl
284
jne L(mm_len_9_16_bytes_forward)
285
testb $4, %dl
286
.p2align 4,,5
287
jne L(mm_len_5_8_bytes_forward)
288
test %rdx, %rdx
289
.p2align 4,,2
290
je L(mm_return)
291
testb $2, %dl
292
.p2align 4,,1
293
jne L(mm_len_2_4_bytes_forward)
294
movzbl -1(%rsi,%rdx), %ebx
295
movzbl (%rsi), %esi
296
movb %bl, -1(%rdi,%rdx)
297
movb %sil, (%rdi)
298
jmp L(mm_return)
299
300
L(mm_len_2_4_bytes_forward):
301
movzwl -2(%rsi,%rdx), %ebx
302
movzwl (%rsi), %esi
303
movw %bx, -2(%rdi,%rdx)
304
movw %si, (%rdi)
305
jmp L(mm_return)
306
307
L(mm_len_5_8_bytes_forward):
308
movl (%rsi), %ebx
309
movl -4(%rsi,%rdx), %esi
310
movl %ebx, (%rdi)
311
movl %esi, -4(%rdi,%rdx)
312
jmp L(mm_return)
313
314
L(mm_len_9_16_bytes_forward):
315
mov (%rsi), %rbx
316
mov -8(%rsi, %rdx), %rsi
317
mov %rbx, (%rdi)
318
mov %rsi, -8(%rdi, %rdx)
319
jmp L(mm_return)
320
321
L(mm_recalc_len):
322
/* Compute in %rdx how many bytes are left to copy after
323
the main loop stops. */
324
mov %rbx, %rdx
325
sub %rdi, %rdx
326
/* The code for copying backwards. */
327
L(mm_len_0_or_more_backward):
328
329
/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
330
separately. */
331
cmp $16, %rdx
332
jbe L(mm_len_0_16_bytes_backward)
333
334
cmp $32, %rdx
335
ja L(mm_len_32_or_more_backward)
336
337
/* Copy [0..32] and return. */
338
movdqu (%rsi), %xmm0
339
movdqu -16(%rsi, %rdx), %xmm1
340
movdqu %xmm0, (%rdi)
341
movdqu %xmm1, -16(%rdi, %rdx)
342
jmp L(mm_return)
343
344
L(mm_len_32_or_more_backward):
345
cmp $64, %rdx
346
ja L(mm_len_64_or_more_backward)
347
348
/* Copy [0..64] and return. */
349
movdqu (%rsi), %xmm0
350
movdqu 16(%rsi), %xmm1
351
movdqu -16(%rsi, %rdx), %xmm2
352
movdqu -32(%rsi, %rdx), %xmm3
353
movdqu %xmm0, (%rdi)
354
movdqu %xmm1, 16(%rdi)
355
movdqu %xmm2, -16(%rdi, %rdx)
356
movdqu %xmm3, -32(%rdi, %rdx)
357
jmp L(mm_return)
358
359
L(mm_len_64_or_more_backward):
360
cmp $128, %rdx
361
ja L(mm_len_128_or_more_backward)
362
363
/* Copy [0..128] and return. */
364
movdqu (%rsi), %xmm0
365
movdqu 16(%rsi), %xmm1
366
movdqu 32(%rsi), %xmm2
367
movdqu 48(%rsi), %xmm3
368
movdqu -64(%rsi, %rdx), %xmm4
369
movdqu -48(%rsi, %rdx), %xmm5
370
movdqu -32(%rsi, %rdx), %xmm6
371
movdqu -16(%rsi, %rdx), %xmm7
372
movdqu %xmm0, (%rdi)
373
movdqu %xmm1, 16(%rdi)
374
movdqu %xmm2, 32(%rdi)
375
movdqu %xmm3, 48(%rdi)
376
movdqu %xmm4, -64(%rdi, %rdx)
377
movdqu %xmm5, -48(%rdi, %rdx)
378
movdqu %xmm6, -32(%rdi, %rdx)
379
movdqu %xmm7, -16(%rdi, %rdx)
380
jmp L(mm_return)
381
382
L(mm_len_128_or_more_backward):
383
/* Aligning the address of destination. We need to save
384
16 bits from the source in order not to overwrite them. */
385
movdqu -16(%rsi, %rdx), %xmm0
386
movdqu -32(%rsi, %rdx), %xmm1
387
movdqu -48(%rsi, %rdx), %xmm2
388
movdqu -64(%rsi, %rdx), %xmm3
389
390
lea (%rdi, %rdx), %r9
391
and $-64, %r9 /* r9 = aligned dst */
392
393
mov %rsi, %r8
394
sub %rdi, %r8 /* r8 = src - dst, diff */
395
396
movdqu -16(%r9, %r8), %xmm4
397
movdqu -32(%r9, %r8), %xmm5
398
movdqu -48(%r9, %r8), %xmm6
399
movdqu -64(%r9, %r8), %xmm7
400
401
movdqu %xmm0, -16(%rdi, %rdx)
402
movdqu %xmm1, -32(%rdi, %rdx)
403
movdqu %xmm2, -48(%rdi, %rdx)
404
movdqu %xmm3, -64(%rdi, %rdx)
405
movdqa %xmm4, -16(%r9)
406
movaps %xmm5, -32(%r9)
407
movaps %xmm6, -48(%r9)
408
movaps %xmm7, -64(%r9)
409
lea -64(%r9), %r9
410
411
lea 64(%rdi), %rbx
412
and $-64, %rbx
413
414
cmp %r9, %rbx
415
jae L(mm_recalc_len)
416
417
cmp $SHARED_CACHE_SIZE_HALF, %rdx
418
jae L(mm_large_page_loop_backward)
419
420
.p2align 4
421
L(mm_main_loop_backward):
422
423
prefetcht0 -128(%r9, %r8)
424
425
movdqu -64(%r9, %r8), %xmm0
426
movdqu -48(%r9, %r8), %xmm1
427
movdqu -32(%r9, %r8), %xmm2
428
movdqu -16(%r9, %r8), %xmm3
429
movdqa %xmm0, -64(%r9)
430
movaps %xmm1, -48(%r9)
431
movaps %xmm2, -32(%r9)
432
movaps %xmm3, -16(%r9)
433
lea -64(%r9), %r9
434
cmp %r9, %rbx
435
jb L(mm_main_loop_backward)
436
jmp L(mm_recalc_len)
437
438
/* Copy [0..16] and return. */
439
L(mm_len_0_16_bytes_backward):
440
testb $24, %dl
441
jnz L(mm_len_9_16_bytes_backward)
442
testb $4, %dl
443
.p2align 4,,5
444
jnz L(mm_len_5_8_bytes_backward)
445
test %rdx, %rdx
446
.p2align 4,,2
447
je L(mm_return)
448
testb $2, %dl
449
.p2align 4,,1
450
jne L(mm_len_3_4_bytes_backward)
451
movzbl -1(%rsi,%rdx), %ebx
452
movzbl (%rsi), %ecx
453
movb %bl, -1(%rdi,%rdx)
454
movb %cl, (%rdi)
455
jmp L(mm_return)
456
457
L(mm_len_3_4_bytes_backward):
458
movzwl -2(%rsi,%rdx), %ebx
459
movzwl (%rsi), %ecx
460
movw %bx, -2(%rdi,%rdx)
461
movw %cx, (%rdi)
462
jmp L(mm_return)
463
464
L(mm_len_9_16_bytes_backward):
465
movl -4(%rsi,%rdx), %ebx
466
movl -8(%rsi,%rdx), %ecx
467
movl %ebx, -4(%rdi,%rdx)
468
movl %ecx, -8(%rdi,%rdx)
469
sub $8, %rdx
470
jmp L(mm_len_0_16_bytes_backward)
471
472
L(mm_len_5_8_bytes_backward):
473
movl (%rsi), %ebx
474
movl -4(%rsi,%rdx), %ecx
475
movl %ebx, (%rdi)
476
movl %ecx, -4(%rdi,%rdx)
477
478
L(mm_return):
479
RETURN
480
481
/* Big length copy forward part. */
482
483
.p2align 4
484
L(mm_large_page_loop_forward):
485
movdqu (%r8, %rsi), %xmm0
486
movdqu 16(%r8, %rsi), %xmm1
487
movdqu 32(%r8, %rsi), %xmm2
488
movdqu 48(%r8, %rsi), %xmm3
489
movntdq %xmm0, (%r8)
490
movntdq %xmm1, 16(%r8)
491
movntdq %xmm2, 32(%r8)
492
movntdq %xmm3, 48(%r8)
493
lea 64(%r8), %r8
494
cmp %r8, %rbx
495
ja L(mm_large_page_loop_forward)
496
sfence
497
jmp L(mm_copy_remaining_forward)
498
499
/* Big length copy backward part. */
500
.p2align 4
501
L(mm_large_page_loop_backward):
502
movdqu -64(%r9, %r8), %xmm0
503
movdqu -48(%r9, %r8), %xmm1
504
movdqu -32(%r9, %r8), %xmm2
505
movdqu -16(%r9, %r8), %xmm3
506
movntdq %xmm0, -64(%r9)
507
movntdq %xmm1, -48(%r9)
508
movntdq %xmm2, -32(%r9)
509
movntdq %xmm3, -16(%r9)
510
lea -64(%r9), %r9
511
cmp %r9, %rbx
512
jb L(mm_large_page_loop_backward)
513
sfence
514
jmp L(mm_recalc_len)
515
516
END (MEMMOVE)
517
518
ALIAS_SYMBOL(memcpy, MEMMOVE)
519
520