Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/amd64/string/memccpy.S
39500 views
1
/*
2
* Copyright (c) 2023, 2024 The FreeBSD Foundation
3
*
4
* This software was developed by Robert Clausecker <[email protected]>
5
* under sponsorship from the FreeBSD Foundation.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
9
* are met:
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
* SUCH DAMAGE
27
*/
28
29
#include <machine/asm.h>
30
31
#include "amd64_archlevel.h"
32
33
#define ALIGN_TEXT .p2align 4, 0x90
34
35
.weak memccpy
36
.set memccpy, __memccpy
37
ARCHFUNCS(__memccpy)
38
ARCHFUNC(__memccpy, scalar)
39
ARCHFUNC(__memccpy, baseline)
40
ENDARCHFUNCS(__memccpy)
41
42
ARCHENTRY(__memccpy, scalar)
43
push %rbp # establish stack frame
44
mov %rsp, %rbp
45
push %rax # dummy push for alignment
46
push %rbx
47
push %rdi
48
push %rsi
49
50
mov %rsi, %rdi
51
mov %edx, %esi
52
mov %rcx, %rdx
53
mov %rcx, %rbx
54
call CNAME(__memchr) # ptr = memchr(src, c, len)
55
56
pop %rsi
57
pop %rdi
58
lea 1(%rax), %rdx
59
sub %rsi, %rdx # size = ptr - src + 1
60
mov %rbx, %rcx
61
lea (%rdi, %rdx, 1), %rbx # res = dest + size
62
test %rax, %rax # if (ptr == NULL)
63
cmovz %rcx, %rdx # size = len
64
cmovz %rax, %rbx # res = NULL
65
call CNAME(memcpy)
66
67
mov %rbx, %rax # return (res)
68
pop %rbx
69
leave
70
ret
71
ARCHEND(__memccpy, scalar)
72
73
ARCHENTRY(__memccpy, baseline)
74
sub $1, %rcx # RCX refers to last character in buffer
75
jb .L0 # go to special code path if len was 0
76
77
movd %edx, %xmm4
78
mov %rcx, %rdx
79
punpcklbw %xmm4, %xmm4 # c -> cc
80
mov %esi, %ecx
81
punpcklwd %xmm4, %xmm4 # cc -> cccc
82
mov %rsi, %r9 # stash a copy of the source pointer for later
83
pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc
84
and $~0xf, %rsi
85
movdqa %xmm4, %xmm1
86
pcmpeqb (%rsi), %xmm1 # c found in head?
87
and $0xf, %ecx
88
mov $-1, %eax
89
pmovmskb %xmm1, %r8d
90
lea -32(%rcx), %r11
91
shl %cl, %eax # mask of bytes in the string
92
add %rdx, %r11 # distance from alignment boundary - 32
93
jnc .Lrunt # jump if buffer length is 32 or less
94
95
and %r8d, %eax
96
jz 0f # match (or induced match) found?
97
98
/* match in first chunk */
99
tzcnt %eax, %edx # where is c?
100
sub %ecx, %edx # ... from the beginning of the string?
101
lea 1(%rdi, %rdx, 1), %rax # return value
102
jmp .L0116
103
104
0: movdqa 16(%rsi), %xmm3 # load second string chunk
105
movdqu (%r9), %xmm2 # load unaligned string head
106
movdqa %xmm4, %xmm1
107
pcmpeqb %xmm3, %xmm1 # c found in second chunk?
108
109
/* process second chunk */
110
pmovmskb %xmm1, %eax
111
test %eax, %eax
112
jz 0f
113
114
/* match in second chunk */
115
tzcnt %eax, %edx # where is c?
116
sub $16, %ecx
117
sub %ecx, %edx # adjust for alignment offset
118
lea 1(%rdi, %rdx, 1), %rax # return value
119
jmp .L0132
120
121
/* c not found in second chunk: prepare for main loop */
122
0: movdqa 32(%rsi), %xmm0 # load next string chunk
123
movdqa %xmm4, %xmm1
124
movdqu %xmm2, (%rdi) # deposit head into buffer
125
sub %rcx, %rdi # adjust RDI to correspond to RSI
126
mov %r11, %rdx
127
movdqu %xmm3, 16(%rdi) # deposit second chunk
128
sub %rsi, %rdi # express RDI as distance from RSI
129
add $32, %rsi # advance RSI past first two chunks
130
sub $16, %rdx # enough left for another round?
131
jb 1f
132
133
/* main loop unrolled twice */
134
ALIGN_TEXT
135
0: pcmpeqb %xmm0, %xmm1 # c encountered?
136
pmovmskb %xmm1, %eax
137
test %eax, %eax
138
jnz 3f
139
140
movdqu %xmm0, (%rsi, %rdi)
141
movdqa 16(%rsi), %xmm0 # load next string chunk
142
movdqa %xmm4, %xmm1
143
cmp $16, %rdx # more than a full chunk left?
144
jb 2f
145
146
add $32, %rsi # advance pointers to next chunk
147
pcmpeqb %xmm0, %xmm1 # c encountered?
148
pmovmskb %xmm1, %eax
149
test %eax, %eax
150
jnz 4f
151
152
movdqu %xmm0, -16(%rsi, %rdi)
153
movdqa (%rsi), %xmm0 # load next string chunk
154
movdqa %xmm4, %xmm1
155
sub $32, %rdx
156
jae 0b
157
158
1: sub $16, %rsi # undo second advancement
159
add $16, %edx
160
161
/* 1--16 bytes left in the buffer but string has not ended yet */
162
2: pcmpeqb %xmm1, %xmm0 # c encountered?
163
pmovmskb %xmm0, %r8d
164
mov %r8d, %ecx
165
bts %edx, %r8d # treat end of buffer as end of string
166
tzcnt %r8d, %r8d # find tail length
167
add %rsi, %rdi # restore RDI
168
movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail
169
movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail
170
lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered
171
xor %eax, %eax # return value if no terminator encountered
172
bt %r8d, %ecx # terminator encountered inside buffer?
173
cmovc %rsi, %rax # if yes, return pointer, else NULL
174
ret
175
176
4: sub $16, %rsi # undo second advancement
177
178
/* terminator found and buffer has not ended yet */
179
3: tzcnt %eax, %eax # find length of string tail
180
movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c)
181
add %rsi, %rdi # restore destination pointer
182
movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c)
183
lea 1(%rdi, %rax, 1), %rax # compute return value
184
ret
185
186
/* buffer is 1--32 bytes in size */
187
ALIGN_TEXT
188
.Lrunt: add $32, %r11d # undo earlier decrement
189
mov %r8d, %r10d # keep a copy of the original match mask
190
bts %r11d, %r8d # induce match at buffer end
191
and %ax, %r8w # is there a match in the first 16 bytes?
192
jnz 0f # if yes, skip looking at second chunk
193
194
pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk
195
pmovmskb %xmm4, %r8d
196
shl $16, %r8d # place second chunk matches in bits 16--31
197
mov %r8d, %r10d # keep a copy of the original match mask
198
bts %r11d, %r8d # induce a match at buffer end
199
200
0: xor %eax, %eax # return value if terminator not found
201
tzcnt %r8d, %edx # find string/buffer length from alignment boundary
202
lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx
203
sub %rcx, %r8
204
bt %edx, %r10d # was the terminator present?
205
cmovc %r8, %rax # if yes, return pointer, else NULL
206
sub %ecx, %edx # find actual string/buffer length
207
208
ALIGN_TEXT
209
.L0132: cmp $16, %rdx # at least 17 bytes to copy?
210
jb .L0116
211
212
/* copy 17--32 bytes */
213
movdqu (%r9), %xmm0 # load first 16 bytes
214
movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
215
movdqu %xmm0, (%rdi)
216
movdqu %xmm1, -15(%rdi, %rdx, 1)
217
ret
218
219
/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
220
ALIGN_TEXT
221
.L0116: cmp $8, %rdx # at least 9 bytes to copy?
222
jae .L0916
223
224
cmp $4, %rdx # at least 5 bytes to copy?
225
jae .L0508
226
227
cmp $2, %rdx # at least 3 bytes to copy?
228
jae .L0304
229
230
/* copy one or two bytes */
231
movzbl (%r9), %ecx # load first byte from src
232
movzbl (%r9, %rdx, 1), %esi # load last byte from src
233
mov %cl, (%rdi) # deposit into destination
234
mov %sil, (%rdi, %rdx, 1)
235
ret
236
237
.L0304: movzwl (%r9), %ecx
238
movzwl -1(%r9, %rdx, 1), %esi
239
mov %cx, (%rdi)
240
mov %si, -1(%rdi, %rdx, 1)
241
ret
242
243
.L0508: mov (%r9), %ecx
244
mov -3(%r9, %rdx, 1), %esi
245
mov %ecx, (%rdi)
246
mov %esi, -3(%rdi, %rdx, 1)
247
ret
248
249
.L0916: mov (%r9), %rcx
250
mov -7(%r9, %rdx, 1), %rsi
251
mov %rcx, (%rdi)
252
mov %rsi, -7(%rdi, %rdx, 1)
253
ret
254
255
/* length zero destination: return null pointer */
256
.L0: xor %eax, %eax
257
ret
258
ARCHEND(__memccpy, baseline)
259
260
.section .note.GNU-stack,"",%progbits
261
262