Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/amd64/string/stpncpy.S
103478 views
1
/*
2
* Copyright (c) 2023 The FreeBSD Foundation
3
*
4
* This software was developed by Robert Clausecker <[email protected]>
5
* under sponsorship from the FreeBSD Foundation.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
9
* are met:
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
* SUCH DAMAGE
27
*/
28
29
#include <machine/asm.h>
30
31
#include "amd64_archlevel.h"
32
33
#define ALIGN_TEXT .p2align 4, 0x90
34
35
.weak stpncpy
36
.set stpncpy, __stpncpy
37
ARCHFUNCS(__stpncpy)
38
ARCHFUNC(__stpncpy, scalar)
39
ARCHFUNC(__stpncpy, baseline)
40
ENDARCHFUNCS(__stpncpy)
41
42
ARCHENTRY(__stpncpy, scalar)
43
push %rbp # establish stack frame
44
mov %rsp, %rbp
45
46
push %rdx
47
push %rdi
48
push %rsi
49
push %rax # dummy push for alignment
50
51
mov %rsi, %rdi
52
xor %esi, %esi
53
call CNAME(__memchr) # memchr(src, '\0', len)
54
pop %rcx # dummy pop
55
pop %rsi
56
mov -16(%rbp), %rdi
57
58
test %rax, %rax # NUL found?
59
jz .Lfullcopy
60
61
mov %rax, %rdx
62
sub %rsi, %rdx # copy until the NUL byte
63
add %rdx, -16(%rbp) # advance destination by string length
64
sub %rdx, -8(%rbp) # and shorten buffer size by string length
65
call CNAME(memcpy)
66
67
pop %rdi
68
pop %rdx
69
xor %esi, %esi
70
pop %rbp
71
jmp CNAME(memset) # clear remaining buffer
72
73
.Lfullcopy:
74
mov -8(%rbp), %rdx
75
call CNAME(memcpy) # copy whole string
76
add -8(%rbp), %rax # point to dest[n]
77
leave
78
ret
79
ARCHEND(__stpncpy, scalar)
80
81
/*
82
* this mask allows us to generate masks of 16-n 0xff bytes
83
* followed by n 0x00 bytes by loading from .Lmask+n.
84
*/
85
.section .rodata
86
.Lmask: .quad 0xffffffffffffffff
87
.quad 0xffffffffffffffff
88
.quad 0x0000000000000000
89
.quad 0x0000000000000000
90
91
/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
92
ARCHENTRY(__stpncpy, baseline)
93
#define bounce (-3*16-8) /* location of on-stack bounce buffer */
94
test %rdx, %rdx # no bytes to copy?
95
jz .L0
96
97
mov %esi, %ecx
98
and $~0xf, %rsi # align source to 16 bytes
99
movdqa (%rsi), %xmm0 # load head
100
and $0xf, %ecx # offset from alignment
101
mov $-1, %r9d
102
lea -33(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32
103
shl %cl, %r9d # mask of bytes belonging to the string
104
sub %rcx, %rdi # adjust RDI to correspond to RSI
105
pxor %xmm1, %xmm1
106
movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack
107
pcmpeqb %xmm1, %xmm0
108
pmovmskb %xmm0, %r8d
109
110
lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary
111
add %rdx, %rax # less than 2 chunks (32 bytes) to play with?
112
jnc .Lrunt # if yes, use special runt processing
113
114
movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
115
and %r9d, %r8d # end of string within head?
116
jnz .Lheadnul
117
118
movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer
119
movdqu %xmm2, (%rdi, %rcx, 1) # an deposit
120
121
add $16, %rsi
122
add $16, %rdi
123
sub $32, %r10
124
125
/* main loop unrolled twice */
126
ALIGN_TEXT
127
0: movdqa (%rsi), %xmm0
128
pxor %xmm1, %xmm1
129
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
130
pmovmskb %xmm1, %r8d
131
test %r8d, %r8d
132
jnz 3f
133
134
movdqu %xmm0, (%rdi)
135
cmp $16, %r10 # more than a full chunk left?
136
jbe 1f
137
138
movdqa 16(%rsi), %xmm0
139
add $32, %rdi # advance pointers to next chunk
140
add $32, %rsi
141
pxor %xmm1, %xmm1
142
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
143
pmovmskb %xmm1, %r8d
144
test %r8d, %r8d
145
jnz 2f
146
147
movdqu %xmm0, -16(%rdi)
148
sub $32, %r10 # more than another full chunk left?
149
ja 0b
150
151
sub $16, %rdi # undo second advancement
152
sub $16, %rsi
153
add $16, %r10d # restore number of remaining bytes
154
155
/* 1--16 bytes left but string has not ended yet */
156
1: pxor %xmm1, %xmm1
157
pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail?
158
pmovmskb %xmm1, %r8d
159
bts %r10d, %r8d # treat end of buffer as NUL
160
tzcnt %r8d, %r8d # where is the NUL byte?
161
movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL
162
lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte
163
# or end of buffer
164
movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer
165
ret
166
167
2: sub $16, %rdi # undo second advancement
168
sub $16, %rsi
169
sub $16, %r10
170
171
/* string has ended and buffer has not */
172
3: tzcnt %r8d, %r8d # where did the string end?
173
lea .Lmask+16(%rip), %rcx
174
lea (%rdi, %r8, 1), %rax # where the NUL byte will be
175
neg %r8
176
movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is,
177
# 00 where it is not
178
pand %xmm1, %xmm0 # mask out bytes after the string
179
movdqu %xmm0, (%rdi) # store masked current chunk
180
pxor %xmm1, %xmm1
181
sub $16, %r10 # another full chunk left?
182
jbe 1f
183
184
/* clear remaining destination buffer (tail has been cleared earlier) */
185
ALIGN_TEXT
186
0: movdqu %xmm1, 16(%rdi)
187
cmp $16, %r10
188
jbe 1f
189
190
movdqu %xmm1, 32(%rdi)
191
add $32, %rdi
192
sub $32, %r10
193
ja 0b
194
195
1: ret
196
197
/* at least two chunks to play with and NUL while processing head */
198
.Lheadnul:
199
movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
200
tzcnt %r8d, %r8d # find location of NUL byte
201
movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination
202
movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes
203
movdqu %xmm1, 16(%rdi) # clear out second chunk
204
lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte
205
206
add $32, %rdi # advance past first two chunks
207
sub $32+16, %r10 # advance past first three chunks
208
jbe 1f # did we pass the end of the buffer?
209
210
/* clear remaining destination buffer (tail has been cleared earlier) */
211
ALIGN_TEXT
212
0: movdqu %xmm1, (%rdi) # clear out buffer chunk
213
cmp $16, %r10
214
jbe 1f
215
216
movdqu %xmm1, 16(%rdi)
217
add $32, %rdi
218
sub $32, %r10
219
ja 0b
220
221
1: ret
222
223
/* 1--32 bytes to copy, bounce through the stack */
224
.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy
225
and %r9d, %r8d # mask out head before string
226
bts %r10, %r8 # treat end of buffer as end of string
227
test $0x1ffff, %r8d # end of string within first chunk or right after?
228
jnz 0f # if yes, do not inspect second buffer
229
230
movdqa 16(%rsi), %xmm0 # load second chunk of input
231
movdqa %xmm0, bounce+16(%rsp) # stash copy on stack
232
pcmpeqb %xmm1, %xmm0 # NUL in second chunk?
233
pmovmskb %xmm0, %r9d
234
shl $16, %r9d
235
or %r9, %r8 # merge found NUL bytes into NUL mask
236
237
/* end of string after one buffer */
238
0: tzcnt %r8, %r8 # location of last char in string
239
movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
240
lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
241
lea (%rdi, %r8, 1), %rax # return pointer to NUL byte
242
243
cmp $16, %edx # at least 16 bytes to transfer?
244
jae .L1631
245
246
mov (%rsi), %r8 # load string head
247
cmp $8, %edx # at least 8 bytes to transfer?
248
jae .L0815
249
250
cmp $4, %edx # at least 4 bytes to transfer?
251
jae .L0407
252
253
movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string
254
mov %r8b, (%rdi, %rcx, 1) # store first byte
255
256
cmp $2, %edx # at least 2 bytes to transfer?
257
jb .L1
258
259
mov %si, -2(%rdi, %r10, 1) # store last two bytes of string
260
.L1: ret
261
262
.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string
263
movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
264
movdqu %xmm0, (%rdi, %rcx, 1)
265
movdqu %xmm1, -16(%rdi, %r10, 1)
266
ret
267
268
.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string
269
mov %r8, (%rdi, %rcx, 1)
270
mov %rdx, -8(%rdi, %r10, 1)
271
ret
272
273
.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string
274
mov %r8d, (%rdi, %rcx, 1)
275
mov %edx, -4(%rdi, %r10, 1)
276
ret
277
278
/* length 0 buffer: just return dest */
279
.L0: mov %rdi, %rax
280
ret
281
ARCHEND(__stpncpy, baseline)
282
283
.section .note.GNU-stack,"",%progbits
284
285