Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/amd64/string/stpncpy.S
39486 views
1
/*
2
* Copyright (c) 2023 The FreeBSD Foundation
3
*
4
* This software was developed by Robert Clausecker <[email protected]>
5
* under sponsorship from the FreeBSD Foundation.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
9
* are met:
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
* SUCH DAMAGE
27
*/
28
29
#include <machine/asm.h>
30
31
#include "amd64_archlevel.h"
32
33
#define ALIGN_TEXT .p2align 4, 0x90
34
35
.weak stpncpy
36
.set stpncpy, __stpncpy
37
ARCHFUNCS(__stpncpy)
38
ARCHFUNC(__stpncpy, scalar)
39
ARCHFUNC(__stpncpy, baseline)
40
ENDARCHFUNCS(__stpncpy)
41
42
ARCHENTRY(__stpncpy, scalar)
43
push %rbp # establish stack frame
44
mov %rsp, %rbp
45
46
push %rdx
47
push %rdi
48
push %rsi
49
push %rax # dummy push for alignment
50
51
mov %rsi, %rdi
52
xor %esi, %esi
53
call CNAME(__memchr) # memchr(src, '\0', len)
54
pop %rcx # dummy pop
55
pop %rsi
56
mov -16(%rbp), %rdi
57
58
test %rax, %rax # NUL found?
59
jz .Lfullcopy
60
61
mov %rax, %rdx
62
sub %rsi, %rdx # copy until the NUL byte
63
add %rdx, -16(%rbp) # advance destination by string length
64
sub %rdx, -8(%rbp) # and shorten buffer size by string length
65
call CNAME(memcpy)
66
67
pop %rdi
68
pop %rdx
69
xor %esi, %esi
70
pop %rbp
71
jmp CNAME(memset) # clear remaining buffer
72
73
.Lfullcopy:
74
mov -8(%rbp), %rdx
75
call CNAME(memcpy) # copy whole string
76
add -8(%rbp), %rax # point to dest[n]
77
leave
78
ret
79
ARCHEND(__stpncpy, scalar)
80
81
/*
82
* this mask allows us to generate masks of 16-n 0xff bytes
83
* followed by n 0x00 bytes by loading from .Lmask+n.
84
*/
85
.section .rodata
86
.Lmask: .quad 0xffffffffffffffff
87
.quad 0xffffffffffffffff
88
.quad 0x0000000000000000
89
.quad 0x0000000000000000
90
91
/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
92
ARCHENTRY(__stpncpy, baseline)
93
#define bounce (-3*16-8) /* location of on-stack bounce buffer */
94
95
test %rdx, %rdx # no bytes to copy?
96
jz .L0
97
98
mov %esi, %ecx
99
and $~0xf, %rsi # align source to 16 bytes
100
movdqa (%rsi), %xmm0 # load head
101
and $0xf, %ecx # offset from alignment
102
mov $-1, %r9d
103
lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32
104
shl %cl, %r9d # mask of bytes belonging to the string
105
sub %rcx, %rdi # adjust RDI to correspond to RSI
106
pxor %xmm1, %xmm1
107
movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack
108
pcmpeqb %xmm1, %xmm0
109
pmovmskb %xmm0, %r8d
110
111
lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary
112
add %rdx, %rax # less than 2 chunks (32 bytes) to play with?
113
jnc .Lrunt # if yes, use special runt processing
114
115
movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
116
and %r9d, %r8d # end of string within head?
117
jnz .Lheadnul
118
119
movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer
120
movdqu %xmm2, (%rdi, %rcx, 1) # an deposit
121
122
add $16, %rsi
123
add $16, %rdi
124
sub $32, %r10
125
126
/* main loop unrolled twice */
127
ALIGN_TEXT
128
0: movdqa (%rsi), %xmm0
129
pxor %xmm1, %xmm1
130
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
131
pmovmskb %xmm1, %r8d
132
test %r8d, %r8d
133
jnz 3f
134
135
movdqu %xmm0, (%rdi)
136
cmp $16, %r10 # more than a full chunk left?
137
jbe 1f
138
139
movdqa 16(%rsi), %xmm0
140
add $32, %rdi # advance pointers to next chunk
141
add $32, %rsi
142
pxor %xmm1, %xmm1
143
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
144
pmovmskb %xmm1, %r8d
145
test %r8d, %r8d
146
jnz 2f
147
148
movdqu %xmm0, -16(%rdi)
149
sub $32, %r10 # more than another full chunk left?
150
ja 0b
151
152
sub $16, %rdi # undo second advancement
153
sub $16, %rsi
154
add $16, %r10d # restore number of remaining bytes
155
156
/* 1--16 bytes left but string has not ended yet */
157
1: pxor %xmm1, %xmm1
158
pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail?
159
pmovmskb %xmm1, %r8d
160
bts %r10d, %r8d # treat end of buffer as NUL
161
tzcnt %r8d, %r8d # where is the NUL byte?
162
movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL
163
lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte
164
# or end of buffer
165
movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer
166
ret
167
168
2: sub $16, %rdi # undo second advancement
169
sub $16, %rsi
170
sub $16, %r10
171
172
/* string has ended and buffer has not */
173
3: tzcnt %r8d, %r8d # where did the string end?
174
lea .Lmask+16(%rip), %rcx
175
lea (%rdi, %r8, 1), %rax # where the NUL byte will be
176
neg %r8
177
movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is,
178
# 00 where it is not
179
pand %xmm1, %xmm0 # mask out bytes after the string
180
movdqu %xmm0, (%rdi) # store masked current chunk
181
pxor %xmm1, %xmm1
182
sub $16, %r10 # another full chunk left?
183
jbe 1f
184
185
/* clear remaining destination buffer (tail has been cleared earlier) */
186
ALIGN_TEXT
187
0: movdqu %xmm1, 16(%rdi)
188
cmp $16, %r10
189
jbe 1f
190
191
movdqu %xmm1, 32(%rdi)
192
add $32, %rdi
193
sub $32, %r10
194
ja 0b
195
196
1: ret
197
198
/* at least two chunks to play with and NUL while processing head */
199
.Lheadnul:
200
movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
201
tzcnt %r8d, %r8d # find location of NUL byte
202
movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination
203
movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes
204
movdqu %xmm1, 16(%rdi) # clear out second chunk
205
lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte
206
207
add $32, %rdi # advance past first two chunks
208
sub $32+16, %r10 # advance past first three chunks
209
jbe 1f # did we pass the end of the buffer?
210
211
/* clear remaining destination buffer (tail has been cleared earlier) */
212
ALIGN_TEXT
213
0: movdqu %xmm1, (%rdi) # clear out buffer chunk
214
cmp $16, %r10
215
jbe 1f
216
217
movdqu %xmm1, 16(%rdi)
218
add $32, %rdi
219
sub $32, %r10
220
ja 0b
221
222
1: ret
223
224
/* 1--32 bytes to copy, bounce through the stack */
225
.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy
226
bts %r10d, %r8d # treat end of buffer as end of string
227
and %r9w, %r8w # end of string within first buffer?
228
jnz 0f # if yes, do not inspect second buffer
229
230
movdqa 16(%rsi), %xmm0 # load second chunk of input
231
movdqa %xmm0, bounce+16(%rsp) # stash copy on stack
232
pcmpeqb %xmm1, %xmm0 # NUL in second chunk?
233
pmovmskb %xmm0, %r9d
234
shl $16, %r9d
235
or %r9d, %r8d # merge found NUL bytes into NUL mask
236
237
/* end of string after one buffer */
238
0: tzcnt %r8d, %r8d # location of last char in string
239
movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
240
lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
241
lea (%rdi, %r8, 1), %rax # return pointer to NUL byte
242
243
cmp $16, %edx # at least 16 bytes to transfer?
244
jae .L1631
245
246
mov (%rsi), %r8 # load string head
247
cmp $8, %edx # at least 8 bytes to transfer?
248
jae .L0815
249
250
cmp $4, %edx # at least 4 bytes to transfer?
251
jae .L0407
252
253
movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string
254
mov %r8b, (%rdi, %rcx, 1) # store first byte
255
256
cmp $2, %edx # at least 2 bytes to transfer?
257
jb .L1
258
259
mov %si, -2(%rdi, %r10, 1) # store last two bytes of string
260
.L1: ret
261
262
.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string
263
movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
264
movdqu %xmm0, (%rdi, %rcx, 1)
265
movdqu %xmm1, -16(%rdi, %r10, 1)
266
ret
267
268
.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string
269
mov %r8, (%rdi, %rcx, 1)
270
mov %rdx, -8(%rdi, %r10, 1)
271
ret
272
273
.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string
274
mov %r8d, (%rdi, %rcx, 1)
275
mov %edx, -4(%rdi, %r10, 1)
276
ret
277
278
/* length 0 buffer: just return dest */
279
.L0: mov %rdi, %rax
280
ret
281
ARCHEND(__stpncpy, baseline)
282
283
.section .note.GNU-stack,"",%progbits
284
285