Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/amd64/string/stpcpy.S
39500 views
1
/*-
2
* Copyright (c) 2023, The FreeBSD Foundation
3
*
4
* SPDX-License-Expression: BSD-2-Clause
5
*
6
* Portions of this software were developed by Robert Clausecker
7
* <[email protected]> under sponsorship from the FreeBSD Foundation.
8
*
9
* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
10
* written by J.T. Conklin <[email protected]> and
11
* adapted by Guillaume Morin <[email protected]> to implement stpcpy
12
* that was originally dedicated to the public domain
13
*/
14
15
#include <machine/asm.h>
16
17
#include "amd64_archlevel.h"
18
19
#define ALIGN_TEXT .p2align 4, 0x90
20
21
.weak stpcpy
22
.set stpcpy, __stpcpy
23
ARCHFUNCS(__stpcpy)
24
ARCHFUNC(__stpcpy, scalar)
25
ARCHFUNC(__stpcpy, baseline)
26
ENDARCHFUNCS(__stpcpy)
27
28
/*
29
* This stpcpy implementation copies a byte at a time until the
30
* source pointer is aligned to a word boundary, it then copies by
31
* words until it finds a word containing a zero byte, and finally
32
* copies by bytes until the end of the string is reached.
33
*
34
* While this may result in unaligned stores if the source and
35
* destination pointers are unaligned with respect to each other,
36
* it is still faster than either byte copies or the overhead of
37
* an implementation suitable for machines with strict alignment
38
* requirements.
39
*/
40
41
ARCHENTRY(__stpcpy, scalar)
42
movabsq $0x0101010101010101,%r8
43
movabsq $0x8080808080808080,%r9
44
45
/*
46
* Align source to a word boundary.
47
* Consider unrolling loop?
48
*/
49
.Lalign:
50
testb $7,%sil
51
je .Lword_aligned
52
movb (%rsi),%dl
53
incq %rsi
54
movb %dl,(%rdi)
55
incq %rdi
56
testb %dl,%dl
57
jne .Lalign
58
movq %rdi,%rax
59
dec %rax
60
ret
61
62
ALIGN_TEXT
63
.Lloop:
64
movq %rdx,(%rdi)
65
addq $8,%rdi
66
.Lword_aligned:
67
movq (%rsi),%rdx
68
movq %rdx,%rcx
69
addq $8,%rsi
70
subq %r8,%rcx
71
testq %r9,%rcx
72
je .Lloop
73
74
/*
75
* In rare cases, the above loop may exit prematurely. We must
76
* return to the loop if none of the bytes in the word equal 0.
77
*/
78
79
movb %dl,(%rdi)
80
testb %dl,%dl /* 1st byte == 0? */
81
je .Ldone
82
incq %rdi
83
84
shrq $8,%rdx
85
movb %dl,(%rdi)
86
testb %dl,%dl /* 2nd byte == 0? */
87
je .Ldone
88
incq %rdi
89
90
shrq $8,%rdx
91
movb %dl,(%rdi)
92
testb %dl,%dl /* 3rd byte == 0? */
93
je .Ldone
94
incq %rdi
95
96
shrq $8,%rdx
97
movb %dl,(%rdi)
98
testb %dl,%dl /* 4th byte == 0? */
99
je .Ldone
100
incq %rdi
101
102
shrq $8,%rdx
103
movb %dl,(%rdi)
104
testb %dl,%dl /* 5th byte == 0? */
105
je .Ldone
106
incq %rdi
107
108
shrq $8,%rdx
109
movb %dl,(%rdi)
110
testb %dl,%dl /* 6th byte == 0? */
111
je .Ldone
112
incq %rdi
113
114
shrq $8,%rdx
115
movb %dl,(%rdi)
116
testb %dl,%dl /* 7th byte == 0? */
117
je .Ldone
118
incq %rdi
119
120
shrq $8,%rdx
121
movb %dl,(%rdi)
122
incq %rdi
123
testb %dl,%dl /* 8th byte == 0? */
124
jne .Lword_aligned
125
decq %rdi
126
127
.Ldone:
128
movq %rdi,%rax
129
ret
130
ARCHEND(__stpcpy, scalar)
131
132
ARCHENTRY(__stpcpy, baseline)
133
mov %esi, %ecx
134
mov %rdi, %rdx
135
sub %rsi, %rdi # express destination as distance to surce
136
and $~0xf, %rsi # align source to 16 byte
137
movdqa (%rsi), %xmm0 # head of string with junk before
138
pxor %xmm1, %xmm1
139
and $0xf, %ecx # misalignment in bytes
140
pcmpeqb %xmm1, %xmm0 # NUL byte present?
141
pmovmskb %xmm0, %eax
142
shr %cl, %eax # clear out matches in junk bytes
143
bsf %eax, %eax # find match if any
144
jnz .Lrunt
145
146
/* first normal iteration: write head back if it succeeds */
147
movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
148
movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
149
pcmpeqb %xmm0, %xmm1 # NUL byte present?
150
pmovmskb %xmm1, %eax
151
test %eax, %eax # find match if any
152
jnz .Lshorty
153
154
movdqu %xmm2, (%rdx) # store beginning of string
155
156
/* main loop, unrolled twice */
157
ALIGN_TEXT
158
0: movdqa 32(%rsi), %xmm2 # load current iteraion
159
movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
160
pxor %xmm1, %xmm1
161
add $32, %rsi
162
pcmpeqb %xmm2, %xmm1 # NUL byte present?
163
pmovmskb %xmm1, %eax
164
test %eax, %eax
165
jnz 1f
166
167
movdqa 16(%rsi), %xmm0 # load current iteraion
168
movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
169
pxor %xmm1, %xmm1
170
pcmpeqb %xmm0, %xmm1 # NUL byte present?
171
pmovmskb %xmm1, %eax
172
test %eax, %eax
173
jz 0b
174
175
/* end of string after main loop has iterated */
176
add $16, %rsi # advance rsi to second unrolled half
177
1: tzcnt %eax, %eax # find location of match
178
# (behaves as bsf on pre-x86-64-v3 CPUs)
179
add %rsi, %rax # point to NUL byte
180
movdqu -15(%rax), %xmm0 # last 16 bytes of string
181
movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
182
add %rdi, %rax # point to destination's NUL byte
183
ret
184
185
/* NUL encountered in second iteration */
186
.Lshorty:
187
tzcnt %eax, %eax
188
add $16, %eax # account for length of first iteration
189
sub %ecx, %eax # but not the parts before the string
190
191
/* NUL encountered in first iteration */
192
.Lrunt: lea 1(%rax), %edi # string length including NUL byte
193
add %rcx, %rsi # point to beginning of string
194
add %rdx, %rax # point to NUL byte
195
196
/* transfer 16--32 bytes */
197
.L1632: cmp $16, %edi
198
jb .L0815
199
200
movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
201
movdqu %xmm2, (%rdx) # store first 16 bytes
202
movdqu %xmm0, -15(%rax) # store last 16 bytes
203
ret
204
205
/* transfer 8--15 bytes */
206
.L0815: cmp $8, %edi
207
jb .L0407
208
209
mov (%rsi), %rcx # load first 8 bytes
210
mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
211
mov %rcx, (%rdx) # store to dst
212
mov %rdi, -7(%rax) # dito
213
ret
214
215
/* transfer 4--7 bytes */
216
.L0407: cmp $4, %edi
217
jb .L0203
218
219
mov (%rsi), %ecx
220
mov -4(%rsi, %rdi, 1), %edi
221
mov %ecx, (%rdx)
222
mov %edi, -3(%rax)
223
ret
224
225
/* transfer 2--3 bytes */
226
.L0203: cmp $2, %edi
227
jb .L0101
228
229
movzwl (%rsi), %ecx
230
mov %cx, (%rdx) # store first two bytes
231
232
/* transfer 0 bytes (last byte is always NUL) */
233
.L0101: movb $0, (%rax) # store terminating NUL byte
234
ret
235
ARCHEND(__stpcpy, baseline)
236
237
.section .note.GNU-stack,"",%progbits
238
239