Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/ia64/lib/memset.S
10817 views
1
/* Optimized version of the standard memset() function.
2
3
Copyright (c) 2002 Hewlett-Packard Co/CERN
4
Sverre Jarp <[email protected]>
5
6
Return: dest
7
8
Inputs:
9
in0: dest
10
in1: value
11
in2: count
12
13
The algorithm is fairly straightforward: set byte by byte until we
14
we get to a 16B-aligned address, then loop on 128 B chunks using an
15
early store as prefetching, then loop on 32B chucks, then clear remaining
16
words, finally clear remaining bytes.
17
Since a stf.spill f0 can store 16B in one go, we use this instruction
18
to get peak speed when value = 0. */
19
20
#include <asm/asmmacro.h>
21
#undef ret
22
23
#define dest in0
24
#define value in1
25
#define cnt in2
26
27
#define tmp r31
28
#define save_lc r30
29
#define ptr0 r29
30
#define ptr1 r28
31
#define ptr2 r27
32
#define ptr3 r26
33
#define ptr9 r24
34
#define loopcnt r23
35
#define linecnt r22
36
#define bytecnt r21
37
38
#define fvalue f6
39
40
// This routine uses only scratch predicate registers (p6 - p15)
41
#define p_scr p6 // default register for same-cycle branches
42
#define p_nz p7
43
#define p_zr p8
44
#define p_unalgn p9
45
#define p_y p11
46
#define p_n p12
47
#define p_yy p13
48
#define p_nn p14
49
50
#define MIN1 15
51
#define MIN1P1HALF 8
52
#define LINE_SIZE 128
53
#define LSIZE_SH 7 // shift amount
54
#define PREF_AHEAD 8
55
56
GLOBAL_ENTRY(memset)
57
{ .mmi
58
.prologue
59
alloc tmp = ar.pfs, 3, 0, 0, 0
60
lfetch.nt1 [dest] //
61
.save ar.lc, save_lc
62
mov.i save_lc = ar.lc
63
.body
64
} { .mmi
65
mov ret0 = dest // return value
66
cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
67
cmp.eq p_scr, p0 = cnt, r0
68
;; }
69
{ .mmi
70
and ptr2 = -(MIN1+1), dest // aligned address
71
and tmp = MIN1, dest // prepare to check for correct alignment
72
tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
73
} { .mib
74
mov ptr1 = dest
75
mux1 value = value, @brcst // create 8 identical bytes in word
76
(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
77
;; }
78
{ .mib
79
cmp.ne p_unalgn, p0 = tmp, r0 //
80
} { .mib
81
sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
82
cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
83
(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
84
;; }
85
{ .mmi
86
(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
87
(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
88
(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
89
;; }
90
{ .mib
91
(p_y) add cnt = -8, cnt //
92
(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
93
} { .mib
94
(p_y) st8 [ptr2] = value,-4 //
95
(p_n) add ptr2 = 4, ptr2 //
96
;; }
97
{ .mib
98
(p_yy) add cnt = -4, cnt //
99
(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
100
} { .mib
101
(p_yy) st4 [ptr2] = value,-2 //
102
(p_nn) add ptr2 = 2, ptr2 //
103
;; }
104
{ .mmi
105
mov tmp = LINE_SIZE+1 // for compare
106
(p_y) add cnt = -2, cnt //
107
(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
108
} { .mmi
109
setf.sig fvalue=value // transfer value to FLP side
110
(p_y) st2 [ptr2] = value,-1 //
111
(p_n) add ptr2 = 1, ptr2 //
112
;; }
113
114
{ .mmi
115
(p_yy) st1 [ptr2] = value //
116
cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
117
} { .mbb
118
(p_yy) add cnt = -1, cnt //
119
(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
120
;; }
121
122
{ .mib
123
nop.m 0
124
shr.u linecnt = cnt, LSIZE_SH
125
(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
126
;; }
127
128
TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
129
{ .mmi
130
and tmp = -(LINE_SIZE), cnt // compute end of range
131
mov ptr9 = ptr1 // used for prefetching
132
and cnt = (LINE_SIZE-1), cnt // remainder
133
} { .mmi
134
mov loopcnt = PREF_AHEAD-1 // default prefetch loop
135
cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
136
;; }
137
{ .mmi
138
(p_scr) add loopcnt = -1, linecnt //
139
add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
140
add ptr1 = tmp, ptr1 // first address beyond total range
141
;; }
142
{ .mmi
143
add tmp = -1, linecnt // next loop count
144
mov.i ar.lc = loopcnt //
145
;; }
146
.pref_l1a:
147
{ .mib
148
stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
149
nop.i 0
150
br.cloop.dptk.few .pref_l1a
151
;; }
152
{ .mmi
153
add ptr0 = 16, ptr2 // Two stores in parallel
154
mov.i ar.lc = tmp //
155
;; }
156
.l1ax:
157
{ .mmi
158
stf8 [ptr2] = fvalue, 8
159
stf8 [ptr0] = fvalue, 8
160
;; }
161
{ .mmi
162
stf8 [ptr2] = fvalue, 24
163
stf8 [ptr0] = fvalue, 24
164
;; }
165
{ .mmi
166
stf8 [ptr2] = fvalue, 8
167
stf8 [ptr0] = fvalue, 8
168
;; }
169
{ .mmi
170
stf8 [ptr2] = fvalue, 24
171
stf8 [ptr0] = fvalue, 24
172
;; }
173
{ .mmi
174
stf8 [ptr2] = fvalue, 8
175
stf8 [ptr0] = fvalue, 8
176
;; }
177
{ .mmi
178
stf8 [ptr2] = fvalue, 24
179
stf8 [ptr0] = fvalue, 24
180
;; }
181
{ .mmi
182
stf8 [ptr2] = fvalue, 8
183
stf8 [ptr0] = fvalue, 32
184
cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
185
;; }
186
{ .mmb
187
stf8 [ptr2] = fvalue, 24
188
(p_scr) stf8 [ptr9] = fvalue, 128
189
br.cloop.dptk.few .l1ax
190
;; }
191
{ .mbb
192
cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
193
(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
194
br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
195
;; }
196
197
TEXT_ALIGN(32)
198
.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
199
{ .mmi
200
and tmp = -(LINE_SIZE), cnt // compute end of range
201
mov ptr9 = ptr1 // used for prefetching
202
and cnt = (LINE_SIZE-1), cnt // remainder
203
} { .mmi
204
mov loopcnt = PREF_AHEAD-1 // default prefetch loop
205
cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
206
;; }
207
{ .mmi
208
(p_scr) add loopcnt = -1, linecnt
209
add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
210
add ptr1 = tmp, ptr1 // first address beyond total range
211
;; }
212
{ .mmi
213
add tmp = -1, linecnt // next loop count
214
mov.i ar.lc = loopcnt
215
;; }
216
.pref_l1b:
217
{ .mib
218
stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
219
nop.i 0
220
br.cloop.dptk.few .pref_l1b
221
;; }
222
{ .mmi
223
add ptr0 = 16, ptr2 // Two stores in parallel
224
mov.i ar.lc = tmp
225
;; }
226
.l1bx:
227
{ .mmi
228
stf.spill [ptr2] = f0, 32
229
stf.spill [ptr0] = f0, 32
230
;; }
231
{ .mmi
232
stf.spill [ptr2] = f0, 32
233
stf.spill [ptr0] = f0, 32
234
;; }
235
{ .mmi
236
stf.spill [ptr2] = f0, 32
237
stf.spill [ptr0] = f0, 64
238
cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
239
;; }
240
{ .mmb
241
stf.spill [ptr2] = f0, 32
242
(p_scr) stf.spill [ptr9] = f0, 128
243
br.cloop.dptk.few .l1bx
244
;; }
245
{ .mib
246
cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
247
(p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
248
;; }
249
250
.fraction_of_line:
251
{ .mib
252
add ptr2 = 16, ptr1
253
shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
254
;; }
255
{ .mib
256
cmp.eq p_scr, p0 = loopcnt, r0
257
add loopcnt = -1, loopcnt
258
(p_scr) br.cond.dpnt.many .store_words
259
;; }
260
{ .mib
261
and cnt = 0x1f, cnt // compute the remaining cnt
262
mov.i ar.lc = loopcnt
263
;; }
264
TEXT_ALIGN(32)
265
.l2: // ------------------------------------ // L2A: store 32B in 2 cycles
266
{ .mmb
267
stf8 [ptr1] = fvalue, 8
268
stf8 [ptr2] = fvalue, 8
269
;; } { .mmb
270
stf8 [ptr1] = fvalue, 24
271
stf8 [ptr2] = fvalue, 24
272
br.cloop.dptk.many .l2
273
;; }
274
.store_words:
275
{ .mib
276
cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
277
(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
278
;; }
279
280
{ .mmi
281
stf8 [ptr1] = fvalue, 8 // store
282
cmp.le p_y, p_n = 16, cnt
283
add cnt = -8, cnt // subtract
284
;; }
285
{ .mmi
286
(p_y) stf8 [ptr1] = fvalue, 8 // store
287
(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
288
(p_y) add cnt = -8, cnt // subtract
289
;; }
290
{ .mmi // store
291
(p_yy) stf8 [ptr1] = fvalue, 8
292
(p_yy) add cnt = -8, cnt // subtract
293
;; }
294
295
.move_bytes_from_alignment:
296
{ .mib
297
cmp.eq p_scr, p0 = cnt, r0
298
tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
299
(p_scr) br.cond.dpnt.few .restore_and_exit
300
;; }
301
{ .mib
302
(p_y) st4 [ptr1] = value,4
303
tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
304
;; }
305
{ .mib
306
(p_yy) st2 [ptr1] = value,2
307
tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
308
;; }
309
310
{ .mib
311
(p_y) st1 [ptr1] = value
312
;; }
313
.restore_and_exit:
314
{ .mib
315
nop.m 0
316
mov.i ar.lc = save_lc
317
br.ret.sptk.many rp
318
;; }
319
320
.move_bytes_unaligned:
321
{ .mmi
322
.pred.rel "mutex",p_y, p_n
323
.pred.rel "mutex",p_yy, p_nn
324
(p_n) cmp.le p_yy, p_nn = 4, cnt
325
(p_y) cmp.le p_yy, p_nn = 5, cnt
326
(p_n) add ptr2 = 2, ptr1
327
} { .mmi
328
(p_y) add ptr2 = 3, ptr1
329
(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
330
(p_y) add cnt = -1, cnt
331
;; }
332
{ .mmi
333
(p_yy) cmp.le.unc p_y, p0 = 8, cnt
334
add ptr3 = ptr1, cnt // prepare last store
335
mov.i ar.lc = save_lc
336
} { .mmi
337
(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
338
(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
339
(p_yy) add cnt = -4, cnt
340
;; }
341
{ .mmi
342
(p_y) cmp.le.unc p_yy, p0 = 8, cnt
343
add ptr3 = -1, ptr3 // last store
344
tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
345
} { .mmi
346
(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
347
(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
348
(p_y) add cnt = -4, cnt
349
;; }
350
{ .mmi
351
(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
352
(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
353
tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
354
} { .mmi
355
(p_yy) add cnt = -4, cnt
356
;; }
357
{ .mmb
358
(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
359
(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
360
br.ret.sptk.many rp
361
}
362
END(memset)
363
364