CoCalc -- ev6-memset.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/alpha/lib/ev6-memset.S
²⁶⁴²⁵ views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 * arch/alpha/lib/ev6-memset.S
4
 *
5
 * This is an efficient (and relatively small) implementation of the C library
6
 * "memset()" function for the 21264 implementation of Alpha.
7
 *
8
 * 21264 version  contributed by Rick Gorton <[email protected]>
9
 *
10
 * Much of the information about 21264 scheduling/coding comes from:
11
 *	Compiler Writer's Guide for the Alpha 21264
12
 *	abbreviated as 'CWG' in other comments here
13
 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
14
 * Scheduling notation:
15
 *	E	- either cluster
16
 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17
 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
18
 * The algorithm for the leading and trailing quadwords remains the same,
19
 * however the loop has been unrolled to enable better memory throughput,
20
 * and the code has been replicated for each of the entry points: __memset
21
 * and __memset16 to permit better scheduling to eliminate the stalling
22
 * encountered during the mask replication.
23
 * A future enhancement might be to put in a byte store loop for really
24
 * small (say < 32 bytes) memset()s.  Whether or not that change would be
25
 * a win in the kernel would depend upon the contextual usage.
26
 * WARNING: Maintaining this is going to be more work than the above version,
27
 * as fixes will need to be made in multiple places.  The performance gain
28
 * is worth it.
29
 */
30
#include <linux/export.h>
31
	.set noat
32
	.set noreorder
33
.text
34
	.globl memset
35
	.globl __memset
36
	.globl ___memset
37
	.globl __memset16
38
	.globl __constant_c_memset
39

40
	.ent ___memset
41
.align 5
42
___memset:
43
	.frame $30,0,$26,0
44
	.prologue 0
45

46
	/*
47
	 * Serious stalling happens.  The only way to mitigate this is to
48
	 * undertake a major re-write to interleave the constant materialization
49
	 * with other parts of the fall-through code.  This is important, even
50
	 * though it makes maintenance tougher.
51
	 * Do this later.
52
	 */
53
	and $17,255,$1		# E : 00000000000000ch
54
	insbl $17,1,$2		# U : 000000000000ch00
55
	bis $16,$16,$0		# E : return value
56
	ble $18,end_b		# U : zero length requested?
57

58
	addq $18,$16,$6		# E : max address to write to
59
	bis	$1,$2,$17	# E : 000000000000chch
60
	insbl	$1,2,$3		# U : 0000000000ch0000
61
	insbl	$1,3,$4		# U : 00000000ch000000
62

63
	or	$3,$4,$3	# E : 00000000chch0000
64
	inswl	$17,4,$5	# U : 0000chch00000000
65
	xor	$16,$6,$1	# E : will complete write be within one quadword?
66
	inswl	$17,6,$2	# U : chch000000000000
67

68
	or	$17,$3,$17	# E : 00000000chchchch
69
	or	$2,$5,$2	# E : chchchch00000000
70
	bic	$1,7,$1		# E : fit within a single quadword?
71
	and	$16,7,$3	# E : Target addr misalignment
72

73
	or	$17,$2,$17	# E : chchchchchchchch
74
	beq	$1,within_quad_b # U :
75
	nop			# E :
76
	beq	$3,aligned_b	# U : target is 0mod8
77

78
	/*
79
	 * Target address is misaligned, and won't fit within a quadword
80
	 */
81
	ldq_u $4,0($16)		# L : Fetch first partial
82
	bis $16,$16,$5		# E : Save the address
83
	insql $17,$16,$2	# U : Insert new bytes
84
	subq $3,8,$3		# E : Invert (for addressing uses)
85

86
	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
87
	mskql $4,$16,$4		# U : clear relevant parts of the quad
88
	subq $16,$3,$16		# E : $16 is new aligned destination
89
	bis $2,$4,$1		# E : Final bytes
90

91
	nop
92
	stq_u $1,0($5)		# L : Store result
93
	nop
94
	nop
95

96
.align 4
97
aligned_b:
98
	/*
99
	 * We are now guaranteed to be quad aligned, with at least
100
	 * one partial quad to write.
101
	 */
102

103
	sra $18,3,$3		# U : Number of remaining quads to write
104
	and $18,7,$18		# E : Number of trailing bytes to write
105
	bis $16,$16,$5		# E : Save dest address
106
	beq $3,no_quad_b	# U : tail stuff only
107

108
	/*
109
	 * it's worth the effort to unroll this and use wh64 if possible
110
	 * Lifted a bunch of code from clear_user.S
111
	 * At this point, entry values are:
112
	 * $16	Current destination address
113
	 * $5	A copy of $16
114
	 * $6	The max quadword address to write to
115
	 * $18	Number trailer bytes
116
	 * $3	Number quads to write
117
	 */
118

119
	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
120
	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
121
	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
122
	blt	$4, loop_b	# U :
123

124
	/*
125
	 * We know we've got at least 16 quads, minimum of one trip
126
	 * through unrolled loop.  Do a quad at a time to get us 0mod64
127
	 * aligned.
128
	 */
129

130
	nop			# E :
131
	nop			# E :
132
	nop			# E :
133
	beq	$1, $bigalign_b	# U :
134

135
$alignmod64_b:
136
	stq	$17, 0($5)	# L :
137
	subq	$3, 1, $3	# E : For consistency later
138
	addq	$1, 8, $1	# E : Increment towards zero for alignment
139
	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
140

141
	nop
142
	nop
143
	addq	$5, 8, $5	# E : Inc address
144
	blt	$1, $alignmod64_b # U :
145

146
$bigalign_b:
147
	/*
148
	 * $3 - number quads left to go
149
	 * $5 - target address (aligned 0mod64)
150
	 * $17 - mask of stuff to store
151
	 * Scratch registers available: $7, $2, $4, $1
152
	 * we know that we'll be taking a minimum of one trip through
153
 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
154
	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
155
	 * The wh64 is issued on for the starting destination address for trip +2
156
	 * through the loop, and if there are less than two trips left, the target
157
	 * address will be for the current trip.
158
	 */
159

160
$do_wh64_b:
161
	wh64	($4)		# L1 : memory subsystem write hint
162
	subq	$3, 24, $2	# E : For determining future wh64 addresses
163
	stq	$17, 0($5)	# L :
164
	nop			# E :
165

166
	addq	$5, 128, $4	# E : speculative target of next wh64
167
	stq	$17, 8($5)	# L :
168
	stq	$17, 16($5)	# L :
169
	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
170

171
	stq	$17, 24($5)	# L :
172
	stq	$17, 32($5)	# L :
173
	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
174
	nop
175

176
	stq	$17, 40($5)	# L :
177
	stq	$17, 48($5)	# L :
178
	subq	$3, 16, $2	# E : Repeat the loop at least once more?
179
	nop
180

181
	stq	$17, 56($5)	# L :
182
	addq	$5, 64, $5	# E :
183
	subq	$3, 8, $3	# E :
184
	bge	$2, $do_wh64_b	# U :
185

186
	nop
187
	nop
188
	nop
189
	beq	$3, no_quad_b	# U : Might have finished already
190

191
.align 4
192
	/*
193
	 * Simple loop for trailing quadwords, or for small amounts
194
	 * of data (where we can't use an unrolled loop and wh64)
195
	 */
196
loop_b:
197
	stq $17,0($5)		# L :
198
	subq $3,1,$3		# E : Decrement number quads left
199
	addq $5,8,$5		# E : Inc address
200
	bne $3,loop_b		# U : more?
201

202
no_quad_b:
203
	/*
204
	 * Write 0..7 trailing bytes.
205
	 */
206
	nop			# E :
207
	beq $18,end_b		# U : All done?
208
	ldq $7,0($5)		# L :
209
	mskqh $7,$6,$2		# U : Mask final quad
210

211
	insqh $17,$6,$4		# U : New bits
212
	bis $2,$4,$1		# E : Put it all together
213
	stq $1,0($5)		# L : And back to memory
214
	ret $31,($26),1		# L0 :
215

216
within_quad_b:
217
	ldq_u $1,0($16)		# L :
218
	insql $17,$16,$2	# U : New bits
219
	mskql $1,$16,$4		# U : Clear old
220
	bis $2,$4,$2		# E : New result
221

222
	mskql $2,$6,$4		# U :
223
	mskqh $1,$6,$2		# U :
224
	bis $2,$4,$1		# E :
225
	stq_u $1,0($16)		# L :
226

227
end_b:
228
	nop
229
	nop
230
	nop
231
	ret $31,($26),1		# L0 :
232
	.end ___memset
233
	EXPORT_SYMBOL(___memset)
234

235
	/*
236
	 * This is the original body of code, prior to replication and
237
	 * rescheduling.  Leave it here, as there may be calls to this
238
	 * entry point.
239
	 */
240
.align 4
241
	.ent __constant_c_memset
242
__constant_c_memset:
243
	.frame $30,0,$26,0
244
	.prologue 0
245

246
	addq $18,$16,$6		# E : max address to write to
247
	bis $16,$16,$0		# E : return value
248
	xor $16,$6,$1		# E : will complete write be within one quadword?
249
	ble $18,end		# U : zero length requested?
250

251
	bic $1,7,$1		# E : fit within a single quadword
252
	beq $1,within_one_quad	# U :
253
	and $16,7,$3		# E : Target addr misalignment
254
	beq $3,aligned		# U : target is 0mod8
255

256
	/*
257
	 * Target address is misaligned, and won't fit within a quadword
258
	 */
259
	ldq_u $4,0($16)		# L : Fetch first partial
260
	bis $16,$16,$5		# E : Save the address
261
	insql $17,$16,$2	# U : Insert new bytes
262
	subq $3,8,$3		# E : Invert (for addressing uses)
263

264
	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
265
	mskql $4,$16,$4		# U : clear relevant parts of the quad
266
	subq $16,$3,$16		# E : $16 is new aligned destination
267
	bis $2,$4,$1		# E : Final bytes
268

269
	nop
270
	stq_u $1,0($5)		# L : Store result
271
	nop
272
	nop
273

274
.align 4
275
aligned:
276
	/*
277
	 * We are now guaranteed to be quad aligned, with at least
278
	 * one partial quad to write.
279
	 */
280

281
	sra $18,3,$3		# U : Number of remaining quads to write
282
	and $18,7,$18		# E : Number of trailing bytes to write
283
	bis $16,$16,$5		# E : Save dest address
284
	beq $3,no_quad		# U : tail stuff only
285

286
	/*
287
	 * it's worth the effort to unroll this and use wh64 if possible
288
	 * Lifted a bunch of code from clear_user.S
289
	 * At this point, entry values are:
290
	 * $16	Current destination address
291
	 * $5	A copy of $16
292
	 * $6	The max quadword address to write to
293
	 * $18	Number trailer bytes
294
	 * $3	Number quads to write
295
	 */
296

297
	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
298
	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
299
	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
300
	blt	$4, loop	# U :
301

302
	/*
303
	 * We know we've got at least 16 quads, minimum of one trip
304
	 * through unrolled loop.  Do a quad at a time to get us 0mod64
305
	 * aligned.
306
	 */
307

308
	nop			# E :
309
	nop			# E :
310
	nop			# E :
311
	beq	$1, $bigalign	# U :
312

313
$alignmod64:
314
	stq	$17, 0($5)	# L :
315
	subq	$3, 1, $3	# E : For consistency later
316
	addq	$1, 8, $1	# E : Increment towards zero for alignment
317
	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
318

319
	nop
320
	nop
321
	addq	$5, 8, $5	# E : Inc address
322
	blt	$1, $alignmod64	# U :
323

324
$bigalign:
325
	/*
326
	 * $3 - number quads left to go
327
	 * $5 - target address (aligned 0mod64)
328
	 * $17 - mask of stuff to store
329
	 * Scratch registers available: $7, $2, $4, $1
330
	 * we know that we'll be taking a minimum of one trip through
331
 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
332
	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
333
	 * The wh64 is issued on for the starting destination address for trip +2
334
	 * through the loop, and if there are less than two trips left, the target
335
	 * address will be for the current trip.
336
	 */
337

338
$do_wh64:
339
	wh64	($4)		# L1 : memory subsystem write hint
340
	subq	$3, 24, $2	# E : For determining future wh64 addresses
341
	stq	$17, 0($5)	# L :
342
	nop			# E :
343

344
	addq	$5, 128, $4	# E : speculative target of next wh64
345
	stq	$17, 8($5)	# L :
346
	stq	$17, 16($5)	# L :
347
	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
348

349
	stq	$17, 24($5)	# L :
350
	stq	$17, 32($5)	# L :
351
	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
352
	nop
353

354
	stq	$17, 40($5)	# L :
355
	stq	$17, 48($5)	# L :
356
	subq	$3, 16, $2	# E : Repeat the loop at least once more?
357
	nop
358

359
	stq	$17, 56($5)	# L :
360
	addq	$5, 64, $5	# E :
361
	subq	$3, 8, $3	# E :
362
	bge	$2, $do_wh64	# U :
363

364
	nop
365
	nop
366
	nop
367
	beq	$3, no_quad	# U : Might have finished already
368

369
.align 4
370
	/*
371
	 * Simple loop for trailing quadwords, or for small amounts
372
	 * of data (where we can't use an unrolled loop and wh64)
373
	 */
374
loop:
375
	stq $17,0($5)		# L :
376
	subq $3,1,$3		# E : Decrement number quads left
377
	addq $5,8,$5		# E : Inc address
378
	bne $3,loop		# U : more?
379

380
no_quad:
381
	/*
382
	 * Write 0..7 trailing bytes.
383
	 */
384
	nop			# E :
385
	beq $18,end		# U : All done?
386
	ldq $7,0($5)		# L :
387
	mskqh $7,$6,$2		# U : Mask final quad
388

389
	insqh $17,$6,$4		# U : New bits
390
	bis $2,$4,$1		# E : Put it all together
391
	stq $1,0($5)		# L : And back to memory
392
	ret $31,($26),1		# L0 :
393

394
within_one_quad:
395
	ldq_u $1,0($16)		# L :
396
	insql $17,$16,$2	# U : New bits
397
	mskql $1,$16,$4		# U : Clear old
398
	bis $2,$4,$2		# E : New result
399

400
	mskql $2,$6,$4		# U :
401
	mskqh $1,$6,$2		# U :
402
	bis $2,$4,$1		# E :
403
	stq_u $1,0($16)		# L :
404

405
end:
406
	nop
407
	nop
408
	nop
409
	ret $31,($26),1		# L0 :
410
	.end __constant_c_memset
411
	EXPORT_SYMBOL(__constant_c_memset)
412

413
	/*
414
	 * This is a replicant of the __constant_c_memset code, rescheduled
415
	 * to mask stalls.  Note that entry point names also had to change
416
	 */
417
	.align 5
418
	.ent __memset16
419

420
__memset16:
421
	.frame $30,0,$26,0
422
	.prologue 0
423

424
	inswl $17,0,$5		# U : 000000000000c1c2
425
	inswl $17,2,$2		# U : 00000000c1c20000
426
	bis $16,$16,$0		# E : return value
427
	addq	$18,$16,$6	# E : max address to write to
428

429
	ble $18, end_w		# U : zero length requested?
430
	inswl	$17,4,$3	# U : 0000c1c200000000
431
	inswl	$17,6,$4	# U : c1c2000000000000
432
	xor	$16,$6,$1	# E : will complete write be within one quadword?
433

434
	or	$2,$5,$2	# E : 00000000c1c2c1c2
435
	or	$3,$4,$17	# E : c1c2c1c200000000
436
	bic	$1,7,$1		# E : fit within a single quadword
437
	and	$16,7,$3	# E : Target addr misalignment
438

439
	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
440
	beq $1,within_quad_w	# U :
441
	nop
442
	beq $3,aligned_w	# U : target is 0mod8
443

444
	/*
445
	 * Target address is misaligned, and won't fit within a quadword
446
	 */
447
	ldq_u $4,0($16)		# L : Fetch first partial
448
	bis $16,$16,$5		# E : Save the address
449
	insql $17,$16,$2	# U : Insert new bytes
450
	subq $3,8,$3		# E : Invert (for addressing uses)
451

452
	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
453
	mskql $4,$16,$4		# U : clear relevant parts of the quad
454
	subq $16,$3,$16		# E : $16 is new aligned destination
455
	bis $2,$4,$1		# E : Final bytes
456

457
	nop
458
	stq_u $1,0($5)		# L : Store result
459
	nop
460
	nop
461

462
.align 4
463
aligned_w:
464
	/*
465
	 * We are now guaranteed to be quad aligned, with at least
466
	 * one partial quad to write.
467
	 */
468

469
	sra $18,3,$3		# U : Number of remaining quads to write
470
	and $18,7,$18		# E : Number of trailing bytes to write
471
	bis $16,$16,$5		# E : Save dest address
472
	beq $3,no_quad_w	# U : tail stuff only
473

474
	/*
475
	 * it's worth the effort to unroll this and use wh64 if possible
476
	 * Lifted a bunch of code from clear_user.S
477
	 * At this point, entry values are:
478
	 * $16	Current destination address
479
	 * $5	A copy of $16
480
	 * $6	The max quadword address to write to
481
	 * $18	Number trailer bytes
482
	 * $3	Number quads to write
483
	 */
484

485
	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
486
	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
487
	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
488
	blt	$4, loop_w	# U :
489

490
	/*
491
	 * We know we've got at least 16 quads, minimum of one trip
492
	 * through unrolled loop.  Do a quad at a time to get us 0mod64
493
	 * aligned.
494
	 */
495

496
	nop			# E :
497
	nop			# E :
498
	nop			# E :
499
	beq	$1, $bigalign_w	# U :
500

501
$alignmod64_w:
502
	stq	$17, 0($5)	# L :
503
	subq	$3, 1, $3	# E : For consistency later
504
	addq	$1, 8, $1	# E : Increment towards zero for alignment
505
	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
506

507
	nop
508
	nop
509
	addq	$5, 8, $5	# E : Inc address
510
	blt	$1, $alignmod64_w	# U :
511

512
$bigalign_w:
513
	/*
514
	 * $3 - number quads left to go
515
	 * $5 - target address (aligned 0mod64)
516
	 * $17 - mask of stuff to store
517
	 * Scratch registers available: $7, $2, $4, $1
518
	 * we know that we'll be taking a minimum of one trip through
519
 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
520
	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
521
	 * The wh64 is issued on for the starting destination address for trip +2
522
	 * through the loop, and if there are less than two trips left, the target
523
	 * address will be for the current trip.
524
	 */
525

526
$do_wh64_w:
527
	wh64	($4)		# L1 : memory subsystem write hint
528
	subq	$3, 24, $2	# E : For determining future wh64 addresses
529
	stq	$17, 0($5)	# L :
530
	nop			# E :
531

532
	addq	$5, 128, $4	# E : speculative target of next wh64
533
	stq	$17, 8($5)	# L :
534
	stq	$17, 16($5)	# L :
535
	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
536

537
	stq	$17, 24($5)	# L :
538
	stq	$17, 32($5)	# L :
539
	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
540
	nop
541

542
	stq	$17, 40($5)	# L :
543
	stq	$17, 48($5)	# L :
544
	subq	$3, 16, $2	# E : Repeat the loop at least once more?
545
	nop
546

547
	stq	$17, 56($5)	# L :
548
	addq	$5, 64, $5	# E :
549
	subq	$3, 8, $3	# E :
550
	bge	$2, $do_wh64_w	# U :
551

552
	nop
553
	nop
554
	nop
555
	beq	$3, no_quad_w	# U : Might have finished already
556

557
.align 4
558
	/*
559
	 * Simple loop for trailing quadwords, or for small amounts
560
	 * of data (where we can't use an unrolled loop and wh64)
561
	 */
562
loop_w:
563
	stq $17,0($5)		# L :
564
	subq $3,1,$3		# E : Decrement number quads left
565
	addq $5,8,$5		# E : Inc address
566
	bne $3,loop_w		# U : more?
567

568
no_quad_w:
569
	/*
570
	 * Write 0..7 trailing bytes.
571
	 */
572
	nop			# E :
573
	beq $18,end_w		# U : All done?
574
	ldq $7,0($5)		# L :
575
	mskqh $7,$6,$2		# U : Mask final quad
576

577
	insqh $17,$6,$4		# U : New bits
578
	bis $2,$4,$1		# E : Put it all together
579
	stq $1,0($5)		# L : And back to memory
580
	ret $31,($26),1		# L0 :
581

582
within_quad_w:
583
	ldq_u $1,0($16)		# L :
584
	insql $17,$16,$2	# U : New bits
585
	mskql $1,$16,$4		# U : Clear old
586
	bis $2,$4,$2		# E : New result
587

588
	mskql $2,$6,$4		# U :
589
	mskqh $1,$6,$2		# U :
590
	bis $2,$4,$1		# E :
591
	stq_u $1,0($16)		# L :
592

593
end_w:
594
	nop
595
	nop
596
	nop
597
	ret $31,($26),1		# L0 :
598

599
	.end __memset16
600
	EXPORT_SYMBOL(__memset16)
601

602
memset = ___memset
603
__memset = ___memset
604
	EXPORT_SYMBOL(memset)
605
	EXPORT_SYMBOL(__memset)
606

607
Product

Resources

Company