CoCalc -- memcpy

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/tile/lib/memcpy_32.S
¹⁷⁴⁶⁰ views
1
/*
2
 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3
 *
4
 *   This program is free software; you can redistribute it and/or
5
 *   modify it under the terms of the GNU General Public License
6
 *   as published by the Free Software Foundation, version 2.
7
 *
8
 *   This program is distributed in the hope that it will be useful, but
9
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
10
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11
 *   NON INFRINGEMENT.  See the GNU General Public License for
12
 *   more details.
13
 */
14

15
#include <arch/chip.h>
16

17

18
/*
19
 * This file shares the implementation of the userspace memcpy and
20
 * the kernel's memcpy, copy_to_user and copy_from_user.
21
 */
22

23
#include <linux/linkage.h>
24

25
/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
26
#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
27
#define memcpy __memcpy_asm
28
#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
29
#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
30
#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
31
#endif
32

33
#define IS_MEMCPY	  0
34
#define IS_COPY_FROM_USER  1
35
#define IS_COPY_FROM_USER_ZEROING  2
36
#define IS_COPY_TO_USER   -1
37

38
	.section .text.memcpy_common, "ax"
39
	.align 64
40

41
/* Use this to preface each bundle that can cause an exception so
42
 * the kernel can clean up properly. The special cleanup code should
43
 * not use these, since it knows what it is doing.
44
 */
45
#define EX \
46
	.pushsection __ex_table, "a"; \
47
	.word 9f, memcpy_common_fixup; \
48
	.popsection; \
49
	9
50

51

52
/* __copy_from_user_inatomic takes the kernel target address in r0,
53
 * the user source in r1, and the bytes to copy in r2.
54
 * It returns the number of uncopiable bytes (hopefully zero) in r0.
55
 */
56
ENTRY(__copy_from_user_inatomic)
57
.type __copy_from_user_inatomic, @function
58
	FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
59
	  .text.memcpy_common, \
60
	  .Lend_memcpy_common - __copy_from_user_inatomic)
61
	{ movei r29, IS_COPY_FROM_USER; j memcpy_common }
62
	.size __copy_from_user_inatomic, . - __copy_from_user_inatomic
63

64
/* __copy_from_user_zeroing is like __copy_from_user_inatomic, but
65
 * any uncopiable bytes are zeroed in the target.
66
 */
67
ENTRY(__copy_from_user_zeroing)
68
.type __copy_from_user_zeroing, @function
69
	FEEDBACK_REENTER(__copy_from_user_inatomic)
70
	{ movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
71
	.size __copy_from_user_zeroing, . - __copy_from_user_zeroing
72

73
/* __copy_to_user_inatomic takes the user target address in r0,
74
 * the kernel source in r1, and the bytes to copy in r2.
75
 * It returns the number of uncopiable bytes (hopefully zero) in r0.
76
 */
77
ENTRY(__copy_to_user_inatomic)
78
.type __copy_to_user_inatomic, @function
79
	FEEDBACK_REENTER(__copy_from_user_inatomic)
80
	{ movei r29, IS_COPY_TO_USER; j memcpy_common }
81
	.size __copy_to_user_inatomic, . - __copy_to_user_inatomic
82

83
ENTRY(memcpy)
84
.type memcpy, @function
85
	FEEDBACK_REENTER(__copy_from_user_inatomic)
86
	{ movei r29, IS_MEMCPY }
87
	.size memcpy, . - memcpy
88
	/* Fall through */
89

90
	.type memcpy_common, @function
91
memcpy_common:
92
	/* On entry, r29 holds one of the IS_* macro values from above. */
93

94

95
	/* r0 is the dest, r1 is the source, r2 is the size. */
96

97
	/* Save aside original dest so we can return it at the end. */
98
	{ sw sp, lr; move r23, r0; or r4, r0, r1 }
99

100
	/* Check for an empty size. */
101
	{ bz r2, .Ldone; andi r4, r4, 3 }
102

103
	/* Save aside original values in case of a fault. */
104
	{ move r24, r1; move r25, r2 }
105
	move r27, lr
106

107
	/* Check for an unaligned source or dest. */
108
	{ bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 }
109

110
.Lcheck_aligned_copy_size:
111
	/* If we are copying < 256 bytes, branch to simple case. */
112
	{ blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 }
113

114
	/* Copying >= 256 bytes, so jump to complex prefetching loop. */
115
	{ andi r6, r1, 63; j .Lcopy_many }
116

117
/*
118
 *
119
 * Aligned 4 byte at a time copy loop
120
 *
121
 */
122

123
.Lcopy_8_loop:
124
	/* Copy two words at a time to hide load latency. */
125
EX:	{ lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 }
126
EX:	{ lw r4, r1; addi r1, r1, 4 }
127
EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
128
EX:	{ sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 }
129
.Lcopy_8_check:
130
	{ bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 }
131

132
	/* Copy odd leftover word, if any. */
133
	{ bnzt r4, .Lcheck_odd_stragglers }
134
EX:	{ lw r3, r1; addi r1, r1, 4 }
135
EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
136

137
.Lcheck_odd_stragglers:
138
	{ bnz r2, .Lcopy_unaligned_few }
139

140
.Ldone:
141
	/* For memcpy return original dest address, else zero. */
142
	{ mz r0, r29, r23; jrp lr }
143

144

145
/*
146
 *
147
 * Prefetching multiple cache line copy handler (for large transfers).
148
 *
149
 */
150

151
	/* Copy words until r1 is cache-line-aligned. */
152
.Lalign_loop:
153
EX:	{ lw r3, r1; addi r1, r1, 4 }
154
	{ andi r6, r1, 63 }
155
EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
156
.Lcopy_many:
157
	{ bnzt r6, .Lalign_loop; addi r9, r0, 63 }
158

159
	{ addi r3, r1, 60; andi r9, r9, -64 }
160

161
#if CHIP_HAS_WH64()
162
	/* No need to prefetch dst, we'll just do the wh64
163
	 * right before we copy a line.
164
	 */
165
#endif
166

167
EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }
168
	/* Intentionally stall for a few cycles to leave L2 cache alone. */
169
	{ bnzt zero, .; move r27, lr }
170
EX:	{ lw r6, r3; addi r3, r3, 64 }
171
	/* Intentionally stall for a few cycles to leave L2 cache alone. */
172
	{ bnzt zero, . }
173
EX:	{ lw r7, r3; addi r3, r3, 64 }
174
#if !CHIP_HAS_WH64()
175
	/* Prefetch the dest */
176
	/* Intentionally stall for a few cycles to leave L2 cache alone. */
177
	{ bnzt zero, . }
178
	/* Use a real load to cause a TLB miss if necessary.  We aren't using
179
	 * r28, so this should be fine.
180
	 */
181
EX:	{ lw r28, r9; addi r9, r9, 64 }
182
	/* Intentionally stall for a few cycles to leave L2 cache alone. */
183
	{ bnzt zero, . }
184
	{ prefetch r9; addi r9, r9, 64 }
185
	/* Intentionally stall for a few cycles to leave L2 cache alone. */
186
	{ bnzt zero, . }
187
	{ prefetch r9; addi r9, r9, 64 }
188
#endif
189
	/* Intentionally stall for a few cycles to leave L2 cache alone. */
190
	{ bz zero, .Lbig_loop2 }
191

192
	/* On entry to this loop:
193
	 * - r0 points to the start of dst line 0
194
	 * - r1 points to start of src line 0
195
	 * - r2 >= (256 - 60), only the first time the loop trips.
196
	 * - r3 contains r1 + 128 + 60    [pointer to end of source line 2]
197
	 *   This is our prefetch address. When we get near the end
198
	 *   rather than prefetching off the end this is changed to point
199
	 *   to some "safe" recently loaded address.
200
	 * - r5 contains *(r1 + 60)       [i.e. last word of source line 0]
201
	 * - r6 contains *(r1 + 64 + 60)  [i.e. last word of source line 1]
202
	 * - r9 contains ((r0 + 63) & -64)
203
	 *     [start of next dst cache line.]
204
	 */
205

206
.Lbig_loop:
207
	{ jal .Lcopy_line2; add r15, r1, r2 }
208

209
.Lbig_loop2:
210
	/* Copy line 0, first stalling until r5 is ready. */
211
EX:	{ move r12, r5; lw r16, r1 }
212
	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
213
	/* Prefetch several lines ahead. */
214
EX:	{ lw r5, r3; addi r3, r3, 64 }
215
	{ jal .Lcopy_line }
216

217
	/* Copy line 1, first stalling until r6 is ready. */
218
EX:	{ move r12, r6; lw r16, r1 }
219
	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
220
	/* Prefetch several lines ahead. */
221
EX:	{ lw r6, r3; addi r3, r3, 64 }
222
	{ jal .Lcopy_line }
223

224
	/* Copy line 2, first stalling until r7 is ready. */
225
EX:	{ move r12, r7; lw r16, r1 }
226
	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
227
	/* Prefetch several lines ahead. */
228
EX:	{ lw r7, r3; addi r3, r3, 64 }
229
	/* Use up a caches-busy cycle by jumping back to the top of the
230
	 * loop. Might as well get it out of the way now.
231
	 */
232
	{ j .Lbig_loop }
233

234

235
	/* On entry:
236
	 * - r0 points to the destination line.
237
	 * - r1 points to the source line.
238
	 * - r3 is the next prefetch address.
239
	 * - r9 holds the last address used for wh64.
240
	 * - r12 = WORD_15
241
	 * - r16 = WORD_0.
242
	 * - r17 == r1 + 16.
243
	 * - r27 holds saved lr to restore.
244
	 *
245
	 * On exit:
246
	 * - r0 is incremented by 64.
247
	 * - r1 is incremented by 64, unless that would point to a word
248
	 *   beyond the end of the source array, in which case it is redirected
249
	 *   to point to an arbitrary word already in the cache.
250
	 * - r2 is decremented by 64.
251
	 * - r3 is unchanged, unless it points to a word beyond the
252
	 *   end of the source array, in which case it is redirected
253
	 *   to point to an arbitrary word already in the cache.
254
	 *   Redirecting is OK since if we are that close to the end
255
	 *   of the array we will not come back to this subroutine
256
	 *   and use the contents of the prefetched address.
257
	 * - r4 is nonzero iff r2 >= 64.
258
	 * - r9 is incremented by 64, unless it points beyond the
259
	 *   end of the last full destination cache line, in which
260
	 *   case it is redirected to a "safe address" that can be
261
	 *   clobbered (sp - 64)
262
	 * - lr contains the value in r27.
263
	 */
264

265
/* r26 unused */
266

267
.Lcopy_line:
268
	/* TODO: when r3 goes past the end, we would like to redirect it
269
	 * to prefetch the last partial cache line (if any) just once, for the
270
	 * benefit of the final cleanup loop. But we don't want to
271
	 * prefetch that line more than once, or subsequent prefetches
272
	 * will go into the RTF. But then .Lbig_loop should unconditionally
273
	 * branch to top of loop to execute final prefetch, and its
274
	 * nop should become a conditional branch.
275
	 */
276

277
	/* We need two non-memory cycles here to cover the resources
278
	 * used by the loads initiated by the caller.
279
	 */
280
	{ add r15, r1, r2 }
281
.Lcopy_line2:
282
	{ slt_u r13, r3, r15; addi r17, r1, 16 }
283

284
	/* NOTE: this will stall for one cycle as L1 is busy. */
285

286
	/* Fill second L1D line. */
287
EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
288

289
#if CHIP_HAS_WH64()
290
	/* Prepare destination line for writing. */
291
EX:	{ wh64 r9; addi r9, r9, 64 }
292
#else
293
	/* Prefetch dest line */
294
	{ prefetch r9; addi r9, r9, 64 }
295
#endif
296
	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
297

298
	/* Load the three remaining words from the last L1D line, which
299
	 * we know has already filled the L1D.
300
	 */
301
EX:	{ lw r4, r1;  addi r1, r1, 4;   addi r20, r1, 16 }   /* r4 = WORD_12 */
302
EX:	{ lw r8, r1;  addi r1, r1, 4;   slt_u r13, r20, r15 }/* r8 = WORD_13 */
303
EX:	{ lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 }  /* r11 = WORD_14 */
304

305
	/* Load the three remaining words from the first L1D line, first
306
	 * stalling until it has filled by "looking at" r16.
307
	 */
308
EX:	{ lw r13, r1; addi r1, r1, 4; move zero, r16 }   /* r13 = WORD_1 */
309
EX:	{ lw r14, r1; addi r1, r1, 4 }                   /* r14 = WORD_2 */
310
EX:	{ lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
311

312
	/* Load second word from the second L1D line, first
313
	 * stalling until it has filled by "looking at" r17.
314
	 */
315
EX:	{ lw r19, r1; addi r1, r1, 4; move zero, r17 }  /* r19 = WORD_5 */
316

317
	/* Store last word to the destination line, potentially dirtying it
318
	 * for the first time, which keeps the L2 busy for two cycles.
319
	 */
320
EX:	{ sw r10, r12 }                                 /* store(WORD_15) */
321

322
	/* Use two L1D hits to cover the sw L2 access above. */
323
EX:	{ lw r10, r1; addi r1, r1, 4 }                  /* r10 = WORD_6 */
324
EX:	{ lw r12, r1; addi r1, r1, 4 }                  /* r12 = WORD_7 */
325

326
	/* Fill third L1D line. */
327
EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
328

329
	/* Store first L1D line. */
330
EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
331
EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
332
EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
333
#if CHIP_HAS_WH64()
334
EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
335
#else
336
	/* Back up the r9 to a cache line we are already storing to
337
	 * if it gets past the end of the dest vector.  Strictly speaking,
338
	 * we don't need to back up to the start of a cache line, but it's free
339
	 * and tidy, so why not?
340
	 */
341
EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
342
#endif
343
	/* Store second L1D line. */
344
EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
345
EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
346
EX:	{ sw r0, r10; addi r0, r0, 4 }                  /* store(WORD_6) */
347
EX:	{ sw r0, r12; addi r0, r0, 4 }                  /* store(WORD_7) */
348

349
EX:	{ lw r13, r1; addi r1, r1, 4; move zero, r18 }  /* r13 = WORD_9 */
350
EX:	{ lw r14, r1; addi r1, r1, 4 }                  /* r14 = WORD_10 */
351
EX:	{ lw r15, r1; move r1, r20   }                  /* r15 = WORD_11 */
352

353
	/* Store third L1D line. */
354
EX:	{ sw r0, r18; addi r0, r0, 4 }                  /* store(WORD_8) */
355
EX:	{ sw r0, r13; addi r0, r0, 4 }                  /* store(WORD_9) */
356
EX:	{ sw r0, r14; addi r0, r0, 4 }                  /* store(WORD_10) */
357
EX:	{ sw r0, r15; addi r0, r0, 4 }                  /* store(WORD_11) */
358

359
	/* Store rest of fourth L1D line. */
360
EX:	{ sw r0, r4;  addi r0, r0, 4 }                  /* store(WORD_12) */
361
	{
362
EX:	sw r0, r8                                       /* store(WORD_13) */
363
	addi r0, r0, 4
364
	/* Will r2 be > 64 after we subtract 64 below? */
365
	shri r4, r2, 7
366
	}
367
	{
368
EX:	sw r0, r11                                      /* store(WORD_14) */
369
	addi r0, r0, 8
370
	/* Record 64 bytes successfully copied. */
371
	addi r2, r2, -64
372
	}
373

374
	{ jrp lr; move lr, r27 }
375

376
	/* Convey to the backtrace library that the stack frame is size
377
	 * zero, and the real return address is on the stack rather than
378
	 * in 'lr'.
379
	 */
380
	{ info 8 }
381

382
	.align 64
383
.Lcopy_unaligned_maybe_many:
384
	/* Skip the setup overhead if we aren't copying many bytes. */
385
	{ slti_u r8, r2, 20; sub r4, zero, r0 }
386
	{ bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 }
387
	{ bz r4, .Ldest_is_word_aligned; add r18, r1, r2 }
388

389
/*
390
 *
391
 * unaligned 4 byte at a time copy handler.
392
 *
393
 */
394

395
	/* Copy single bytes until r0 == 0 mod 4, so we can store words. */
396
.Lalign_dest_loop:
397
EX:	{ lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 }
398
EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
399
	{ bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 }
400

401
	/* If source and dest are now *both* aligned, do an aligned copy. */
402
	{ bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 }
403

404
.Ldest_is_word_aligned:
405

406
#if CHIP_HAS_DWORD_ALIGN()
407
EX:	{ andi r8, r0, 63; lwadd_na r6, r1, 4}
408
	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
409

410
	/* This copies unaligned words until either there are fewer
411
	 * than 4 bytes left to copy, or until the destination pointer
412
	 * is cache-aligned, whichever comes first.
413
	 *
414
	 * On entry:
415
	 * - r0 is the next store address.
416
	 * - r1 points 4 bytes past the load address corresponding to r0.
417
	 * - r2 >= 4
418
	 * - r6 is the next aligned word loaded.
419
	 */
420
.Lcopy_unaligned_src_words:
421
EX:	{ lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 }
422
	/* stall */
423
	{ dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 }
424
EX:	{ swadd r0, r6, 4; addi r2, r2, -4 }
425
	{ bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 }
426
	{ bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
427

428
	/* On entry:
429
	 * - r0 is the next store address.
430
	 * - r1 points 4 bytes past the load address corresponding to r0.
431
	 * - r2 >= 4 (# of bytes left to store).
432
	 * - r6 is the next aligned src word value.
433
	 * - r9 = (r2 < 64U).
434
	 * - r18 points one byte past the end of source memory.
435
	 */
436
.Ldest_is_L2_line_aligned:
437

438
	{
439
	/* Not a full cache line remains. */
440
	bnz r9, .Lcleanup_unaligned_words
441
	move r7, r6
442
	}
443

444
	/* r2 >= 64 */
445

446
	/* Kick off two prefetches, but don't go past the end. */
447
	{ addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 }
448
	{ prefetch r3; move r3, r8; slt_u r8, r8, r18 }
449
	{ mvz r3, r8, r1; addi r8, r3, 64 }
450
	{ prefetch r3; move r3, r8; slt_u r8, r8, r18 }
451
	{ mvz r3, r8, r1; movei r17, 0 }
452

453
.Lcopy_unaligned_line:
454
	/* Prefetch another line. */
455
	{ prefetch r3; addi r15, r1, 60; addi r3, r3, 64 }
456
	/* Fire off a load of the last word we are about to copy. */
457
EX:	{ lw_na r15, r15; slt_u r8, r3, r18 }
458

459
EX:	{ mvz r3, r8, r1; wh64 r0 }
460

461
	/* This loop runs twice.
462
	 *
463
	 * On entry:
464
	 * - r17 is even before the first iteration, and odd before
465
	 *   the second.  It is incremented inside the loop.  Encountering
466
	 *   an even value at the end of the loop makes it stop.
467
	 */
468
.Lcopy_half_an_unaligned_line:
469
EX:	{
470
	/* Stall until the last byte is ready. In the steady state this
471
	 * guarantees all words to load below will be in the L2 cache, which
472
	 * avoids shunting the loads to the RTF.
473
	 */
474
	move zero, r15
475
	lwadd_na r7, r1, 16
476
	}
477
EX:	{ lwadd_na r11, r1, 12 }
478
EX:	{ lwadd_na r14, r1, -24 }
479
EX:	{ lwadd_na r8, r1, 4 }
480
EX:	{ lwadd_na r9, r1, 4 }
481
EX:	{
482
	lwadd_na r10, r1, 8
483
	/* r16 = (r2 < 64), after we subtract 32 from r2 below. */
484
	slti_u r16, r2, 64 + 32
485
	}
486
EX:	{ lwadd_na r12, r1, 4; addi r17, r17, 1 }
487
EX:	{ lwadd_na r13, r1, 8; dword_align r6, r7, r1 }
488
EX:	{ swadd r0, r6,  4; dword_align r7,  r8,  r1 }
489
EX:	{ swadd r0, r7,  4; dword_align r8,  r9,  r1 }
490
EX:	{ swadd r0, r8,  4; dword_align r9,  r10, r1 }
491
EX:	{ swadd r0, r9,  4; dword_align r10, r11, r1 }
492
EX:	{ swadd r0, r10, 4; dword_align r11, r12, r1 }
493
EX:	{ swadd r0, r11, 4; dword_align r12, r13, r1 }
494
EX:	{ swadd r0, r12, 4; dword_align r13, r14, r1 }
495
EX:	{ swadd r0, r13, 4; addi r2, r2, -32 }
496
	{ move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line }
497

498
	{ bzt r16, .Lcopy_unaligned_line; move r7, r6 }
499

500
	/* On entry:
501
	 * - r0 is the next store address.
502
	 * - r1 points 4 bytes past the load address corresponding to r0.
503
	 * - r2 >= 0 (# of bytes left to store).
504
	 * - r7 is the next aligned src word value.
505
	 */
506
.Lcleanup_unaligned_words:
507
	/* Handle any trailing bytes. */
508
	{ bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 }
509
	{ bzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
510

511
	/* Move r1 back to the point where it corresponds to r0. */
512
	{ addi r1, r1, -4 }
513

514
#else /* !CHIP_HAS_DWORD_ALIGN() */
515

516
	/* Compute right/left shift counts and load initial source words. */
517
	{ andi r5, r1, -4; andi r3, r1, 3 }
518
EX:	{ lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
519
EX:	{ lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
520

521
	/* Load and store one word at a time, using shifts and ORs
522
	 * to correct for the misaligned src.
523
	 */
524
.Lcopy_unaligned_src_loop:
525
	{ shr r6, r6, r3; shl r8, r7, r4 }
526
EX:	{ lw r7, r5; or r8, r8, r6; move r6, r7 }
527
EX:	{ sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
528
	{ addi r5, r5, 4; slti_u r8, r2, 8 }
529
	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
530

531
	{ bz r2, .Lcopy_unaligned_done }
532
#endif /* !CHIP_HAS_DWORD_ALIGN() */
533

534
	/* Fall through */
535

536
/*
537
 *
538
 * 1 byte at a time copy handler.
539
 *
540
 */
541

542
.Lcopy_unaligned_few:
543
EX:	{ lb_u r3, r1; addi r1, r1, 1 }
544
EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
545
	{ bnzt r2, .Lcopy_unaligned_few }
546

547
.Lcopy_unaligned_done:
548

549
	/* For memcpy return original dest address, else zero. */
550
	{ mz r0, r29, r23; jrp lr }
551

552
.Lend_memcpy_common:
553
	.size memcpy_common, .Lend_memcpy_common - memcpy_common
554

555
	.section .fixup,"ax"
556
memcpy_common_fixup:
557
	.type memcpy_common_fixup, @function
558

559
	/* Skip any bytes we already successfully copied.
560
	 * r2 (num remaining) is correct, but r0 (dst) and r1 (src)
561
	 * may not be quite right because of unrolling and prefetching.
562
	 * So we need to recompute their values as the address just
563
	 * after the last byte we are sure was successfully loaded and
564
	 * then stored.
565
	 */
566

567
	/* Determine how many bytes we successfully copied. */
568
	{ sub r3, r25, r2 }
569

570
	/* Add this to the original r0 and r1 to get their new values. */
571
	{ add r0, r23, r3; add r1, r24, r3 }
572

573
	{ bzt r29, memcpy_fixup_loop }
574
	{ blzt r29, copy_to_user_fixup_loop }
575

576
copy_from_user_fixup_loop:
577
	/* Try copying the rest one byte at a time, expecting a load fault. */
578
.Lcfu:	{ lb_u r3, r1; addi r1, r1, 1 }
579
	{ sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
580
	{ bnzt r2, copy_from_user_fixup_loop }
581

582
.Lcopy_from_user_fixup_zero_remainder:
583
	{ bbs r29, 2f }  /* low bit set means IS_COPY_FROM_USER */
584
	/* byte-at-a-time loop faulted, so zero the rest. */
585
	{ move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ }
586
1:      { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 }
587
	{ bnzt r3, 1b }
588
2:	move lr, r27
589
	{ move r0, r2; jrp lr }
590

591
copy_to_user_fixup_loop:
592
	/* Try copying the rest one byte at a time, expecting a store fault. */
593
	{ lb_u r3, r1; addi r1, r1, 1 }
594
.Lctu:	{ sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
595
	{ bnzt r2, copy_to_user_fixup_loop }
596
.Lcopy_to_user_fixup_done:
597
	move lr, r27
598
	{ move r0, r2; jrp lr }
599

600
memcpy_fixup_loop:
601
	/* Try copying the rest one byte at a time. We expect a disastrous
602
	 * fault to happen since we are in fixup code, but let it happen.
603
	 */
604
	{ lb_u r3, r1; addi r1, r1, 1 }
605
	{ sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
606
	{ bnzt r2, memcpy_fixup_loop }
607
	/* This should be unreachable, we should have faulted again.
608
	 * But be paranoid and handle it in case some interrupt changed
609
	 * the TLB or something.
610
	 */
611
	move lr, r27
612
	{ move r0, r23; jrp lr }
613

614
	.size memcpy_common_fixup, . - memcpy_common_fixup
615

616
	.section __ex_table,"a"
617
	.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
618
	.word .Lctu, .Lcopy_to_user_fixup_done
619

620
Product

Resources

Company