/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev6-copy_page.S3*4* Copy an entire page.5*/67/* The following comparison of this routine vs the normal copy_page.S8was written by an unnamed ev6 hardware designer and forwarded to me9via Steven Hobbs <[email protected]>.1011First Problem: STQ overflows.12-----------------------------1314It would be nice if EV6 handled every resource overflow efficiently,15but for some it doesn't. Including store queue overflows. It causes16a trap and a restart of the pipe.1718To get around this we sometimes use (to borrow a term from a VSSAD19researcher) "aeration". The idea is to slow the rate at which the20processor receives valid instructions by inserting nops in the fetch21path. In doing so, you can prevent the overflow and actually make22the code run faster. You can, of course, take advantage of the fact23that the processor can fetch at most 4 aligned instructions per cycle.2425I inserted enough nops to force it to take 10 cycles to fetch the26loop code. In theory, EV6 should be able to execute this loop in279 cycles but I was not able to get it to run that fast -- the initial28conditions were such that I could not reach this optimum rate on29(chaotic) EV6. I wrote the code such that everything would issue30in order.3132Second Problem: Dcache index matches.33-------------------------------------3435If you are going to use this routine on random aligned pages, there36is a 25% chance that the pages will be at the same dcache indices.37This results in many nasty memory traps without care.3839The solution is to schedule the prefetches to avoid the memory40conflicts. I schedule the wh64 prefetches farther ahead of the41read prefetches to avoid this problem.4243Third Problem: Needs more prefetching.44--------------------------------------4546In order to improve the code I added deeper prefetching to take the47most advantage of EV6's bandwidth.4849I also prefetched the read stream. Note that adding the read prefetch50forced me to add another cycle to the inner-most kernel - up to 1151from the original 8 cycles per iteration. We could improve performance52further by unrolling the loop and doing multiple prefetches per cycle.5354I think that the code below will be very robust and fast code for the55purposes of copying aligned pages. It is slower when both source and56destination pages are in the dcache, but it is my guess that this is57less important than the dcache miss case. */5859#include <linux/export.h>60.text61.align 462.global copy_page63.ent copy_page64copy_page:65.prologue 06667/* Prefetch 5 read cachelines; write-hint 10 cache lines. */68wh64 ($16)69ldl $31,0($17)70ldl $31,64($17)71lda $1,1*64($16)7273wh64 ($1)74ldl $31,128($17)75ldl $31,192($17)76lda $1,2*64($16)7778wh64 ($1)79ldl $31,256($17)80lda $18,11881lda $1,3*64($16)8283wh64 ($1)84nop85lda $1,4*64($16)86lda $2,5*64($16)8788wh64 ($1)89wh64 ($2)90lda $1,6*64($16)91lda $2,7*64($16)9293wh64 ($1)94wh64 ($2)95lda $1,8*64($16)96lda $2,9*64($16)9798wh64 ($1)99wh64 ($2)100lda $19,10*64($16)101nop102103/* Main prefetching/write-hinting loop. */1041: ldq $0,0($17)105ldq $1,8($17)106unop107unop108109unop110unop111ldq $2,16($17)112ldq $3,24($17)113114ldq $4,32($17)115ldq $5,40($17)116unop117unop118119unop120unop121ldq $6,48($17)122ldq $7,56($17)123124ldl $31,320($17)125unop126unop127unop128129/* This gives the extra cycle of aeration above the minimum. */130unop131unop132unop133unop134135wh64 ($19)136unop137unop138unop139140stq $0,0($16)141subq $18,1,$18142stq $1,8($16)143unop144145unop146stq $2,16($16)147addq $17,64,$17148stq $3,24($16)149150stq $4,32($16)151stq $5,40($16)152addq $19,64,$19153unop154155stq $6,48($16)156stq $7,56($16)157addq $16,64,$16158bne $18, 1b159160/* Prefetch the final 5 cache lines of the read stream. */161lda $18,10162ldl $31,320($17)163ldl $31,384($17)164ldl $31,448($17)165166ldl $31,512($17)167ldl $31,576($17)168nop169nop170171/* Non-prefetching, non-write-hinting cleanup loop for the172final 10 cache lines. */1732: ldq $0,0($17)174ldq $1,8($17)175ldq $2,16($17)176ldq $3,24($17)177178ldq $4,32($17)179ldq $5,40($17)180ldq $6,48($17)181ldq $7,56($17)182183stq $0,0($16)184subq $18,1,$18185stq $1,8($16)186addq $17,64,$17187188stq $2,16($16)189stq $3,24($16)190stq $4,32($16)191stq $5,40($16)192193stq $6,48($16)194stq $7,56($16)195addq $16,64,$16196bne $18, 2b197198ret199nop200unop201nop202203.end copy_page204EXPORT_SYMBOL(copy_page)205206207