/*1* McKinley-optimized version of copy_page().2*3* Copyright (C) 2002 Hewlett-Packard Co4* David Mosberger <[email protected]>5*6* Inputs:7* in0: address of target page8* in1: address of source page9* Output:10* no return value11*12* General idea:13* - use regular loads and stores to prefetch data to avoid consuming M-slot just for14* lfetches => good for in-cache performance15* - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single16* cycle17*18* Principle of operation:19* First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.20* To avoid secondary misses in L2, we prefetch both source and destination with a line-size21* of 128 bytes. When both of these lines are in the L2 and the first half of the22* source line is in L1, we start copying the remaining words. The second half of the23* source line is prefetched in an earlier iteration, so that by the time we start24* accessing it, it's also present in the L1.25*26* We use a software-pipelined loop to control the overall operation. The pipeline27* has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching28* source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination29* cache-lines, the last K stages are used to copy the cache-line words not copied by30* the prefetches. The four relevant points in the pipelined are called A, B, C, D:31* p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line32* should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought33* into L1D and p[D] is TRUE if a cacheline needs to be copied.34*35* This all sounds very complicated, but thanks to the modulo-scheduled loop support,36* the resulting code is very regular and quite easy to follow (once you get the idea).37*38* As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented39* as the separate .prefetch_loop. Logically, this loop performs exactly like the40* main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,41* so that each loop iteration is faster (again, good for cached case).42*43* When reading the code, it helps to keep the following picture in mind:44*45* word 0 word 146* +------+------+---47* | v[x] | t1 | ^48* | t2 | t3 | |49* | t4 | t5 | |50* | t6 | t7 | | 128 bytes51* | n[y] | t9 | | (L2 cache line)52* | t10 | t11 | |53* | t12 | t13 | |54* | t14 | t15 | v55* +------+------+---56*57* Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C]58* to fetch the second-half of the L2 cache line into L1, and the tX words are copied in59* an order that avoids bank conflicts.60*/61#include <asm/asmmacro.h>62#include <asm/page.h>6364#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)6566#define src0 r267#define src1 r368#define dst0 r969#define dst1 r1070#define src_pre_mem r1171#define dst_pre_mem r1472#define src_pre_l2 r1573#define dst_pre_l2 r1674#define t1 r1775#define t2 r1876#define t3 r1977#define t4 r2078#define t5 t1 // alias!79#define t6 t2 // alias!80#define t7 t3 // alias!81#define t9 t5 // alias!82#define t10 t4 // alias!83#define t11 t7 // alias!84#define t12 t6 // alias!85#define t14 t10 // alias!86#define t13 r2187#define t15 r228889#define saved_lc r2390#define saved_pr r249192#define A 093#define B (PREFETCH_DIST)94#define C (B + PREFETCH_DIST)95#define D (C + 3)96#define N (D + 1)97#define Nrot ((N + 7) & ~7)9899GLOBAL_ENTRY(copy_page)100.prologue101alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot102103.rotr v[2*PREFETCH_DIST], n[D-C+1]104.rotp p[N]105106.save ar.lc, saved_lc107mov saved_lc = ar.lc108.save pr, saved_pr109mov saved_pr = pr110.body111112mov src_pre_mem = in1113mov pr.rot = 0x10000114mov ar.ec = 1 // special unrolled loop115116mov dst_pre_mem = in0117mov ar.lc = 2*PREFETCH_DIST - 1118119add src_pre_l2 = 8*8, in1120add dst_pre_l2 = 8*8, in0121add src0 = 8, in1 // first t1 src122add src1 = 3*8, in1 // first t3 src123add dst0 = 8, in0 // first t1 dst124add dst1 = 3*8, in0 // first t3 dst125mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1126nop.m 0127nop.i 0128;;129// same as .line_copy loop, but with all predicated-off instructions removed:130.prefetch_loop:131(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0132(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2133br.ctop.sptk .prefetch_loop134;;135cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero)136mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits!137mov ar.ec = N // # of stages in pipeline138;;139.line_copy:140(p[D]) ld8 t2 = [src0], 3*8 // M0141(p[D]) ld8 t4 = [src1], 3*8 // M1142(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory143(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2144;;145(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory146(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2147(p[D]) st8 [dst0] = t1, 8 // M2148(p[D]) st8 [dst1] = t3, 8 // M3149;;150(p[D]) ld8 t5 = [src0], 8151(p[D]) ld8 t7 = [src1], 3*8152(p[D]) st8 [dst0] = t2, 3*8153(p[D]) st8 [dst1] = t4, 3*8154;;155(p[D]) ld8 t6 = [src0], 3*8156(p[D]) ld8 t10 = [src1], 8157(p[D]) st8 [dst0] = t5, 8158(p[D]) st8 [dst1] = t7, 3*8159;;160(p[D]) ld8 t9 = [src0], 3*8161(p[D]) ld8 t11 = [src1], 3*8162(p[D]) st8 [dst0] = t6, 3*8163(p[D]) st8 [dst1] = t10, 8164;;165(p[D]) ld8 t12 = [src0], 8166(p[D]) ld8 t14 = [src1], 8167(p[D]) st8 [dst0] = t9, 3*8168(p[D]) st8 [dst1] = t11, 3*8169;;170(p[D]) ld8 t13 = [src0], 4*8171(p[D]) ld8 t15 = [src1], 4*8172(p[D]) st8 [dst0] = t12, 8173(p[D]) st8 [dst1] = t14, 8174;;175(p[D-1])ld8 t1 = [src0], 8176(p[D-1])ld8 t3 = [src1], 8177(p[D]) st8 [dst0] = t13, 4*8178(p[D]) st8 [dst1] = t15, 4*8179br.ctop.sptk .line_copy180;;181mov ar.lc = saved_lc182mov pr = saved_pr, -1183br.ret.sptk.many rp184END(copy_page)185186187