Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/alpha/lib/ev6-csum_ipv6_magic.S
10817 views
1
/*
2
* arch/alpha/lib/ev6-csum_ipv6_magic.S
3
* 21264 version contributed by Rick Gorton <[email protected]>
4
*
5
* unsigned short csum_ipv6_magic(struct in6_addr *saddr,
6
* struct in6_addr *daddr,
7
* __u32 len,
8
* unsigned short proto,
9
* unsigned int csum);
10
*
11
* Much of the information about 21264 scheduling/coding comes from:
12
* Compiler Writer's Guide for the Alpha 21264
13
* abbreviated as 'CWG' in other comments here
14
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15
* Scheduling notation:
16
* E - either cluster
17
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
19
* Try not to change the actual algorithm if possible for consistency.
20
* Determining actual stalls (other than slotting) doesn't appear to be easy to do.
21
*
22
* unsigned short csum_ipv6_magic(struct in6_addr *saddr,
23
* struct in6_addr *daddr,
24
* __u32 len,
25
* unsigned short proto,
26
* unsigned int csum);
27
*
28
* Swap <proto> (takes form 0xaabb)
29
* Then shift it left by 48, so result is:
30
* 0xbbaa0000 00000000
31
* Then turn it back into a sign extended 32-bit item
32
* 0xbbaa0000
33
*
34
* Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence
35
* (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence)
36
* Assume input takes form 0xAABBCCDD
37
*
38
* Finally, original 'folding' approach is to split the long into 4 unsigned shorts
39
* add 4 ushorts, resulting in ushort/carry
40
* add carry bits + ushort --> ushort
41
* add carry bits + ushort --> ushort (in case the carry results in an overflow)
42
* Truncate to a ushort. (took 13 instructions)
43
* From doing some testing, using the approach in checksum.c:from64to16()
44
* results in the same outcome:
45
* split into 2 uints, add those, generating a ulong
46
* add the 3 low ushorts together, generating a uint
47
* a final add of the 2 lower ushorts
48
* truncating the result.
49
*
50
* Misalignment handling added by Ivan Kokshaysky <[email protected]>
51
* The cost is 16 instructions (~8 cycles), including two extra loads which
52
* may cause additional delay in rare cases (load-load replay traps).
53
*/
54
55
.globl csum_ipv6_magic
56
.align 4
57
.ent csum_ipv6_magic
58
.frame $30,0,$26,0
59
csum_ipv6_magic:
60
.prologue 0
61
62
ldq_u $0,0($16) # L : Latency: 3
63
inslh $18,7,$4 # U : 0000000000AABBCC
64
ldq_u $1,8($16) # L : Latency: 3
65
sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00
66
67
and $16,7,$6 # E : src misalignment
68
ldq_u $5,15($16) # L : Latency: 3
69
zapnot $20,15,$20 # U : zero extend incoming csum
70
ldq_u $2,0($17) # L : U L U L : Latency: 3
71
72
extql $0,$6,$0 # U :
73
extqh $1,$6,$22 # U :
74
ldq_u $3,8($17) # L : Latency: 3
75
sll $19,24,$19 # U : U U L U : 0x000000aa bb000000
76
77
cmoveq $6,$31,$22 # E : src aligned?
78
ldq_u $23,15($17) # L : Latency: 3
79
inswl $18,3,$18 # U : 000000CCDD000000
80
addl $19,$7,$19 # E : U L U L : <sign bits>bbaabb00
81
82
or $0,$22,$0 # E : 1st src word complete
83
extql $1,$6,$1 # U :
84
or $18,$4,$18 # E : 000000CCDDAABBCC
85
extqh $5,$6,$5 # U : L U L U
86
87
and $17,7,$6 # E : dst misalignment
88
extql $2,$6,$2 # U :
89
or $1,$5,$1 # E : 2nd src word complete
90
extqh $3,$6,$22 # U : L U L U :
91
92
cmoveq $6,$31,$22 # E : dst aligned?
93
extql $3,$6,$3 # U :
94
addq $20,$0,$20 # E : begin summing the words
95
extqh $23,$6,$23 # U : L U L U :
96
97
srl $18,16,$4 # U : 0000000000CCDDAA
98
or $2,$22,$2 # E : 1st dst word complete
99
zap $19,0x3,$19 # U : <sign bits>bbaa0000
100
or $3,$23,$3 # E : U L U L : 2nd dst word complete
101
102
cmpult $20,$0,$0 # E :
103
addq $20,$1,$20 # E :
104
zapnot $18,0xa,$18 # U : 00000000DD00BB00
105
zap $4,0xa,$4 # U : U U L L : 0000000000CC00AA
106
107
or $18,$4,$18 # E : 00000000DDCCBBAA
108
nop # E :
109
cmpult $20,$1,$1 # E :
110
addq $20,$2,$20 # E : U L U L
111
112
cmpult $20,$2,$2 # E :
113
addq $20,$3,$20 # E :
114
cmpult $20,$3,$3 # E : (1 cycle stall on $20)
115
addq $20,$18,$20 # E : U L U L (1 cycle stall on $20)
116
117
cmpult $20,$18,$18 # E :
118
addq $20,$19,$20 # E : (1 cycle stall on $20)
119
addq $0,$1,$0 # E : merge the carries back into the csum
120
addq $2,$3,$2 # E :
121
122
cmpult $20,$19,$19 # E :
123
addq $18,$19,$18 # E : (1 cycle stall on $19)
124
addq $0,$2,$0 # E :
125
addq $20,$18,$20 # E : U L U L :
126
/* (1 cycle stall on $18, 2 cycles on $20) */
127
128
addq $0,$20,$0 # E :
129
zapnot $0,15,$1 # U : Start folding output (1 cycle stall on $0)
130
nop # E :
131
srl $0,32,$0 # U : U L U L : (1 cycle stall on $0)
132
133
addq $1,$0,$1 # E : Finished generating ulong
134
extwl $1,2,$2 # U : ushort[1] (1 cycle stall on $1)
135
zapnot $1,3,$0 # U : ushort[0] (1 cycle stall on $1)
136
extwl $1,4,$1 # U : ushort[2] (1 cycle stall on $1)
137
138
addq $0,$2,$0 # E
139
addq $0,$1,$3 # E : Finished generating uint
140
/* (1 cycle stall on $0) */
141
extwl $3,2,$1 # U : ushort[1] (1 cycle stall on $3)
142
nop # E : L U L U
143
144
addq $1,$3,$0 # E : Final carry
145
not $0,$4 # E : complement (1 cycle stall on $0)
146
zapnot $4,3,$0 # U : clear upper garbage bits
147
/* (1 cycle stall on $4) */
148
ret # L0 : L U L U
149
150
.end csum_ipv6_magic
151
152