Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/asm/sha2-x64.S
1201 views
1
/*
2
* Copyright 2012-2013 [email protected]
3
*
4
* This program is free software; you can redistribute it and/or modify it
5
* under the terms of the GNU General Public License as published by the Free
6
* Software Foundation; either version 2 of the License, or (at your option)
7
* any later version. See COPYING for more details.
8
*/
9
10
#include <cpuminer-config.h>
11
12
#if defined(__linux__) && defined(__ELF__)
13
.section .note.GNU-stack,"",%progbits
14
#endif
15
16
#if defined(USE_ASM) && defined(__x86_64__)
17
18
.data
19
.p2align 7
20
sha256_4h:
21
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
22
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
23
.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
24
.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
25
.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
26
.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
27
.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
28
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
29
30
.data
31
.p2align 7
32
sha256_4k:
33
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
34
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
35
.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
36
.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
37
.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
38
.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
39
.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
40
.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
41
.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
42
.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
43
.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
44
.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
45
.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
46
.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
47
.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
48
.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
49
.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
50
.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
51
.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
52
.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
53
.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
54
.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
55
.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
56
.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
57
.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
58
.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
59
.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
60
.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
61
.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
62
.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
63
.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
64
.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
65
.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
66
.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
67
.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
68
.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
69
.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
70
.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
71
.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
72
.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
73
.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
74
.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
75
.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
76
.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
77
.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
78
.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
79
.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
80
.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
81
.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
82
.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
83
.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
84
.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
85
.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
86
.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
87
.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
88
.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
89
.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
90
.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
91
.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
92
.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
93
.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
94
.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
95
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
96
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
97
98
.data
99
.p2align 6
100
sha256d_4preext2_17:
101
.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
102
sha256d_4preext2_23:
103
.long 0x11002000, 0x11002000, 0x11002000, 0x11002000
104
sha256d_4preext2_24:
105
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
106
sha256d_4preext2_30:
107
.long 0x00400022, 0x00400022, 0x00400022, 0x00400022
108
109
110
#ifdef USE_AVX2
111
112
.data
113
.p2align 7
114
sha256_8h:
115
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
116
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
117
.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
118
.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
119
.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
120
.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
121
.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
122
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
123
124
.data
125
.p2align 7
126
sha256_8k:
127
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
128
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491
129
.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
130
.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
131
.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
132
.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
133
.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
134
.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
135
.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
136
.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
137
.long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be
138
.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
139
.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
140
.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
141
.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
142
.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
143
.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
144
.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
145
.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
146
.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
147
.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
148
.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
149
.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
150
.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
151
.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
152
.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
153
.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
154
.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
155
.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
156
.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
157
.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
158
.long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967
159
.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
160
.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
161
.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
162
.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
163
.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
164
.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
165
.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
166
.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
167
.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
168
.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
169
.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
170
.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
171
.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
172
.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
173
.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
174
.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
175
.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
176
.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
177
.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
178
.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
179
.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
180
.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
181
.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
182
.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
183
.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
184
.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
185
.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
186
.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
187
.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
188
.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
189
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
190
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
191
192
.data
193
.p2align 6
194
sha256d_8preext2_17:
195
.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
196
sha256d_8preext2_23:
197
.long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000
198
sha256d_8preext2_24:
199
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
200
sha256d_8preext2_30:
201
.long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022
202
203
#endif /* USE_AVX2 */
204
205
206
.text
207
.p2align 6
208
.globl sha256_init_4way
209
.globl _sha256_init_4way
210
sha256_init_4way:
211
_sha256_init_4way:
212
#if defined(_WIN64) || defined(__CYGWIN__)
213
pushq %rdi
214
movq %rcx, %rdi
215
#endif
216
movdqa sha256_4h+0(%rip), %xmm0
217
movdqa sha256_4h+16(%rip), %xmm1
218
movdqa sha256_4h+32(%rip), %xmm2
219
movdqa sha256_4h+48(%rip), %xmm3
220
movdqu %xmm0, 0(%rdi)
221
movdqu %xmm1, 16(%rdi)
222
movdqu %xmm2, 32(%rdi)
223
movdqu %xmm3, 48(%rdi)
224
movdqa sha256_4h+64(%rip), %xmm0
225
movdqa sha256_4h+80(%rip), %xmm1
226
movdqa sha256_4h+96(%rip), %xmm2
227
movdqa sha256_4h+112(%rip), %xmm3
228
movdqu %xmm0, 64(%rdi)
229
movdqu %xmm1, 80(%rdi)
230
movdqu %xmm2, 96(%rdi)
231
movdqu %xmm3, 112(%rdi)
232
#if defined(_WIN64) || defined(__CYGWIN__)
233
popq %rdi
234
#endif
235
ret
236
237
238
#ifdef USE_AVX2
239
.text
240
.p2align 6
241
.globl sha256_init_8way
242
.globl _sha256_init_8way
243
sha256_init_8way:
244
_sha256_init_8way:
245
#if defined(_WIN64) || defined(__CYGWIN__)
246
pushq %rdi
247
movq %rcx, %rdi
248
#endif
249
vpbroadcastd sha256_4h+0(%rip), %ymm0
250
vpbroadcastd sha256_4h+16(%rip), %ymm1
251
vpbroadcastd sha256_4h+32(%rip), %ymm2
252
vpbroadcastd sha256_4h+48(%rip), %ymm3
253
vmovdqu %ymm0, 0*32(%rdi)
254
vmovdqu %ymm1, 1*32(%rdi)
255
vmovdqu %ymm2, 2*32(%rdi)
256
vmovdqu %ymm3, 3*32(%rdi)
257
vpbroadcastd sha256_4h+64(%rip), %ymm0
258
vpbroadcastd sha256_4h+80(%rip), %ymm1
259
vpbroadcastd sha256_4h+96(%rip), %ymm2
260
vpbroadcastd sha256_4h+112(%rip), %ymm3
261
vmovdqu %ymm0, 4*32(%rdi)
262
vmovdqu %ymm1, 5*32(%rdi)
263
vmovdqu %ymm2, 6*32(%rdi)
264
vmovdqu %ymm3, 7*32(%rdi)
265
#if defined(_WIN64) || defined(__CYGWIN__)
266
popq %rdi
267
#endif
268
ret
269
#endif /* USE_AVX2 */
270
271
272
.macro sha256_sse2_extend_round i
273
movdqa (\i-15)*16(%rax), %xmm0
274
movdqa %xmm0, %xmm2
275
psrld $3, %xmm0
276
movdqa %xmm0, %xmm1
277
pslld $14, %xmm2
278
psrld $4, %xmm1
279
pxor %xmm1, %xmm0
280
pxor %xmm2, %xmm0
281
psrld $11, %xmm1
282
pslld $11, %xmm2
283
pxor %xmm1, %xmm0
284
pxor %xmm2, %xmm0
285
paddd (\i-16)*16(%rax), %xmm0
286
paddd (\i-7)*16(%rax), %xmm0
287
288
movdqa %xmm3, %xmm2
289
psrld $10, %xmm3
290
pslld $13, %xmm2
291
movdqa %xmm3, %xmm1
292
psrld $7, %xmm1
293
pxor %xmm1, %xmm3
294
pxor %xmm2, %xmm3
295
psrld $2, %xmm1
296
pslld $2, %xmm2
297
pxor %xmm1, %xmm3
298
pxor %xmm2, %xmm3
299
paddd %xmm0, %xmm3
300
movdqa %xmm3, \i*16(%rax)
301
.endm
302
303
.macro sha256_sse2_extend_doubleround i
304
movdqa (\i-15)*16(%rax), %xmm0
305
movdqa (\i-14)*16(%rax), %xmm4
306
movdqa %xmm0, %xmm2
307
movdqa %xmm4, %xmm6
308
psrld $3, %xmm0
309
psrld $3, %xmm4
310
movdqa %xmm0, %xmm1
311
movdqa %xmm4, %xmm5
312
pslld $14, %xmm2
313
pslld $14, %xmm6
314
psrld $4, %xmm1
315
psrld $4, %xmm5
316
pxor %xmm1, %xmm0
317
pxor %xmm5, %xmm4
318
psrld $11, %xmm1
319
psrld $11, %xmm5
320
pxor %xmm2, %xmm0
321
pxor %xmm6, %xmm4
322
pslld $11, %xmm2
323
pslld $11, %xmm6
324
pxor %xmm1, %xmm0
325
pxor %xmm5, %xmm4
326
pxor %xmm2, %xmm0
327
pxor %xmm6, %xmm4
328
329
paddd (\i-16)*16(%rax), %xmm0
330
paddd (\i-15)*16(%rax), %xmm4
331
332
movdqa %xmm3, %xmm2
333
movdqa %xmm7, %xmm6
334
psrld $10, %xmm3
335
psrld $10, %xmm7
336
movdqa %xmm3, %xmm1
337
movdqa %xmm7, %xmm5
338
pslld $13, %xmm2
339
pslld $13, %xmm6
340
psrld $7, %xmm1
341
psrld $7, %xmm5
342
343
paddd (\i-7)*16(%rax), %xmm0
344
paddd (\i-6)*16(%rax), %xmm4
345
346
pxor %xmm1, %xmm3
347
pxor %xmm5, %xmm7
348
psrld $2, %xmm1
349
psrld $2, %xmm5
350
pxor %xmm2, %xmm3
351
pxor %xmm6, %xmm7
352
pslld $2, %xmm2
353
pslld $2, %xmm6
354
pxor %xmm1, %xmm3
355
pxor %xmm5, %xmm7
356
pxor %xmm2, %xmm3
357
pxor %xmm6, %xmm7
358
359
paddd %xmm0, %xmm3
360
paddd %xmm4, %xmm7
361
movdqa %xmm3, \i*16(%rax)
362
movdqa %xmm7, (\i+1)*16(%rax)
363
.endm
364
365
.macro sha256_sse2_main_round i
366
movdqa 16*(\i)(%rax), %xmm6
367
368
movdqa %xmm0, %xmm1
369
movdqa 16(%rsp), %xmm2
370
pandn %xmm2, %xmm1
371
paddd 32(%rsp), %xmm6
372
373
movdqa %xmm2, 32(%rsp)
374
movdqa 0(%rsp), %xmm2
375
movdqa %xmm2, 16(%rsp)
376
377
pand %xmm0, %xmm2
378
pxor %xmm2, %xmm1
379
movdqa %xmm0, 0(%rsp)
380
381
paddd %xmm1, %xmm6
382
383
movdqa %xmm0, %xmm1
384
psrld $6, %xmm0
385
paddd 16*(\i)(%rcx), %xmm6
386
movdqa %xmm0, %xmm2
387
pslld $7, %xmm1
388
psrld $5, %xmm2
389
pxor %xmm1, %xmm0
390
pxor %xmm2, %xmm0
391
pslld $14, %xmm1
392
psrld $14, %xmm2
393
pxor %xmm1, %xmm0
394
pslld $5, %xmm1
395
pxor %xmm2, %xmm0
396
pxor %xmm1, %xmm0
397
movdqa %xmm5, %xmm1
398
paddd %xmm0, %xmm6
399
400
movdqa %xmm3, %xmm0
401
movdqa %xmm4, %xmm3
402
movdqa %xmm4, %xmm2
403
paddd %xmm6, %xmm0
404
pand %xmm5, %xmm2
405
pand %xmm7, %xmm1
406
pand %xmm7, %xmm4
407
pxor %xmm4, %xmm1
408
movdqa %xmm5, %xmm4
409
movdqa %xmm7, %xmm5
410
pxor %xmm2, %xmm1
411
paddd %xmm1, %xmm6
412
413
movdqa %xmm7, %xmm2
414
psrld $2, %xmm7
415
movdqa %xmm7, %xmm1
416
pslld $10, %xmm2
417
psrld $11, %xmm1
418
pxor %xmm2, %xmm7
419
pslld $9, %xmm2
420
pxor %xmm1, %xmm7
421
psrld $9, %xmm1
422
pxor %xmm2, %xmm7
423
pslld $11, %xmm2
424
pxor %xmm1, %xmm7
425
pxor %xmm2, %xmm7
426
paddd %xmm6, %xmm7
427
.endm
428
429
.macro sha256_sse2_main_quadround i
430
sha256_sse2_main_round \i+0
431
sha256_sse2_main_round \i+1
432
sha256_sse2_main_round \i+2
433
sha256_sse2_main_round \i+3
434
.endm
435
436
437
#if defined(USE_AVX)
438
439
.macro sha256_avx_extend_round i
440
vmovdqa (\i-15)*16(%rax), %xmm0
441
vpslld $14, %xmm0, %xmm2
442
vpsrld $3, %xmm0, %xmm0
443
vpsrld $4, %xmm0, %xmm1
444
vpxor %xmm1, %xmm0, %xmm0
445
vpxor %xmm2, %xmm0, %xmm0
446
vpsrld $11, %xmm1, %xmm1
447
vpslld $11, %xmm2, %xmm2
448
vpxor %xmm1, %xmm0, %xmm0
449
vpxor %xmm2, %xmm0, %xmm0
450
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
451
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
452
453
vpslld $13, %xmm3, %xmm2
454
vpsrld $10, %xmm3, %xmm3
455
vpsrld $7, %xmm3, %xmm1
456
vpxor %xmm1, %xmm3, %xmm3
457
vpxor %xmm2, %xmm3, %xmm3
458
vpsrld $2, %xmm1, %xmm1
459
vpslld $2, %xmm2, %xmm2
460
vpxor %xmm1, %xmm3, %xmm3
461
vpxor %xmm2, %xmm3, %xmm3
462
vpaddd %xmm0, %xmm3, %xmm3
463
vmovdqa %xmm3, \i*16(%rax)
464
.endm
465
466
.macro sha256_avx_extend_doubleround i
467
vmovdqa (\i-15)*16(%rax), %xmm0
468
vmovdqa (\i-14)*16(%rax), %xmm4
469
vpslld $14, %xmm0, %xmm2
470
vpslld $14, %xmm4, %xmm6
471
vpsrld $3, %xmm0, %xmm8
472
vpsrld $3, %xmm4, %xmm4
473
vpsrld $7, %xmm0, %xmm1
474
vpsrld $4, %xmm4, %xmm5
475
vpxor %xmm1, %xmm8, %xmm8
476
vpxor %xmm5, %xmm4, %xmm4
477
vpsrld $11, %xmm1, %xmm1
478
vpsrld $11, %xmm5, %xmm5
479
vpxor %xmm2, %xmm8, %xmm8
480
vpxor %xmm6, %xmm4, %xmm4
481
vpslld $11, %xmm2, %xmm2
482
vpslld $11, %xmm6, %xmm6
483
vpxor %xmm1, %xmm8, %xmm8
484
vpxor %xmm5, %xmm4, %xmm4
485
vpxor %xmm2, %xmm8, %xmm8
486
vpxor %xmm6, %xmm4, %xmm4
487
488
vpaddd %xmm0, %xmm4, %xmm4
489
vpaddd (\i-16)*16(%rax), %xmm8, %xmm0
490
491
vpslld $13, %xmm3, %xmm2
492
vpslld $13, %xmm7, %xmm6
493
vpsrld $10, %xmm3, %xmm3
494
vpsrld $10, %xmm7, %xmm7
495
496
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
497
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
498
499
vpsrld $7, %xmm3, %xmm1
500
vpsrld $7, %xmm7, %xmm5
501
vpxor %xmm1, %xmm3, %xmm3
502
vpxor %xmm5, %xmm7, %xmm7
503
vpsrld $2, %xmm1, %xmm1
504
vpsrld $2, %xmm5, %xmm5
505
vpxor %xmm2, %xmm3, %xmm3
506
vpxor %xmm6, %xmm7, %xmm7
507
vpslld $2, %xmm2, %xmm2
508
vpslld $2, %xmm6, %xmm6
509
vpxor %xmm1, %xmm3, %xmm3
510
vpxor %xmm5, %xmm7, %xmm7
511
vpxor %xmm2, %xmm3, %xmm3
512
vpxor %xmm6, %xmm7, %xmm7
513
514
vpaddd %xmm0, %xmm3, %xmm3
515
vpaddd %xmm4, %xmm7, %xmm7
516
vmovdqa %xmm3, \i*16(%rax)
517
vmovdqa %xmm7, (\i+1)*16(%rax)
518
.endm
519
520
.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
521
vpaddd 16*(\i)(%rax), \r0, %xmm6
522
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
523
524
vpandn \r1, \r3, %xmm1
525
vpand \r3, \r2, %xmm2
526
vpxor %xmm2, %xmm1, %xmm1
527
vpaddd %xmm1, %xmm6, %xmm6
528
529
vpslld $7, \r3, %xmm1
530
vpsrld $6, \r3, \r0
531
vpsrld $5, \r0, %xmm2
532
vpxor %xmm1, \r0, \r0
533
vpxor %xmm2, \r0, \r0
534
vpslld $14, %xmm1, %xmm1
535
vpsrld $14, %xmm2, %xmm2
536
vpxor %xmm1, \r0, \r0
537
vpxor %xmm2, \r0, \r0
538
vpslld $5, %xmm1, %xmm1
539
vpxor %xmm1, \r0, \r0
540
vpaddd \r0, %xmm6, %xmm6
541
vpaddd %xmm6, \r4, \r0
542
543
vpand \r6, \r5, %xmm2
544
vpand \r7, \r5, \r4
545
vpand \r7, \r6, %xmm1
546
vpxor \r4, %xmm1, %xmm1
547
vpxor %xmm2, %xmm1, %xmm1
548
vpaddd %xmm1, %xmm6, %xmm6
549
550
vpslld $10, \r7, %xmm2
551
vpsrld $2, \r7, \r4
552
vpsrld $11, \r4, %xmm1
553
vpxor %xmm2, \r4, \r4
554
vpxor %xmm1, \r4, \r4
555
vpslld $9, %xmm2, %xmm2
556
vpsrld $9, %xmm1, %xmm1
557
vpxor %xmm2, \r4, \r4
558
vpxor %xmm1, \r4, \r4
559
vpslld $11, %xmm2, %xmm2
560
vpxor %xmm2, \r4, \r4
561
vpaddd %xmm6, \r4, \r4
562
.endm
563
564
.macro sha256_avx_main_quadround i
565
sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
566
sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
567
sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
568
sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
569
.endm
570
571
#endif /* USE_AVX */
572
573
574
#if defined(USE_AVX2)
575
576
.macro sha256_avx2_extend_round i
577
vmovdqa (\i-15)*32(%rax), %ymm0
578
vpslld $14, %ymm0, %ymm2
579
vpsrld $3, %ymm0, %ymm0
580
vpsrld $4, %ymm0, %ymm1
581
vpxor %ymm1, %ymm0, %ymm0
582
vpxor %ymm2, %ymm0, %ymm0
583
vpsrld $11, %ymm1, %ymm1
584
vpslld $11, %ymm2, %ymm2
585
vpxor %ymm1, %ymm0, %ymm0
586
vpxor %ymm2, %ymm0, %ymm0
587
vpaddd (\i-16)*32(%rax), %ymm0, %ymm0
588
vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
589
590
vpslld $13, %ymm3, %ymm2
591
vpsrld $10, %ymm3, %ymm3
592
vpsrld $7, %ymm3, %ymm1
593
vpxor %ymm1, %ymm3, %ymm3
594
vpxor %ymm2, %ymm3, %ymm3
595
vpsrld $2, %ymm1, %ymm1
596
vpslld $2, %ymm2, %ymm2
597
vpxor %ymm1, %ymm3, %ymm3
598
vpxor %ymm2, %ymm3, %ymm3
599
vpaddd %ymm0, %ymm3, %ymm3
600
vmovdqa %ymm3, \i*32(%rax)
601
.endm
602
603
.macro sha256_avx2_extend_doubleround i
604
vmovdqa (\i-15)*32(%rax), %ymm0
605
vmovdqa (\i-14)*32(%rax), %ymm4
606
vpslld $14, %ymm0, %ymm2
607
vpslld $14, %ymm4, %ymm6
608
vpsrld $3, %ymm0, %ymm8
609
vpsrld $3, %ymm4, %ymm4
610
vpsrld $7, %ymm0, %ymm1
611
vpsrld $4, %ymm4, %ymm5
612
vpxor %ymm1, %ymm8, %ymm8
613
vpxor %ymm5, %ymm4, %ymm4
614
vpsrld $11, %ymm1, %ymm1
615
vpsrld $11, %ymm5, %ymm5
616
vpxor %ymm2, %ymm8, %ymm8
617
vpxor %ymm6, %ymm4, %ymm4
618
vpslld $11, %ymm2, %ymm2
619
vpslld $11, %ymm6, %ymm6
620
vpxor %ymm1, %ymm8, %ymm8
621
vpxor %ymm5, %ymm4, %ymm4
622
vpxor %ymm2, %ymm8, %ymm8
623
vpxor %ymm6, %ymm4, %ymm4
624
625
vpaddd %ymm0, %ymm4, %ymm4
626
vpaddd (\i-16)*32(%rax), %ymm8, %ymm0
627
628
vpslld $13, %ymm3, %ymm2
629
vpslld $13, %ymm7, %ymm6
630
vpsrld $10, %ymm3, %ymm3
631
vpsrld $10, %ymm7, %ymm7
632
633
vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
634
vpaddd (\i-6)*32(%rax), %ymm4, %ymm4
635
636
vpsrld $7, %ymm3, %ymm1
637
vpsrld $7, %ymm7, %ymm5
638
vpxor %ymm1, %ymm3, %ymm3
639
vpxor %ymm5, %ymm7, %ymm7
640
vpsrld $2, %ymm1, %ymm1
641
vpsrld $2, %ymm5, %ymm5
642
vpxor %ymm2, %ymm3, %ymm3
643
vpxor %ymm6, %ymm7, %ymm7
644
vpslld $2, %ymm2, %ymm2
645
vpslld $2, %ymm6, %ymm6
646
vpxor %ymm1, %ymm3, %ymm3
647
vpxor %ymm5, %ymm7, %ymm7
648
vpxor %ymm2, %ymm3, %ymm3
649
vpxor %ymm6, %ymm7, %ymm7
650
651
vpaddd %ymm0, %ymm3, %ymm3
652
vpaddd %ymm4, %ymm7, %ymm7
653
vmovdqa %ymm3, \i*32(%rax)
654
vmovdqa %ymm7, (\i+1)*32(%rax)
655
.endm
656
657
.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
658
vpaddd 32*(\i)(%rax), \r0, %ymm6
659
vpaddd 32*(\i)(%rcx), %ymm6, %ymm6
660
661
vpandn \r1, \r3, %ymm1
662
vpand \r3, \r2, %ymm2
663
vpxor %ymm2, %ymm1, %ymm1
664
vpaddd %ymm1, %ymm6, %ymm6
665
666
vpslld $7, \r3, %ymm1
667
vpsrld $6, \r3, \r0
668
vpsrld $5, \r0, %ymm2
669
vpxor %ymm1, \r0, \r0
670
vpxor %ymm2, \r0, \r0
671
vpslld $14, %ymm1, %ymm1
672
vpsrld $14, %ymm2, %ymm2
673
vpxor %ymm1, \r0, \r0
674
vpxor %ymm2, \r0, \r0
675
vpslld $5, %ymm1, %ymm1
676
vpxor %ymm1, \r0, \r0
677
vpaddd \r0, %ymm6, %ymm6
678
vpaddd %ymm6, \r4, \r0
679
680
vpand \r6, \r5, %ymm2
681
vpand \r7, \r5, \r4
682
vpand \r7, \r6, %ymm1
683
vpxor \r4, %ymm1, %ymm1
684
vpxor %ymm2, %ymm1, %ymm1
685
vpaddd %ymm1, %ymm6, %ymm6
686
687
vpslld $10, \r7, %ymm2
688
vpsrld $2, \r7, \r4
689
vpsrld $11, \r4, %ymm1
690
vpxor %ymm2, \r4, \r4
691
vpxor %ymm1, \r4, \r4
692
vpslld $9, %ymm2, %ymm2
693
vpsrld $9, %ymm1, %ymm1
694
vpxor %ymm2, \r4, \r4
695
vpxor %ymm1, \r4, \r4
696
vpslld $11, %ymm2, %ymm2
697
vpxor %ymm2, \r4, \r4
698
vpaddd %ymm6, \r4, \r4
699
.endm
700
701
.macro sha256_avx2_main_quadround i
702
sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
703
sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
704
sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
705
sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
706
.endm
707
708
#endif /* USE_AVX2 */
709
710
711
#if defined(USE_XOP)
712
713
.macro sha256_xop_extend_round i
714
vmovdqa (\i-15)*16(%rax), %xmm0
715
vprotd $25, %xmm0, %xmm1
716
vprotd $14, %xmm0, %xmm2
717
vpsrld $3, %xmm0, %xmm0
718
vpxor %xmm1, %xmm2, %xmm2
719
vpxor %xmm2, %xmm0, %xmm0
720
721
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
722
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
723
724
vprotd $15, %xmm3, %xmm1
725
vprotd $13, %xmm3, %xmm2
726
vpsrld $10, %xmm3, %xmm3
727
vpxor %xmm1, %xmm2, %xmm2
728
vpxor %xmm2, %xmm3, %xmm3
729
vpaddd %xmm0, %xmm3, %xmm3
730
vmovdqa %xmm3, \i*16(%rax)
731
.endm
732
733
.macro sha256_xop_extend_doubleround i
734
vmovdqa (\i-15)*16(%rax), %xmm0
735
vmovdqa (\i-14)*16(%rax), %xmm4
736
vprotd $25, %xmm0, %xmm1
737
vprotd $25, %xmm4, %xmm5
738
vprotd $14, %xmm0, %xmm2
739
vprotd $14, %xmm4, %xmm6
740
vpxor %xmm1, %xmm2, %xmm2
741
vpxor %xmm5, %xmm6, %xmm6
742
vpsrld $3, %xmm0, %xmm0
743
vpsrld $3, %xmm4, %xmm4
744
vpxor %xmm2, %xmm0, %xmm0
745
vpxor %xmm6, %xmm4, %xmm4
746
747
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
748
vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
749
750
vprotd $15, %xmm3, %xmm1
751
vprotd $15, %xmm7, %xmm5
752
vprotd $13, %xmm3, %xmm2
753
vprotd $13, %xmm7, %xmm6
754
vpxor %xmm1, %xmm2, %xmm2
755
vpxor %xmm5, %xmm6, %xmm6
756
757
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
758
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
759
760
vpsrld $10, %xmm3, %xmm3
761
vpsrld $10, %xmm7, %xmm7
762
vpxor %xmm2, %xmm3, %xmm3
763
vpxor %xmm6, %xmm7, %xmm7
764
765
vpaddd %xmm0, %xmm3, %xmm3
766
vpaddd %xmm4, %xmm7, %xmm7
767
vmovdqa %xmm3, \i*16(%rax)
768
vmovdqa %xmm7, (\i+1)*16(%rax)
769
.endm
770
771
.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
772
vpaddd 16*(\i)(%rax), \r0, %xmm6
773
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
774
775
vpandn \r1, \r3, %xmm1
776
vpand \r3, \r2, %xmm2
777
vpxor %xmm2, %xmm1, %xmm1
778
vpaddd %xmm1, %xmm6, %xmm6
779
780
vprotd $26, \r3, %xmm1
781
vprotd $21, \r3, %xmm2
782
vpxor %xmm1, %xmm2, %xmm2
783
vprotd $7, \r3, \r0
784
vpxor %xmm2, \r0, \r0
785
vpaddd \r0, %xmm6, %xmm6
786
vpaddd %xmm6, \r4, \r0
787
788
vpand \r6, \r5, %xmm2
789
vpand \r7, \r5, \r4
790
vpand \r7, \r6, %xmm1
791
vpxor \r4, %xmm1, %xmm1
792
vpxor %xmm2, %xmm1, %xmm1
793
vpaddd %xmm1, %xmm6, %xmm6
794
795
vprotd $30, \r7, %xmm1
796
vprotd $19, \r7, %xmm2
797
vpxor %xmm1, %xmm2, %xmm2
798
vprotd $10, \r7, \r4
799
vpxor %xmm2, \r4, \r4
800
vpaddd %xmm6, \r4, \r4
801
.endm
802
803
.macro sha256_xop_main_quadround i
804
sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
805
sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
806
sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
807
sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
808
.endm
809
810
#endif /* USE_XOP */
811
812
813
.text
814
.p2align 6
815
sha256_transform_4way_core_sse2:
816
leaq 256(%rsp), %rcx
817
leaq 48*16(%rcx), %rax
818
movdqa -2*16(%rcx), %xmm3
819
movdqa -1*16(%rcx), %xmm7
820
sha256_transform_4way_sse2_extend_loop:
821
movdqa -15*16(%rcx), %xmm0
822
movdqa -14*16(%rcx), %xmm4
823
movdqa %xmm0, %xmm2
824
movdqa %xmm4, %xmm6
825
psrld $3, %xmm0
826
psrld $3, %xmm4
827
movdqa %xmm0, %xmm1
828
movdqa %xmm4, %xmm5
829
pslld $14, %xmm2
830
pslld $14, %xmm6
831
psrld $4, %xmm1
832
psrld $4, %xmm5
833
pxor %xmm1, %xmm0
834
pxor %xmm5, %xmm4
835
psrld $11, %xmm1
836
psrld $11, %xmm5
837
pxor %xmm2, %xmm0
838
pxor %xmm6, %xmm4
839
pslld $11, %xmm2
840
pslld $11, %xmm6
841
pxor %xmm1, %xmm0
842
pxor %xmm5, %xmm4
843
pxor %xmm2, %xmm0
844
pxor %xmm6, %xmm4
845
846
paddd -16*16(%rcx), %xmm0
847
paddd -15*16(%rcx), %xmm4
848
849
movdqa %xmm3, %xmm2
850
movdqa %xmm7, %xmm6
851
psrld $10, %xmm3
852
psrld $10, %xmm7
853
movdqa %xmm3, %xmm1
854
movdqa %xmm7, %xmm5
855
pslld $13, %xmm2
856
pslld $13, %xmm6
857
psrld $7, %xmm1
858
psrld $7, %xmm5
859
860
paddd -7*16(%rcx), %xmm0
861
paddd -6*16(%rcx), %xmm4
862
863
pxor %xmm1, %xmm3
864
pxor %xmm5, %xmm7
865
psrld $2, %xmm1
866
psrld $2, %xmm5
867
pxor %xmm2, %xmm3
868
pxor %xmm6, %xmm7
869
pslld $2, %xmm2
870
pslld $2, %xmm6
871
pxor %xmm1, %xmm3
872
pxor %xmm5, %xmm7
873
pxor %xmm2, %xmm3
874
pxor %xmm6, %xmm7
875
876
paddd %xmm0, %xmm3
877
paddd %xmm4, %xmm7
878
movdqa %xmm3, (%rcx)
879
movdqa %xmm7, 16(%rcx)
880
addq $2*16, %rcx
881
cmpq %rcx, %rax
882
jne sha256_transform_4way_sse2_extend_loop
883
884
movdqu 0(%rdi), %xmm7
885
movdqu 16(%rdi), %xmm5
886
movdqu 32(%rdi), %xmm4
887
movdqu 48(%rdi), %xmm3
888
movdqu 64(%rdi), %xmm0
889
movdqu 80(%rdi), %xmm8
890
movdqu 96(%rdi), %xmm9
891
movdqu 112(%rdi), %xmm10
892
893
leaq sha256_4k(%rip), %rcx
894
xorq %rax, %rax
895
sha256_transform_4way_sse2_main_loop:
896
movdqa (%rsp, %rax), %xmm6
897
paddd (%rcx, %rax), %xmm6
898
paddd %xmm10, %xmm6
899
900
movdqa %xmm0, %xmm1
901
movdqa %xmm9, %xmm2
902
pandn %xmm2, %xmm1
903
904
movdqa %xmm2, %xmm10
905
movdqa %xmm8, %xmm2
906
movdqa %xmm2, %xmm9
907
908
pand %xmm0, %xmm2
909
pxor %xmm2, %xmm1
910
movdqa %xmm0, %xmm8
911
912
paddd %xmm1, %xmm6
913
914
movdqa %xmm0, %xmm1
915
psrld $6, %xmm0
916
movdqa %xmm0, %xmm2
917
pslld $7, %xmm1
918
psrld $5, %xmm2
919
pxor %xmm1, %xmm0
920
pxor %xmm2, %xmm0
921
pslld $14, %xmm1
922
psrld $14, %xmm2
923
pxor %xmm1, %xmm0
924
pxor %xmm2, %xmm0
925
pslld $5, %xmm1
926
pxor %xmm1, %xmm0
927
paddd %xmm0, %xmm6
928
929
movdqa %xmm3, %xmm0
930
paddd %xmm6, %xmm0
931
932
movdqa %xmm5, %xmm1
933
movdqa %xmm4, %xmm3
934
movdqa %xmm4, %xmm2
935
pand %xmm5, %xmm2
936
pand %xmm7, %xmm4
937
pand %xmm7, %xmm1
938
pxor %xmm4, %xmm1
939
movdqa %xmm5, %xmm4
940
movdqa %xmm7, %xmm5
941
pxor %xmm2, %xmm1
942
paddd %xmm1, %xmm6
943
944
movdqa %xmm7, %xmm2
945
psrld $2, %xmm7
946
movdqa %xmm7, %xmm1
947
pslld $10, %xmm2
948
psrld $11, %xmm1
949
pxor %xmm2, %xmm7
950
pxor %xmm1, %xmm7
951
pslld $9, %xmm2
952
psrld $9, %xmm1
953
pxor %xmm2, %xmm7
954
pxor %xmm1, %xmm7
955
pslld $11, %xmm2
956
pxor %xmm2, %xmm7
957
paddd %xmm6, %xmm7
958
959
addq $16, %rax
960
cmpq $16*64, %rax
961
jne sha256_transform_4way_sse2_main_loop
962
jmp sha256_transform_4way_finish
963
964
965
#if defined(USE_AVX)
966
.text
967
.p2align 6
968
sha256_transform_4way_core_avx:
969
leaq 256(%rsp), %rax
970
movdqa -2*16(%rax), %xmm3
971
movdqa -1*16(%rax), %xmm7
972
sha256_avx_extend_doubleround 0
973
sha256_avx_extend_doubleround 2
974
sha256_avx_extend_doubleround 4
975
sha256_avx_extend_doubleround 6
976
sha256_avx_extend_doubleround 8
977
sha256_avx_extend_doubleround 10
978
sha256_avx_extend_doubleround 12
979
sha256_avx_extend_doubleround 14
980
sha256_avx_extend_doubleround 16
981
sha256_avx_extend_doubleround 18
982
sha256_avx_extend_doubleround 20
983
sha256_avx_extend_doubleround 22
984
sha256_avx_extend_doubleround 24
985
sha256_avx_extend_doubleround 26
986
sha256_avx_extend_doubleround 28
987
sha256_avx_extend_doubleround 30
988
sha256_avx_extend_doubleround 32
989
sha256_avx_extend_doubleround 34
990
sha256_avx_extend_doubleround 36
991
sha256_avx_extend_doubleround 38
992
sha256_avx_extend_doubleround 40
993
sha256_avx_extend_doubleround 42
994
sha256_avx_extend_doubleround 44
995
sha256_avx_extend_doubleround 46
996
movdqu 0(%rdi), %xmm7
997
movdqu 16(%rdi), %xmm5
998
movdqu 32(%rdi), %xmm4
999
movdqu 48(%rdi), %xmm3
1000
movdqu 64(%rdi), %xmm0
1001
movdqu 80(%rdi), %xmm8
1002
movdqu 96(%rdi), %xmm9
1003
movdqu 112(%rdi), %xmm10
1004
movq %rsp, %rax
1005
leaq sha256_4k(%rip), %rcx
1006
sha256_avx_main_quadround 0
1007
sha256_avx_main_quadround 4
1008
sha256_avx_main_quadround 8
1009
sha256_avx_main_quadround 12
1010
sha256_avx_main_quadround 16
1011
sha256_avx_main_quadround 20
1012
sha256_avx_main_quadround 24
1013
sha256_avx_main_quadround 28
1014
sha256_avx_main_quadround 32
1015
sha256_avx_main_quadround 36
1016
sha256_avx_main_quadround 40
1017
sha256_avx_main_quadround 44
1018
sha256_avx_main_quadround 48
1019
sha256_avx_main_quadround 52
1020
sha256_avx_main_quadround 56
1021
sha256_avx_main_quadround 60
1022
jmp sha256_transform_4way_finish
1023
#endif /* USE_AVX */
1024
1025
1026
#if defined(USE_XOP)
1027
.text
1028
.p2align 6
1029
sha256_transform_4way_core_xop:
1030
leaq 256(%rsp), %rax
1031
movdqa -2*16(%rax), %xmm3
1032
movdqa -1*16(%rax), %xmm7
1033
sha256_xop_extend_doubleround 0
1034
sha256_xop_extend_doubleround 2
1035
sha256_xop_extend_doubleround 4
1036
sha256_xop_extend_doubleround 6
1037
sha256_xop_extend_doubleround 8
1038
sha256_xop_extend_doubleround 10
1039
sha256_xop_extend_doubleround 12
1040
sha256_xop_extend_doubleround 14
1041
sha256_xop_extend_doubleround 16
1042
sha256_xop_extend_doubleround 18
1043
sha256_xop_extend_doubleround 20
1044
sha256_xop_extend_doubleround 22
1045
sha256_xop_extend_doubleround 24
1046
sha256_xop_extend_doubleround 26
1047
sha256_xop_extend_doubleround 28
1048
sha256_xop_extend_doubleround 30
1049
sha256_xop_extend_doubleround 32
1050
sha256_xop_extend_doubleround 34
1051
sha256_xop_extend_doubleround 36
1052
sha256_xop_extend_doubleround 38
1053
sha256_xop_extend_doubleround 40
1054
sha256_xop_extend_doubleround 42
1055
sha256_xop_extend_doubleround 44
1056
sha256_xop_extend_doubleround 46
1057
movdqu 0(%rdi), %xmm7
1058
movdqu 16(%rdi), %xmm5
1059
movdqu 32(%rdi), %xmm4
1060
movdqu 48(%rdi), %xmm3
1061
movdqu 64(%rdi), %xmm0
1062
movdqu 80(%rdi), %xmm8
1063
movdqu 96(%rdi), %xmm9
1064
movdqu 112(%rdi), %xmm10
1065
movq %rsp, %rax
1066
leaq sha256_4k(%rip), %rcx
1067
sha256_xop_main_quadround 0
1068
sha256_xop_main_quadround 4
1069
sha256_xop_main_quadround 8
1070
sha256_xop_main_quadround 12
1071
sha256_xop_main_quadround 16
1072
sha256_xop_main_quadround 20
1073
sha256_xop_main_quadround 24
1074
sha256_xop_main_quadround 28
1075
sha256_xop_main_quadround 32
1076
sha256_xop_main_quadround 36
1077
sha256_xop_main_quadround 40
1078
sha256_xop_main_quadround 44
1079
sha256_xop_main_quadround 48
1080
sha256_xop_main_quadround 52
1081
sha256_xop_main_quadround 56
1082
sha256_xop_main_quadround 60
1083
jmp sha256_transform_4way_finish
1084
#endif /* USE_XOP */
1085
1086
1087
.data
1088
.p2align 3
1089
sha256_transform_4way_core_addr:
1090
.quad 0x0
1091
1092
.macro p2bswap_rsi_rsp i
1093
movdqu \i*16(%rsi), %xmm0
1094
movdqu (\i+1)*16(%rsi), %xmm2
1095
pshuflw $0xb1, %xmm0, %xmm0
1096
pshuflw $0xb1, %xmm2, %xmm2
1097
pshufhw $0xb1, %xmm0, %xmm0
1098
pshufhw $0xb1, %xmm2, %xmm2
1099
movdqa %xmm0, %xmm1
1100
movdqa %xmm2, %xmm3
1101
psrlw $8, %xmm1
1102
psrlw $8, %xmm3
1103
psllw $8, %xmm0
1104
psllw $8, %xmm2
1105
pxor %xmm1, %xmm0
1106
pxor %xmm3, %xmm2
1107
movdqa %xmm0, \i*16(%rsp)
1108
movdqa %xmm2, (\i+1)*16(%rsp)
1109
.endm
1110
1111
.text
1112
.p2align 6
1113
.globl sha256_transform_4way
1114
.globl _sha256_transform_4way
1115
sha256_transform_4way:
1116
_sha256_transform_4way:
1117
#if defined(_WIN64) || defined(__CYGWIN__)
1118
pushq %rdi
1119
subq $96, %rsp
1120
movdqa %xmm6, 0(%rsp)
1121
movdqa %xmm7, 16(%rsp)
1122
movdqa %xmm8, 32(%rsp)
1123
movdqa %xmm9, 48(%rsp)
1124
movdqa %xmm10, 64(%rsp)
1125
movdqa %xmm11, 80(%rsp)
1126
pushq %rsi
1127
movq %rcx, %rdi
1128
movq %rdx, %rsi
1129
movq %r8, %rdx
1130
#endif
1131
movq %rsp, %r8
1132
subq $1032, %rsp
1133
andq $-128, %rsp
1134
1135
testq %rdx, %rdx
1136
jnz sha256_transform_4way_swap
1137
1138
movdqu 0*16(%rsi), %xmm0
1139
movdqu 1*16(%rsi), %xmm1
1140
movdqu 2*16(%rsi), %xmm2
1141
movdqu 3*16(%rsi), %xmm3
1142
movdqu 4*16(%rsi), %xmm4
1143
movdqu 5*16(%rsi), %xmm5
1144
movdqu 6*16(%rsi), %xmm6
1145
movdqu 7*16(%rsi), %xmm7
1146
movdqa %xmm0, 0*16(%rsp)
1147
movdqa %xmm1, 1*16(%rsp)
1148
movdqa %xmm2, 2*16(%rsp)
1149
movdqa %xmm3, 3*16(%rsp)
1150
movdqa %xmm4, 4*16(%rsp)
1151
movdqa %xmm5, 5*16(%rsp)
1152
movdqa %xmm6, 6*16(%rsp)
1153
movdqa %xmm7, 7*16(%rsp)
1154
movdqu 8*16(%rsi), %xmm0
1155
movdqu 9*16(%rsi), %xmm1
1156
movdqu 10*16(%rsi), %xmm2
1157
movdqu 11*16(%rsi), %xmm3
1158
movdqu 12*16(%rsi), %xmm4
1159
movdqu 13*16(%rsi), %xmm5
1160
movdqu 14*16(%rsi), %xmm6
1161
movdqu 15*16(%rsi), %xmm7
1162
movdqa %xmm0, 8*16(%rsp)
1163
movdqa %xmm1, 9*16(%rsp)
1164
movdqa %xmm2, 10*16(%rsp)
1165
movdqa %xmm3, 11*16(%rsp)
1166
movdqa %xmm4, 12*16(%rsp)
1167
movdqa %xmm5, 13*16(%rsp)
1168
movdqa %xmm6, 14*16(%rsp)
1169
movdqa %xmm7, 15*16(%rsp)
1170
jmp *sha256_transform_4way_core_addr(%rip)
1171
1172
.p2align 6
1173
sha256_transform_4way_swap:
1174
p2bswap_rsi_rsp 0
1175
p2bswap_rsi_rsp 2
1176
p2bswap_rsi_rsp 4
1177
p2bswap_rsi_rsp 6
1178
p2bswap_rsi_rsp 8
1179
p2bswap_rsi_rsp 10
1180
p2bswap_rsi_rsp 12
1181
p2bswap_rsi_rsp 14
1182
jmp *sha256_transform_4way_core_addr(%rip)
1183
1184
.p2align 6
1185
sha256_transform_4way_finish:
1186
movdqu 0(%rdi), %xmm2
1187
movdqu 16(%rdi), %xmm6
1188
movdqu 32(%rdi), %xmm11
1189
movdqu 48(%rdi), %xmm1
1190
paddd %xmm2, %xmm7
1191
paddd %xmm6, %xmm5
1192
paddd %xmm11, %xmm4
1193
paddd %xmm1, %xmm3
1194
movdqu 64(%rdi), %xmm2
1195
movdqu 80(%rdi), %xmm6
1196
movdqu 96(%rdi), %xmm11
1197
movdqu 112(%rdi), %xmm1
1198
paddd %xmm2, %xmm0
1199
paddd %xmm6, %xmm8
1200
paddd %xmm11, %xmm9
1201
paddd %xmm1, %xmm10
1202
1203
movdqu %xmm7, 0(%rdi)
1204
movdqu %xmm5, 16(%rdi)
1205
movdqu %xmm4, 32(%rdi)
1206
movdqu %xmm3, 48(%rdi)
1207
movdqu %xmm0, 64(%rdi)
1208
movdqu %xmm8, 80(%rdi)
1209
movdqu %xmm9, 96(%rdi)
1210
movdqu %xmm10, 112(%rdi)
1211
1212
movq %r8, %rsp
1213
#if defined(_WIN64) || defined(__CYGWIN__)
1214
popq %rsi
1215
movdqa 0(%rsp), %xmm6
1216
movdqa 16(%rsp), %xmm7
1217
movdqa 32(%rsp), %xmm8
1218
movdqa 48(%rsp), %xmm9
1219
movdqa 64(%rsp), %xmm10
1220
movdqa 80(%rsp), %xmm11
1221
addq $96, %rsp
1222
popq %rdi
1223
#endif
1224
ret
1225
1226
1227
#ifdef USE_AVX2
1228
1229
.text
1230
.p2align 6
1231
sha256_transform_8way_core_avx2:
1232
leaq 8*64(%rsp), %rax
1233
vmovdqa -2*32(%rax), %ymm3
1234
vmovdqa -1*32(%rax), %ymm7
1235
sha256_avx2_extend_doubleround 0
1236
sha256_avx2_extend_doubleround 2
1237
sha256_avx2_extend_doubleround 4
1238
sha256_avx2_extend_doubleround 6
1239
sha256_avx2_extend_doubleround 8
1240
sha256_avx2_extend_doubleround 10
1241
sha256_avx2_extend_doubleround 12
1242
sha256_avx2_extend_doubleround 14
1243
sha256_avx2_extend_doubleround 16
1244
sha256_avx2_extend_doubleround 18
1245
sha256_avx2_extend_doubleround 20
1246
sha256_avx2_extend_doubleround 22
1247
sha256_avx2_extend_doubleround 24
1248
sha256_avx2_extend_doubleround 26
1249
sha256_avx2_extend_doubleround 28
1250
sha256_avx2_extend_doubleround 30
1251
sha256_avx2_extend_doubleround 32
1252
sha256_avx2_extend_doubleround 34
1253
sha256_avx2_extend_doubleround 36
1254
sha256_avx2_extend_doubleround 38
1255
sha256_avx2_extend_doubleround 40
1256
sha256_avx2_extend_doubleround 42
1257
sha256_avx2_extend_doubleround 44
1258
sha256_avx2_extend_doubleround 46
1259
vmovdqu 0*32(%rdi), %ymm7
1260
vmovdqu 1*32(%rdi), %ymm5
1261
vmovdqu 2*32(%rdi), %ymm4
1262
vmovdqu 3*32(%rdi), %ymm3
1263
vmovdqu 4*32(%rdi), %ymm0
1264
vmovdqu 5*32(%rdi), %ymm8
1265
vmovdqu 6*32(%rdi), %ymm9
1266
vmovdqu 7*32(%rdi), %ymm10
1267
movq %rsp, %rax
1268
leaq sha256_8k(%rip), %rcx
1269
sha256_avx2_main_quadround 0
1270
sha256_avx2_main_quadround 4
1271
sha256_avx2_main_quadround 8
1272
sha256_avx2_main_quadround 12
1273
sha256_avx2_main_quadround 16
1274
sha256_avx2_main_quadround 20
1275
sha256_avx2_main_quadround 24
1276
sha256_avx2_main_quadround 28
1277
sha256_avx2_main_quadround 32
1278
sha256_avx2_main_quadround 36
1279
sha256_avx2_main_quadround 40
1280
sha256_avx2_main_quadround 44
1281
sha256_avx2_main_quadround 48
1282
sha256_avx2_main_quadround 52
1283
sha256_avx2_main_quadround 56
1284
sha256_avx2_main_quadround 60
1285
jmp sha256_transform_8way_finish
1286
1287
.macro p2bswap_avx2_rsi_rsp i
1288
vmovdqu \i*32(%rsi), %ymm0
1289
vmovdqu (\i+1)*32(%rsi), %ymm2
1290
vpshuflw $0xb1, %ymm0, %ymm0
1291
vpshuflw $0xb1, %ymm2, %ymm2
1292
vpshufhw $0xb1, %ymm0, %ymm0
1293
vpshufhw $0xb1, %ymm2, %ymm2
1294
vpsrlw $8, %ymm0, %ymm1
1295
vpsrlw $8, %ymm2, %ymm3
1296
vpsllw $8, %ymm0, %ymm0
1297
vpsllw $8, %ymm2, %ymm2
1298
vpxor %ymm1, %ymm0, %ymm0
1299
vpxor %ymm3, %ymm2, %ymm2
1300
vmovdqa %ymm0, \i*32(%rsp)
1301
vmovdqa %ymm2, (\i+1)*32(%rsp)
1302
.endm
1303
1304
.text
1305
.p2align 6
1306
.globl sha256_transform_8way
1307
.globl _sha256_transform_8way
1308
sha256_transform_8way:
1309
_sha256_transform_8way:
1310
#if defined(_WIN64) || defined(__CYGWIN__)
1311
pushq %rdi
1312
subq $96, %rsp
1313
vmovdqa %xmm6, 0(%rsp)
1314
vmovdqa %xmm7, 16(%rsp)
1315
vmovdqa %xmm8, 32(%rsp)
1316
vmovdqa %xmm9, 48(%rsp)
1317
vmovdqa %xmm10, 64(%rsp)
1318
vmovdqa %xmm11, 80(%rsp)
1319
pushq %rsi
1320
movq %rcx, %rdi
1321
movq %rdx, %rsi
1322
movq %r8, %rdx
1323
#endif
1324
movq %rsp, %r8
1325
subq $64*32, %rsp
1326
andq $-128, %rsp
1327
1328
testq %rdx, %rdx
1329
jnz sha256_transform_8way_swap
1330
1331
vmovdqu 0*32(%rsi), %ymm0
1332
vmovdqu 1*32(%rsi), %ymm1
1333
vmovdqu 2*32(%rsi), %ymm2
1334
vmovdqu 3*32(%rsi), %ymm3
1335
vmovdqu 4*32(%rsi), %ymm4
1336
vmovdqu 5*32(%rsi), %ymm5
1337
vmovdqu 6*32(%rsi), %ymm6
1338
vmovdqu 7*32(%rsi), %ymm7
1339
vmovdqa %ymm0, 0*32(%rsp)
1340
vmovdqa %ymm1, 1*32(%rsp)
1341
vmovdqa %ymm2, 2*32(%rsp)
1342
vmovdqa %ymm3, 3*32(%rsp)
1343
vmovdqa %ymm4, 4*32(%rsp)
1344
vmovdqa %ymm5, 5*32(%rsp)
1345
vmovdqa %ymm6, 6*32(%rsp)
1346
vmovdqa %ymm7, 7*32(%rsp)
1347
vmovdqu 8*32(%rsi), %ymm0
1348
vmovdqu 9*32(%rsi), %ymm1
1349
vmovdqu 10*32(%rsi), %ymm2
1350
vmovdqu 11*32(%rsi), %ymm3
1351
vmovdqu 12*32(%rsi), %ymm4
1352
vmovdqu 13*32(%rsi), %ymm5
1353
vmovdqu 14*32(%rsi), %ymm6
1354
vmovdqu 15*32(%rsi), %ymm7
1355
vmovdqa %ymm0, 8*32(%rsp)
1356
vmovdqa %ymm1, 9*32(%rsp)
1357
vmovdqa %ymm2, 10*32(%rsp)
1358
vmovdqa %ymm3, 11*32(%rsp)
1359
vmovdqa %ymm4, 12*32(%rsp)
1360
vmovdqa %ymm5, 13*32(%rsp)
1361
vmovdqa %ymm6, 14*32(%rsp)
1362
vmovdqa %ymm7, 15*32(%rsp)
1363
jmp sha256_transform_8way_core_avx2
1364
1365
.p2align 6
1366
sha256_transform_8way_swap:
1367
p2bswap_avx2_rsi_rsp 0
1368
p2bswap_avx2_rsi_rsp 2
1369
p2bswap_avx2_rsi_rsp 4
1370
p2bswap_avx2_rsi_rsp 6
1371
p2bswap_avx2_rsi_rsp 8
1372
p2bswap_avx2_rsi_rsp 10
1373
p2bswap_avx2_rsi_rsp 12
1374
p2bswap_avx2_rsi_rsp 14
1375
jmp sha256_transform_8way_core_avx2
1376
1377
.p2align 6
1378
sha256_transform_8way_finish:
1379
vmovdqu 0*32(%rdi), %ymm2
1380
vmovdqu 1*32(%rdi), %ymm6
1381
vmovdqu 2*32(%rdi), %ymm11
1382
vmovdqu 3*32(%rdi), %ymm1
1383
vpaddd %ymm2, %ymm7, %ymm7
1384
vpaddd %ymm6, %ymm5, %ymm5
1385
vpaddd %ymm11, %ymm4, %ymm4
1386
vpaddd %ymm1, %ymm3, %ymm3
1387
vmovdqu 4*32(%rdi), %ymm2
1388
vmovdqu 5*32(%rdi), %ymm6
1389
vmovdqu 6*32(%rdi), %ymm11
1390
vmovdqu 7*32(%rdi), %ymm1
1391
vpaddd %ymm2, %ymm0, %ymm0
1392
vpaddd %ymm6, %ymm8, %ymm8
1393
vpaddd %ymm11, %ymm9, %ymm9
1394
vpaddd %ymm1, %ymm10, %ymm10
1395
1396
vmovdqu %ymm7, 0*32(%rdi)
1397
vmovdqu %ymm5, 1*32(%rdi)
1398
vmovdqu %ymm4, 2*32(%rdi)
1399
vmovdqu %ymm3, 3*32(%rdi)
1400
vmovdqu %ymm0, 4*32(%rdi)
1401
vmovdqu %ymm8, 5*32(%rdi)
1402
vmovdqu %ymm9, 6*32(%rdi)
1403
vmovdqu %ymm10, 7*32(%rdi)
1404
1405
movq %r8, %rsp
1406
#if defined(_WIN64) || defined(__CYGWIN__)
1407
popq %rsi
1408
vmovdqa 0(%rsp), %xmm6
1409
vmovdqa 16(%rsp), %xmm7
1410
vmovdqa 32(%rsp), %xmm8
1411
vmovdqa 48(%rsp), %xmm9
1412
vmovdqa 64(%rsp), %xmm10
1413
vmovdqa 80(%rsp), %xmm11
1414
addq $96, %rsp
1415
popq %rdi
1416
#endif
1417
ret
1418
1419
#endif /* USE_AVX2 */
1420
1421
1422
.data
1423
.p2align 3
1424
sha256d_ms_4way_addr:
1425
.quad 0x0
1426
1427
.text
1428
.p2align 6
1429
.globl sha256d_ms_4way
1430
.globl _sha256d_ms_4way
1431
sha256d_ms_4way:
1432
_sha256d_ms_4way:
1433
jmp *sha256d_ms_4way_addr(%rip)
1434
1435
1436
.p2align 6
1437
sha256d_ms_4way_sse2:
1438
#if defined(_WIN64) || defined(__CYGWIN__)
1439
pushq %rdi
1440
subq $32, %rsp
1441
movdqa %xmm6, 0(%rsp)
1442
movdqa %xmm7, 16(%rsp)
1443
pushq %rsi
1444
movq %rcx, %rdi
1445
movq %rdx, %rsi
1446
movq %r8, %rdx
1447
movq %r9, %rcx
1448
#endif
1449
subq $8+67*16, %rsp
1450
1451
leaq 256(%rsi), %rax
1452
1453
sha256d_ms_4way_sse2_extend_loop1:
1454
movdqa 3*16(%rsi), %xmm0
1455
movdqa 2*16(%rax), %xmm3
1456
movdqa 3*16(%rax), %xmm7
1457
movdqa %xmm3, 5*16(%rsp)
1458
movdqa %xmm7, 6*16(%rsp)
1459
movdqa %xmm0, %xmm2
1460
paddd %xmm0, %xmm7
1461
psrld $3, %xmm0
1462
movdqa %xmm0, %xmm1
1463
pslld $14, %xmm2
1464
psrld $4, %xmm1
1465
pxor %xmm1, %xmm0
1466
pxor %xmm2, %xmm0
1467
psrld $11, %xmm1
1468
pslld $11, %xmm2
1469
pxor %xmm1, %xmm0
1470
pxor %xmm2, %xmm0
1471
paddd %xmm0, %xmm3
1472
movdqa %xmm3, 2*16(%rax)
1473
movdqa %xmm7, 3*16(%rax)
1474
1475
movdqa 4*16(%rax), %xmm0
1476
movdqa %xmm0, 7*16(%rsp)
1477
movdqa %xmm3, %xmm2
1478
movdqa %xmm7, %xmm6
1479
psrld $10, %xmm3
1480
psrld $10, %xmm7
1481
movdqa %xmm3, %xmm1
1482
movdqa %xmm7, %xmm5
1483
pslld $13, %xmm2
1484
pslld $13, %xmm6
1485
psrld $7, %xmm1
1486
psrld $7, %xmm5
1487
pxor %xmm1, %xmm3
1488
pxor %xmm5, %xmm7
1489
psrld $2, %xmm1
1490
psrld $2, %xmm5
1491
pxor %xmm2, %xmm3
1492
pxor %xmm6, %xmm7
1493
pslld $2, %xmm2
1494
pslld $2, %xmm6
1495
pxor %xmm1, %xmm3
1496
pxor %xmm5, %xmm7
1497
pxor %xmm2, %xmm3
1498
pxor %xmm6, %xmm7
1499
paddd %xmm0, %xmm3
1500
movdqa %xmm3, 4*16(%rax)
1501
movdqa %xmm7, 5*16(%rax)
1502
1503
movdqa 6*16(%rax), %xmm0
1504
movdqa 7*16(%rax), %xmm4
1505
movdqa %xmm0, 9*16(%rsp)
1506
movdqa %xmm4, 10*16(%rsp)
1507
movdqa %xmm3, %xmm2
1508
movdqa %xmm7, %xmm6
1509
psrld $10, %xmm3
1510
psrld $10, %xmm7
1511
movdqa %xmm3, %xmm1
1512
movdqa %xmm7, %xmm5
1513
pslld $13, %xmm2
1514
pslld $13, %xmm6
1515
psrld $7, %xmm1
1516
psrld $7, %xmm5
1517
pxor %xmm1, %xmm3
1518
pxor %xmm5, %xmm7
1519
psrld $2, %xmm1
1520
psrld $2, %xmm5
1521
pxor %xmm2, %xmm3
1522
pxor %xmm6, %xmm7
1523
pslld $2, %xmm2
1524
pslld $2, %xmm6
1525
pxor %xmm1, %xmm3
1526
pxor %xmm5, %xmm7
1527
pxor %xmm2, %xmm3
1528
pxor %xmm6, %xmm7
1529
paddd %xmm0, %xmm3
1530
paddd %xmm4, %xmm7
1531
movdqa %xmm3, 6*16(%rax)
1532
movdqa %xmm7, 7*16(%rax)
1533
1534
movdqa 8*16(%rax), %xmm0
1535
movdqa 2*16(%rax), %xmm4
1536
movdqa %xmm0, 11*16(%rsp)
1537
movdqa %xmm3, %xmm2
1538
movdqa %xmm7, %xmm6
1539
psrld $10, %xmm3
1540
psrld $10, %xmm7
1541
movdqa %xmm3, %xmm1
1542
movdqa %xmm7, %xmm5
1543
pslld $13, %xmm2
1544
pslld $13, %xmm6
1545
psrld $7, %xmm1
1546
psrld $7, %xmm5
1547
pxor %xmm1, %xmm3
1548
pxor %xmm5, %xmm7
1549
psrld $2, %xmm1
1550
psrld $2, %xmm5
1551
pxor %xmm2, %xmm3
1552
pxor %xmm6, %xmm7
1553
pslld $2, %xmm2
1554
pslld $2, %xmm6
1555
pxor %xmm1, %xmm3
1556
pxor %xmm5, %xmm7
1557
pxor %xmm2, %xmm3
1558
pxor %xmm6, %xmm7
1559
paddd %xmm0, %xmm3
1560
paddd %xmm4, %xmm7
1561
movdqa %xmm3, 8*16(%rax)
1562
movdqa %xmm7, 9*16(%rax)
1563
1564
movdqa %xmm3, %xmm2
1565
movdqa %xmm7, %xmm6
1566
psrld $10, %xmm3
1567
psrld $10, %xmm7
1568
movdqa %xmm3, %xmm1
1569
movdqa %xmm7, %xmm5
1570
pslld $13, %xmm2
1571
pslld $13, %xmm6
1572
psrld $7, %xmm1
1573
psrld $7, %xmm5
1574
pxor %xmm1, %xmm3
1575
pxor %xmm5, %xmm7
1576
psrld $2, %xmm1
1577
psrld $2, %xmm5
1578
pxor %xmm2, %xmm3
1579
pxor %xmm6, %xmm7
1580
pslld $2, %xmm2
1581
pslld $2, %xmm6
1582
pxor %xmm1, %xmm3
1583
pxor %xmm5, %xmm7
1584
pxor %xmm2, %xmm3
1585
pxor %xmm6, %xmm7
1586
paddd 3*16(%rax), %xmm3
1587
paddd 4*16(%rax), %xmm7
1588
movdqa %xmm3, 10*16(%rax)
1589
movdqa %xmm7, 11*16(%rax)
1590
1591
movdqa %xmm3, %xmm2
1592
movdqa %xmm7, %xmm6
1593
psrld $10, %xmm3
1594
psrld $10, %xmm7
1595
movdqa %xmm3, %xmm1
1596
movdqa %xmm7, %xmm5
1597
pslld $13, %xmm2
1598
pslld $13, %xmm6
1599
psrld $7, %xmm1
1600
psrld $7, %xmm5
1601
pxor %xmm1, %xmm3
1602
pxor %xmm5, %xmm7
1603
psrld $2, %xmm1
1604
psrld $2, %xmm5
1605
pxor %xmm2, %xmm3
1606
pxor %xmm6, %xmm7
1607
pslld $2, %xmm2
1608
pslld $2, %xmm6
1609
pxor %xmm1, %xmm3
1610
pxor %xmm5, %xmm7
1611
pxor %xmm2, %xmm3
1612
pxor %xmm6, %xmm7
1613
paddd 5*16(%rax), %xmm3
1614
paddd 6*16(%rax), %xmm7
1615
movdqa %xmm3, 12*16(%rax)
1616
movdqa %xmm7, 13*16(%rax)
1617
1618
movdqa 14*16(%rax), %xmm0
1619
movdqa 15*16(%rax), %xmm4
1620
movdqa %xmm0, 17*16(%rsp)
1621
movdqa %xmm4, 18*16(%rsp)
1622
movdqa %xmm3, %xmm2
1623
movdqa %xmm7, %xmm6
1624
psrld $10, %xmm3
1625
psrld $10, %xmm7
1626
movdqa %xmm3, %xmm1
1627
movdqa %xmm7, %xmm5
1628
paddd 7*16(%rax), %xmm0
1629
paddd 8*16(%rax), %xmm4
1630
pslld $13, %xmm2
1631
pslld $13, %xmm6
1632
psrld $7, %xmm1
1633
psrld $7, %xmm5
1634
pxor %xmm1, %xmm3
1635
pxor %xmm5, %xmm7
1636
psrld $2, %xmm1
1637
psrld $2, %xmm5
1638
pxor %xmm2, %xmm3
1639
pxor %xmm6, %xmm7
1640
pslld $2, %xmm2
1641
pslld $2, %xmm6
1642
pxor %xmm1, %xmm3
1643
pxor %xmm5, %xmm7
1644
pxor %xmm2, %xmm3
1645
pxor %xmm6, %xmm7
1646
paddd %xmm0, %xmm3
1647
paddd %xmm4, %xmm7
1648
movdqa %xmm3, 14*16(%rax)
1649
movdqa %xmm7, 15*16(%rax)
1650
1651
sha256d_ms_4way_sse2_extend_loop2:
1652
sha256_sse2_extend_doubleround 16
1653
sha256_sse2_extend_doubleround 18
1654
sha256_sse2_extend_doubleround 20
1655
sha256_sse2_extend_doubleround 22
1656
sha256_sse2_extend_doubleround 24
1657
sha256_sse2_extend_doubleround 26
1658
sha256_sse2_extend_doubleround 28
1659
sha256_sse2_extend_doubleround 30
1660
sha256_sse2_extend_doubleround 32
1661
sha256_sse2_extend_doubleround 34
1662
sha256_sse2_extend_doubleround 36
1663
sha256_sse2_extend_doubleround 38
1664
sha256_sse2_extend_doubleround 40
1665
sha256_sse2_extend_doubleround 42
1666
jz sha256d_ms_4way_sse2_extend_coda2
1667
sha256_sse2_extend_doubleround 44
1668
sha256_sse2_extend_doubleround 46
1669
1670
movdqa 0(%rcx), %xmm3
1671
movdqa 16(%rcx), %xmm0
1672
movdqa 32(%rcx), %xmm1
1673
movdqa 48(%rcx), %xmm2
1674
movdqa 64(%rcx), %xmm6
1675
movdqa 80(%rcx), %xmm7
1676
movdqa 96(%rcx), %xmm5
1677
movdqa 112(%rcx), %xmm4
1678
movdqa %xmm1, 0(%rsp)
1679
movdqa %xmm2, 16(%rsp)
1680
movdqa %xmm6, 32(%rsp)
1681
1682
movq %rsi, %rax
1683
leaq sha256_4k(%rip), %rcx
1684
jmp sha256d_ms_4way_sse2_main_loop1
1685
1686
sha256d_ms_4way_sse2_main_loop2:
1687
sha256_sse2_main_round 0
1688
sha256_sse2_main_round 1
1689
sha256_sse2_main_round 2
1690
sha256d_ms_4way_sse2_main_loop1:
1691
sha256_sse2_main_round 3
1692
sha256_sse2_main_quadround 4
1693
sha256_sse2_main_quadround 8
1694
sha256_sse2_main_quadround 12
1695
sha256_sse2_main_quadround 16
1696
sha256_sse2_main_quadround 20
1697
sha256_sse2_main_quadround 24
1698
sha256_sse2_main_quadround 28
1699
sha256_sse2_main_quadround 32
1700
sha256_sse2_main_quadround 36
1701
sha256_sse2_main_quadround 40
1702
sha256_sse2_main_quadround 44
1703
sha256_sse2_main_quadround 48
1704
sha256_sse2_main_quadround 52
1705
sha256_sse2_main_round 56
1706
jz sha256d_ms_4way_sse2_finish
1707
sha256_sse2_main_round 57
1708
sha256_sse2_main_round 58
1709
sha256_sse2_main_round 59
1710
sha256_sse2_main_quadround 60
1711
1712
movdqa 5*16(%rsp), %xmm1
1713
movdqa 6*16(%rsp), %xmm2
1714
movdqa 7*16(%rsp), %xmm6
1715
movdqa %xmm1, 18*16(%rsi)
1716
movdqa %xmm2, 19*16(%rsi)
1717
movdqa %xmm6, 20*16(%rsi)
1718
movdqa 9*16(%rsp), %xmm1
1719
movdqa 10*16(%rsp), %xmm2
1720
movdqa 11*16(%rsp), %xmm6
1721
movdqa %xmm1, 22*16(%rsi)
1722
movdqa %xmm2, 23*16(%rsi)
1723
movdqa %xmm6, 24*16(%rsi)
1724
movdqa 17*16(%rsp), %xmm1
1725
movdqa 18*16(%rsp), %xmm2
1726
movdqa %xmm1, 30*16(%rsi)
1727
movdqa %xmm2, 31*16(%rsi)
1728
1729
movdqa 0(%rsp), %xmm1
1730
movdqa 16(%rsp), %xmm2
1731
movdqa 32(%rsp), %xmm6
1732
paddd 0(%rdx), %xmm7
1733
paddd 16(%rdx), %xmm5
1734
paddd 32(%rdx), %xmm4
1735
paddd 48(%rdx), %xmm3
1736
paddd 64(%rdx), %xmm0
1737
paddd 80(%rdx), %xmm1
1738
paddd 96(%rdx), %xmm2
1739
paddd 112(%rdx), %xmm6
1740
1741
movdqa %xmm7, 48+0(%rsp)
1742
movdqa %xmm5, 48+16(%rsp)
1743
movdqa %xmm4, 48+32(%rsp)
1744
movdqa %xmm3, 48+48(%rsp)
1745
movdqa %xmm0, 48+64(%rsp)
1746
movdqa %xmm1, 48+80(%rsp)
1747
movdqa %xmm2, 48+96(%rsp)
1748
movdqa %xmm6, 48+112(%rsp)
1749
1750
pxor %xmm0, %xmm0
1751
movq $0x8000000000000100, %rax
1752
movd %rax, %xmm1
1753
pshufd $0x55, %xmm1, %xmm2
1754
pshufd $0x00, %xmm1, %xmm1
1755
movdqa %xmm2, 48+128(%rsp)
1756
movdqa %xmm0, 48+144(%rsp)
1757
movdqa %xmm0, 48+160(%rsp)
1758
movdqa %xmm0, 48+176(%rsp)
1759
movdqa %xmm0, 48+192(%rsp)
1760
movdqa %xmm0, 48+208(%rsp)
1761
movdqa %xmm0, 48+224(%rsp)
1762
movdqa %xmm1, 48+240(%rsp)
1763
1764
leaq 19*16(%rsp), %rax
1765
cmpq %rax, %rax
1766
1767
movdqa -15*16(%rax), %xmm0
1768
movdqa -14*16(%rax), %xmm4
1769
movdqa %xmm0, %xmm2
1770
movdqa %xmm4, %xmm6
1771
psrld $3, %xmm0
1772
psrld $3, %xmm4
1773
movdqa %xmm0, %xmm1
1774
movdqa %xmm4, %xmm5
1775
pslld $14, %xmm2
1776
pslld $14, %xmm6
1777
psrld $4, %xmm1
1778
psrld $4, %xmm5
1779
pxor %xmm1, %xmm0
1780
pxor %xmm5, %xmm4
1781
psrld $11, %xmm1
1782
psrld $11, %xmm5
1783
pxor %xmm2, %xmm0
1784
pxor %xmm6, %xmm4
1785
pslld $11, %xmm2
1786
pslld $11, %xmm6
1787
pxor %xmm1, %xmm0
1788
pxor %xmm5, %xmm4
1789
pxor %xmm2, %xmm0
1790
pxor %xmm6, %xmm4
1791
paddd -16*16(%rax), %xmm0
1792
paddd -15*16(%rax), %xmm4
1793
paddd sha256d_4preext2_17(%rip), %xmm4
1794
movdqa %xmm0, %xmm3
1795
movdqa %xmm4, %xmm7
1796
movdqa %xmm3, 0*16(%rax)
1797
movdqa %xmm7, 1*16(%rax)
1798
1799
sha256_sse2_extend_doubleround 2
1800
sha256_sse2_extend_doubleround 4
1801
1802
movdqa -9*16(%rax), %xmm0
1803
movdqa sha256d_4preext2_23(%rip), %xmm4
1804
movdqa %xmm0, %xmm2
1805
psrld $3, %xmm0
1806
movdqa %xmm0, %xmm1
1807
pslld $14, %xmm2
1808
psrld $4, %xmm1
1809
pxor %xmm1, %xmm0
1810
pxor %xmm2, %xmm0
1811
psrld $11, %xmm1
1812
pslld $11, %xmm2
1813
pxor %xmm1, %xmm0
1814
pxor %xmm2, %xmm0
1815
paddd -10*16(%rax), %xmm0
1816
paddd -9*16(%rax), %xmm4
1817
movdqa %xmm3, %xmm2
1818
movdqa %xmm7, %xmm6
1819
psrld $10, %xmm3
1820
psrld $10, %xmm7
1821
movdqa %xmm3, %xmm1
1822
movdqa %xmm7, %xmm5
1823
paddd -1*16(%rax), %xmm0
1824
pslld $13, %xmm2
1825
pslld $13, %xmm6
1826
psrld $7, %xmm1
1827
psrld $7, %xmm5
1828
paddd 0*16(%rax), %xmm4
1829
pxor %xmm1, %xmm3
1830
pxor %xmm5, %xmm7
1831
psrld $2, %xmm1
1832
psrld $2, %xmm5
1833
pxor %xmm2, %xmm3
1834
pxor %xmm6, %xmm7
1835
pslld $2, %xmm2
1836
pslld $2, %xmm6
1837
pxor %xmm1, %xmm3
1838
pxor %xmm5, %xmm7
1839
pxor %xmm2, %xmm3
1840
pxor %xmm6, %xmm7
1841
paddd %xmm0, %xmm3
1842
paddd %xmm4, %xmm7
1843
movdqa %xmm3, 6*16(%rax)
1844
movdqa %xmm7, 7*16(%rax)
1845
1846
movdqa sha256d_4preext2_24(%rip), %xmm0
1847
movdqa %xmm3, %xmm2
1848
movdqa %xmm7, %xmm6
1849
psrld $10, %xmm3
1850
psrld $10, %xmm7
1851
movdqa %xmm3, %xmm1
1852
movdqa %xmm7, %xmm5
1853
paddd 1*16(%rax), %xmm0
1854
pslld $13, %xmm2
1855
pslld $13, %xmm6
1856
psrld $7, %xmm1
1857
psrld $7, %xmm5
1858
pxor %xmm1, %xmm3
1859
pxor %xmm5, %xmm7
1860
psrld $2, %xmm1
1861
psrld $2, %xmm5
1862
pxor %xmm2, %xmm3
1863
pxor %xmm6, %xmm7
1864
pslld $2, %xmm2
1865
pslld $2, %xmm6
1866
pxor %xmm1, %xmm3
1867
pxor %xmm5, %xmm7
1868
pxor %xmm2, %xmm3
1869
pxor %xmm6, %xmm7
1870
paddd %xmm0, %xmm3
1871
paddd 2*16(%rax), %xmm7
1872
movdqa %xmm3, 8*16(%rax)
1873
movdqa %xmm7, 9*16(%rax)
1874
1875
movdqa %xmm3, %xmm2
1876
movdqa %xmm7, %xmm6
1877
psrld $10, %xmm3
1878
psrld $10, %xmm7
1879
movdqa %xmm3, %xmm1
1880
movdqa %xmm7, %xmm5
1881
pslld $13, %xmm2
1882
pslld $13, %xmm6
1883
psrld $7, %xmm1
1884
psrld $7, %xmm5
1885
pxor %xmm1, %xmm3
1886
pxor %xmm5, %xmm7
1887
psrld $2, %xmm1
1888
psrld $2, %xmm5
1889
pxor %xmm2, %xmm3
1890
pxor %xmm6, %xmm7
1891
pslld $2, %xmm2
1892
pslld $2, %xmm6
1893
pxor %xmm1, %xmm3
1894
pxor %xmm5, %xmm7
1895
pxor %xmm2, %xmm3
1896
pxor %xmm6, %xmm7
1897
paddd 3*16(%rax), %xmm3
1898
paddd 4*16(%rax), %xmm7
1899
movdqa %xmm3, 10*16(%rax)
1900
movdqa %xmm7, 11*16(%rax)
1901
1902
movdqa %xmm3, %xmm2
1903
movdqa %xmm7, %xmm6
1904
psrld $10, %xmm3
1905
psrld $10, %xmm7
1906
movdqa %xmm3, %xmm1
1907
movdqa %xmm7, %xmm5
1908
pslld $13, %xmm2
1909
pslld $13, %xmm6
1910
psrld $7, %xmm1
1911
psrld $7, %xmm5
1912
pxor %xmm1, %xmm3
1913
pxor %xmm5, %xmm7
1914
psrld $2, %xmm1
1915
psrld $2, %xmm5
1916
pxor %xmm2, %xmm3
1917
pxor %xmm6, %xmm7
1918
pslld $2, %xmm2
1919
pslld $2, %xmm6
1920
pxor %xmm1, %xmm3
1921
pxor %xmm5, %xmm7
1922
pxor %xmm2, %xmm3
1923
pxor %xmm6, %xmm7
1924
paddd 5*16(%rax), %xmm3
1925
paddd 6*16(%rax), %xmm7
1926
movdqa %xmm3, 12*16(%rax)
1927
movdqa %xmm7, 13*16(%rax)
1928
1929
movdqa sha256d_4preext2_30(%rip), %xmm0
1930
movdqa 0*16(%rax), %xmm4
1931
movdqa %xmm4, %xmm6
1932
psrld $3, %xmm4
1933
movdqa %xmm4, %xmm5
1934
pslld $14, %xmm6
1935
psrld $4, %xmm5
1936
pxor %xmm5, %xmm4
1937
pxor %xmm6, %xmm4
1938
psrld $11, %xmm5
1939
pslld $11, %xmm6
1940
pxor %xmm5, %xmm4
1941
pxor %xmm6, %xmm4
1942
paddd -1*16(%rax), %xmm4
1943
movdqa %xmm3, %xmm2
1944
movdqa %xmm7, %xmm6
1945
psrld $10, %xmm3
1946
psrld $10, %xmm7
1947
movdqa %xmm3, %xmm1
1948
movdqa %xmm7, %xmm5
1949
paddd 7*16(%rax), %xmm0
1950
pslld $13, %xmm2
1951
pslld $13, %xmm6
1952
psrld $7, %xmm1
1953
psrld $7, %xmm5
1954
paddd 8*16(%rax), %xmm4
1955
pxor %xmm1, %xmm3
1956
pxor %xmm5, %xmm7
1957
psrld $2, %xmm1
1958
psrld $2, %xmm5
1959
pxor %xmm2, %xmm3
1960
pxor %xmm6, %xmm7
1961
pslld $2, %xmm2
1962
pslld $2, %xmm6
1963
pxor %xmm1, %xmm3
1964
pxor %xmm5, %xmm7
1965
pxor %xmm2, %xmm3
1966
pxor %xmm6, %xmm7
1967
paddd %xmm0, %xmm3
1968
paddd %xmm4, %xmm7
1969
movdqa %xmm3, 14*16(%rax)
1970
movdqa %xmm7, 15*16(%rax)
1971
1972
jmp sha256d_ms_4way_sse2_extend_loop2
1973
1974
sha256d_ms_4way_sse2_extend_coda2:
1975
sha256_sse2_extend_round 44
1976
1977
movdqa sha256_4h+0(%rip), %xmm7
1978
movdqa sha256_4h+16(%rip), %xmm5
1979
movdqa sha256_4h+32(%rip), %xmm4
1980
movdqa sha256_4h+48(%rip), %xmm3
1981
movdqa sha256_4h+64(%rip), %xmm0
1982
movdqa sha256_4h+80(%rip), %xmm1
1983
movdqa sha256_4h+96(%rip), %xmm2
1984
movdqa sha256_4h+112(%rip), %xmm6
1985
movdqa %xmm1, 0(%rsp)
1986
movdqa %xmm2, 16(%rsp)
1987
movdqa %xmm6, 32(%rsp)
1988
1989
leaq 48(%rsp), %rax
1990
leaq sha256_4k(%rip), %rcx
1991
jmp sha256d_ms_4way_sse2_main_loop2
1992
1993
.macro sha256_sse2_main_round_red i, r7
1994
movdqa 16*\i(%rax), %xmm6
1995
paddd 16*\i(%rcx), %xmm6
1996
paddd 32(%rsp), %xmm6
1997
movdqa %xmm0, %xmm1
1998
movdqa 16(%rsp), %xmm2
1999
paddd \r7, %xmm6
2000
pandn %xmm2, %xmm1
2001
movdqa %xmm2, 32(%rsp)
2002
movdqa 0(%rsp), %xmm2
2003
movdqa %xmm2, 16(%rsp)
2004
pand %xmm0, %xmm2
2005
pxor %xmm2, %xmm1
2006
movdqa %xmm0, 0(%rsp)
2007
paddd %xmm1, %xmm6
2008
movdqa %xmm0, %xmm1
2009
psrld $6, %xmm0
2010
movdqa %xmm0, %xmm2
2011
pslld $7, %xmm1
2012
psrld $5, %xmm2
2013
pxor %xmm1, %xmm0
2014
pxor %xmm2, %xmm0
2015
pslld $14, %xmm1
2016
psrld $14, %xmm2
2017
pxor %xmm1, %xmm0
2018
pxor %xmm2, %xmm0
2019
pslld $5, %xmm1
2020
pxor %xmm1, %xmm0
2021
paddd %xmm6, %xmm0
2022
.endm
2023
2024
sha256d_ms_4way_sse2_finish:
2025
sha256_sse2_main_round_red 57, %xmm3
2026
sha256_sse2_main_round_red 58, %xmm4
2027
sha256_sse2_main_round_red 59, %xmm5
2028
sha256_sse2_main_round_red 60, %xmm7
2029
2030
paddd sha256_4h+112(%rip), %xmm0
2031
movdqa %xmm0, 112(%rdi)
2032
2033
addq $8+67*16, %rsp
2034
#if defined(_WIN64) || defined(__CYGWIN__)
2035
popq %rsi
2036
movdqa 0(%rsp), %xmm6
2037
movdqa 16(%rsp), %xmm7
2038
addq $32, %rsp
2039
popq %rdi
2040
#endif
2041
ret
2042
2043
2044
#if defined(USE_AVX)
2045
2046
.p2align 6
2047
sha256d_ms_4way_avx:
2048
#if defined(_WIN64) || defined(__CYGWIN__)
2049
pushq %rdi
2050
subq $80, %rsp
2051
movdqa %xmm6, 0(%rsp)
2052
movdqa %xmm7, 16(%rsp)
2053
movdqa %xmm8, 32(%rsp)
2054
movdqa %xmm9, 48(%rsp)
2055
movdqa %xmm10, 64(%rsp)
2056
pushq %rsi
2057
movq %rcx, %rdi
2058
movq %rdx, %rsi
2059
movq %r8, %rdx
2060
movq %r9, %rcx
2061
#endif
2062
subq $1032, %rsp
2063
2064
leaq 256(%rsi), %rax
2065
2066
sha256d_ms_4way_avx_extend_loop1:
2067
vmovdqa 3*16(%rsi), %xmm0
2068
vmovdqa 2*16(%rax), %xmm3
2069
vmovdqa 3*16(%rax), %xmm7
2070
vmovdqa %xmm3, 2*16(%rsp)
2071
vmovdqa %xmm7, 3*16(%rsp)
2072
vpaddd %xmm0, %xmm7, %xmm7
2073
vpslld $14, %xmm0, %xmm2
2074
vpsrld $3, %xmm0, %xmm0
2075
vpsrld $4, %xmm0, %xmm1
2076
vpxor %xmm1, %xmm0, %xmm0
2077
vpxor %xmm2, %xmm0, %xmm0
2078
vpsrld $11, %xmm1, %xmm1
2079
vpslld $11, %xmm2, %xmm2
2080
vpxor %xmm1, %xmm0, %xmm0
2081
vpxor %xmm2, %xmm0, %xmm0
2082
vpaddd %xmm0, %xmm3, %xmm3
2083
vmovdqa %xmm3, 2*16(%rax)
2084
vmovdqa %xmm7, 3*16(%rax)
2085
2086
vmovdqa 4*16(%rax), %xmm0
2087
vmovdqa %xmm0, 4*16(%rsp)
2088
vpslld $13, %xmm3, %xmm2
2089
vpslld $13, %xmm7, %xmm6
2090
vpsrld $10, %xmm3, %xmm3
2091
vpsrld $10, %xmm7, %xmm7
2092
vpsrld $7, %xmm3, %xmm1
2093
vpsrld $7, %xmm7, %xmm5
2094
vpxor %xmm1, %xmm3, %xmm3
2095
vpxor %xmm5, %xmm7, %xmm7
2096
vpsrld $2, %xmm1, %xmm1
2097
vpsrld $2, %xmm5, %xmm5
2098
vpxor %xmm2, %xmm3, %xmm3
2099
vpxor %xmm6, %xmm7, %xmm7
2100
vpslld $2, %xmm2, %xmm2
2101
vpslld $2, %xmm6, %xmm6
2102
vpxor %xmm1, %xmm3, %xmm3
2103
vpxor %xmm5, %xmm7, %xmm7
2104
vpxor %xmm2, %xmm3, %xmm3
2105
vpxor %xmm6, %xmm7, %xmm7
2106
vpaddd %xmm0, %xmm3, %xmm3
2107
vmovdqa %xmm3, 4*16(%rax)
2108
vmovdqa %xmm7, 5*16(%rax)
2109
2110
vmovdqa 6*16(%rax), %xmm0
2111
vmovdqa 7*16(%rax), %xmm4
2112
vmovdqa %xmm0, 6*16(%rsp)
2113
vmovdqa %xmm4, 7*16(%rsp)
2114
vpslld $13, %xmm3, %xmm2
2115
vpslld $13, %xmm7, %xmm6
2116
vpsrld $10, %xmm3, %xmm3
2117
vpsrld $10, %xmm7, %xmm7
2118
vpsrld $7, %xmm3, %xmm1
2119
vpsrld $7, %xmm7, %xmm5
2120
vpxor %xmm1, %xmm3, %xmm3
2121
vpxor %xmm5, %xmm7, %xmm7
2122
vpsrld $2, %xmm1, %xmm1
2123
vpsrld $2, %xmm5, %xmm5
2124
vpxor %xmm2, %xmm3, %xmm3
2125
vpxor %xmm6, %xmm7, %xmm7
2126
vpslld $2, %xmm2, %xmm2
2127
vpslld $2, %xmm6, %xmm6
2128
vpxor %xmm1, %xmm3, %xmm3
2129
vpxor %xmm5, %xmm7, %xmm7
2130
vpxor %xmm2, %xmm3, %xmm3
2131
vpxor %xmm6, %xmm7, %xmm7
2132
vpaddd %xmm0, %xmm3, %xmm3
2133
vpaddd %xmm4, %xmm7, %xmm7
2134
vmovdqa %xmm3, 6*16(%rax)
2135
vmovdqa %xmm7, 7*16(%rax)
2136
2137
vmovdqa 8*16(%rax), %xmm0
2138
vmovdqa 2*16(%rax), %xmm4
2139
vmovdqa %xmm0, 8*16(%rsp)
2140
vpslld $13, %xmm3, %xmm2
2141
vpslld $13, %xmm7, %xmm6
2142
vpsrld $10, %xmm3, %xmm3
2143
vpsrld $10, %xmm7, %xmm7
2144
vpsrld $7, %xmm3, %xmm1
2145
vpsrld $7, %xmm7, %xmm5
2146
vpxor %xmm1, %xmm3, %xmm3
2147
vpxor %xmm5, %xmm7, %xmm7
2148
vpsrld $2, %xmm1, %xmm1
2149
vpsrld $2, %xmm5, %xmm5
2150
vpxor %xmm2, %xmm3, %xmm3
2151
vpxor %xmm6, %xmm7, %xmm7
2152
vpslld $2, %xmm2, %xmm2
2153
vpslld $2, %xmm6, %xmm6
2154
vpxor %xmm1, %xmm3, %xmm3
2155
vpxor %xmm5, %xmm7, %xmm7
2156
vpxor %xmm2, %xmm3, %xmm3
2157
vpxor %xmm6, %xmm7, %xmm7
2158
vpaddd %xmm0, %xmm3, %xmm3
2159
vpaddd %xmm4, %xmm7, %xmm7
2160
vmovdqa %xmm3, 8*16(%rax)
2161
vmovdqa %xmm7, 9*16(%rax)
2162
2163
vpslld $13, %xmm3, %xmm2
2164
vpslld $13, %xmm7, %xmm6
2165
vpsrld $10, %xmm3, %xmm3
2166
vpsrld $10, %xmm7, %xmm7
2167
vpsrld $7, %xmm3, %xmm1
2168
vpsrld $7, %xmm7, %xmm5
2169
vpxor %xmm1, %xmm3, %xmm3
2170
vpxor %xmm5, %xmm7, %xmm7
2171
vpsrld $2, %xmm1, %xmm1
2172
vpsrld $2, %xmm5, %xmm5
2173
vpxor %xmm2, %xmm3, %xmm3
2174
vpxor %xmm6, %xmm7, %xmm7
2175
vpslld $2, %xmm2, %xmm2
2176
vpslld $2, %xmm6, %xmm6
2177
vpxor %xmm1, %xmm3, %xmm3
2178
vpxor %xmm5, %xmm7, %xmm7
2179
vpxor %xmm2, %xmm3, %xmm3
2180
vpxor %xmm6, %xmm7, %xmm7
2181
vpaddd 3*16(%rax), %xmm3, %xmm3
2182
vpaddd 4*16(%rax), %xmm7, %xmm7
2183
vmovdqa %xmm3, 10*16(%rax)
2184
vmovdqa %xmm7, 11*16(%rax)
2185
2186
vpslld $13, %xmm3, %xmm2
2187
vpslld $13, %xmm7, %xmm6
2188
vpsrld $10, %xmm3, %xmm3
2189
vpsrld $10, %xmm7, %xmm7
2190
vpsrld $7, %xmm3, %xmm1
2191
vpsrld $7, %xmm7, %xmm5
2192
vpxor %xmm1, %xmm3, %xmm3
2193
vpxor %xmm5, %xmm7, %xmm7
2194
vpsrld $2, %xmm1, %xmm1
2195
vpsrld $2, %xmm5, %xmm5
2196
vpxor %xmm2, %xmm3, %xmm3
2197
vpxor %xmm6, %xmm7, %xmm7
2198
vpslld $2, %xmm2, %xmm2
2199
vpslld $2, %xmm6, %xmm6
2200
vpxor %xmm1, %xmm3, %xmm3
2201
vpxor %xmm5, %xmm7, %xmm7
2202
vpxor %xmm2, %xmm3, %xmm3
2203
vpxor %xmm6, %xmm7, %xmm7
2204
vpaddd 5*16(%rax), %xmm3, %xmm3
2205
vpaddd 6*16(%rax), %xmm7, %xmm7
2206
vmovdqa %xmm3, 12*16(%rax)
2207
vmovdqa %xmm7, 13*16(%rax)
2208
2209
vmovdqa 14*16(%rax), %xmm0
2210
vmovdqa 15*16(%rax), %xmm4
2211
vmovdqa %xmm0, 14*16(%rsp)
2212
vmovdqa %xmm4, 15*16(%rsp)
2213
vpslld $13, %xmm3, %xmm2
2214
vpslld $13, %xmm7, %xmm6
2215
vpsrld $10, %xmm3, %xmm3
2216
vpsrld $10, %xmm7, %xmm7
2217
vpaddd 7*16(%rax), %xmm0, %xmm0
2218
vpaddd 8*16(%rax), %xmm4, %xmm4
2219
vpsrld $7, %xmm3, %xmm1
2220
vpsrld $7, %xmm7, %xmm5
2221
vpxor %xmm1, %xmm3, %xmm3
2222
vpxor %xmm5, %xmm7, %xmm7
2223
vpsrld $2, %xmm1, %xmm1
2224
vpsrld $2, %xmm5, %xmm5
2225
vpxor %xmm2, %xmm3, %xmm3
2226
vpxor %xmm6, %xmm7, %xmm7
2227
vpslld $2, %xmm2, %xmm2
2228
vpslld $2, %xmm6, %xmm6
2229
vpxor %xmm1, %xmm3, %xmm3
2230
vpxor %xmm5, %xmm7, %xmm7
2231
vpxor %xmm2, %xmm3, %xmm3
2232
vpxor %xmm6, %xmm7, %xmm7
2233
vpaddd %xmm0, %xmm3, %xmm3
2234
vpaddd %xmm4, %xmm7, %xmm7
2235
vmovdqa %xmm3, 14*16(%rax)
2236
vmovdqa %xmm7, 15*16(%rax)
2237
2238
sha256d_ms_4way_avx_extend_loop2:
2239
sha256_avx_extend_doubleround 16
2240
sha256_avx_extend_doubleround 18
2241
sha256_avx_extend_doubleround 20
2242
sha256_avx_extend_doubleround 22
2243
sha256_avx_extend_doubleround 24
2244
sha256_avx_extend_doubleround 26
2245
sha256_avx_extend_doubleround 28
2246
sha256_avx_extend_doubleround 30
2247
sha256_avx_extend_doubleround 32
2248
sha256_avx_extend_doubleround 34
2249
sha256_avx_extend_doubleround 36
2250
sha256_avx_extend_doubleround 38
2251
sha256_avx_extend_doubleround 40
2252
sha256_avx_extend_doubleround 42
2253
jz sha256d_ms_4way_avx_extend_coda2
2254
sha256_avx_extend_doubleround 44
2255
sha256_avx_extend_doubleround 46
2256
2257
movdqa 0(%rcx), %xmm7
2258
movdqa 16(%rcx), %xmm8
2259
movdqa 32(%rcx), %xmm9
2260
movdqa 48(%rcx), %xmm10
2261
movdqa 64(%rcx), %xmm0
2262
movdqa 80(%rcx), %xmm5
2263
movdqa 96(%rcx), %xmm4
2264
movdqa 112(%rcx), %xmm3
2265
2266
movq %rsi, %rax
2267
leaq sha256_4k(%rip), %rcx
2268
jmp sha256d_ms_4way_avx_main_loop1
2269
2270
sha256d_ms_4way_avx_main_loop2:
2271
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2272
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2273
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2274
sha256d_ms_4way_avx_main_loop1:
2275
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2276
sha256_avx_main_quadround 4
2277
sha256_avx_main_quadround 8
2278
sha256_avx_main_quadround 12
2279
sha256_avx_main_quadround 16
2280
sha256_avx_main_quadround 20
2281
sha256_avx_main_quadround 24
2282
sha256_avx_main_quadround 28
2283
sha256_avx_main_quadround 32
2284
sha256_avx_main_quadround 36
2285
sha256_avx_main_quadround 40
2286
sha256_avx_main_quadround 44
2287
sha256_avx_main_quadround 48
2288
sha256_avx_main_quadround 52
2289
sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2290
jz sha256d_ms_4way_avx_finish
2291
sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2292
sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2293
sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2294
sha256_avx_main_quadround 60
2295
2296
movdqa 2*16(%rsp), %xmm1
2297
movdqa 3*16(%rsp), %xmm2
2298
movdqa 4*16(%rsp), %xmm6
2299
movdqa %xmm1, 18*16(%rsi)
2300
movdqa %xmm2, 19*16(%rsi)
2301
movdqa %xmm6, 20*16(%rsi)
2302
movdqa 6*16(%rsp), %xmm1
2303
movdqa 7*16(%rsp), %xmm2
2304
movdqa 8*16(%rsp), %xmm6
2305
movdqa %xmm1, 22*16(%rsi)
2306
movdqa %xmm2, 23*16(%rsi)
2307
movdqa %xmm6, 24*16(%rsi)
2308
movdqa 14*16(%rsp), %xmm1
2309
movdqa 15*16(%rsp), %xmm2
2310
movdqa %xmm1, 30*16(%rsi)
2311
movdqa %xmm2, 31*16(%rsi)
2312
2313
paddd 0(%rdx), %xmm7
2314
paddd 16(%rdx), %xmm5
2315
paddd 32(%rdx), %xmm4
2316
paddd 48(%rdx), %xmm3
2317
paddd 64(%rdx), %xmm0
2318
paddd 80(%rdx), %xmm8
2319
paddd 96(%rdx), %xmm9
2320
paddd 112(%rdx), %xmm10
2321
2322
movdqa %xmm7, 0(%rsp)
2323
movdqa %xmm5, 16(%rsp)
2324
movdqa %xmm4, 32(%rsp)
2325
movdqa %xmm3, 48(%rsp)
2326
movdqa %xmm0, 64(%rsp)
2327
movdqa %xmm8, 80(%rsp)
2328
movdqa %xmm9, 96(%rsp)
2329
movdqa %xmm10, 112(%rsp)
2330
2331
pxor %xmm0, %xmm0
2332
movq $0x8000000000000100, %rax
2333
movd %rax, %xmm1
2334
pshufd $0x55, %xmm1, %xmm2
2335
pshufd $0x00, %xmm1, %xmm1
2336
movdqa %xmm2, 128(%rsp)
2337
movdqa %xmm0, 144(%rsp)
2338
movdqa %xmm0, 160(%rsp)
2339
movdqa %xmm0, 176(%rsp)
2340
movdqa %xmm0, 192(%rsp)
2341
movdqa %xmm0, 208(%rsp)
2342
movdqa %xmm0, 224(%rsp)
2343
movdqa %xmm1, 240(%rsp)
2344
2345
leaq 256(%rsp), %rax
2346
cmpq %rax, %rax
2347
2348
vmovdqa -15*16(%rax), %xmm0
2349
vmovdqa -14*16(%rax), %xmm4
2350
vpslld $14, %xmm0, %xmm2
2351
vpslld $14, %xmm4, %xmm6
2352
vpsrld $3, %xmm0, %xmm8
2353
vpsrld $3, %xmm4, %xmm4
2354
vpsrld $7, %xmm0, %xmm1
2355
vpsrld $4, %xmm4, %xmm5
2356
vpxor %xmm1, %xmm8, %xmm8
2357
vpxor %xmm5, %xmm4, %xmm4
2358
vpsrld $11, %xmm1, %xmm1
2359
vpsrld $11, %xmm5, %xmm5
2360
vpxor %xmm2, %xmm8, %xmm8
2361
vpxor %xmm6, %xmm4, %xmm4
2362
vpslld $11, %xmm2, %xmm2
2363
vpslld $11, %xmm6, %xmm6
2364
vpxor %xmm1, %xmm8, %xmm8
2365
vpxor %xmm5, %xmm4, %xmm4
2366
vpxor %xmm2, %xmm8, %xmm8
2367
vpxor %xmm6, %xmm4, %xmm4
2368
vpaddd %xmm0, %xmm4, %xmm4
2369
vpaddd -16*16(%rax), %xmm8, %xmm3
2370
vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
2371
vmovdqa %xmm3, 0*16(%rax)
2372
vmovdqa %xmm7, 1*16(%rax)
2373
2374
sha256_avx_extend_doubleround 2
2375
sha256_avx_extend_doubleround 4
2376
2377
vmovdqa -9*16(%rax), %xmm0
2378
vpslld $14, %xmm0, %xmm2
2379
vpsrld $3, %xmm0, %xmm8
2380
vpsrld $7, %xmm0, %xmm1
2381
vpxor %xmm1, %xmm8, %xmm8
2382
vpxor %xmm2, %xmm8, %xmm8
2383
vpsrld $11, %xmm1, %xmm1
2384
vpslld $11, %xmm2, %xmm2
2385
vpxor %xmm1, %xmm8, %xmm8
2386
vpxor %xmm2, %xmm8, %xmm8
2387
vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
2388
vpaddd -10*16(%rax), %xmm8, %xmm0
2389
vpslld $13, %xmm3, %xmm2
2390
vpslld $13, %xmm7, %xmm6
2391
vpsrld $10, %xmm3, %xmm3
2392
vpsrld $10, %xmm7, %xmm7
2393
vpaddd -1*16(%rax), %xmm0, %xmm0
2394
vpaddd 0*16(%rax), %xmm4, %xmm4
2395
vpsrld $7, %xmm3, %xmm1
2396
vpsrld $7, %xmm7, %xmm5
2397
vpxor %xmm1, %xmm3, %xmm3
2398
vpxor %xmm5, %xmm7, %xmm7
2399
vpsrld $2, %xmm1, %xmm1
2400
vpsrld $2, %xmm5, %xmm5
2401
vpxor %xmm2, %xmm3, %xmm3
2402
vpxor %xmm6, %xmm7, %xmm7
2403
vpslld $2, %xmm2, %xmm2
2404
vpslld $2, %xmm6, %xmm6
2405
vpxor %xmm1, %xmm3, %xmm3
2406
vpxor %xmm5, %xmm7, %xmm7
2407
vpxor %xmm2, %xmm3, %xmm3
2408
vpxor %xmm6, %xmm7, %xmm7
2409
vpaddd %xmm0, %xmm3, %xmm3
2410
vpaddd %xmm4, %xmm7, %xmm7
2411
vmovdqa %xmm3, 6*16(%rax)
2412
vmovdqa %xmm7, 7*16(%rax)
2413
2414
vpslld $13, %xmm3, %xmm2
2415
vpslld $13, %xmm7, %xmm6
2416
vpsrld $10, %xmm3, %xmm3
2417
vpsrld $10, %xmm7, %xmm7
2418
vpsrld $7, %xmm3, %xmm1
2419
vpsrld $7, %xmm7, %xmm5
2420
vpxor %xmm1, %xmm3, %xmm3
2421
vpxor %xmm5, %xmm7, %xmm7
2422
vpsrld $2, %xmm1, %xmm1
2423
vpsrld $2, %xmm5, %xmm5
2424
vpxor %xmm2, %xmm3, %xmm3
2425
vpxor %xmm6, %xmm7, %xmm7
2426
vpslld $2, %xmm2, %xmm2
2427
vpslld $2, %xmm6, %xmm6
2428
vpxor %xmm1, %xmm3, %xmm3
2429
vpxor %xmm5, %xmm7, %xmm7
2430
vpxor %xmm2, %xmm3, %xmm3
2431
vpxor %xmm6, %xmm7, %xmm7
2432
vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
2433
vpaddd 1*16(%rax), %xmm3, %xmm3
2434
vpaddd 2*16(%rax), %xmm7, %xmm7
2435
vmovdqa %xmm3, 8*16(%rax)
2436
vmovdqa %xmm7, 9*16(%rax)
2437
2438
vpslld $13, %xmm3, %xmm2
2439
vpslld $13, %xmm7, %xmm6
2440
vpsrld $10, %xmm3, %xmm3
2441
vpsrld $10, %xmm7, %xmm7
2442
vpsrld $7, %xmm3, %xmm1
2443
vpsrld $7, %xmm7, %xmm5
2444
vpxor %xmm1, %xmm3, %xmm3
2445
vpxor %xmm5, %xmm7, %xmm7
2446
vpsrld $2, %xmm1, %xmm1
2447
vpsrld $2, %xmm5, %xmm5
2448
vpxor %xmm2, %xmm3, %xmm3
2449
vpxor %xmm6, %xmm7, %xmm7
2450
vpslld $2, %xmm2, %xmm2
2451
vpslld $2, %xmm6, %xmm6
2452
vpxor %xmm1, %xmm3, %xmm3
2453
vpxor %xmm5, %xmm7, %xmm7
2454
vpxor %xmm2, %xmm3, %xmm3
2455
vpxor %xmm6, %xmm7, %xmm7
2456
vpaddd 3*16(%rax), %xmm3, %xmm3
2457
vpaddd 4*16(%rax), %xmm7, %xmm7
2458
vmovdqa %xmm3, 10*16(%rax)
2459
vmovdqa %xmm7, 11*16(%rax)
2460
2461
vpslld $13, %xmm3, %xmm2
2462
vpslld $13, %xmm7, %xmm6
2463
vpsrld $10, %xmm3, %xmm3
2464
vpsrld $10, %xmm7, %xmm7
2465
vpsrld $7, %xmm3, %xmm1
2466
vpsrld $7, %xmm7, %xmm5
2467
vpxor %xmm1, %xmm3, %xmm3
2468
vpxor %xmm5, %xmm7, %xmm7
2469
vpsrld $2, %xmm1, %xmm1
2470
vpsrld $2, %xmm5, %xmm5
2471
vpxor %xmm2, %xmm3, %xmm3
2472
vpxor %xmm6, %xmm7, %xmm7
2473
vpslld $2, %xmm2, %xmm2
2474
vpslld $2, %xmm6, %xmm6
2475
vpxor %xmm1, %xmm3, %xmm3
2476
vpxor %xmm5, %xmm7, %xmm7
2477
vpxor %xmm2, %xmm3, %xmm3
2478
vpxor %xmm6, %xmm7, %xmm7
2479
vpaddd 5*16(%rax), %xmm3, %xmm3
2480
vpaddd 6*16(%rax), %xmm7, %xmm7
2481
vmovdqa %xmm3, 12*16(%rax)
2482
vmovdqa %xmm7, 13*16(%rax)
2483
2484
vmovdqa sha256d_4preext2_30(%rip), %xmm0
2485
vmovdqa 0*16(%rax), %xmm4
2486
vpslld $14, %xmm4, %xmm6
2487
vpsrld $3, %xmm4, %xmm4
2488
vpsrld $4, %xmm4, %xmm5
2489
vpxor %xmm5, %xmm4, %xmm4
2490
vpxor %xmm6, %xmm4, %xmm4
2491
vpsrld $11, %xmm5, %xmm5
2492
vpslld $11, %xmm6, %xmm6
2493
vpxor %xmm5, %xmm4, %xmm4
2494
vpxor %xmm6, %xmm4, %xmm4
2495
vpaddd -1*16(%rax), %xmm4, %xmm4
2496
vpslld $13, %xmm3, %xmm2
2497
vpslld $13, %xmm7, %xmm6
2498
vpsrld $10, %xmm3, %xmm3
2499
vpsrld $10, %xmm7, %xmm7
2500
vpaddd 7*16(%rax), %xmm0, %xmm0
2501
vpaddd 8*16(%rax), %xmm4, %xmm4
2502
vpsrld $7, %xmm3, %xmm1
2503
vpsrld $7, %xmm7, %xmm5
2504
vpxor %xmm1, %xmm3, %xmm3
2505
vpxor %xmm5, %xmm7, %xmm7
2506
vpsrld $2, %xmm1, %xmm1
2507
vpsrld $2, %xmm5, %xmm5
2508
vpxor %xmm2, %xmm3, %xmm3
2509
vpxor %xmm6, %xmm7, %xmm7
2510
vpslld $2, %xmm2, %xmm2
2511
vpslld $2, %xmm6, %xmm6
2512
vpxor %xmm1, %xmm3, %xmm3
2513
vpxor %xmm5, %xmm7, %xmm7
2514
vpxor %xmm2, %xmm3, %xmm3
2515
vpxor %xmm6, %xmm7, %xmm7
2516
vpaddd %xmm0, %xmm3, %xmm3
2517
vpaddd %xmm4, %xmm7, %xmm7
2518
vmovdqa %xmm3, 14*16(%rax)
2519
vmovdqa %xmm7, 15*16(%rax)
2520
2521
jmp sha256d_ms_4way_avx_extend_loop2
2522
2523
sha256d_ms_4way_avx_extend_coda2:
2524
sha256_avx_extend_round 44
2525
2526
movdqa sha256_4h+0(%rip), %xmm7
2527
movdqa sha256_4h+16(%rip), %xmm5
2528
movdqa sha256_4h+32(%rip), %xmm4
2529
movdqa sha256_4h+48(%rip), %xmm3
2530
movdqa sha256_4h+64(%rip), %xmm0
2531
movdqa sha256_4h+80(%rip), %xmm8
2532
movdqa sha256_4h+96(%rip), %xmm9
2533
movdqa sha256_4h+112(%rip), %xmm10
2534
2535
movq %rsp, %rax
2536
leaq sha256_4k(%rip), %rcx
2537
jmp sha256d_ms_4way_avx_main_loop2
2538
2539
.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4
2540
vpaddd 16*\i(%rax), \r0, %xmm6
2541
vpaddd 16*\i(%rcx), %xmm6, %xmm6
2542
vpandn \r1, \r3, %xmm1
2543
vpand \r3, \r2, %xmm2
2544
vpxor %xmm2, %xmm1, %xmm1
2545
vpaddd %xmm1, %xmm6, %xmm6
2546
vpslld $7, \r3, %xmm1
2547
vpsrld $6, \r3, \r0
2548
vpsrld $5, \r0, %xmm2
2549
vpxor %xmm1, \r0, \r0
2550
vpxor %xmm2, \r0, \r0
2551
vpslld $14, %xmm1, %xmm1
2552
vpsrld $14, %xmm2, %xmm2
2553
vpxor %xmm1, \r0, \r0
2554
vpxor %xmm2, \r0, \r0
2555
vpslld $5, %xmm1, %xmm1
2556
vpxor %xmm1, \r0, \r0
2557
vpaddd \r0, %xmm6, %xmm6
2558
vpaddd %xmm6, \r4, \r0
2559
.endm
2560
2561
sha256d_ms_4way_avx_finish:
2562
sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
2563
sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
2564
sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
2565
sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
2566
2567
paddd sha256_4h+112(%rip), %xmm10
2568
movdqa %xmm10, 112(%rdi)
2569
2570
addq $1032, %rsp
2571
#if defined(_WIN64) || defined(__CYGWIN__)
2572
popq %rsi
2573
movdqa 0(%rsp), %xmm6
2574
movdqa 16(%rsp), %xmm7
2575
movdqa 32(%rsp), %xmm8
2576
movdqa 48(%rsp), %xmm9
2577
movdqa 64(%rsp), %xmm10
2578
addq $80, %rsp
2579
popq %rdi
2580
#endif
2581
ret
2582
2583
#endif /* USE_AVX */
2584
2585
2586
#if defined(USE_XOP)
2587
2588
.p2align 6
2589
sha256d_ms_4way_xop:
2590
#if defined(_WIN64) || defined(__CYGWIN__)
2591
pushq %rdi
2592
subq $80, %rsp
2593
movdqa %xmm6, 0(%rsp)
2594
movdqa %xmm7, 16(%rsp)
2595
movdqa %xmm8, 32(%rsp)
2596
movdqa %xmm9, 48(%rsp)
2597
movdqa %xmm10, 64(%rsp)
2598
pushq %rsi
2599
movq %rcx, %rdi
2600
movq %rdx, %rsi
2601
movq %r8, %rdx
2602
movq %r9, %rcx
2603
#endif
2604
subq $1032, %rsp
2605
2606
leaq 256(%rsi), %rax
2607
2608
sha256d_ms_4way_xop_extend_loop1:
2609
vmovdqa 3*16(%rsi), %xmm0
2610
vmovdqa 2*16(%rax), %xmm3
2611
vmovdqa 3*16(%rax), %xmm7
2612
vmovdqa %xmm3, 2*16(%rsp)
2613
vmovdqa %xmm7, 3*16(%rsp)
2614
vpaddd %xmm0, %xmm7, %xmm7
2615
vprotd $25, %xmm0, %xmm1
2616
vprotd $14, %xmm0, %xmm2
2617
vpsrld $3, %xmm0, %xmm0
2618
vpxor %xmm1, %xmm2, %xmm2
2619
vpxor %xmm2, %xmm0, %xmm0
2620
vpaddd %xmm0, %xmm3, %xmm3
2621
vmovdqa %xmm3, 2*16(%rax)
2622
vmovdqa %xmm7, 3*16(%rax)
2623
2624
vmovdqa 4*16(%rax), %xmm0
2625
vmovdqa %xmm0, 4*16(%rsp)
2626
vprotd $15, %xmm3, %xmm1
2627
vprotd $15, %xmm7, %xmm5
2628
vprotd $13, %xmm3, %xmm2
2629
vprotd $13, %xmm7, %xmm6
2630
vpxor %xmm1, %xmm2, %xmm2
2631
vpxor %xmm5, %xmm6, %xmm6
2632
vpsrld $10, %xmm3, %xmm3
2633
vpsrld $10, %xmm7, %xmm7
2634
vpxor %xmm2, %xmm3, %xmm3
2635
vpxor %xmm6, %xmm7, %xmm7
2636
vpaddd %xmm0, %xmm3, %xmm3
2637
vmovdqa %xmm3, 4*16(%rax)
2638
vmovdqa %xmm7, 5*16(%rax)
2639
2640
vmovdqa 6*16(%rax), %xmm0
2641
vmovdqa 7*16(%rax), %xmm4
2642
vmovdqa %xmm0, 6*16(%rsp)
2643
vmovdqa %xmm4, 7*16(%rsp)
2644
vprotd $15, %xmm3, %xmm1
2645
vprotd $15, %xmm7, %xmm5
2646
vprotd $13, %xmm3, %xmm2
2647
vprotd $13, %xmm7, %xmm6
2648
vpxor %xmm1, %xmm2, %xmm2
2649
vpxor %xmm5, %xmm6, %xmm6
2650
vpsrld $10, %xmm3, %xmm3
2651
vpsrld $10, %xmm7, %xmm7
2652
vpxor %xmm2, %xmm3, %xmm3
2653
vpxor %xmm6, %xmm7, %xmm7
2654
vpaddd %xmm0, %xmm3, %xmm3
2655
vpaddd %xmm4, %xmm7, %xmm7
2656
vmovdqa %xmm3, 6*16(%rax)
2657
vmovdqa %xmm7, 7*16(%rax)
2658
2659
vmovdqa 8*16(%rax), %xmm0
2660
vmovdqa 2*16(%rax), %xmm4
2661
vmovdqa %xmm0, 8*16(%rsp)
2662
vprotd $15, %xmm3, %xmm1
2663
vprotd $15, %xmm7, %xmm5
2664
vprotd $13, %xmm3, %xmm2
2665
vprotd $13, %xmm7, %xmm6
2666
vpxor %xmm1, %xmm2, %xmm2
2667
vpxor %xmm5, %xmm6, %xmm6
2668
vpsrld $10, %xmm3, %xmm3
2669
vpsrld $10, %xmm7, %xmm7
2670
vpxor %xmm2, %xmm3, %xmm3
2671
vpxor %xmm6, %xmm7, %xmm7
2672
vpaddd %xmm0, %xmm3, %xmm3
2673
vpaddd %xmm4, %xmm7, %xmm7
2674
vmovdqa %xmm3, 8*16(%rax)
2675
vmovdqa %xmm7, 9*16(%rax)
2676
2677
vprotd $15, %xmm3, %xmm1
2678
vprotd $15, %xmm7, %xmm5
2679
vprotd $13, %xmm3, %xmm2
2680
vprotd $13, %xmm7, %xmm6
2681
vpxor %xmm1, %xmm2, %xmm2
2682
vpxor %xmm5, %xmm6, %xmm6
2683
vpsrld $10, %xmm3, %xmm3
2684
vpsrld $10, %xmm7, %xmm7
2685
vpxor %xmm2, %xmm3, %xmm3
2686
vpxor %xmm6, %xmm7, %xmm7
2687
vpaddd 3*16(%rax), %xmm3, %xmm3
2688
vpaddd 4*16(%rax), %xmm7, %xmm7
2689
vmovdqa %xmm3, 10*16(%rax)
2690
vmovdqa %xmm7, 11*16(%rax)
2691
2692
vprotd $15, %xmm3, %xmm1
2693
vprotd $15, %xmm7, %xmm5
2694
vprotd $13, %xmm3, %xmm2
2695
vprotd $13, %xmm7, %xmm6
2696
vpxor %xmm1, %xmm2, %xmm2
2697
vpxor %xmm5, %xmm6, %xmm6
2698
vpsrld $10, %xmm3, %xmm3
2699
vpsrld $10, %xmm7, %xmm7
2700
vpxor %xmm2, %xmm3, %xmm3
2701
vpxor %xmm6, %xmm7, %xmm7
2702
vpaddd 5*16(%rax), %xmm3, %xmm3
2703
vpaddd 6*16(%rax), %xmm7, %xmm7
2704
vmovdqa %xmm3, 12*16(%rax)
2705
vmovdqa %xmm7, 13*16(%rax)
2706
2707
vmovdqa 14*16(%rax), %xmm0
2708
vmovdqa 15*16(%rax), %xmm4
2709
vmovdqa %xmm0, 14*16(%rsp)
2710
vmovdqa %xmm4, 15*16(%rsp)
2711
vprotd $15, %xmm3, %xmm1
2712
vprotd $15, %xmm7, %xmm5
2713
vprotd $13, %xmm3, %xmm2
2714
vprotd $13, %xmm7, %xmm6
2715
vpxor %xmm1, %xmm2, %xmm2
2716
vpxor %xmm5, %xmm6, %xmm6
2717
vpaddd 7*16(%rax), %xmm0, %xmm0
2718
vpaddd 8*16(%rax), %xmm4, %xmm4
2719
vpsrld $10, %xmm3, %xmm3
2720
vpsrld $10, %xmm7, %xmm7
2721
vpxor %xmm2, %xmm3, %xmm3
2722
vpxor %xmm6, %xmm7, %xmm7
2723
vpaddd %xmm0, %xmm3, %xmm3
2724
vpaddd %xmm4, %xmm7, %xmm7
2725
vmovdqa %xmm3, 14*16(%rax)
2726
vmovdqa %xmm7, 15*16(%rax)
2727
2728
sha256d_ms_4way_xop_extend_loop2:
2729
sha256_xop_extend_doubleround 16
2730
sha256_xop_extend_doubleround 18
2731
sha256_xop_extend_doubleround 20
2732
sha256_xop_extend_doubleround 22
2733
sha256_xop_extend_doubleround 24
2734
sha256_xop_extend_doubleround 26
2735
sha256_xop_extend_doubleround 28
2736
sha256_xop_extend_doubleround 30
2737
sha256_xop_extend_doubleround 32
2738
sha256_xop_extend_doubleround 34
2739
sha256_xop_extend_doubleround 36
2740
sha256_xop_extend_doubleround 38
2741
sha256_xop_extend_doubleround 40
2742
sha256_xop_extend_doubleround 42
2743
jz sha256d_ms_4way_xop_extend_coda2
2744
sha256_xop_extend_doubleround 44
2745
sha256_xop_extend_doubleround 46
2746
2747
movdqa 0(%rcx), %xmm7
2748
movdqa 16(%rcx), %xmm8
2749
movdqa 32(%rcx), %xmm9
2750
movdqa 48(%rcx), %xmm10
2751
movdqa 64(%rcx), %xmm0
2752
movdqa 80(%rcx), %xmm5
2753
movdqa 96(%rcx), %xmm4
2754
movdqa 112(%rcx), %xmm3
2755
2756
movq %rsi, %rax
2757
leaq sha256_4k(%rip), %rcx
2758
jmp sha256d_ms_4way_xop_main_loop1
2759
2760
sha256d_ms_4way_xop_main_loop2:
2761
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2762
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2763
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2764
sha256d_ms_4way_xop_main_loop1:
2765
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2766
sha256_xop_main_quadround 4
2767
sha256_xop_main_quadround 8
2768
sha256_xop_main_quadround 12
2769
sha256_xop_main_quadround 16
2770
sha256_xop_main_quadround 20
2771
sha256_xop_main_quadround 24
2772
sha256_xop_main_quadround 28
2773
sha256_xop_main_quadround 32
2774
sha256_xop_main_quadround 36
2775
sha256_xop_main_quadround 40
2776
sha256_xop_main_quadround 44
2777
sha256_xop_main_quadround 48
2778
sha256_xop_main_quadround 52
2779
sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2780
jz sha256d_ms_4way_xop_finish
2781
sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2782
sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2783
sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2784
sha256_xop_main_quadround 60
2785
2786
movdqa 2*16(%rsp), %xmm1
2787
movdqa 3*16(%rsp), %xmm2
2788
movdqa 4*16(%rsp), %xmm6
2789
movdqa %xmm1, 18*16(%rsi)
2790
movdqa %xmm2, 19*16(%rsi)
2791
movdqa %xmm6, 20*16(%rsi)
2792
movdqa 6*16(%rsp), %xmm1
2793
movdqa 7*16(%rsp), %xmm2
2794
movdqa 8*16(%rsp), %xmm6
2795
movdqa %xmm1, 22*16(%rsi)
2796
movdqa %xmm2, 23*16(%rsi)
2797
movdqa %xmm6, 24*16(%rsi)
2798
movdqa 14*16(%rsp), %xmm1
2799
movdqa 15*16(%rsp), %xmm2
2800
movdqa %xmm1, 30*16(%rsi)
2801
movdqa %xmm2, 31*16(%rsi)
2802
2803
paddd 0(%rdx), %xmm7
2804
paddd 16(%rdx), %xmm5
2805
paddd 32(%rdx), %xmm4
2806
paddd 48(%rdx), %xmm3
2807
paddd 64(%rdx), %xmm0
2808
paddd 80(%rdx), %xmm8
2809
paddd 96(%rdx), %xmm9
2810
paddd 112(%rdx), %xmm10
2811
2812
movdqa %xmm7, 0(%rsp)
2813
movdqa %xmm5, 16(%rsp)
2814
movdqa %xmm4, 32(%rsp)
2815
movdqa %xmm3, 48(%rsp)
2816
movdqa %xmm0, 64(%rsp)
2817
movdqa %xmm8, 80(%rsp)
2818
movdqa %xmm9, 96(%rsp)
2819
movdqa %xmm10, 112(%rsp)
2820
2821
pxor %xmm0, %xmm0
2822
movq $0x8000000000000100, %rax
2823
movd %rax, %xmm1
2824
pshufd $0x55, %xmm1, %xmm2
2825
pshufd $0x00, %xmm1, %xmm1
2826
movdqa %xmm2, 128(%rsp)
2827
movdqa %xmm0, 144(%rsp)
2828
movdqa %xmm0, 160(%rsp)
2829
movdqa %xmm0, 176(%rsp)
2830
movdqa %xmm0, 192(%rsp)
2831
movdqa %xmm0, 208(%rsp)
2832
movdqa %xmm0, 224(%rsp)
2833
movdqa %xmm1, 240(%rsp)
2834
2835
leaq 256(%rsp), %rax
2836
cmpq %rax, %rax
2837
2838
vmovdqa -15*16(%rax), %xmm0
2839
vmovdqa -14*16(%rax), %xmm4
2840
vprotd $25, %xmm0, %xmm1
2841
vprotd $25, %xmm4, %xmm5
2842
vprotd $14, %xmm0, %xmm2
2843
vprotd $14, %xmm4, %xmm6
2844
vpxor %xmm1, %xmm2, %xmm2
2845
vpxor %xmm5, %xmm6, %xmm6
2846
vpsrld $3, %xmm0, %xmm8
2847
vpsrld $3, %xmm4, %xmm4
2848
vpxor %xmm2, %xmm8, %xmm8
2849
vpxor %xmm6, %xmm4, %xmm4
2850
vpaddd %xmm0, %xmm4, %xmm4
2851
vpaddd -16*16(%rax), %xmm8, %xmm3
2852
vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
2853
vmovdqa %xmm3, 0*16(%rax)
2854
vmovdqa %xmm7, 1*16(%rax)
2855
2856
sha256_xop_extend_doubleround 2
2857
sha256_xop_extend_doubleround 4
2858
2859
vmovdqa -9*16(%rax), %xmm0
2860
vprotd $25, %xmm0, %xmm1
2861
vprotd $14, %xmm0, %xmm2
2862
vpsrld $3, %xmm0, %xmm8
2863
vpxor %xmm1, %xmm2, %xmm2
2864
vpxor %xmm2, %xmm8, %xmm8
2865
vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
2866
vpaddd -10*16(%rax), %xmm8, %xmm0
2867
vprotd $15, %xmm3, %xmm1
2868
vprotd $15, %xmm7, %xmm5
2869
vprotd $13, %xmm3, %xmm2
2870
vprotd $13, %xmm7, %xmm6
2871
vpxor %xmm1, %xmm2, %xmm2
2872
vpxor %xmm5, %xmm6, %xmm6
2873
vpaddd -1*16(%rax), %xmm0, %xmm0
2874
vpaddd 0*16(%rax), %xmm4, %xmm4
2875
vpsrld $10, %xmm3, %xmm3
2876
vpsrld $10, %xmm7, %xmm7
2877
vpxor %xmm2, %xmm3, %xmm3
2878
vpxor %xmm6, %xmm7, %xmm7
2879
vpaddd %xmm0, %xmm3, %xmm3
2880
vpaddd %xmm4, %xmm7, %xmm7
2881
vmovdqa %xmm3, 6*16(%rax)
2882
vmovdqa %xmm7, 7*16(%rax)
2883
2884
vprotd $15, %xmm3, %xmm1
2885
vprotd $15, %xmm7, %xmm5
2886
vprotd $13, %xmm3, %xmm2
2887
vprotd $13, %xmm7, %xmm6
2888
vpxor %xmm1, %xmm2, %xmm2
2889
vpxor %xmm5, %xmm6, %xmm6
2890
vpsrld $10, %xmm3, %xmm3
2891
vpsrld $10, %xmm7, %xmm7
2892
vpxor %xmm2, %xmm3, %xmm3
2893
vpxor %xmm6, %xmm7, %xmm7
2894
vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
2895
vpaddd 1*16(%rax), %xmm3, %xmm3
2896
vpaddd 2*16(%rax), %xmm7, %xmm7
2897
vmovdqa %xmm3, 8*16(%rax)
2898
vmovdqa %xmm7, 9*16(%rax)
2899
2900
vprotd $15, %xmm3, %xmm1
2901
vprotd $15, %xmm7, %xmm5
2902
vprotd $13, %xmm3, %xmm2
2903
vprotd $13, %xmm7, %xmm6
2904
vpxor %xmm1, %xmm2, %xmm2
2905
vpxor %xmm5, %xmm6, %xmm6
2906
vpsrld $10, %xmm3, %xmm3
2907
vpsrld $10, %xmm7, %xmm7
2908
vpxor %xmm2, %xmm3, %xmm3
2909
vpxor %xmm6, %xmm7, %xmm7
2910
vpaddd 3*16(%rax), %xmm3, %xmm3
2911
vpaddd 4*16(%rax), %xmm7, %xmm7
2912
vmovdqa %xmm3, 10*16(%rax)
2913
vmovdqa %xmm7, 11*16(%rax)
2914
2915
vprotd $15, %xmm3, %xmm1
2916
vprotd $15, %xmm7, %xmm5
2917
vprotd $13, %xmm3, %xmm2
2918
vprotd $13, %xmm7, %xmm6
2919
vpxor %xmm1, %xmm2, %xmm2
2920
vpxor %xmm5, %xmm6, %xmm6
2921
vpsrld $10, %xmm3, %xmm3
2922
vpsrld $10, %xmm7, %xmm7
2923
vpxor %xmm2, %xmm3, %xmm3
2924
vpxor %xmm6, %xmm7, %xmm7
2925
vpaddd 5*16(%rax), %xmm3, %xmm3
2926
vpaddd 6*16(%rax), %xmm7, %xmm7
2927
vmovdqa %xmm3, 12*16(%rax)
2928
vmovdqa %xmm7, 13*16(%rax)
2929
2930
vmovdqa sha256d_4preext2_30(%rip), %xmm0
2931
vmovdqa 0*16(%rax), %xmm4
2932
vprotd $25, %xmm4, %xmm5
2933
vprotd $14, %xmm4, %xmm6
2934
vpxor %xmm5, %xmm6, %xmm6
2935
vpsrld $3, %xmm4, %xmm4
2936
vpxor %xmm6, %xmm4, %xmm4
2937
vpaddd -1*16(%rax), %xmm4, %xmm4
2938
vprotd $15, %xmm3, %xmm1
2939
vprotd $15, %xmm7, %xmm5
2940
vprotd $13, %xmm3, %xmm2
2941
vprotd $13, %xmm7, %xmm6
2942
vpxor %xmm1, %xmm2, %xmm2
2943
vpxor %xmm5, %xmm6, %xmm6
2944
vpaddd 7*16(%rax), %xmm0, %xmm0
2945
vpaddd 8*16(%rax), %xmm4, %xmm4
2946
vpsrld $10, %xmm3, %xmm3
2947
vpsrld $10, %xmm7, %xmm7
2948
vpxor %xmm2, %xmm3, %xmm3
2949
vpxor %xmm6, %xmm7, %xmm7
2950
vpaddd %xmm0, %xmm3, %xmm3
2951
vpaddd %xmm4, %xmm7, %xmm7
2952
vmovdqa %xmm3, 14*16(%rax)
2953
vmovdqa %xmm7, 15*16(%rax)
2954
2955
jmp sha256d_ms_4way_xop_extend_loop2
2956
2957
sha256d_ms_4way_xop_extend_coda2:
2958
sha256_xop_extend_round 44
2959
2960
movdqa sha256_4h+0(%rip), %xmm7
2961
movdqa sha256_4h+16(%rip), %xmm5
2962
movdqa sha256_4h+32(%rip), %xmm4
2963
movdqa sha256_4h+48(%rip), %xmm3
2964
movdqa sha256_4h+64(%rip), %xmm0
2965
movdqa sha256_4h+80(%rip), %xmm8
2966
movdqa sha256_4h+96(%rip), %xmm9
2967
movdqa sha256_4h+112(%rip), %xmm10
2968
2969
movq %rsp, %rax
2970
leaq sha256_4k(%rip), %rcx
2971
jmp sha256d_ms_4way_xop_main_loop2
2972
2973
.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4
2974
vpaddd 16*\i(%rax), \r0, %xmm6
2975
vpaddd 16*\i(%rcx), %xmm6, %xmm6
2976
vpandn \r1, \r3, %xmm1
2977
vpand \r3, \r2, %xmm2
2978
vpxor %xmm2, %xmm1, %xmm1
2979
vpaddd %xmm1, %xmm6, %xmm6
2980
vprotd $26, \r3, %xmm1
2981
vprotd $21, \r3, %xmm2
2982
vpxor %xmm1, %xmm2, %xmm2
2983
vprotd $7, \r3, \r0
2984
vpxor %xmm2, \r0, \r0
2985
vpaddd \r0, %xmm6, %xmm6
2986
vpaddd %xmm6, \r4, \r0
2987
.endm
2988
2989
sha256d_ms_4way_xop_finish:
2990
sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
2991
sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
2992
sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
2993
sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
2994
2995
paddd sha256_4h+112(%rip), %xmm10
2996
movdqa %xmm10, 112(%rdi)
2997
2998
addq $1032, %rsp
2999
#if defined(_WIN64) || defined(__CYGWIN__)
3000
popq %rsi
3001
movdqa 0(%rsp), %xmm6
3002
movdqa 16(%rsp), %xmm7
3003
movdqa 32(%rsp), %xmm8
3004
movdqa 48(%rsp), %xmm9
3005
movdqa 64(%rsp), %xmm10
3006
addq $80, %rsp
3007
popq %rdi
3008
#endif
3009
ret
3010
3011
#endif /* USE_XOP */
3012
3013
3014
.text
3015
.p2align 6
3016
.globl sha256_use_4way
3017
.globl _sha256_use_4way
3018
sha256_use_4way:
3019
_sha256_use_4way:
3020
pushq %rbx
3021
pushq %rcx
3022
pushq %rdx
3023
3024
#if defined(USE_AVX)
3025
/* Check for AVX and OSXSAVE support */
3026
movl $1, %eax
3027
cpuid
3028
andl $0x18000000, %ecx
3029
cmpl $0x18000000, %ecx
3030
jne sha256_use_4way_base
3031
/* Check for XMM and YMM state support */
3032
xorl %ecx, %ecx
3033
xgetbv
3034
andl $0x00000006, %eax
3035
cmpl $0x00000006, %eax
3036
jne sha256_use_4way_base
3037
#if defined(USE_XOP)
3038
/* Check for XOP support */
3039
movl $0x80000001, %eax
3040
cpuid
3041
andl $0x00000800, %ecx
3042
jz sha256_use_4way_avx
3043
3044
sha256_use_4way_xop:
3045
leaq sha256d_ms_4way_xop(%rip), %rcx
3046
leaq sha256_transform_4way_core_xop(%rip), %rdx
3047
jmp sha256_use_4way_done
3048
#endif /* USE_XOP */
3049
3050
sha256_use_4way_avx:
3051
leaq sha256d_ms_4way_avx(%rip), %rcx
3052
leaq sha256_transform_4way_core_avx(%rip), %rdx
3053
jmp sha256_use_4way_done
3054
#endif /* USE_AVX */
3055
3056
sha256_use_4way_base:
3057
leaq sha256d_ms_4way_sse2(%rip), %rcx
3058
leaq sha256_transform_4way_core_sse2(%rip), %rdx
3059
3060
sha256_use_4way_done:
3061
movq %rcx, sha256d_ms_4way_addr(%rip)
3062
movq %rdx, sha256_transform_4way_core_addr(%rip)
3063
popq %rdx
3064
popq %rcx
3065
popq %rbx
3066
movl $1, %eax
3067
ret
3068
3069
3070
#if defined(USE_AVX2)
3071
3072
.text
3073
.p2align 6
3074
.globl sha256d_ms_8way
3075
.globl _sha256d_ms_8way
3076
sha256d_ms_8way:
3077
_sha256d_ms_8way:
3078
sha256d_ms_8way_avx2:
3079
#if defined(_WIN64) || defined(__CYGWIN__)
3080
pushq %rdi
3081
subq $80, %rsp
3082
vmovdqa %xmm6, 0(%rsp)
3083
vmovdqa %xmm7, 16(%rsp)
3084
vmovdqa %xmm8, 32(%rsp)
3085
vmovdqa %xmm9, 48(%rsp)
3086
vmovdqa %xmm10, 64(%rsp)
3087
pushq %rsi
3088
movq %rcx, %rdi
3089
movq %rdx, %rsi
3090
movq %r8, %rdx
3091
movq %r9, %rcx
3092
#endif
3093
pushq %rbp
3094
movq %rsp, %rbp
3095
subq $64*32, %rsp
3096
andq $-128, %rsp
3097
3098
leaq 16*32(%rsi), %rax
3099
3100
sha256d_ms_8way_avx2_extend_loop1:
3101
vmovdqa 3*32(%rsi), %ymm0
3102
vmovdqa 2*32(%rax), %ymm3
3103
vmovdqa 3*32(%rax), %ymm7
3104
vmovdqa %ymm3, 2*32(%rsp)
3105
vmovdqa %ymm7, 3*32(%rsp)
3106
vpaddd %ymm0, %ymm7, %ymm7
3107
vpslld $14, %ymm0, %ymm2
3108
vpsrld $3, %ymm0, %ymm0
3109
vpsrld $4, %ymm0, %ymm1
3110
vpxor %ymm1, %ymm0, %ymm0
3111
vpxor %ymm2, %ymm0, %ymm0
3112
vpsrld $11, %ymm1, %ymm1
3113
vpslld $11, %ymm2, %ymm2
3114
vpxor %ymm1, %ymm0, %ymm0
3115
vpxor %ymm2, %ymm0, %ymm0
3116
vpaddd %ymm0, %ymm3, %ymm3
3117
vmovdqa %ymm3, 2*32(%rax)
3118
vmovdqa %ymm7, 3*32(%rax)
3119
3120
vmovdqa 4*32(%rax), %ymm0
3121
vmovdqa %ymm0, 4*32(%rsp)
3122
vpslld $13, %ymm3, %ymm2
3123
vpslld $13, %ymm7, %ymm6
3124
vpsrld $10, %ymm3, %ymm3
3125
vpsrld $10, %ymm7, %ymm7
3126
vpsrld $7, %ymm3, %ymm1
3127
vpsrld $7, %ymm7, %ymm5
3128
vpxor %ymm1, %ymm3, %ymm3
3129
vpxor %ymm5, %ymm7, %ymm7
3130
vpsrld $2, %ymm1, %ymm1
3131
vpsrld $2, %ymm5, %ymm5
3132
vpxor %ymm2, %ymm3, %ymm3
3133
vpxor %ymm6, %ymm7, %ymm7
3134
vpslld $2, %ymm2, %ymm2
3135
vpslld $2, %ymm6, %ymm6
3136
vpxor %ymm1, %ymm3, %ymm3
3137
vpxor %ymm5, %ymm7, %ymm7
3138
vpxor %ymm2, %ymm3, %ymm3
3139
vpxor %ymm6, %ymm7, %ymm7
3140
vpaddd %ymm0, %ymm3, %ymm3
3141
vmovdqa %ymm3, 4*32(%rax)
3142
vmovdqa %ymm7, 5*32(%rax)
3143
3144
vmovdqa 6*32(%rax), %ymm0
3145
vmovdqa 7*32(%rax), %ymm4
3146
vmovdqa %ymm0, 6*32(%rsp)
3147
vmovdqa %ymm4, 7*32(%rsp)
3148
vpslld $13, %ymm3, %ymm2
3149
vpslld $13, %ymm7, %ymm6
3150
vpsrld $10, %ymm3, %ymm3
3151
vpsrld $10, %ymm7, %ymm7
3152
vpsrld $7, %ymm3, %ymm1
3153
vpsrld $7, %ymm7, %ymm5
3154
vpxor %ymm1, %ymm3, %ymm3
3155
vpxor %ymm5, %ymm7, %ymm7
3156
vpsrld $2, %ymm1, %ymm1
3157
vpsrld $2, %ymm5, %ymm5
3158
vpxor %ymm2, %ymm3, %ymm3
3159
vpxor %ymm6, %ymm7, %ymm7
3160
vpslld $2, %ymm2, %ymm2
3161
vpslld $2, %ymm6, %ymm6
3162
vpxor %ymm1, %ymm3, %ymm3
3163
vpxor %ymm5, %ymm7, %ymm7
3164
vpxor %ymm2, %ymm3, %ymm3
3165
vpxor %ymm6, %ymm7, %ymm7
3166
vpaddd %ymm0, %ymm3, %ymm3
3167
vpaddd %ymm4, %ymm7, %ymm7
3168
vmovdqa %ymm3, 6*32(%rax)
3169
vmovdqa %ymm7, 7*32(%rax)
3170
3171
vmovdqa 8*32(%rax), %ymm0
3172
vmovdqa 2*32(%rax), %ymm4
3173
vmovdqa %ymm0, 8*32(%rsp)
3174
vpslld $13, %ymm3, %ymm2
3175
vpslld $13, %ymm7, %ymm6
3176
vpsrld $10, %ymm3, %ymm3
3177
vpsrld $10, %ymm7, %ymm7
3178
vpsrld $7, %ymm3, %ymm1
3179
vpsrld $7, %ymm7, %ymm5
3180
vpxor %ymm1, %ymm3, %ymm3
3181
vpxor %ymm5, %ymm7, %ymm7
3182
vpsrld $2, %ymm1, %ymm1
3183
vpsrld $2, %ymm5, %ymm5
3184
vpxor %ymm2, %ymm3, %ymm3
3185
vpxor %ymm6, %ymm7, %ymm7
3186
vpslld $2, %ymm2, %ymm2
3187
vpslld $2, %ymm6, %ymm6
3188
vpxor %ymm1, %ymm3, %ymm3
3189
vpxor %ymm5, %ymm7, %ymm7
3190
vpxor %ymm2, %ymm3, %ymm3
3191
vpxor %ymm6, %ymm7, %ymm7
3192
vpaddd %ymm0, %ymm3, %ymm3
3193
vpaddd %ymm4, %ymm7, %ymm7
3194
vmovdqa %ymm3, 8*32(%rax)
3195
vmovdqa %ymm7, 9*32(%rax)
3196
3197
vpslld $13, %ymm3, %ymm2
3198
vpslld $13, %ymm7, %ymm6
3199
vpsrld $10, %ymm3, %ymm3
3200
vpsrld $10, %ymm7, %ymm7
3201
vpsrld $7, %ymm3, %ymm1
3202
vpsrld $7, %ymm7, %ymm5
3203
vpxor %ymm1, %ymm3, %ymm3
3204
vpxor %ymm5, %ymm7, %ymm7
3205
vpsrld $2, %ymm1, %ymm1
3206
vpsrld $2, %ymm5, %ymm5
3207
vpxor %ymm2, %ymm3, %ymm3
3208
vpxor %ymm6, %ymm7, %ymm7
3209
vpslld $2, %ymm2, %ymm2
3210
vpslld $2, %ymm6, %ymm6
3211
vpxor %ymm1, %ymm3, %ymm3
3212
vpxor %ymm5, %ymm7, %ymm7
3213
vpxor %ymm2, %ymm3, %ymm3
3214
vpxor %ymm6, %ymm7, %ymm7
3215
vpaddd 3*32(%rax), %ymm3, %ymm3
3216
vpaddd 4*32(%rax), %ymm7, %ymm7
3217
vmovdqa %ymm3, 10*32(%rax)
3218
vmovdqa %ymm7, 11*32(%rax)
3219
3220
vpslld $13, %ymm3, %ymm2
3221
vpslld $13, %ymm7, %ymm6
3222
vpsrld $10, %ymm3, %ymm3
3223
vpsrld $10, %ymm7, %ymm7
3224
vpsrld $7, %ymm3, %ymm1
3225
vpsrld $7, %ymm7, %ymm5
3226
vpxor %ymm1, %ymm3, %ymm3
3227
vpxor %ymm5, %ymm7, %ymm7
3228
vpsrld $2, %ymm1, %ymm1
3229
vpsrld $2, %ymm5, %ymm5
3230
vpxor %ymm2, %ymm3, %ymm3
3231
vpxor %ymm6, %ymm7, %ymm7
3232
vpslld $2, %ymm2, %ymm2
3233
vpslld $2, %ymm6, %ymm6
3234
vpxor %ymm1, %ymm3, %ymm3
3235
vpxor %ymm5, %ymm7, %ymm7
3236
vpxor %ymm2, %ymm3, %ymm3
3237
vpxor %ymm6, %ymm7, %ymm7
3238
vpaddd 5*32(%rax), %ymm3, %ymm3
3239
vpaddd 6*32(%rax), %ymm7, %ymm7
3240
vmovdqa %ymm3, 12*32(%rax)
3241
vmovdqa %ymm7, 13*32(%rax)
3242
3243
vmovdqa 14*32(%rax), %ymm0
3244
vmovdqa 15*32(%rax), %ymm4
3245
vmovdqa %ymm0, 14*32(%rsp)
3246
vmovdqa %ymm4, 15*32(%rsp)
3247
vpslld $13, %ymm3, %ymm2
3248
vpslld $13, %ymm7, %ymm6
3249
vpsrld $10, %ymm3, %ymm3
3250
vpsrld $10, %ymm7, %ymm7
3251
vpaddd 7*32(%rax), %ymm0, %ymm0
3252
vpaddd 8*32(%rax), %ymm4, %ymm4
3253
vpsrld $7, %ymm3, %ymm1
3254
vpsrld $7, %ymm7, %ymm5
3255
vpxor %ymm1, %ymm3, %ymm3
3256
vpxor %ymm5, %ymm7, %ymm7
3257
vpsrld $2, %ymm1, %ymm1
3258
vpsrld $2, %ymm5, %ymm5
3259
vpxor %ymm2, %ymm3, %ymm3
3260
vpxor %ymm6, %ymm7, %ymm7
3261
vpslld $2, %ymm2, %ymm2
3262
vpslld $2, %ymm6, %ymm6
3263
vpxor %ymm1, %ymm3, %ymm3
3264
vpxor %ymm5, %ymm7, %ymm7
3265
vpxor %ymm2, %ymm3, %ymm3
3266
vpxor %ymm6, %ymm7, %ymm7
3267
vpaddd %ymm0, %ymm3, %ymm3
3268
vpaddd %ymm4, %ymm7, %ymm7
3269
vmovdqa %ymm3, 14*32(%rax)
3270
vmovdqa %ymm7, 15*32(%rax)
3271
3272
sha256d_ms_8way_avx2_extend_loop2:
3273
sha256_avx2_extend_doubleround 16
3274
sha256_avx2_extend_doubleround 18
3275
sha256_avx2_extend_doubleround 20
3276
sha256_avx2_extend_doubleround 22
3277
sha256_avx2_extend_doubleround 24
3278
sha256_avx2_extend_doubleround 26
3279
sha256_avx2_extend_doubleround 28
3280
sha256_avx2_extend_doubleround 30
3281
sha256_avx2_extend_doubleround 32
3282
sha256_avx2_extend_doubleround 34
3283
sha256_avx2_extend_doubleround 36
3284
sha256_avx2_extend_doubleround 38
3285
sha256_avx2_extend_doubleround 40
3286
sha256_avx2_extend_doubleround 42
3287
jz sha256d_ms_8way_avx2_extend_coda2
3288
sha256_avx2_extend_doubleround 44
3289
sha256_avx2_extend_doubleround 46
3290
3291
vmovdqa 0(%rcx), %ymm7
3292
vmovdqa 32(%rcx), %ymm8
3293
vmovdqa 64(%rcx), %ymm9
3294
vmovdqa 96(%rcx), %ymm10
3295
vmovdqa 128(%rcx), %ymm0
3296
vmovdqa 160(%rcx), %ymm5
3297
vmovdqa 192(%rcx), %ymm4
3298
vmovdqa 224(%rcx), %ymm3
3299
3300
movq %rsi, %rax
3301
leaq sha256_8k(%rip), %rcx
3302
jmp sha256d_ms_8way_avx2_main_loop1
3303
3304
sha256d_ms_8way_avx2_main_loop2:
3305
sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
3306
sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
3307
sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
3308
sha256d_ms_8way_avx2_main_loop1:
3309
sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
3310
sha256_avx2_main_quadround 4
3311
sha256_avx2_main_quadround 8
3312
sha256_avx2_main_quadround 12
3313
sha256_avx2_main_quadround 16
3314
sha256_avx2_main_quadround 20
3315
sha256_avx2_main_quadround 24
3316
sha256_avx2_main_quadround 28
3317
sha256_avx2_main_quadround 32
3318
sha256_avx2_main_quadround 36
3319
sha256_avx2_main_quadround 40
3320
sha256_avx2_main_quadround 44
3321
sha256_avx2_main_quadround 48
3322
sha256_avx2_main_quadround 52
3323
sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
3324
jz sha256d_ms_8way_avx2_finish
3325
sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
3326
sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
3327
sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
3328
sha256_avx2_main_quadround 60
3329
3330
vmovdqa 2*32(%rsp), %ymm1
3331
vmovdqa 3*32(%rsp), %ymm2
3332
vmovdqa 4*32(%rsp), %ymm6
3333
vmovdqa %ymm1, 18*32(%rsi)
3334
vmovdqa %ymm2, 19*32(%rsi)
3335
vmovdqa %ymm6, 20*32(%rsi)
3336
vmovdqa 6*32(%rsp), %ymm1
3337
vmovdqa 7*32(%rsp), %ymm2
3338
vmovdqa 8*32(%rsp), %ymm6
3339
vmovdqa %ymm1, 22*32(%rsi)
3340
vmovdqa %ymm2, 23*32(%rsi)
3341
vmovdqa %ymm6, 24*32(%rsi)
3342
vmovdqa 14*32(%rsp), %ymm1
3343
vmovdqa 15*32(%rsp), %ymm2
3344
vmovdqa %ymm1, 30*32(%rsi)
3345
vmovdqa %ymm2, 31*32(%rsi)
3346
3347
vpaddd 0(%rdx), %ymm7, %ymm7
3348
vpaddd 32(%rdx), %ymm5, %ymm5
3349
vpaddd 64(%rdx), %ymm4, %ymm4
3350
vpaddd 96(%rdx), %ymm3, %ymm3
3351
vpaddd 128(%rdx), %ymm0, %ymm0
3352
vpaddd 160(%rdx), %ymm8, %ymm8
3353
vpaddd 192(%rdx), %ymm9, %ymm9
3354
vpaddd 224(%rdx), %ymm10, %ymm10
3355
3356
vmovdqa %ymm7, 0(%rsp)
3357
vmovdqa %ymm5, 32(%rsp)
3358
vmovdqa %ymm4, 64(%rsp)
3359
vmovdqa %ymm3, 96(%rsp)
3360
vmovdqa %ymm0, 128(%rsp)
3361
vmovdqa %ymm8, 160(%rsp)
3362
vmovdqa %ymm9, 192(%rsp)
3363
vmovdqa %ymm10, 224(%rsp)
3364
3365
vpxor %ymm0, %ymm0, %ymm0
3366
movq $0x8000000000000100, %rax
3367
vmovd %rax, %xmm1
3368
vinserti128 $1, %xmm1, %ymm1, %ymm1
3369
vpshufd $0x55, %ymm1, %ymm2
3370
vpshufd $0x00, %ymm1, %ymm1
3371
vmovdqa %ymm2, 8*32(%rsp)
3372
vmovdqa %ymm0, 9*32(%rsp)
3373
vmovdqa %ymm0, 10*32(%rsp)
3374
vmovdqa %ymm0, 11*32(%rsp)
3375
vmovdqa %ymm0, 12*32(%rsp)
3376
vmovdqa %ymm0, 13*32(%rsp)
3377
vmovdqa %ymm0, 14*32(%rsp)
3378
vmovdqa %ymm1, 15*32(%rsp)
3379
3380
leaq 16*32(%rsp), %rax
3381
cmpq %rax, %rax
3382
3383
vmovdqa -15*32(%rax), %ymm0
3384
vmovdqa -14*32(%rax), %ymm4
3385
vpslld $14, %ymm0, %ymm2
3386
vpslld $14, %ymm4, %ymm6
3387
vpsrld $3, %ymm0, %ymm8
3388
vpsrld $3, %ymm4, %ymm4
3389
vpsrld $7, %ymm0, %ymm1
3390
vpsrld $4, %ymm4, %ymm5
3391
vpxor %ymm1, %ymm8, %ymm8
3392
vpxor %ymm5, %ymm4, %ymm4
3393
vpsrld $11, %ymm1, %ymm1
3394
vpsrld $11, %ymm5, %ymm5
3395
vpxor %ymm2, %ymm8, %ymm8
3396
vpxor %ymm6, %ymm4, %ymm4
3397
vpslld $11, %ymm2, %ymm2
3398
vpslld $11, %ymm6, %ymm6
3399
vpxor %ymm1, %ymm8, %ymm8
3400
vpxor %ymm5, %ymm4, %ymm4
3401
vpxor %ymm2, %ymm8, %ymm8
3402
vpxor %ymm6, %ymm4, %ymm4
3403
vpaddd %ymm0, %ymm4, %ymm4
3404
vpaddd -16*32(%rax), %ymm8, %ymm3
3405
vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7
3406
vmovdqa %ymm3, 0*32(%rax)
3407
vmovdqa %ymm7, 1*32(%rax)
3408
3409
sha256_avx2_extend_doubleround 2
3410
sha256_avx2_extend_doubleround 4
3411
3412
vmovdqa -9*32(%rax), %ymm0
3413
vpslld $14, %ymm0, %ymm2
3414
vpsrld $3, %ymm0, %ymm8
3415
vpsrld $7, %ymm0, %ymm1
3416
vpxor %ymm1, %ymm8, %ymm8
3417
vpxor %ymm2, %ymm8, %ymm8
3418
vpsrld $11, %ymm1, %ymm1
3419
vpslld $11, %ymm2, %ymm2
3420
vpxor %ymm1, %ymm8, %ymm8
3421
vpxor %ymm2, %ymm8, %ymm8
3422
vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4
3423
vpaddd -10*32(%rax), %ymm8, %ymm0
3424
vpslld $13, %ymm3, %ymm2
3425
vpslld $13, %ymm7, %ymm6
3426
vpsrld $10, %ymm3, %ymm3
3427
vpsrld $10, %ymm7, %ymm7
3428
vpaddd -1*32(%rax), %ymm0, %ymm0
3429
vpaddd 0*32(%rax), %ymm4, %ymm4
3430
vpsrld $7, %ymm3, %ymm1
3431
vpsrld $7, %ymm7, %ymm5
3432
vpxor %ymm1, %ymm3, %ymm3
3433
vpxor %ymm5, %ymm7, %ymm7
3434
vpsrld $2, %ymm1, %ymm1
3435
vpsrld $2, %ymm5, %ymm5
3436
vpxor %ymm2, %ymm3, %ymm3
3437
vpxor %ymm6, %ymm7, %ymm7
3438
vpslld $2, %ymm2, %ymm2
3439
vpslld $2, %ymm6, %ymm6
3440
vpxor %ymm1, %ymm3, %ymm3
3441
vpxor %ymm5, %ymm7, %ymm7
3442
vpxor %ymm2, %ymm3, %ymm3
3443
vpxor %ymm6, %ymm7, %ymm7
3444
vpaddd %ymm0, %ymm3, %ymm3
3445
vpaddd %ymm4, %ymm7, %ymm7
3446
vmovdqa %ymm3, 6*32(%rax)
3447
vmovdqa %ymm7, 7*32(%rax)
3448
3449
vpslld $13, %ymm3, %ymm2
3450
vpslld $13, %ymm7, %ymm6
3451
vpsrld $10, %ymm3, %ymm3
3452
vpsrld $10, %ymm7, %ymm7
3453
vpsrld $7, %ymm3, %ymm1
3454
vpsrld $7, %ymm7, %ymm5
3455
vpxor %ymm1, %ymm3, %ymm3
3456
vpxor %ymm5, %ymm7, %ymm7
3457
vpsrld $2, %ymm1, %ymm1
3458
vpsrld $2, %ymm5, %ymm5
3459
vpxor %ymm2, %ymm3, %ymm3
3460
vpxor %ymm6, %ymm7, %ymm7
3461
vpslld $2, %ymm2, %ymm2
3462
vpslld $2, %ymm6, %ymm6
3463
vpxor %ymm1, %ymm3, %ymm3
3464
vpxor %ymm5, %ymm7, %ymm7
3465
vpxor %ymm2, %ymm3, %ymm3
3466
vpxor %ymm6, %ymm7, %ymm7
3467
vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3
3468
vpaddd 1*32(%rax), %ymm3, %ymm3
3469
vpaddd 2*32(%rax), %ymm7, %ymm7
3470
vmovdqa %ymm3, 8*32(%rax)
3471
vmovdqa %ymm7, 9*32(%rax)
3472
3473
vpslld $13, %ymm3, %ymm2
3474
vpslld $13, %ymm7, %ymm6
3475
vpsrld $10, %ymm3, %ymm3
3476
vpsrld $10, %ymm7, %ymm7
3477
vpsrld $7, %ymm3, %ymm1
3478
vpsrld $7, %ymm7, %ymm5
3479
vpxor %ymm1, %ymm3, %ymm3
3480
vpxor %ymm5, %ymm7, %ymm7
3481
vpsrld $2, %ymm1, %ymm1
3482
vpsrld $2, %ymm5, %ymm5
3483
vpxor %ymm2, %ymm3, %ymm3
3484
vpxor %ymm6, %ymm7, %ymm7
3485
vpslld $2, %ymm2, %ymm2
3486
vpslld $2, %ymm6, %ymm6
3487
vpxor %ymm1, %ymm3, %ymm3
3488
vpxor %ymm5, %ymm7, %ymm7
3489
vpxor %ymm2, %ymm3, %ymm3
3490
vpxor %ymm6, %ymm7, %ymm7
3491
vpaddd 3*32(%rax), %ymm3, %ymm3
3492
vpaddd 4*32(%rax), %ymm7, %ymm7
3493
vmovdqa %ymm3, 10*32(%rax)
3494
vmovdqa %ymm7, 11*32(%rax)
3495
3496
vpslld $13, %ymm3, %ymm2
3497
vpslld $13, %ymm7, %ymm6
3498
vpsrld $10, %ymm3, %ymm3
3499
vpsrld $10, %ymm7, %ymm7
3500
vpsrld $7, %ymm3, %ymm1
3501
vpsrld $7, %ymm7, %ymm5
3502
vpxor %ymm1, %ymm3, %ymm3
3503
vpxor %ymm5, %ymm7, %ymm7
3504
vpsrld $2, %ymm1, %ymm1
3505
vpsrld $2, %ymm5, %ymm5
3506
vpxor %ymm2, %ymm3, %ymm3
3507
vpxor %ymm6, %ymm7, %ymm7
3508
vpslld $2, %ymm2, %ymm2
3509
vpslld $2, %ymm6, %ymm6
3510
vpxor %ymm1, %ymm3, %ymm3
3511
vpxor %ymm5, %ymm7, %ymm7
3512
vpxor %ymm2, %ymm3, %ymm3
3513
vpxor %ymm6, %ymm7, %ymm7
3514
vpaddd 5*32(%rax), %ymm3, %ymm3
3515
vpaddd 6*32(%rax), %ymm7, %ymm7
3516
vmovdqa %ymm3, 12*32(%rax)
3517
vmovdqa %ymm7, 13*32(%rax)
3518
3519
vmovdqa sha256d_8preext2_30(%rip), %ymm0
3520
vmovdqa 0*32(%rax), %ymm4
3521
vpslld $14, %ymm4, %ymm6
3522
vpsrld $3, %ymm4, %ymm4
3523
vpsrld $4, %ymm4, %ymm5
3524
vpxor %ymm5, %ymm4, %ymm4
3525
vpxor %ymm6, %ymm4, %ymm4
3526
vpsrld $11, %ymm5, %ymm5
3527
vpslld $11, %ymm6, %ymm6
3528
vpxor %ymm5, %ymm4, %ymm4
3529
vpxor %ymm6, %ymm4, %ymm4
3530
vpaddd -1*32(%rax), %ymm4, %ymm4
3531
vpslld $13, %ymm3, %ymm2
3532
vpslld $13, %ymm7, %ymm6
3533
vpsrld $10, %ymm3, %ymm3
3534
vpsrld $10, %ymm7, %ymm7
3535
vpaddd 7*32(%rax), %ymm0, %ymm0
3536
vpaddd 8*32(%rax), %ymm4, %ymm4
3537
vpsrld $7, %ymm3, %ymm1
3538
vpsrld $7, %ymm7, %ymm5
3539
vpxor %ymm1, %ymm3, %ymm3
3540
vpxor %ymm5, %ymm7, %ymm7
3541
vpsrld $2, %ymm1, %ymm1
3542
vpsrld $2, %ymm5, %ymm5
3543
vpxor %ymm2, %ymm3, %ymm3
3544
vpxor %ymm6, %ymm7, %ymm7
3545
vpslld $2, %ymm2, %ymm2
3546
vpslld $2, %ymm6, %ymm6
3547
vpxor %ymm1, %ymm3, %ymm3
3548
vpxor %ymm5, %ymm7, %ymm7
3549
vpxor %ymm2, %ymm3, %ymm3
3550
vpxor %ymm6, %ymm7, %ymm7
3551
vpaddd %ymm0, %ymm3, %ymm3
3552
vpaddd %ymm4, %ymm7, %ymm7
3553
vmovdqa %ymm3, 14*32(%rax)
3554
vmovdqa %ymm7, 15*32(%rax)
3555
3556
jmp sha256d_ms_8way_avx2_extend_loop2
3557
3558
sha256d_ms_8way_avx2_extend_coda2:
3559
sha256_avx2_extend_round 44
3560
3561
vmovdqa sha256_8h+0(%rip), %ymm7
3562
vmovdqa sha256_8h+32(%rip), %ymm5
3563
vmovdqa sha256_8h+64(%rip), %ymm4
3564
vmovdqa sha256_8h+96(%rip), %ymm3
3565
vmovdqa sha256_8h+128(%rip), %ymm0
3566
vmovdqa sha256_8h+160(%rip), %ymm8
3567
vmovdqa sha256_8h+192(%rip), %ymm9
3568
vmovdqa sha256_8h+224(%rip), %ymm10
3569
3570
movq %rsp, %rax
3571
leaq sha256_8k(%rip), %rcx
3572
jmp sha256d_ms_8way_avx2_main_loop2
3573
3574
.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4
3575
vpaddd 32*\i(%rax), \r0, %ymm6
3576
vpaddd 32*\i(%rcx), %ymm6, %ymm6
3577
vpandn \r1, \r3, %ymm1
3578
vpand \r3, \r2, %ymm2
3579
vpxor %ymm2, %ymm1, %ymm1
3580
vpaddd %ymm1, %ymm6, %ymm6
3581
vpslld $7, \r3, %ymm1
3582
vpsrld $6, \r3, \r0
3583
vpsrld $5, \r0, %ymm2
3584
vpxor %ymm1, \r0, \r0
3585
vpxor %ymm2, \r0, \r0
3586
vpslld $14, %ymm1, %ymm1
3587
vpsrld $14, %ymm2, %ymm2
3588
vpxor %ymm1, \r0, \r0
3589
vpxor %ymm2, \r0, \r0
3590
vpslld $5, %ymm1, %ymm1
3591
vpxor %ymm1, \r0, \r0
3592
vpaddd \r0, %ymm6, %ymm6
3593
vpaddd %ymm6, \r4, \r0
3594
.endm
3595
3596
sha256d_ms_8way_avx2_finish:
3597
sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4
3598
sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5
3599
sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7
3600
sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3
3601
3602
vpaddd sha256_8h+224(%rip), %ymm10, %ymm10
3603
vmovdqa %ymm10, 224(%rdi)
3604
3605
movq %rbp, %rsp
3606
popq %rbp
3607
#if defined(_WIN64) || defined(__CYGWIN__)
3608
popq %rsi
3609
vmovdqa 0(%rsp), %xmm6
3610
vmovdqa 16(%rsp), %xmm7
3611
vmovdqa 32(%rsp), %xmm8
3612
vmovdqa 48(%rsp), %xmm9
3613
vmovdqa 64(%rsp), %xmm10
3614
addq $80, %rsp
3615
popq %rdi
3616
#endif
3617
ret
3618
3619
3620
.text
3621
.p2align 6
3622
.globl sha256_use_8way
3623
.globl _sha256_use_8way
3624
sha256_use_8way:
3625
_sha256_use_8way:
3626
pushq %rbx
3627
3628
/* Check for AVX and OSXSAVE support */
3629
movl $1, %eax
3630
cpuid
3631
andl $0x18000000, %ecx
3632
cmpl $0x18000000, %ecx
3633
jne sha256_use_8way_no
3634
/* Check for AVX2 support */
3635
movl $7, %eax
3636
xorl %ecx, %ecx
3637
cpuid
3638
andl $0x00000020, %ebx
3639
cmpl $0x00000020, %ebx
3640
jne sha256_use_8way_no
3641
/* Check for XMM and YMM state support */
3642
xorl %ecx, %ecx
3643
xgetbv
3644
andl $0x00000006, %eax
3645
cmpl $0x00000006, %eax
3646
jne sha256_use_8way_no
3647
3648
sha256_use_8way_yes:
3649
movl $1, %eax
3650
jmp sha256_use_8way_done
3651
3652
sha256_use_8way_no:
3653
xorl %eax, %eax
3654
3655
sha256_use_8way_done:
3656
popq %rbx
3657
ret
3658
3659
#endif /* USE_AVX2 */
3660
3661
#endif
3662
3663