Crypto++  7.0
Free C++ class library of cryptographic schemes
blake2-simd.cpp
1 // blake2-simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to ARMv7a/ARMv8a
5 // NEON and SSE4.2 instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the appropriate
7 // instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "misc.h"
12 #include "blake2.h"
13 
14 // Uncomment for benchmarking C++ against SSE2 or NEON.
15 // Do so in both blake2.cpp and blake2-simd.cpp.
16 // #undef CRYPTOPP_SSE41_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
20 // 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
21 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
22 # undef CRYPTOPP_ARM_NEON_AVAILABLE
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <emmintrin.h>
27 # include <tmmintrin.h>
28 # include <smmintrin.h>
29 #endif
30 
31 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
32 # include <arm_neon.h>
33 #endif
34 
35 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
36 // compilers don't follow ACLE conventions for the include.
37 #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
38 # include <stdint.h>
39 # include <arm_acle.h>
40 #endif
41 
42 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
43 #define M128_CAST(x) ((__m128i *)(void *)(x))
44 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
45 
46 NAMESPACE_BEGIN(CryptoPP)
47 
48 // Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x,
49 // Win64 supplies it except for VS2008. See http://stackoverflow.com/a/38547909/608639
50 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || \
51  (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600))
52 inline __m128i MM_SET_EPI64X(const word64 a, const word64 b)
53 {
54  const word64 t[2] = {b,a}; __m128i r;
55  std::memcpy(&r, t, sizeof(t));
56  return r;
57 }
58 #else
59 # define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b)
60 #endif
61 
62 ANONYMOUS_NAMESPACE_BEGIN
63 
64 CRYPTOPP_ALIGN_DATA(16)
65 const word32 BLAKE2S_IV[8] = {
66  0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
67  0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
68 };
69 
70 CRYPTOPP_ALIGN_DATA(16)
71 const word64 BLAKE2B_IV[8] = {
72  W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
73  W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
74  W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
75  W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)
76 };
77 
78 ANONYMOUS_NAMESPACE_END
79 
80 #if CRYPTOPP_SSE41_AVAILABLE
81 void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false>& state)
82 {
83  __m128i row1, row2, row3, row4;
84  __m128i buf1, buf2, buf3, buf4;
85 
86  __m128i t0, t1, t2;
87  __m128i ff0, ff1;
88 
89  const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
90  const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
91 
92  const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
93  const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
94  const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
95  const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
96 
97  row1 = ff0 = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
98  row2 = ff1 = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
99  row3 = _mm_setr_epi32(BLAKE2S_IV[0], BLAKE2S_IV[1], BLAKE2S_IV[2], BLAKE2S_IV[3]);
100  row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
101  buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
102 
103  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
104  row4 = _mm_xor_si128(row4, row1);
105  row4 = _mm_shuffle_epi8(row4,r16);
106  row3 = _mm_add_epi32(row3, row4);
107  row2 = _mm_xor_si128(row2, row3);
108  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
109 
110  buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1))));
111 
112  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
113  row4 = _mm_xor_si128(row4, row1);
114  row4 = _mm_shuffle_epi8(row4,r8);
115  row3 = _mm_add_epi32(row3, row4);
116  row2 = _mm_xor_si128(row2, row3);
117  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
118 
119  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
120  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
121  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
122 
123  buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0))));
124 
125  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
126  row4 = _mm_xor_si128(row4, row1);
127  row4 = _mm_shuffle_epi8(row4,r16);
128  row3 = _mm_add_epi32(row3, row4);
129  row2 = _mm_xor_si128(row2, row3);
130  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
131 
132  buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1))));
133 
134  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
135  row4 = _mm_xor_si128(row4, row1);
136  row4 = _mm_shuffle_epi8(row4,r8);
137  row3 = _mm_add_epi32(row3, row4);
138  row2 = _mm_xor_si128(row2, row3);
139  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
140 
141  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
142  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
143  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
144 
145  t0 = _mm_blend_epi16(m1, m2, 0x0C);
146  t1 = _mm_slli_si128(m3, 4);
147  t2 = _mm_blend_epi16(t0, t1, 0xF0);
148  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
149 
150  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
151  row4 = _mm_xor_si128(row4, row1);
152  row4 = _mm_shuffle_epi8(row4,r16);
153  row3 = _mm_add_epi32(row3, row4);
154  row2 = _mm_xor_si128(row2, row3);
155  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
156 
157  t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0));
158  t1 = _mm_blend_epi16(m1,m3,0xC0);
159  t2 = _mm_blend_epi16(t0, t1, 0xF0);
160  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
161 
162  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
163  row4 = _mm_xor_si128(row4, row1);
164  row4 = _mm_shuffle_epi8(row4,r8);
165  row3 = _mm_add_epi32(row3, row4);
166  row2 = _mm_xor_si128(row2, row3);
167  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
168 
169  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
170  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
171  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
172 
173  t0 = _mm_slli_si128(m1, 4);
174  t1 = _mm_blend_epi16(m2, t0, 0x30);
175  t2 = _mm_blend_epi16(m0, t1, 0xF0);
176  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
177 
178  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
179  row4 = _mm_xor_si128(row4, row1);
180  row4 = _mm_shuffle_epi8(row4,r16);
181  row3 = _mm_add_epi32(row3, row4);
182  row2 = _mm_xor_si128(row2, row3);
183  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
184 
185  t0 = _mm_unpackhi_epi32(m0,m1);
186  t1 = _mm_slli_si128(m3, 4);
187  t2 = _mm_blend_epi16(t0, t1, 0x0C);
188  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
189 
190  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
191  row4 = _mm_xor_si128(row4, row1);
192  row4 = _mm_shuffle_epi8(row4,r8);
193  row3 = _mm_add_epi32(row3, row4);
194  row2 = _mm_xor_si128(row2, row3);
195  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
196 
197  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
198  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
199  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
200 
201  t0 = _mm_unpackhi_epi32(m2,m3);
202  t1 = _mm_blend_epi16(m3,m1,0x0C);
203  t2 = _mm_blend_epi16(t0, t1, 0x0F);
204  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
205 
206  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
207  row4 = _mm_xor_si128(row4, row1);
208  row4 = _mm_shuffle_epi8(row4,r16);
209  row3 = _mm_add_epi32(row3, row4);
210  row2 = _mm_xor_si128(row2, row3);
211  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
212 
213  t0 = _mm_unpacklo_epi32(m2,m0);
214  t1 = _mm_blend_epi16(t0, m0, 0xF0);
215  t2 = _mm_slli_si128(m3, 8);
216  buf2 = _mm_blend_epi16(t1, t2, 0xC0);
217 
218  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
219  row4 = _mm_xor_si128(row4, row1);
220  row4 = _mm_shuffle_epi8(row4,r8);
221  row3 = _mm_add_epi32(row3, row4);
222  row2 = _mm_xor_si128(row2, row3);
223  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
224 
225  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
226  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
227  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
228 
229  t0 = _mm_blend_epi16(m0, m2, 0x3C);
230  t1 = _mm_srli_si128(m1, 12);
231  t2 = _mm_blend_epi16(t0,t1,0x03);
232  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
233 
234  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
235  row4 = _mm_xor_si128(row4, row1);
236  row4 = _mm_shuffle_epi8(row4,r16);
237  row3 = _mm_add_epi32(row3, row4);
238  row2 = _mm_xor_si128(row2, row3);
239  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
240 
241  t0 = _mm_slli_si128(m3, 4);
242  t1 = _mm_blend_epi16(m0, m1, 0x33);
243  t2 = _mm_blend_epi16(t1, t0, 0xC0);
244  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
245 
246  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
247  row4 = _mm_xor_si128(row4, row1);
248  row4 = _mm_shuffle_epi8(row4,r8);
249  row3 = _mm_add_epi32(row3, row4);
250  row2 = _mm_xor_si128(row2, row3);
251  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
252 
253  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
254  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
255  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
256 
257  t0 = _mm_unpackhi_epi32(m0,m1);
258  t1 = _mm_unpackhi_epi32(t0, m2);
259  t2 = _mm_blend_epi16(t1, m3, 0x0C);
260  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
261 
262  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
263  row4 = _mm_xor_si128(row4, row1);
264  row4 = _mm_shuffle_epi8(row4,r16);
265  row3 = _mm_add_epi32(row3, row4);
266  row2 = _mm_xor_si128(row2, row3);
267  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
268 
269  t0 = _mm_slli_si128(m2, 8);
270  t1 = _mm_blend_epi16(m3,m0,0x0C);
271  t2 = _mm_blend_epi16(t1, t0, 0xC0);
272  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
273 
274  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
275  row4 = _mm_xor_si128(row4, row1);
276  row4 = _mm_shuffle_epi8(row4,r8);
277  row3 = _mm_add_epi32(row3, row4);
278  row2 = _mm_xor_si128(row2, row3);
279  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
280 
281  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
282  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
283  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
284 
285  t0 = _mm_blend_epi16(m0,m1,0x0F);
286  t1 = _mm_blend_epi16(t0, m3, 0xC0);
287  buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
288 
289  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
290  row4 = _mm_xor_si128(row4, row1);
291  row4 = _mm_shuffle_epi8(row4,r16);
292  row3 = _mm_add_epi32(row3, row4);
293  row2 = _mm_xor_si128(row2, row3);
294  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
295 
296  t0 = _mm_unpacklo_epi32(m0,m2);
297  t1 = _mm_unpackhi_epi32(m1,m2);
298  buf4 = _mm_unpacklo_epi64(t1,t0);
299 
300  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
301  row4 = _mm_xor_si128(row4, row1);
302  row4 = _mm_shuffle_epi8(row4,r8);
303  row3 = _mm_add_epi32(row3, row4);
304  row2 = _mm_xor_si128(row2, row3);
305  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
306 
307  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
308  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
309  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
310 
311  t0 = _mm_unpacklo_epi64(m1,m2);
312  t1 = _mm_unpackhi_epi64(m0,m2);
313  t2 = _mm_blend_epi16(t0,t1,0x33);
314  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
315 
316  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
317  row4 = _mm_xor_si128(row4, row1);
318  row4 = _mm_shuffle_epi8(row4,r16);
319  row3 = _mm_add_epi32(row3, row4);
320  row2 = _mm_xor_si128(row2, row3);
321  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
322 
323  t0 = _mm_unpackhi_epi64(m1,m3);
324  t1 = _mm_unpacklo_epi64(m0,m1);
325  buf2 = _mm_blend_epi16(t0,t1,0x33);
326 
327  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
328  row4 = _mm_xor_si128(row4, row1);
329  row4 = _mm_shuffle_epi8(row4,r8);
330  row3 = _mm_add_epi32(row3, row4);
331  row2 = _mm_xor_si128(row2, row3);
332  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
333 
334  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
335  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
336  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
337 
338  t0 = _mm_unpackhi_epi64(m3,m1);
339  t1 = _mm_unpackhi_epi64(m2,m0);
340  buf3 = _mm_blend_epi16(t1,t0,0x33);
341 
342  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
343  row4 = _mm_xor_si128(row4, row1);
344  row4 = _mm_shuffle_epi8(row4,r16);
345  row3 = _mm_add_epi32(row3, row4);
346  row2 = _mm_xor_si128(row2, row3);
347  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
348 
349  t0 = _mm_blend_epi16(m0,m2,0x03);
350  t1 = _mm_slli_si128(t0, 8);
351  t2 = _mm_blend_epi16(t1,m3,0x0F);
352  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
353 
354  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
355  row4 = _mm_xor_si128(row4, row1);
356  row4 = _mm_shuffle_epi8(row4,r8);
357  row3 = _mm_add_epi32(row3, row4);
358  row2 = _mm_xor_si128(row2, row3);
359  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
360 
361  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
362  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
363  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
364 
365  t0 = _mm_unpackhi_epi32(m0,m1);
366  t1 = _mm_unpacklo_epi32(m0,m2);
367  buf1 = _mm_unpacklo_epi64(t0,t1);
368 
369  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
370  row4 = _mm_xor_si128(row4, row1);
371  row4 = _mm_shuffle_epi8(row4,r16);
372  row3 = _mm_add_epi32(row3, row4);
373  row2 = _mm_xor_si128(row2, row3);
374  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
375 
376  t0 = _mm_srli_si128(m2, 4);
377  t1 = _mm_blend_epi16(m0,m3,0x03);
378  buf2 = _mm_blend_epi16(t1,t0,0x3C);
379 
380  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
381  row4 = _mm_xor_si128(row4, row1);
382  row4 = _mm_shuffle_epi8(row4,r8);
383  row3 = _mm_add_epi32(row3, row4);
384  row2 = _mm_xor_si128(row2, row3);
385  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
386 
387  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
388  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
389  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
390 
391  t0 = _mm_blend_epi16(m1,m0,0x0C);
392  t1 = _mm_srli_si128(m3, 4);
393  t2 = _mm_blend_epi16(t0,t1,0x30);
394  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
395 
396  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
397  row4 = _mm_xor_si128(row4, row1);
398  row4 = _mm_shuffle_epi8(row4,r16);
399  row3 = _mm_add_epi32(row3, row4);
400  row2 = _mm_xor_si128(row2, row3);
401  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
402 
403  t0 = _mm_unpacklo_epi64(m1,m2);
404  t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1));
405  buf4 = _mm_blend_epi16(t0,t1,0x33);
406 
407  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
408  row4 = _mm_xor_si128(row4, row1);
409  row4 = _mm_shuffle_epi8(row4,r8);
410  row3 = _mm_add_epi32(row3, row4);
411  row2 = _mm_xor_si128(row2, row3);
412  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
413 
414  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
415  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
416  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
417 
418  t0 = _mm_slli_si128(m1, 12);
419  t1 = _mm_blend_epi16(m0,m3,0x33);
420  buf1 = _mm_blend_epi16(t1,t0,0xC0);
421 
422  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
423  row4 = _mm_xor_si128(row4, row1);
424  row4 = _mm_shuffle_epi8(row4,r16);
425  row3 = _mm_add_epi32(row3, row4);
426  row2 = _mm_xor_si128(row2, row3);
427  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
428 
429  t0 = _mm_blend_epi16(m3,m2,0x30);
430  t1 = _mm_srli_si128(m1, 4);
431  t2 = _mm_blend_epi16(t0,t1,0x03);
432  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
433 
434  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
435  row4 = _mm_xor_si128(row4, row1);
436  row4 = _mm_shuffle_epi8(row4,r8);
437  row3 = _mm_add_epi32(row3, row4);
438  row2 = _mm_xor_si128(row2, row3);
439  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
440 
441  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
442  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
443  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
444 
445  t0 = _mm_unpacklo_epi64(m0,m2);
446  t1 = _mm_srli_si128(m1, 4);
447  buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
448 
449  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
450  row4 = _mm_xor_si128(row4, row1);
451  row4 = _mm_shuffle_epi8(row4,r16);
452  row3 = _mm_add_epi32(row3, row4);
453  row2 = _mm_xor_si128(row2, row3);
454  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
455 
456  t0 = _mm_unpackhi_epi32(m1,m2);
457  t1 = _mm_unpackhi_epi64(m0,t0);
458  buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
459 
460  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
461  row4 = _mm_xor_si128(row4, row1);
462  row4 = _mm_shuffle_epi8(row4,r8);
463  row3 = _mm_add_epi32(row3, row4);
464  row2 = _mm_xor_si128(row2, row3);
465  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
466 
467  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
468  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
469  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
470 
471  t0 = _mm_unpackhi_epi32(m0,m1);
472  t1 = _mm_blend_epi16(t0,m3,0x0F);
473  buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
474 
475  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
476  row4 = _mm_xor_si128(row4, row1);
477  row4 = _mm_shuffle_epi8(row4,r16);
478  row3 = _mm_add_epi32(row3, row4);
479  row2 = _mm_xor_si128(row2, row3);
480  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
481 
482  t0 = _mm_blend_epi16(m2,m3,0x30);
483  t1 = _mm_srli_si128(m0,4);
484  t2 = _mm_blend_epi16(t0,t1,0x03);
485  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
486 
487  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
488  row4 = _mm_xor_si128(row4, row1);
489  row4 = _mm_shuffle_epi8(row4,r8);
490  row3 = _mm_add_epi32(row3, row4);
491  row2 = _mm_xor_si128(row2, row3);
492  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
493 
494  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
495  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
496  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
497 
498  t0 = _mm_unpackhi_epi64(m0,m3);
499  t1 = _mm_unpacklo_epi64(m1,m2);
500  t2 = _mm_blend_epi16(t0,t1,0x3C);
501  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
502 
503  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
504  row4 = _mm_xor_si128(row4, row1);
505  row4 = _mm_shuffle_epi8(row4,r16);
506  row3 = _mm_add_epi32(row3, row4);
507  row2 = _mm_xor_si128(row2, row3);
508  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
509 
510  t0 = _mm_unpacklo_epi32(m0,m1);
511  t1 = _mm_unpackhi_epi32(m1,m2);
512  buf4 = _mm_unpacklo_epi64(t0,t1);
513 
514  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
515  row4 = _mm_xor_si128(row4, row1);
516  row4 = _mm_shuffle_epi8(row4,r8);
517  row3 = _mm_add_epi32(row3, row4);
518  row2 = _mm_xor_si128(row2, row3);
519  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
520 
521  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
522  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
523  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
524 
525  t0 = _mm_unpackhi_epi32(m1,m3);
526  t1 = _mm_unpacklo_epi64(t0,m0);
527  t2 = _mm_blend_epi16(t1,m2,0xC0);
528  buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
529 
530  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
531  row4 = _mm_xor_si128(row4, row1);
532  row4 = _mm_shuffle_epi8(row4,r16);
533  row3 = _mm_add_epi32(row3, row4);
534  row2 = _mm_xor_si128(row2, row3);
535  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
536 
537  t0 = _mm_unpackhi_epi32(m0,m3);
538  t1 = _mm_blend_epi16(m2,t0,0xF0);
539  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
540 
541  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
542  row4 = _mm_xor_si128(row4, row1);
543  row4 = _mm_shuffle_epi8(row4,r8);
544  row3 = _mm_add_epi32(row3, row4);
545  row2 = _mm_xor_si128(row2, row3);
546  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
547 
548  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
549  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
550  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
551 
552  t0 = _mm_blend_epi16(m2,m0,0x0C);
553  t1 = _mm_slli_si128(t0,4);
554  buf3 = _mm_blend_epi16(t1,m3,0x0F);
555 
556  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
557  row4 = _mm_xor_si128(row4, row1);
558  row4 = _mm_shuffle_epi8(row4,r16);
559  row3 = _mm_add_epi32(row3, row4);
560  row2 = _mm_xor_si128(row2, row3);
561  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
562 
563  t0 = _mm_blend_epi16(m1,m0,0x30);
564  buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
565 
566  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
567  row4 = _mm_xor_si128(row4, row1);
568  row4 = _mm_shuffle_epi8(row4,r8);
569  row3 = _mm_add_epi32(row3, row4);
570  row2 = _mm_xor_si128(row2, row3);
571  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
572 
573  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
574  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
575  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
576 
577  t0 = _mm_blend_epi16(m0,m2,0x03);
578  t1 = _mm_blend_epi16(m1,m2,0x30);
579  t2 = _mm_blend_epi16(t1,t0,0x0F);
580  buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
581 
582  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
583  row4 = _mm_xor_si128(row4, row1);
584  row4 = _mm_shuffle_epi8(row4,r16);
585  row3 = _mm_add_epi32(row3, row4);
586  row2 = _mm_xor_si128(row2, row3);
587  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
588 
589  t0 = _mm_slli_si128(m0,4);
590  t1 = _mm_blend_epi16(m1,t0,0xC0);
591  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
592 
593  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
594  row4 = _mm_xor_si128(row4, row1);
595  row4 = _mm_shuffle_epi8(row4,r8);
596  row3 = _mm_add_epi32(row3, row4);
597  row2 = _mm_xor_si128(row2, row3);
598  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
599 
600  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
601  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
602  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
603 
604  t0 = _mm_unpackhi_epi32(m0,m3);
605  t1 = _mm_unpacklo_epi32(m2,m3);
606  t2 = _mm_unpackhi_epi64(t0,t1);
607  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
608 
609  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
610  row4 = _mm_xor_si128(row4, row1);
611  row4 = _mm_shuffle_epi8(row4,r16);
612  row3 = _mm_add_epi32(row3, row4);
613  row2 = _mm_xor_si128(row2, row3);
614  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
615 
616  t0 = _mm_blend_epi16(m3,m2,0xC0);
617  t1 = _mm_unpacklo_epi32(m0,m3);
618  t2 = _mm_blend_epi16(t0,t1,0x0F);
619  buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
620 
621  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
622  row4 = _mm_xor_si128(row4, row1);
623  row4 = _mm_shuffle_epi8(row4,r8);
624  row3 = _mm_add_epi32(row3, row4);
625  row2 = _mm_xor_si128(row2, row3);
626  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
627 
628  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
629  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
630  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
631 
632  _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
633  _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
634 }
635 
636 void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state)
637 {
638  __m128i row1l, row1h;
639  __m128i row2l, row2h;
640  __m128i row3l, row3h;
641  __m128i row4l, row4h;
642  __m128i b0, b1, t0, t1;
643 
644  const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
645  const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
646 
647  const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
648  const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
649  const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
650  const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
651  const __m128i m4 = _mm_loadu_si128(CONST_M128_CAST(input + 64));
652  const __m128i m5 = _mm_loadu_si128(CONST_M128_CAST(input + 80));
653  const __m128i m6 = _mm_loadu_si128(CONST_M128_CAST(input + 96));
654  const __m128i m7 = _mm_loadu_si128(CONST_M128_CAST(input + 112));
655 
656  row1l = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
657  row1h = _mm_loadu_si128(CONST_M128_CAST(&state.h[2]));
658  row2l = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
659  row2h = _mm_loadu_si128(CONST_M128_CAST(&state.h[6]));
660  row3l = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[0]));
661  row3h = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[2]));
662  row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[4])), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
663  row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[6])), _mm_loadu_si128(CONST_M128_CAST(&state.f[0])));
664 
665  b0 = _mm_unpacklo_epi64(m0, m1);
666  b1 = _mm_unpacklo_epi64(m2, m3);
667  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
668  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
669  row4l = _mm_xor_si128(row4l, row1l);
670  row4h = _mm_xor_si128(row4h, row1h);
671  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
672  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
673  row3l = _mm_add_epi64(row3l, row4l);
674  row3h = _mm_add_epi64(row3h, row4h);
675  row2l = _mm_xor_si128(row2l, row3l);
676  row2h = _mm_xor_si128(row2h, row3h);
677  row2l = _mm_shuffle_epi8(row2l, r24);
678  row2h = _mm_shuffle_epi8(row2h, r24);
679 
680  b0 = _mm_unpackhi_epi64(m0, m1);
681  b1 = _mm_unpackhi_epi64(m2, m3);
682 
683  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
684  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
685  row4l = _mm_xor_si128(row4l, row1l);
686  row4h = _mm_xor_si128(row4h, row1h);
687  row4l = _mm_shuffle_epi8(row4l, r16);
688  row4h = _mm_shuffle_epi8(row4h, r16);
689  row3l = _mm_add_epi64(row3l, row4l);
690  row3h = _mm_add_epi64(row3h, row4h);
691  row2l = _mm_xor_si128(row2l, row3l);
692  row2h = _mm_xor_si128(row2h, row3h);
693  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
694  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
695 
696  t0 = _mm_alignr_epi8(row2h, row2l, 8);
697  t1 = _mm_alignr_epi8(row2l, row2h, 8);
698  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
699  t0 = _mm_alignr_epi8(row4h, row4l, 8);
700  t1 = _mm_alignr_epi8(row4l, row4h, 8);
701  row4l = t1, row4h = t0;
702 
703  b0 = _mm_unpacklo_epi64(m4, m5);
704  b1 = _mm_unpacklo_epi64(m6, m7);
705 
706  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
707  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
708  row4l = _mm_xor_si128(row4l, row1l);
709  row4h = _mm_xor_si128(row4h, row1h);
710  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
711  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
712  row3l = _mm_add_epi64(row3l, row4l);
713  row3h = _mm_add_epi64(row3h, row4h);
714  row2l = _mm_xor_si128(row2l, row3l);
715  row2h = _mm_xor_si128(row2h, row3h);
716  row2l = _mm_shuffle_epi8(row2l, r24);
717  row2h = _mm_shuffle_epi8(row2h, r24);
718 
719  b0 = _mm_unpackhi_epi64(m4, m5);
720  b1 = _mm_unpackhi_epi64(m6, m7);
721 
722  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
723  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
724  row4l = _mm_xor_si128(row4l, row1l);
725  row4h = _mm_xor_si128(row4h, row1h);
726  row4l = _mm_shuffle_epi8(row4l, r16);
727  row4h = _mm_shuffle_epi8(row4h, r16);
728  row3l = _mm_add_epi64(row3l, row4l);
729  row3h = _mm_add_epi64(row3h, row4h);
730  row2l = _mm_xor_si128(row2l, row3l);
731  row2h = _mm_xor_si128(row2h, row3h);
732  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
733  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
734 
735  t0 = _mm_alignr_epi8(row2l, row2h, 8);
736  t1 = _mm_alignr_epi8(row2h, row2l, 8);
737  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
738  t0 = _mm_alignr_epi8(row4l, row4h, 8);
739  t1 = _mm_alignr_epi8(row4h, row4l, 8);
740  row4l = t1, row4h = t0;
741 
742  b0 = _mm_unpacklo_epi64(m7, m2);
743  b1 = _mm_unpackhi_epi64(m4, m6);
744 
745  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
746  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
747  row4l = _mm_xor_si128(row4l, row1l);
748  row4h = _mm_xor_si128(row4h, row1h);
749  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
750  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
751  row3l = _mm_add_epi64(row3l, row4l);
752  row3h = _mm_add_epi64(row3h, row4h);
753  row2l = _mm_xor_si128(row2l, row3l);
754  row2h = _mm_xor_si128(row2h, row3h);
755  row2l = _mm_shuffle_epi8(row2l, r24);
756  row2h = _mm_shuffle_epi8(row2h, r24);
757 
758  b0 = _mm_unpacklo_epi64(m5, m4);
759  b1 = _mm_alignr_epi8(m3, m7, 8);
760 
761  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
762  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
763  row4l = _mm_xor_si128(row4l, row1l);
764  row4h = _mm_xor_si128(row4h, row1h);
765  row4l = _mm_shuffle_epi8(row4l, r16);
766  row4h = _mm_shuffle_epi8(row4h, r16);
767  row3l = _mm_add_epi64(row3l, row4l);
768  row3h = _mm_add_epi64(row3h, row4h);
769  row2l = _mm_xor_si128(row2l, row3l);
770  row2h = _mm_xor_si128(row2h, row3h);
771  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
772  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
773 
774  t0 = _mm_alignr_epi8(row2h, row2l, 8);
775  t1 = _mm_alignr_epi8(row2l, row2h, 8);
776  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
777  t0 = _mm_alignr_epi8(row4h, row4l, 8);
778  t1 = _mm_alignr_epi8(row4l, row4h, 8);
779  row4l = t1, row4h = t0;
780 
781  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
782  b1 = _mm_unpackhi_epi64(m5, m2);
783 
784  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
785  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
786  row4l = _mm_xor_si128(row4l, row1l);
787  row4h = _mm_xor_si128(row4h, row1h);
788  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
789  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
790  row3l = _mm_add_epi64(row3l, row4l);
791  row3h = _mm_add_epi64(row3h, row4h);
792  row2l = _mm_xor_si128(row2l, row3l);
793  row2h = _mm_xor_si128(row2h, row3h);
794  row2l = _mm_shuffle_epi8(row2l, r24);
795  row2h = _mm_shuffle_epi8(row2h, r24);
796 
797  b0 = _mm_unpacklo_epi64(m6, m1);
798  b1 = _mm_unpackhi_epi64(m3, m1);
799 
800  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
801  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
802  row4l = _mm_xor_si128(row4l, row1l);
803  row4h = _mm_xor_si128(row4h, row1h);
804  row4l = _mm_shuffle_epi8(row4l, r16);
805  row4h = _mm_shuffle_epi8(row4h, r16);
806  row3l = _mm_add_epi64(row3l, row4l);
807  row3h = _mm_add_epi64(row3h, row4h);
808  row2l = _mm_xor_si128(row2l, row3l);
809  row2h = _mm_xor_si128(row2h, row3h);
810  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
811  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
812 
813  t0 = _mm_alignr_epi8(row2l, row2h, 8);
814  t1 = _mm_alignr_epi8(row2h, row2l, 8);
815  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
816  t0 = _mm_alignr_epi8(row4l, row4h, 8);
817  t1 = _mm_alignr_epi8(row4h, row4l, 8);
818  row4l = t1, row4h = t0;
819 
820  b0 = _mm_alignr_epi8(m6, m5, 8);
821  b1 = _mm_unpackhi_epi64(m2, m7);
822 
823  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
824  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
825  row4l = _mm_xor_si128(row4l, row1l);
826  row4h = _mm_xor_si128(row4h, row1h);
827  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
828  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
829  row3l = _mm_add_epi64(row3l, row4l);
830  row3h = _mm_add_epi64(row3h, row4h);
831  row2l = _mm_xor_si128(row2l, row3l);
832  row2h = _mm_xor_si128(row2h, row3h);
833  row2l = _mm_shuffle_epi8(row2l, r24);
834  row2h = _mm_shuffle_epi8(row2h, r24);
835 
836  b0 = _mm_unpacklo_epi64(m4, m0);
837  b1 = _mm_blend_epi16(m1, m6, 0xF0);
838 
839  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
840  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
841  row4l = _mm_xor_si128(row4l, row1l);
842  row4h = _mm_xor_si128(row4h, row1h);
843  row4l = _mm_shuffle_epi8(row4l, r16);
844  row4h = _mm_shuffle_epi8(row4h, r16);
845  row3l = _mm_add_epi64(row3l, row4l);
846  row3h = _mm_add_epi64(row3h, row4h);
847  row2l = _mm_xor_si128(row2l, row3l);
848  row2h = _mm_xor_si128(row2h, row3h);
849  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
850  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
851 
852  t0 = _mm_alignr_epi8(row2h, row2l, 8);
853  t1 = _mm_alignr_epi8(row2l, row2h, 8);
854  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
855  t0 = _mm_alignr_epi8(row4h, row4l, 8);
856  t1 = _mm_alignr_epi8(row4l, row4h, 8);
857  row4l = t1, row4h = t0;
858 
859  b0 = _mm_blend_epi16(m5, m1, 0xF0);
860  b1 = _mm_unpackhi_epi64(m3, m4);
861 
862  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
863  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
864  row4l = _mm_xor_si128(row4l, row1l);
865  row4h = _mm_xor_si128(row4h, row1h);
866  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
867  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
868  row3l = _mm_add_epi64(row3l, row4l);
869  row3h = _mm_add_epi64(row3h, row4h);
870  row2l = _mm_xor_si128(row2l, row3l);
871  row2h = _mm_xor_si128(row2h, row3h);
872  row2l = _mm_shuffle_epi8(row2l, r24);
873  row2h = _mm_shuffle_epi8(row2h, r24);
874 
875  b0 = _mm_unpacklo_epi64(m7, m3);
876  b1 = _mm_alignr_epi8(m2, m0, 8);
877 
878  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
879  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
880  row4l = _mm_xor_si128(row4l, row1l);
881  row4h = _mm_xor_si128(row4h, row1h);
882  row4l = _mm_shuffle_epi8(row4l, r16);
883  row4h = _mm_shuffle_epi8(row4h, r16);
884  row3l = _mm_add_epi64(row3l, row4l);
885  row3h = _mm_add_epi64(row3h, row4h);
886  row2l = _mm_xor_si128(row2l, row3l);
887  row2h = _mm_xor_si128(row2h, row3h);
888  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
889  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
890 
891  t0 = _mm_alignr_epi8(row2l, row2h, 8);
892  t1 = _mm_alignr_epi8(row2h, row2l, 8);
893  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
894  t0 = _mm_alignr_epi8(row4l, row4h, 8);
895  t1 = _mm_alignr_epi8(row4h, row4l, 8);
896  row4l = t1, row4h = t0;
897 
898  b0 = _mm_unpackhi_epi64(m3, m1);
899  b1 = _mm_unpackhi_epi64(m6, m5);
900 
901  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
902  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
903  row4l = _mm_xor_si128(row4l, row1l);
904  row4h = _mm_xor_si128(row4h, row1h);
905  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
906  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
907  row3l = _mm_add_epi64(row3l, row4l);
908  row3h = _mm_add_epi64(row3h, row4h);
909  row2l = _mm_xor_si128(row2l, row3l);
910  row2h = _mm_xor_si128(row2h, row3h);
911  row2l = _mm_shuffle_epi8(row2l, r24);
912  row2h = _mm_shuffle_epi8(row2h, r24);
913 
914  b0 = _mm_unpackhi_epi64(m4, m0);
915  b1 = _mm_unpacklo_epi64(m6, m7);
916 
917  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
918  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
919  row4l = _mm_xor_si128(row4l, row1l);
920  row4h = _mm_xor_si128(row4h, row1h);
921  row4l = _mm_shuffle_epi8(row4l, r16);
922  row4h = _mm_shuffle_epi8(row4h, r16);
923  row3l = _mm_add_epi64(row3l, row4l);
924  row3h = _mm_add_epi64(row3h, row4h);
925  row2l = _mm_xor_si128(row2l, row3l);
926  row2h = _mm_xor_si128(row2h, row3h);
927  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
928  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
929 
930  t0 = _mm_alignr_epi8(row2h, row2l, 8);
931  t1 = _mm_alignr_epi8(row2l, row2h, 8);
932  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
933  t0 = _mm_alignr_epi8(row4h, row4l, 8);
934  t1 = _mm_alignr_epi8(row4l, row4h, 8);
935  row4l = t1, row4h = t0;
936 
937  b0 = _mm_blend_epi16(m1, m2, 0xF0);
938  b1 = _mm_blend_epi16(m2, m7, 0xF0);
939 
940  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
941  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
942  row4l = _mm_xor_si128(row4l, row1l);
943  row4h = _mm_xor_si128(row4h, row1h);
944  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
945  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
946  row3l = _mm_add_epi64(row3l, row4l);
947  row3h = _mm_add_epi64(row3h, row4h);
948  row2l = _mm_xor_si128(row2l, row3l);
949  row2h = _mm_xor_si128(row2h, row3h);
950  row2l = _mm_shuffle_epi8(row2l, r24);
951  row2h = _mm_shuffle_epi8(row2h, r24);
952 
953  b0 = _mm_unpacklo_epi64(m3, m5);
954  b1 = _mm_unpacklo_epi64(m0, m4);
955 
956  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
957  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
958  row4l = _mm_xor_si128(row4l, row1l);
959  row4h = _mm_xor_si128(row4h, row1h);
960  row4l = _mm_shuffle_epi8(row4l, r16);
961  row4h = _mm_shuffle_epi8(row4h, r16);
962  row3l = _mm_add_epi64(row3l, row4l);
963  row3h = _mm_add_epi64(row3h, row4h);
964  row2l = _mm_xor_si128(row2l, row3l);
965  row2h = _mm_xor_si128(row2h, row3h);
966  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
967  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
968 
969  t0 = _mm_alignr_epi8(row2l, row2h, 8);
970  t1 = _mm_alignr_epi8(row2h, row2l, 8);
971  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
972  t0 = _mm_alignr_epi8(row4l, row4h, 8);
973  t1 = _mm_alignr_epi8(row4h, row4l, 8);
974  row4l = t1, row4h = t0;
975 
976  b0 = _mm_unpackhi_epi64(m4, m2);
977  b1 = _mm_unpacklo_epi64(m1, m5);
978 
979  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
980  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
981  row4l = _mm_xor_si128(row4l, row1l);
982  row4h = _mm_xor_si128(row4h, row1h);
983  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
984  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
985  row3l = _mm_add_epi64(row3l, row4l);
986  row3h = _mm_add_epi64(row3h, row4h);
987  row2l = _mm_xor_si128(row2l, row3l);
988  row2h = _mm_xor_si128(row2h, row3h);
989  row2l = _mm_shuffle_epi8(row2l, r24);
990  row2h = _mm_shuffle_epi8(row2h, r24);
991 
992  b0 = _mm_blend_epi16(m0, m3, 0xF0);
993  b1 = _mm_blend_epi16(m2, m7, 0xF0);
994 
995  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
996  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
997  row4l = _mm_xor_si128(row4l, row1l);
998  row4h = _mm_xor_si128(row4h, row1h);
999  row4l = _mm_shuffle_epi8(row4l, r16);
1000  row4h = _mm_shuffle_epi8(row4h, r16);
1001  row3l = _mm_add_epi64(row3l, row4l);
1002  row3h = _mm_add_epi64(row3h, row4h);
1003  row2l = _mm_xor_si128(row2l, row3l);
1004  row2h = _mm_xor_si128(row2h, row3h);
1005  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1006  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1007 
1008  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1009  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1010  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1011  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1012  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1013  row4l = t1, row4h = t0;
1014 
1015  b0 = _mm_blend_epi16(m7, m5, 0xF0);
1016  b1 = _mm_blend_epi16(m3, m1, 0xF0);
1017 
1018  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1019  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1020  row4l = _mm_xor_si128(row4l, row1l);
1021  row4h = _mm_xor_si128(row4h, row1h);
1022  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1023  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1024  row3l = _mm_add_epi64(row3l, row4l);
1025  row3h = _mm_add_epi64(row3h, row4h);
1026  row2l = _mm_xor_si128(row2l, row3l);
1027  row2h = _mm_xor_si128(row2h, row3h);
1028  row2l = _mm_shuffle_epi8(row2l, r24);
1029  row2h = _mm_shuffle_epi8(row2h, r24);
1030 
1031  b0 = _mm_alignr_epi8(m6, m0, 8);
1032  b1 = _mm_blend_epi16(m4, m6, 0xF0);
1033 
1034  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1035  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1036  row4l = _mm_xor_si128(row4l, row1l);
1037  row4h = _mm_xor_si128(row4h, row1h);
1038  row4l = _mm_shuffle_epi8(row4l, r16);
1039  row4h = _mm_shuffle_epi8(row4h, r16);
1040  row3l = _mm_add_epi64(row3l, row4l);
1041  row3h = _mm_add_epi64(row3h, row4h);
1042  row2l = _mm_xor_si128(row2l, row3l);
1043  row2h = _mm_xor_si128(row2h, row3h);
1044  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1045  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1046 
1047  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1048  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1049  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1050  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1051  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1052  row4l = t1, row4h = t0;
1053 
1054  b0 = _mm_unpacklo_epi64(m1, m3);
1055  b1 = _mm_unpacklo_epi64(m0, m4);
1056 
1057  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1058  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1059  row4l = _mm_xor_si128(row4l, row1l);
1060  row4h = _mm_xor_si128(row4h, row1h);
1061  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1062  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1063  row3l = _mm_add_epi64(row3l, row4l);
1064  row3h = _mm_add_epi64(row3h, row4h);
1065  row2l = _mm_xor_si128(row2l, row3l);
1066  row2h = _mm_xor_si128(row2h, row3h);
1067  row2l = _mm_shuffle_epi8(row2l, r24);
1068  row2h = _mm_shuffle_epi8(row2h, r24);
1069 
1070  b0 = _mm_unpacklo_epi64(m6, m5);
1071  b1 = _mm_unpackhi_epi64(m5, m1);
1072 
1073  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1074  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1075  row4l = _mm_xor_si128(row4l, row1l);
1076  row4h = _mm_xor_si128(row4h, row1h);
1077  row4l = _mm_shuffle_epi8(row4l, r16);
1078  row4h = _mm_shuffle_epi8(row4h, r16);
1079  row3l = _mm_add_epi64(row3l, row4l);
1080  row3h = _mm_add_epi64(row3h, row4h);
1081  row2l = _mm_xor_si128(row2l, row3l);
1082  row2h = _mm_xor_si128(row2h, row3h);
1083  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1084  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1085 
1086  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1087  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1088  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1089  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1090  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1091  row4l = t1, row4h = t0;
1092 
1093  b0 = _mm_blend_epi16(m2, m3, 0xF0);
1094  b1 = _mm_unpackhi_epi64(m7, m0);
1095 
1096  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1097  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1098  row4l = _mm_xor_si128(row4l, row1l);
1099  row4h = _mm_xor_si128(row4h, row1h);
1100  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1101  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1102  row3l = _mm_add_epi64(row3l, row4l);
1103  row3h = _mm_add_epi64(row3h, row4h);
1104  row2l = _mm_xor_si128(row2l, row3l);
1105  row2h = _mm_xor_si128(row2h, row3h);
1106  row2l = _mm_shuffle_epi8(row2l, r24);
1107  row2h = _mm_shuffle_epi8(row2h, r24);
1108 
1109  b0 = _mm_unpackhi_epi64(m6, m2);
1110  b1 = _mm_blend_epi16(m7, m4, 0xF0);
1111 
1112  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1113  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1114  row4l = _mm_xor_si128(row4l, row1l);
1115  row4h = _mm_xor_si128(row4h, row1h);
1116  row4l = _mm_shuffle_epi8(row4l, r16);
1117  row4h = _mm_shuffle_epi8(row4h, r16);
1118  row3l = _mm_add_epi64(row3l, row4l);
1119  row3h = _mm_add_epi64(row3h, row4h);
1120  row2l = _mm_xor_si128(row2l, row3l);
1121  row2h = _mm_xor_si128(row2h, row3h);
1122  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1123  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1124 
1125  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1126  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1127  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1128  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1129  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1130  row4l = t1, row4h = t0;
1131 
1132  b0 = _mm_blend_epi16(m6, m0, 0xF0);
1133  b1 = _mm_unpacklo_epi64(m7, m2);
1134 
1135  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1136  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1137  row4l = _mm_xor_si128(row4l, row1l);
1138  row4h = _mm_xor_si128(row4h, row1h);
1139  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1140  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1141  row3l = _mm_add_epi64(row3l, row4l);
1142  row3h = _mm_add_epi64(row3h, row4h);
1143  row2l = _mm_xor_si128(row2l, row3l);
1144  row2h = _mm_xor_si128(row2h, row3h);
1145  row2l = _mm_shuffle_epi8(row2l, r24);
1146  row2h = _mm_shuffle_epi8(row2h, r24);
1147 
1148  b0 = _mm_unpackhi_epi64(m2, m7);
1149  b1 = _mm_alignr_epi8(m5, m6, 8);
1150 
1151  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1152  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1153  row4l = _mm_xor_si128(row4l, row1l);
1154  row4h = _mm_xor_si128(row4h, row1h);
1155  row4l = _mm_shuffle_epi8(row4l, r16);
1156  row4h = _mm_shuffle_epi8(row4h, r16);
1157  row3l = _mm_add_epi64(row3l, row4l);
1158  row3h = _mm_add_epi64(row3h, row4h);
1159  row2l = _mm_xor_si128(row2l, row3l);
1160  row2h = _mm_xor_si128(row2h, row3h);
1161  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1162  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1163 
1164  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1165  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1166  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1167  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1168  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1169  row4l = t1, row4h = t0;
1170 
1171  b0 = _mm_unpacklo_epi64(m0, m3);
1172  b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));
1173 
1174  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1175  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1176  row4l = _mm_xor_si128(row4l, row1l);
1177  row4h = _mm_xor_si128(row4h, row1h);
1178  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1179  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1180  row3l = _mm_add_epi64(row3l, row4l);
1181  row3h = _mm_add_epi64(row3h, row4h);
1182  row2l = _mm_xor_si128(row2l, row3l);
1183  row2h = _mm_xor_si128(row2h, row3h);
1184  row2l = _mm_shuffle_epi8(row2l, r24);
1185  row2h = _mm_shuffle_epi8(row2h, r24);
1186 
1187  b0 = _mm_unpackhi_epi64(m3, m1);
1188  b1 = _mm_blend_epi16(m1, m5, 0xF0);
1189 
1190  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1191  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1192  row4l = _mm_xor_si128(row4l, row1l);
1193  row4h = _mm_xor_si128(row4h, row1h);
1194  row4l = _mm_shuffle_epi8(row4l, r16);
1195  row4h = _mm_shuffle_epi8(row4h, r16);
1196  row3l = _mm_add_epi64(row3l, row4l);
1197  row3h = _mm_add_epi64(row3h, row4h);
1198  row2l = _mm_xor_si128(row2l, row3l);
1199  row2h = _mm_xor_si128(row2h, row3h);
1200  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1201  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1202 
1203  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1204  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1205  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1206  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1207  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1208  row4l = t1, row4h = t0;
1209 
1210  b0 = _mm_unpackhi_epi64(m6, m3);
1211  b1 = _mm_blend_epi16(m6, m1, 0xF0);
1212 
1213  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1214  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1215  row4l = _mm_xor_si128(row4l, row1l);
1216  row4h = _mm_xor_si128(row4h, row1h);
1217  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1218  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1219  row3l = _mm_add_epi64(row3l, row4l);
1220  row3h = _mm_add_epi64(row3h, row4h);
1221  row2l = _mm_xor_si128(row2l, row3l);
1222  row2h = _mm_xor_si128(row2h, row3h);
1223  row2l = _mm_shuffle_epi8(row2l, r24);
1224  row2h = _mm_shuffle_epi8(row2h, r24);
1225 
1226  b0 = _mm_alignr_epi8(m7, m5, 8);
1227  b1 = _mm_unpackhi_epi64(m0, m4);
1228 
1229  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1230  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1231  row4l = _mm_xor_si128(row4l, row1l);
1232  row4h = _mm_xor_si128(row4h, row1h);
1233  row4l = _mm_shuffle_epi8(row4l, r16);
1234  row4h = _mm_shuffle_epi8(row4h, r16);
1235  row3l = _mm_add_epi64(row3l, row4l);
1236  row3h = _mm_add_epi64(row3h, row4h);
1237  row2l = _mm_xor_si128(row2l, row3l);
1238  row2h = _mm_xor_si128(row2h, row3h);
1239  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1240  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1241 
1242  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1243  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1244  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1245  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1246  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1247  row4l = t1, row4h = t0;
1248 
1249  b0 = _mm_unpackhi_epi64(m2, m7);
1250  b1 = _mm_unpacklo_epi64(m4, m1);
1251 
1252  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1253  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1254  row4l = _mm_xor_si128(row4l, row1l);
1255  row4h = _mm_xor_si128(row4h, row1h);
1256  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1257  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1258  row3l = _mm_add_epi64(row3l, row4l);
1259  row3h = _mm_add_epi64(row3h, row4h);
1260  row2l = _mm_xor_si128(row2l, row3l);
1261  row2h = _mm_xor_si128(row2h, row3h);
1262  row2l = _mm_shuffle_epi8(row2l, r24);
1263  row2h = _mm_shuffle_epi8(row2h, r24);
1264 
1265  b0 = _mm_unpacklo_epi64(m0, m2);
1266  b1 = _mm_unpacklo_epi64(m3, m5);
1267 
1268  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1269  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1270  row4l = _mm_xor_si128(row4l, row1l);
1271  row4h = _mm_xor_si128(row4h, row1h);
1272  row4l = _mm_shuffle_epi8(row4l, r16);
1273  row4h = _mm_shuffle_epi8(row4h, r16);
1274  row3l = _mm_add_epi64(row3l, row4l);
1275  row3h = _mm_add_epi64(row3h, row4h);
1276  row2l = _mm_xor_si128(row2l, row3l);
1277  row2h = _mm_xor_si128(row2h, row3h);
1278  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1279  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1280 
1281  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1282  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1283  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1284  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1285  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1286  row4l = t1, row4h = t0;
1287 
1288  b0 = _mm_unpacklo_epi64(m3, m7);
1289  b1 = _mm_alignr_epi8(m0, m5, 8);
1290 
1291  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1292  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1293  row4l = _mm_xor_si128(row4l, row1l);
1294  row4h = _mm_xor_si128(row4h, row1h);
1295  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1296  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1297  row3l = _mm_add_epi64(row3l, row4l);
1298  row3h = _mm_add_epi64(row3h, row4h);
1299  row2l = _mm_xor_si128(row2l, row3l);
1300  row2h = _mm_xor_si128(row2h, row3h);
1301  row2l = _mm_shuffle_epi8(row2l, r24);
1302  row2h = _mm_shuffle_epi8(row2h, r24);
1303 
1304  b0 = _mm_unpackhi_epi64(m7, m4);
1305  b1 = _mm_alignr_epi8(m4, m1, 8);
1306 
1307  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1308  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1309  row4l = _mm_xor_si128(row4l, row1l);
1310  row4h = _mm_xor_si128(row4h, row1h);
1311  row4l = _mm_shuffle_epi8(row4l, r16);
1312  row4h = _mm_shuffle_epi8(row4h, r16);
1313  row3l = _mm_add_epi64(row3l, row4l);
1314  row3h = _mm_add_epi64(row3h, row4h);
1315  row2l = _mm_xor_si128(row2l, row3l);
1316  row2h = _mm_xor_si128(row2h, row3h);
1317  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1318  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1319 
1320  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1321  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1322  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1323  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1324  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1325  row4l = t1, row4h = t0;
1326 
1327  b0 = m6;
1328  b1 = _mm_alignr_epi8(m5, m0, 8);
1329 
1330  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1331  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1332  row4l = _mm_xor_si128(row4l, row1l);
1333  row4h = _mm_xor_si128(row4h, row1h);
1334  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1335  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1336  row3l = _mm_add_epi64(row3l, row4l);
1337  row3h = _mm_add_epi64(row3h, row4h);
1338  row2l = _mm_xor_si128(row2l, row3l);
1339  row2h = _mm_xor_si128(row2h, row3h);
1340  row2l = _mm_shuffle_epi8(row2l, r24);
1341  row2h = _mm_shuffle_epi8(row2h, r24);
1342 
1343  b0 = _mm_blend_epi16(m1, m3, 0xF0);
1344  b1 = m2;
1345 
1346  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1347  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1348  row4l = _mm_xor_si128(row4l, row1l);
1349  row4h = _mm_xor_si128(row4h, row1h);
1350  row4l = _mm_shuffle_epi8(row4l, r16);
1351  row4h = _mm_shuffle_epi8(row4h, r16);
1352  row3l = _mm_add_epi64(row3l, row4l);
1353  row3h = _mm_add_epi64(row3h, row4h);
1354  row2l = _mm_xor_si128(row2l, row3l);
1355  row2h = _mm_xor_si128(row2h, row3h);
1356  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1357  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1358 
1359  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1360  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1361  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1362  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1363  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1364  row4l = t1, row4h = t0;
1365 
1366  b0 = _mm_unpacklo_epi64(m5, m4);
1367  b1 = _mm_unpackhi_epi64(m3, m0);
1368 
1369  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1370  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1371  row4l = _mm_xor_si128(row4l, row1l);
1372  row4h = _mm_xor_si128(row4h, row1h);
1373  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1374  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1375  row3l = _mm_add_epi64(row3l, row4l);
1376  row3h = _mm_add_epi64(row3h, row4h);
1377  row2l = _mm_xor_si128(row2l, row3l);
1378  row2h = _mm_xor_si128(row2h, row3h);
1379  row2l = _mm_shuffle_epi8(row2l, r24);
1380  row2h = _mm_shuffle_epi8(row2h, r24);
1381 
1382  b0 = _mm_unpacklo_epi64(m1, m2);
1383  b1 = _mm_blend_epi16(m3, m2, 0xF0);
1384 
1385  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1386  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1387  row4l = _mm_xor_si128(row4l, row1l);
1388  row4h = _mm_xor_si128(row4h, row1h);
1389  row4l = _mm_shuffle_epi8(row4l, r16);
1390  row4h = _mm_shuffle_epi8(row4h, r16);
1391  row3l = _mm_add_epi64(row3l, row4l);
1392  row3h = _mm_add_epi64(row3h, row4h);
1393  row2l = _mm_xor_si128(row2l, row3l);
1394  row2h = _mm_xor_si128(row2h, row3h);
1395  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1396  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1397 
1398  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1399  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1400  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1401  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1402  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1403  row4l = t1, row4h = t0;
1404 
1405  b0 = _mm_unpackhi_epi64(m7, m4);
1406  b1 = _mm_unpackhi_epi64(m1, m6);
1407 
1408  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1409  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1410  row4l = _mm_xor_si128(row4l, row1l);
1411  row4h = _mm_xor_si128(row4h, row1h);
1412  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1413  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1414  row3l = _mm_add_epi64(row3l, row4l);
1415  row3h = _mm_add_epi64(row3h, row4h);
1416  row2l = _mm_xor_si128(row2l, row3l);
1417  row2h = _mm_xor_si128(row2h, row3h);
1418  row2l = _mm_shuffle_epi8(row2l, r24);
1419  row2h = _mm_shuffle_epi8(row2h, r24);
1420 
1421  b0 = _mm_alignr_epi8(m7, m5, 8);
1422  b1 = _mm_unpacklo_epi64(m6, m0);
1423 
1424  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1425  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1426  row4l = _mm_xor_si128(row4l, row1l);
1427  row4h = _mm_xor_si128(row4h, row1h);
1428  row4l = _mm_shuffle_epi8(row4l, r16);
1429  row4h = _mm_shuffle_epi8(row4h, r16);
1430  row3l = _mm_add_epi64(row3l, row4l);
1431  row3h = _mm_add_epi64(row3h, row4h);
1432  row2l = _mm_xor_si128(row2l, row3l);
1433  row2h = _mm_xor_si128(row2h, row3h);
1434  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1435  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1436 
1437  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1438  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1439  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1440  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1441  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1442  row4l = t1, row4h = t0;
1443 
1444  b0 = _mm_unpacklo_epi64(m0, m1);
1445  b1 = _mm_unpacklo_epi64(m2, m3);
1446 
1447  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1448  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1449  row4l = _mm_xor_si128(row4l, row1l);
1450  row4h = _mm_xor_si128(row4h, row1h);
1451  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1452  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1453  row3l = _mm_add_epi64(row3l, row4l);
1454  row3h = _mm_add_epi64(row3h, row4h);
1455  row2l = _mm_xor_si128(row2l, row3l);
1456  row2h = _mm_xor_si128(row2h, row3h);
1457  row2l = _mm_shuffle_epi8(row2l, r24);
1458  row2h = _mm_shuffle_epi8(row2h, r24);
1459 
1460  b0 = _mm_unpackhi_epi64(m0, m1);
1461  b1 = _mm_unpackhi_epi64(m2, m3);
1462 
1463  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1464  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1465  row4l = _mm_xor_si128(row4l, row1l);
1466  row4h = _mm_xor_si128(row4h, row1h);
1467  row4l = _mm_shuffle_epi8(row4l, r16);
1468  row4h = _mm_shuffle_epi8(row4h, r16);
1469  row3l = _mm_add_epi64(row3l, row4l);
1470  row3h = _mm_add_epi64(row3h, row4h);
1471  row2l = _mm_xor_si128(row2l, row3l);
1472  row2h = _mm_xor_si128(row2h, row3h);
1473  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1474  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1475 
1476  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1477  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1478  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1479  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1480  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1481  row4l = t1, row4h = t0;
1482 
1483  b0 = _mm_unpacklo_epi64(m4, m5);
1484  b1 = _mm_unpacklo_epi64(m6, m7);
1485 
1486  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1487  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1488  row4l = _mm_xor_si128(row4l, row1l);
1489  row4h = _mm_xor_si128(row4h, row1h);
1490  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1491  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1492  row3l = _mm_add_epi64(row3l, row4l);
1493  row3h = _mm_add_epi64(row3h, row4h);
1494  row2l = _mm_xor_si128(row2l, row3l);
1495  row2h = _mm_xor_si128(row2h, row3h);
1496  row2l = _mm_shuffle_epi8(row2l, r24);
1497  row2h = _mm_shuffle_epi8(row2h, r24);
1498 
1499  b0 = _mm_unpackhi_epi64(m4, m5);
1500  b1 = _mm_unpackhi_epi64(m6, m7);
1501 
1502  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1503  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1504  row4l = _mm_xor_si128(row4l, row1l);
1505  row4h = _mm_xor_si128(row4h, row1h);
1506  row4l = _mm_shuffle_epi8(row4l, r16);
1507  row4h = _mm_shuffle_epi8(row4h, r16);
1508  row3l = _mm_add_epi64(row3l, row4l);
1509  row3h = _mm_add_epi64(row3h, row4h);
1510  row2l = _mm_xor_si128(row2l, row3l);
1511  row2h = _mm_xor_si128(row2h, row3h);
1512  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1513  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1514 
1515  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1516  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1517  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1518  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1519  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1520  row4l = t1, row4h = t0;
1521 
1522  b0 = _mm_unpacklo_epi64(m7, m2);
1523  b1 = _mm_unpackhi_epi64(m4, m6);
1524 
1525  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1526  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1527  row4l = _mm_xor_si128(row4l, row1l);
1528  row4h = _mm_xor_si128(row4h, row1h);
1529  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1530  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1531  row3l = _mm_add_epi64(row3l, row4l);
1532  row3h = _mm_add_epi64(row3h, row4h);
1533  row2l = _mm_xor_si128(row2l, row3l);
1534  row2h = _mm_xor_si128(row2h, row3h);
1535  row2l = _mm_shuffle_epi8(row2l, r24);
1536  row2h = _mm_shuffle_epi8(row2h, r24);
1537 
1538  b0 = _mm_unpacklo_epi64(m5, m4);
1539  b1 = _mm_alignr_epi8(m3, m7, 8);
1540 
1541  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1542  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1543  row4l = _mm_xor_si128(row4l, row1l);
1544  row4h = _mm_xor_si128(row4h, row1h);
1545  row4l = _mm_shuffle_epi8(row4l, r16);
1546  row4h = _mm_shuffle_epi8(row4h, r16);
1547  row3l = _mm_add_epi64(row3l, row4l);
1548  row3h = _mm_add_epi64(row3h, row4h);
1549  row2l = _mm_xor_si128(row2l, row3l);
1550  row2h = _mm_xor_si128(row2h, row3h);
1551  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1552  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1553 
1554  t0 = _mm_alignr_epi8(row2h, row2l, 8);
1555  t1 = _mm_alignr_epi8(row2l, row2h, 8);
1556  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1557  t0 = _mm_alignr_epi8(row4h, row4l, 8);
1558  t1 = _mm_alignr_epi8(row4l, row4h, 8);
1559  row4l = t1, row4h = t0;
1560 
1561  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
1562  b1 = _mm_unpackhi_epi64(m5, m2);
1563 
1564  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1565  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1566  row4l = _mm_xor_si128(row4l, row1l);
1567  row4h = _mm_xor_si128(row4h, row1h);
1568  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1569  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1570  row3l = _mm_add_epi64(row3l, row4l);
1571  row3h = _mm_add_epi64(row3h, row4h);
1572  row2l = _mm_xor_si128(row2l, row3l);
1573  row2h = _mm_xor_si128(row2h, row3h);
1574  row2l = _mm_shuffle_epi8(row2l, r24);
1575  row2h = _mm_shuffle_epi8(row2h, r24);
1576 
1577  b0 = _mm_unpacklo_epi64(m6, m1);
1578  b1 = _mm_unpackhi_epi64(m3, m1);
1579 
1580  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1581  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1582  row4l = _mm_xor_si128(row4l, row1l);
1583  row4h = _mm_xor_si128(row4h, row1h);
1584  row4l = _mm_shuffle_epi8(row4l, r16);
1585  row4h = _mm_shuffle_epi8(row4h, r16);
1586  row3l = _mm_add_epi64(row3l, row4l);
1587  row3h = _mm_add_epi64(row3h, row4h);
1588  row2l = _mm_xor_si128(row2l, row3l);
1589  row2h = _mm_xor_si128(row2h, row3h);
1590  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1591  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1592 
1593  t0 = _mm_alignr_epi8(row2l, row2h, 8);
1594  t1 = _mm_alignr_epi8(row2h, row2l, 8);
1595  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1596  t0 = _mm_alignr_epi8(row4l, row4h, 8);
1597  t1 = _mm_alignr_epi8(row4h, row4l, 8);
1598  row4l = t1, row4h = t0;
1599 
1600  row1l = _mm_xor_si128(row3l, row1l);
1601  row1h = _mm_xor_si128(row3h, row1h);
1602  _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[0])), row1l));
1603  _mm_storeu_si128(M128_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[2])), row1h));
1604 
1605  row2l = _mm_xor_si128(row4l, row2l);
1606  row2h = _mm_xor_si128(row4h, row2h);
1607  _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[4])), row2l));
1608  _mm_storeu_si128(M128_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[6])), row2h));
1609 }
1610 #endif // CRYPTOPP_SSE41_AVAILABLE
1611 
1612 #if CRYPTOPP_ARM_NEON_AVAILABLE
1613 void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state)
1614 {
1615  #define BLAKE2S_LOAD_MSG_0_1(buf) \
1616  do { uint32x2_t t0, t1; \
1617  t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \
1618  t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \
1619  buf = vcombine_u32(t0, t1); } while(0)
1620 
1621  #define BLAKE2S_LOAD_MSG_0_2(buf) \
1622  do { uint32x2_t t0, t1; \
1623  t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \
1624  t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \
1625  buf = vcombine_u32(t0, t1); } while(0)
1626 
1627  #define BLAKE2S_LOAD_MSG_0_3(buf) \
1628  do { uint32x2_t t0, t1; \
1629  t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \
1630  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \
1631  buf = vcombine_u32(t0, t1); } while(0)
1632 
1633  #define BLAKE2S_LOAD_MSG_0_4(buf) \
1634  do { uint32x2_t t0, t1; \
1635  t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \
1636  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \
1637  buf = vcombine_u32(t0, t1); } while(0)
1638 
1639  #define BLAKE2S_LOAD_MSG_1_1(buf) \
1640  do { uint32x2_t t0, t1; \
1641  t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \
1642  t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \
1643  buf = vcombine_u32(t0, t1); } while(0)
1644 
1645  #define BLAKE2S_LOAD_MSG_1_2(buf) \
1646  do { uint32x2_t t0, t1; \
1647  t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \
1648  t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \
1649  buf = vcombine_u32(t0, t1); } while(0)
1650 
1651  #define BLAKE2S_LOAD_MSG_1_3(buf) \
1652  do { uint32x2_t t0, t1; \
1653  t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \
1654  t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \
1655  buf = vcombine_u32(t0, t1); } while(0)
1656 
1657  #define BLAKE2S_LOAD_MSG_1_4(buf) \
1658  do { uint32x2_t t0, t1; \
1659  t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \
1660  t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \
1661  buf = vcombine_u32(t0, t1); } while(0)
1662 
1663  #define BLAKE2S_LOAD_MSG_2_1(buf) \
1664  do { uint32x2_t t0, t1; \
1665  t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \
1666  t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \
1667  buf = vcombine_u32(t0, t1); } while(0)
1668 
1669  #define BLAKE2S_LOAD_MSG_2_2(buf) \
1670  do { uint32x2_t t0, t1; \
1671  t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \
1672  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \
1673  buf = vcombine_u32(t0, t1); } while(0)
1674 
1675  #define BLAKE2S_LOAD_MSG_2_3(buf) \
1676  do { uint32x2_t t0, t1; \
1677  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \
1678  t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \
1679  buf = vcombine_u32(t0, t1); } while(0)
1680 
1681  #define BLAKE2S_LOAD_MSG_2_4(buf) \
1682  do { uint32x2_t t0, t1; \
1683  t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \
1684  t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \
1685  buf = vcombine_u32(t0, t1); } while(0)
1686 
1687  #define BLAKE2S_LOAD_MSG_3_1(buf) \
1688  do { uint32x2_t t0, t1; \
1689  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \
1690  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \
1691  buf = vcombine_u32(t0, t1); } while(0)
1692 
1693  #define BLAKE2S_LOAD_MSG_3_2(buf) \
1694  do { uint32x2_t t0, t1; \
1695  t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \
1696  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \
1697  buf = vcombine_u32(t0, t1); } while(0)
1698 
1699  #define BLAKE2S_LOAD_MSG_3_3(buf) \
1700  do { uint32x2_t t0, t1; \
1701  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \
1702  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \
1703  buf = vcombine_u32(t0, t1); } while(0)
1704 
1705  #define BLAKE2S_LOAD_MSG_3_4(buf) \
1706  do { uint32x2_t t0, t1; \
1707  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \
1708  t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \
1709  buf = vcombine_u32(t0, t1); } while(0)
1710 
1711  #define BLAKE2S_LOAD_MSG_4_1(buf) \
1712  do { uint32x2_t t0, t1; \
1713  t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \
1714  t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \
1715  buf = vcombine_u32(t0, t1); } while(0)
1716 
1717  #define BLAKE2S_LOAD_MSG_4_2(buf) \
1718  do { uint32x2_t t0, t1; \
1719  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \
1720  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \
1721  buf = vcombine_u32(t0, t1); } while(0)
1722 
1723  #define BLAKE2S_LOAD_MSG_4_3(buf) \
1724  do { uint32x2_t t0, t1; \
1725  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \
1726  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \
1727  buf = vcombine_u32(t0, t1); } while(0)
1728 
1729  #define BLAKE2S_LOAD_MSG_4_4(buf) \
1730  do { uint32x2_t t0, t1; \
1731  t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \
1732  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \
1733  buf = vcombine_u32(t0, t1); } while(0)
1734 
1735  #define BLAKE2S_LOAD_MSG_5_1(buf) \
1736  do { uint32x2_t t0, t1; \
1737  t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \
1738  t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \
1739  buf = vcombine_u32(t0, t1); } while(0)
1740 
1741  #define BLAKE2S_LOAD_MSG_5_2(buf) \
1742  do { uint32x2_t t0, t1; \
1743  t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \
1744  t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \
1745  buf = vcombine_u32(t0, t1); } while(0)
1746 
1747  #define BLAKE2S_LOAD_MSG_5_3(buf) \
1748  do { uint32x2_t t0, t1; \
1749  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \
1750  t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \
1751  buf = vcombine_u32(t0, t1); } while(0)
1752 
1753  #define BLAKE2S_LOAD_MSG_5_4(buf) \
1754  do { uint32x2_t t0, t1; \
1755  t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \
1756  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \
1757  buf = vcombine_u32(t0, t1); } while(0)
1758 
1759  #define BLAKE2S_LOAD_MSG_6_1(buf) \
1760  do { uint32x2_t t0, t1; \
1761  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \
1762  t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \
1763  buf = vcombine_u32(t0, t1); } while(0)
1764 
1765  #define BLAKE2S_LOAD_MSG_6_2(buf) \
1766  do { uint32x2_t t0, t1; \
1767  t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \
1768  t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \
1769  buf = vcombine_u32(t0, t1); } while(0)
1770 
1771  #define BLAKE2S_LOAD_MSG_6_3(buf) \
1772  do { uint32x2_t t0, t1; \
1773  t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \
1774  t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \
1775  buf = vcombine_u32(t0, t1); } while(0)
1776 
1777  #define BLAKE2S_LOAD_MSG_6_4(buf) \
1778  do { uint32x2_t t0, t1; \
1779  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \
1780  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \
1781  buf = vcombine_u32(t0, t1); } while(0)
1782 
1783  #define BLAKE2S_LOAD_MSG_7_1(buf) \
1784  do { uint32x2_t t0, t1; \
1785  t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \
1786  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \
1787  buf = vcombine_u32(t0, t1); } while(0)
1788 
1789  #define BLAKE2S_LOAD_MSG_7_2(buf) \
1790  do { uint32x2_t t0, t1; \
1791  t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \
1792  t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \
1793  buf = vcombine_u32(t0, t1); } while(0)
1794 
1795  #define BLAKE2S_LOAD_MSG_7_3(buf) \
1796  do { uint32x2_t t0, t1; \
1797  t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \
1798  t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \
1799  buf = vcombine_u32(t0, t1); } while(0)
1800 
1801  #define BLAKE2S_LOAD_MSG_7_4(buf) \
1802  do { uint32x2_t t0, t1; \
1803  t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \
1804  t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \
1805  buf = vcombine_u32(t0, t1); } while(0)
1806 
1807  #define BLAKE2S_LOAD_MSG_8_1(buf) \
1808  do { uint32x2_t t0, t1; \
1809  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \
1810  t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \
1811  buf = vcombine_u32(t0, t1); } while(0)
1812 
1813  #define BLAKE2S_LOAD_MSG_8_2(buf) \
1814  do { uint32x2_t t0, t1; \
1815  t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \
1816  t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \
1817  buf = vcombine_u32(t0, t1); } while(0)
1818 
1819  #define BLAKE2S_LOAD_MSG_8_3(buf) \
1820  do { uint32x2_t t0, t1; \
1821  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \
1822  t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \
1823  buf = vcombine_u32(t0, t1); } while(0)
1824 
1825  #define BLAKE2S_LOAD_MSG_8_4(buf) \
1826  do { uint32x2_t t0, t1; \
1827  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \
1828  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \
1829  buf = vcombine_u32(t0, t1); } while(0)
1830 
1831  #define BLAKE2S_LOAD_MSG_9_1(buf) \
1832  do { uint32x2_t t0, t1; \
1833  t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \
1834  t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \
1835  buf = vcombine_u32(t0, t1); } while(0)
1836 
1837  #define BLAKE2S_LOAD_MSG_9_2(buf) \
1838  do { uint32x2_t t0, t1; \
1839  t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \
1840  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \
1841  buf = vcombine_u32(t0, t1); } while(0)
1842 
1843  #define BLAKE2S_LOAD_MSG_9_3(buf) \
1844  do { uint32x2_t t0, t1; \
1845  t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \
1846  t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \
1847  buf = vcombine_u32(t0, t1); } while(0)
1848 
1849  #define BLAKE2S_LOAD_MSG_9_4(buf) \
1850  do { uint32x2_t t0, t1; \
1851  t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \
1852  t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \
1853  buf = vcombine_u32(t0, t1); } while(0)
1854 
1855  #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)))
1856 
1857  #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8)
1858 
1859  #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c))
1860 
1861  #define BLAKE2S_G1(row1,row2,row3,row4,buf) \
1862  do { \
1863  row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \
1864  row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \
1865  row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \
1866  } while(0)
1867 
1868  #define BLAKE2S_G2(row1,row2,row3,row4,buf) \
1869  do { \
1870  row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \
1871  row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \
1872  row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \
1873  } while(0)
1874 
1875  #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
1876  do { \
1877  row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \
1878  } while(0)
1879 
1880  #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
1881  do { \
1882  row4 = vextq_u32(row4, row4, 1); \
1883  row3 = vextq_u32(row3, row3, 2); \
1884  row2 = vextq_u32(row2, row2, 3); \
1885  } while(0)
1886 
1887  #define BLAKE2S_ROUND(r) \
1888  do { \
1889  uint32x4_t buf1, buf2, buf3, buf4; \
1890  BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
1891  BLAKE2S_G1(row1,row2,row3,row4,buf1); \
1892  BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \
1893  BLAKE2S_G2(row1,row2,row3,row4,buf2); \
1894  BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \
1895  BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \
1896  BLAKE2S_G1(row1,row2,row3,row4,buf3); \
1897  BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \
1898  BLAKE2S_G2(row1,row2,row3,row4,buf4); \
1899  BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \
1900  } while(0)
1901 
1902  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf<uint32x4_t>()));
1903  CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf<uint32x4_t>()));
1904  CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf<uint32x4_t>()));
1905 
1906  const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00)));
1907  const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16)));
1908  const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32)));
1909  const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48)));
1910 
1911  uint32x4_t row1, row2, row3, row4;
1912 
1913  const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]);
1914  const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]);
1915  row3 = vld1q_u32(&BLAKE2S_IV[0]);
1916  row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.t[0]));
1917 
1918  BLAKE2S_ROUND(0);
1919  BLAKE2S_ROUND(1);
1920  BLAKE2S_ROUND(2);
1921  BLAKE2S_ROUND(3);
1922  BLAKE2S_ROUND(4);
1923  BLAKE2S_ROUND(5);
1924  BLAKE2S_ROUND(6);
1925  BLAKE2S_ROUND(7);
1926  BLAKE2S_ROUND(8);
1927  BLAKE2S_ROUND(9);
1928 
1929  vst1q_u32(&state.h[0], veorq_u32(f0, veorq_u32(row1, row3)));
1930  vst1q_u32(&state.h[4], veorq_u32(f1, veorq_u32(row2, row4)));
1931 }
1932 
1933 void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state)
1934 {
1935  #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
1936  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
1937 
1938  #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
1939  do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
1940 
1941  #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
1942  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
1943 
1944  #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
1945  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
1946 
1947  #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
1948  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
1949 
1950  #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
1951  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
1952 
1953  #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
1954  do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
1955 
1956  #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
1957  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
1958 
1959  #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
1960  do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0)
1961 
1962  #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
1963  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0)
1964 
1965  #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
1966  do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0)
1967 
1968  #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
1969  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0)
1970 
1971  #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
1972  do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0)
1973 
1974  #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
1975  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
1976 
1977  #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
1978  do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
1979 
1980  #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
1981  do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
1982 
1983  #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
1984  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0)
1985 
1986  #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
1987  do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
1988 
1989  #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
1990  do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0)
1991 
1992  #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
1993  do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0)
1994 
1995  #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
1996  do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
1997 
1998  #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
1999  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0)
2000 
2001  #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
2002  do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0)
2003 
2004  #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
2005  do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0)
2006 
2007  #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
2008  do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0)
2009 
2010  #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
2011  do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0)
2012 
2013  #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
2014  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0)
2015 
2016  #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
2017  do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0)
2018 
2019  #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
2020  do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0)
2021 
2022  #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
2023  do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0)
2024 
2025  #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
2026  do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0)
2027 
2028  #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
2029  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0)
2030 
2031  #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
2032  do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0)
2033 
2034  #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
2035  do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0)
2036 
2037  #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
2038  do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0)
2039 
2040  #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
2041  do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0)
2042 
2043  #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
2044  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0)
2045 
2046  #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
2047  do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0)
2048 
2049  #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
2050  do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0)
2051 
2052  #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
2053  do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0)
2054 
2055  #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
2056  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
2057 
2058  #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
2059  do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
2060 
2061  #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
2062  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
2063 
2064  #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
2065  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
2066 
2067  #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
2068  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
2069 
2070  #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
2071  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
2072 
2073  #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
2074  do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
2075 
2076  #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
2077  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
2078 
2079  #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
2080 
2081  #define vrorq_n_u64_24(x) vcombine_u64(\
2082  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
2083  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
2084 
2085  #define vrorq_n_u64_16(x) vcombine_u64(\
2086  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
2087  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
2088 
2089  #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
2090 
2091  #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
2092  do { \
2093  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
2094  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
2095  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
2096  row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
2097  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
2098  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
2099  row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \
2100  } while(0)
2101 
2102  #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
2103  do { \
2104  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
2105  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
2106  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
2107  row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
2108  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
2109  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
2110  row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \
2111  } while(0)
2112 
2113  #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
2114  do { \
2115  uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \
2116  uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \
2117  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
2118  t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
2119  row4l = t0; row4h = t1; \
2120  } while(0)
2121 
2122  #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
2123  do { \
2124  uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \
2125  uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \
2126  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
2127  t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
2128  row4l = t0; row4h = t1; \
2129  } while(0)
2130 
2131  #define BLAKE2B_ROUND(r) \
2132  do { \
2133  uint64x2_t b0, b1; \
2134  BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
2135  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
2136  BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
2137  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
2138  BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
2139  BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
2140  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
2141  BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
2142  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
2143  BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
2144  } while(0)
2145 
2146  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf<uint64x2_t>()));
2147  CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf<uint64x2_t>()));
2148  CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf<uint64x2_t>()));
2149 
2150  const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));
2151  const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));
2152  const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));
2153  const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48));
2154  const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64));
2155  const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80));
2156  const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96));
2157  const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112));
2158 
2159  uint64x2_t row1l, row1h, row2l, row2h;
2160  uint64x2_t row3l, row3h, row4l, row4h;
2161 
2162  const uint64x2_t h0 = row1l = vld1q_u64(&state.h[0]);
2163  const uint64x2_t h1 = row1h = vld1q_u64(&state.h[2]);
2164  const uint64x2_t h2 = row2l = vld1q_u64(&state.h[4]);
2165  const uint64x2_t h3 = row2h = vld1q_u64(&state.h[6]);
2166 
2167  row3l = vld1q_u64(&BLAKE2B_IV[0]);
2168  row3h = vld1q_u64(&BLAKE2B_IV[2]);
2169  row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV[4]), vld1q_u64(&state.t[0]));
2170  row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV[6]), vld1q_u64(&state.f[0]));
2171 
2172  BLAKE2B_ROUND(0);
2173  BLAKE2B_ROUND(1);
2174  BLAKE2B_ROUND(2);
2175  BLAKE2B_ROUND(3);
2176  BLAKE2B_ROUND(4);
2177  BLAKE2B_ROUND(5);
2178  BLAKE2B_ROUND(6);
2179  BLAKE2B_ROUND(7);
2180  BLAKE2B_ROUND(8);
2181  BLAKE2B_ROUND(9);
2182  BLAKE2B_ROUND(10);
2183  BLAKE2B_ROUND(11);
2184 
2185  vst1q_u64(&state.h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
2186  vst1q_u64(&state.h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
2187  vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
2188  vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
2189 }
2190 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
2191 
2192 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Definition: misc.h:1030
Precompiled header file.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
BLAKE2 state information.
Definition: blake2.h:132
Crypto++ library namespace.