Crypto++  7.0
Free C++ class library of cryptographic schemes
salsa.cpp
1 // salsa.cpp - originally written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #ifndef CRYPTOPP_GENERATE_X64_MASM
9 
10 #include "salsa.h"
11 #include "argnames.h"
12 #include "misc.h"
13 #include "cpu.h"
14 
15 #if CRYPTOPP_MSC_VERSION
16 # pragma warning(disable: 4702 4740)
17 #endif
18 
19 // Clang due to "Inline assembly operands don't work with .intel_syntax"
20 // https://llvm.org/bugs/show_bug.cgi?id=24232
21 #if defined(CRYPTOPP_DISABLE_SALSA_ASM)
22 # undef CRYPTOPP_X86_ASM_AVAILABLE
23 # undef CRYPTOPP_X32_ASM_AVAILABLE
24 # undef CRYPTOPP_X64_ASM_AVAILABLE
25 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
26 # undef CRYPTOPP_SSSE3_ASM_AVAILABLE
27 #endif
28 
29 NAMESPACE_BEGIN(CryptoPP)
30 
31 #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
32 void Salsa20_TestInstantiations()
33 {
36 }
37 #endif
38 
39 void Salsa20_Core(word32* data, unsigned int rounds)
40 {
41  CRYPTOPP_ASSERT(data != NULLPTR);
42  CRYPTOPP_ASSERT(rounds % 2 == 0);
43 
44  CRYPTOPP_ALIGN_DATA(16) word32 x[16];
45 
46  for (size_t i = 0; i < 16; ++i)
47  x[i] = data[i];
48 
49  // Rounds must be even
50  for (size_t i = 0; i < rounds; i += 2)
51  {
52  x[ 4] ^= rotlConstant< 7>(x[ 0]+x[12]);
53  x[ 8] ^= rotlConstant< 9>(x[ 4]+x[ 0]);
54  x[12] ^= rotlConstant<13>(x[ 8]+x[ 4]);
55  x[ 0] ^= rotlConstant<18>(x[12]+x[ 8]);
56 
57  x[ 9] ^= rotlConstant< 7>(x[ 5]+x[ 1]);
58  x[13] ^= rotlConstant< 9>(x[ 9]+x[ 5]);
59  x[ 1] ^= rotlConstant<13>(x[13]+x[ 9]);
60  x[ 5] ^= rotlConstant<18>(x[ 1]+x[13]);
61 
62  x[14] ^= rotlConstant< 7>(x[10]+x[ 6]);
63  x[ 2] ^= rotlConstant< 9>(x[14]+x[10]);
64  x[ 6] ^= rotlConstant<13>(x[ 2]+x[14]);
65  x[10] ^= rotlConstant<18>(x[ 6]+x[ 2]);
66 
67  x[ 3] ^= rotlConstant< 7>(x[15]+x[11]);
68  x[ 7] ^= rotlConstant< 9>(x[ 3]+x[15]);
69  x[11] ^= rotlConstant<13>(x[ 7]+x[ 3]);
70  x[15] ^= rotlConstant<18>(x[11]+x[ 7]);
71 
72  x[ 1] ^= rotlConstant< 7>(x[ 0]+x[ 3]);
73  x[ 2] ^= rotlConstant< 9>(x[ 1]+x[ 0]);
74  x[ 3] ^= rotlConstant<13>(x[ 2]+x[ 1]);
75  x[ 0] ^= rotlConstant<18>(x[ 3]+x[ 2]);
76 
77  x[ 6] ^= rotlConstant< 7>(x[ 5]+x[ 4]);
78  x[ 7] ^= rotlConstant< 9>(x[ 6]+x[ 5]);
79  x[ 4] ^= rotlConstant<13>(x[ 7]+x[ 6]);
80  x[ 5] ^= rotlConstant<18>(x[ 4]+x[ 7]);
81 
82  x[11] ^= rotlConstant< 7>(x[10]+x[ 9]);
83  x[ 8] ^= rotlConstant< 9>(x[11]+x[10]);
84  x[ 9] ^= rotlConstant<13>(x[ 8]+x[11]);
85  x[10] ^= rotlConstant<18>(x[ 9]+x[ 8]);
86 
87  x[12] ^= rotlConstant< 7>(x[15]+x[14]);
88  x[13] ^= rotlConstant< 9>(x[12]+x[15]);
89  x[14] ^= rotlConstant<13>(x[13]+x[12]);
90  x[15] ^= rotlConstant<18>(x[14]+x[13]);
91  }
92 
93  #pragma omp simd
94  for (size_t i = 0; i < 16; ++i)
95  data[i] += x[i];
96 }
97 
98 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
99 {
100  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
101 
102  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
103  throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
104 
105  // m_state is reordered for SSE2
107  get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
108  GetBlock<word32, LittleEndian> get2(key + length - 16);
109  get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
110 
111  // "expand 16-byte k" or "expand 32-byte k"
112  m_state[0] = 0x61707865;
113  m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
114  m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
115  m_state[3] = 0x6b206574;
116 }
117 
118 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
119 {
120  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
121  CRYPTOPP_ASSERT(length==8);
122 
124  get(m_state[14])(m_state[11]);
125  m_state[8] = m_state[5] = 0;
126 }
127 
128 void Salsa20_Policy::SeekToIteration(lword iterationCount)
129 {
130  m_state[8] = (word32)iterationCount;
131  m_state[5] = (word32)SafeRightShift<32>(iterationCount);
132 }
133 
134 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
135 unsigned int Salsa20_Policy::GetAlignment() const
136 {
137 #if CRYPTOPP_SSE2_ASM_AVAILABLE
138  if (HasSSE2())
139  return 16;
140  else
141 #endif
142  return GetAlignmentOf<word32>();
143 }
144 
145 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
146 {
147 #if CRYPTOPP_SSE2_ASM_AVAILABLE
148  if (HasSSE2())
149  return 4*BYTES_PER_ITERATION;
150  else
151 #endif
152  return BYTES_PER_ITERATION;
153 }
154 #endif
155 
156 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
157 extern "C" {
158 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
159 }
160 #endif
161 
162 #if CRYPTOPP_MSC_VERSION
163 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
164 #endif
165 
166 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
167 {
168 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
169 
170 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
171  Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
172  return;
173 #endif
174 
175 #if CRYPTOPP_SSE2_ASM_AVAILABLE
176 #ifdef CRYPTOPP_GENERATE_X64_MASM
177  ALIGN 8
178  Salsa20_OperateKeystream PROC FRAME
179  mov r10, [rsp + 5*8] ; state
180  alloc_stack(10*16 + 32*16 + 8)
181  save_xmm128 xmm6, 0200h
182  save_xmm128 xmm7, 0210h
183  save_xmm128 xmm8, 0220h
184  save_xmm128 xmm9, 0230h
185  save_xmm128 xmm10, 0240h
186  save_xmm128 xmm11, 0250h
187  save_xmm128 xmm12, 0260h
188  save_xmm128 xmm13, 0270h
189  save_xmm128 xmm14, 0280h
190  save_xmm128 xmm15, 0290h
191  .endprolog
192 
193  #define REG_output rcx
194  #define REG_input rdx
195  #define REG_iterationCount r8
196  #define REG_state r10
197  #define REG_rounds e9d
198  #define REG_roundsLeft eax
199  #define REG_temp32 r11d
200  #define REG_temp r11
201  #define SSE2_WORKSPACE rsp
202 #else
203  if (HasSSE2())
204  {
205  #if CRYPTOPP_BOOL_X64
206  #define REG_output %1
207  #define REG_input %0
208  #define REG_iterationCount %2
209  #define REG_state %4 /* constant */
210  #define REG_rounds %3 /* constant */
211  #define REG_roundsLeft eax
212  #define REG_temp32 edx
213  #define REG_temp rdx
214  #define SSE2_WORKSPACE %5 /* constant */
215 
216  CRYPTOPP_ALIGN_DATA(16) byte workspace[16*32];
217  #else
218  #define REG_output edi
219  #define REG_input eax
220  #define REG_iterationCount ecx
221  #define REG_state esi
222  #define REG_rounds edx
223  #define REG_roundsLeft ebx
224  #define REG_temp32 ebp
225  #define REG_temp ebp
226  #define SSE2_WORKSPACE esp + WORD_SZ
227  #endif
228 
229  #ifdef __GNUC__
230  __asm__ __volatile__
231  (
232  INTEL_NOPREFIX
233  AS_PUSH_IF86( bx)
234  #else
235  void *s = m_state.data();
236  word32 r = m_rounds;
237 
238  AS2( mov REG_iterationCount, iterationCount)
239  AS2( mov REG_input, input)
240  AS2( mov REG_output, output)
241  AS2( mov REG_state, s)
242  AS2( mov REG_rounds, r)
243  #endif
244 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
245 
246  AS_PUSH_IF86( bp)
247  AS2( cmp REG_iterationCount, 4)
248  ASJ( jl, 5, f)
249 
250 #if CRYPTOPP_BOOL_X86
251  AS2( mov ebx, esp)
252  AS2( and esp, -16)
253  AS2( sub esp, 32*16)
254  AS1( push ebx)
255 #endif
256 
257 #define SSE2_EXPAND_S(i, j) \
258  ASS( pshufd xmm4, xmm##i, j, j, j, j) \
259  AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
260 
261  AS2( movdqa xmm0, [REG_state + 0*16])
262  AS2( movdqa xmm1, [REG_state + 1*16])
263  AS2( movdqa xmm2, [REG_state + 2*16])
264  AS2( movdqa xmm3, [REG_state + 3*16])
265  SSE2_EXPAND_S(0, 0)
266  SSE2_EXPAND_S(0, 1)
267  SSE2_EXPAND_S(0, 2)
268  SSE2_EXPAND_S(0, 3)
269  SSE2_EXPAND_S(1, 0)
270  SSE2_EXPAND_S(1, 2)
271  SSE2_EXPAND_S(1, 3)
272  SSE2_EXPAND_S(2, 1)
273  SSE2_EXPAND_S(2, 2)
274  SSE2_EXPAND_S(2, 3)
275  SSE2_EXPAND_S(3, 0)
276  SSE2_EXPAND_S(3, 1)
277  SSE2_EXPAND_S(3, 2)
278  SSE2_EXPAND_S(3, 3)
279 
280 #define SSE2_EXPAND_S85(i) \
281  AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
282  AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
283  AS2( add REG_roundsLeft, 1) \
284  AS2( adc REG_temp32, 0)
285 
286  ASL(1)
287  AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
288  AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
289  SSE2_EXPAND_S85(0)
290  SSE2_EXPAND_S85(1)
291  SSE2_EXPAND_S85(2)
292  SSE2_EXPAND_S85(3)
293  AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
294  AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
295 
296 #define SSE2_QUARTER_ROUND(a, b, d, i) \
297  AS2( movdqa xmm4, xmm##d) \
298  AS2( paddd xmm4, xmm##a) \
299  AS2( movdqa xmm5, xmm4) \
300  AS2( pslld xmm4, i) \
301  AS2( psrld xmm5, 32-i) \
302  AS2( pxor xmm##b, xmm4) \
303  AS2( pxor xmm##b, xmm5)
304 
305 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
306 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
307 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
308 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
309 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
310 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
311 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
312 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
313 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
314 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
315 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
316 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
317 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
318 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
319 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
320 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
321 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
322 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
323 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
324 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
325 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
326 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
327 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
328 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
329 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
330 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
331 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
332 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
333 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
334 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
335 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
336 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
337 
338 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
339  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
340  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
341  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
342  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
343  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
344  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
345  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
346  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
347  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
348  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
349  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
350  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
351  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
352  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
353  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
354  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
355  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
356  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
357  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
358  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
359  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
360  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
361  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
362  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
363  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
364  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
365  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
366  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
367  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
368  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
369  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
370  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
371 
372 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
373  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
374  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
375  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
376  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
377  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
378  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
379  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
380  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
381  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
382  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
383  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
384  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
385  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
386  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
387  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
388  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
389  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
390  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
391  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
392  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
393  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
394  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
395  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
396  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
397  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
398  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
399  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
400  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
401  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
402  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
403  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
404  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
405 
406 #if CRYPTOPP_BOOL_X64
407  SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
408 #else
409  SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
410  SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
411 #endif
412  AS2( mov REG_roundsLeft, REG_rounds)
413  ASJ( jmp, 2, f)
414 
415  ASL(SSE2_Salsa_Output)
416  AS2( movdqa xmm0, xmm4)
417  AS2( punpckldq xmm4, xmm5)
418  AS2( movdqa xmm1, xmm6)
419  AS2( punpckldq xmm6, xmm7)
420  AS2( movdqa xmm2, xmm4)
421  AS2( punpcklqdq xmm4, xmm6) // e
422  AS2( punpckhqdq xmm2, xmm6) // f
423  AS2( punpckhdq xmm0, xmm5)
424  AS2( punpckhdq xmm1, xmm7)
425  AS2( movdqa xmm6, xmm0)
426  AS2( punpcklqdq xmm0, xmm1) // g
427  AS2( punpckhqdq xmm6, xmm1) // h
428  AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
429  AS1( ret)
430 
431  ASL(6)
432 #if CRYPTOPP_BOOL_X64
433  SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
434  ASL(2)
435  SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
436 #else
437  SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
438  SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
439  ASL(2)
440  SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
441  SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
442 #endif
443  AS2( sub REG_roundsLeft, 2)
444  ASJ( jnz, 6, b)
445 
446 #define SSE2_OUTPUT_4(a, b, c, d) \
447  AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
448  AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
449  AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
450  AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
451  AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
452  AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
453  AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
454  AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
455  ASC( call, SSE2_Salsa_Output)
456 
457  SSE2_OUTPUT_4(0, 13, 10, 7)
458  SSE2_OUTPUT_4(4, 1, 14, 11)
459  SSE2_OUTPUT_4(8, 5, 2, 15)
460  SSE2_OUTPUT_4(12, 9, 6, 3)
461  AS2( test REG_input, REG_input)
462  ASJ( jz, 9, f)
463  AS2( add REG_input, 12*16)
464  ASL(9)
465  AS2( add REG_output, 12*16)
466  AS2( sub REG_iterationCount, 4)
467  AS2( cmp REG_iterationCount, 4)
468  ASJ( jge, 1, b)
469  AS_POP_IF86( sp)
470 
471  ASL(5)
472  AS2( sub REG_iterationCount, 1)
473  ASJ( jl, 4, f)
474  AS2( movdqa xmm0, [REG_state + 0*16])
475  AS2( movdqa xmm1, [REG_state + 1*16])
476  AS2( movdqa xmm2, [REG_state + 2*16])
477  AS2( movdqa xmm3, [REG_state + 3*16])
478  AS2( mov REG_roundsLeft, REG_rounds)
479 
480  ASL(0)
481  SSE2_QUARTER_ROUND(0, 1, 3, 7)
482  SSE2_QUARTER_ROUND(1, 2, 0, 9)
483  SSE2_QUARTER_ROUND(2, 3, 1, 13)
484  SSE2_QUARTER_ROUND(3, 0, 2, 18)
485  ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
486  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
487  ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
488  SSE2_QUARTER_ROUND(0, 3, 1, 7)
489  SSE2_QUARTER_ROUND(3, 2, 0, 9)
490  SSE2_QUARTER_ROUND(2, 1, 3, 13)
491  SSE2_QUARTER_ROUND(1, 0, 2, 18)
492  ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
493  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
494  ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
495  AS2( sub REG_roundsLeft, 2)
496  ASJ( jnz, 0, b)
497 
498  AS2( paddd xmm0, [REG_state + 0*16])
499  AS2( paddd xmm1, [REG_state + 1*16])
500  AS2( paddd xmm2, [REG_state + 2*16])
501  AS2( paddd xmm3, [REG_state + 3*16])
502 
503  AS2( add dword ptr [REG_state + 8*4], 1)
504  AS2( adc dword ptr [REG_state + 5*4], 0)
505 
506  AS2( pcmpeqb xmm6, xmm6) // all ones
507  AS2( psrlq xmm6, 32) // lo32 mask
508  ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
509  AS2( movdqa xmm4, xmm0)
510  AS2( movdqa xmm5, xmm3)
511  AS2( pand xmm0, xmm7)
512  AS2( pand xmm4, xmm6)
513  AS2( pand xmm3, xmm6)
514  AS2( pand xmm5, xmm7)
515  AS2( por xmm4, xmm5) // 0,13,2,15
516  AS2( movdqa xmm5, xmm1)
517  AS2( pand xmm1, xmm7)
518  AS2( pand xmm5, xmm6)
519  AS2( por xmm0, xmm5) // 4,1,6,3
520  AS2( pand xmm6, xmm2)
521  AS2( pand xmm2, xmm7)
522  AS2( por xmm1, xmm6) // 8,5,10,7
523  AS2( por xmm2, xmm3) // 12,9,14,11
524 
525  AS2( movdqa xmm5, xmm4)
526  AS2( movdqa xmm6, xmm0)
527  AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
528  AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
529  AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
530  AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
531 
532  // output keystream
533  AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
534  ASJ( jmp, 5, b)
535  ASL(4)
536 
537  AS_POP_IF86( bp)
538 #ifdef __GNUC__
539  AS_POP_IF86( bx)
540  ATT_PREFIX
541  #if CRYPTOPP_BOOL_X64
542  : "+r" (input), "+r" (output), "+r" (iterationCount)
543  : "r" (m_rounds), "r" (m_state.begin()), "r" (workspace)
544  : "%eax", "%rdx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
545  #else
546  : "+a" (input), "+D" (output), "+c" (iterationCount)
547  : "d" (m_rounds), "S" (m_state.begin())
548  : "memory", "cc"
549  #endif
550  );
551 #endif
552 #ifdef CRYPTOPP_GENERATE_X64_MASM
553  movdqa xmm6, [rsp + 0200h]
554  movdqa xmm7, [rsp + 0210h]
555  movdqa xmm8, [rsp + 0220h]
556  movdqa xmm9, [rsp + 0230h]
557  movdqa xmm10, [rsp + 0240h]
558  movdqa xmm11, [rsp + 0250h]
559  movdqa xmm12, [rsp + 0260h]
560  movdqa xmm13, [rsp + 0270h]
561  movdqa xmm14, [rsp + 0280h]
562  movdqa xmm15, [rsp + 0290h]
563  add rsp, 10*16 + 32*16 + 8
564  ret
565 Salsa20_OperateKeystream ENDP
566 #else
567  }
568  else
569 #endif
570 #endif
571 #ifndef CRYPTOPP_GENERATE_X64_MASM
572  {
573  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
574 
575  while (iterationCount--)
576  {
577  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
578  x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
579  x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
580  x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
581 
582  for (int i=m_rounds; i>0; i-=2)
583  {
584  #define QUARTER_ROUND(a, b, c, d) \
585  b = b ^ rotlConstant<7>(a + d); \
586  c = c ^ rotlConstant<9>(b + a); \
587  d = d ^ rotlConstant<13>(c + b); \
588  a = a ^ rotlConstant<18>(d + c);
589 
590  QUARTER_ROUND(x0, x4, x8, x12)
591  QUARTER_ROUND(x1, x5, x9, x13)
592  QUARTER_ROUND(x2, x6, x10, x14)
593  QUARTER_ROUND(x3, x7, x11, x15)
594 
595  QUARTER_ROUND(x0, x13, x10, x7)
596  QUARTER_ROUND(x1, x14, x11, x4)
597  QUARTER_ROUND(x2, x15, x8, x5)
598  QUARTER_ROUND(x3, x12, x9, x6)
599  }
600 
601  #define SALSA_OUTPUT(x) {\
602  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
603  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
604  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
605  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
606  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
607  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
608  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
609  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
610  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
611  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
612  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
613  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
614  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
615  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
616  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
617  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
618 
619 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
620  CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
621 #endif
622 
623  if (++m_state[8] == 0)
624  ++m_state[5];
625  }
626  }
627 } // see comment above if an internal compiler error occurs here
628 
629 void XSalsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
630 {
631  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
632 
633  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
634  throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
635 
636  GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
637  if (length == 16)
638  memcpy(m_key.begin()+4, m_key.begin(), 16);
639 
640  // "expand 32-byte k"
641  m_state[0] = 0x61707865;
642  m_state[1] = 0x3320646e;
643  m_state[2] = 0x79622d32;
644  m_state[3] = 0x6b206574;
645 }
646 
647 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
648 {
649  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
650  CRYPTOPP_ASSERT(length==24);
651 
652  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
653 
655  get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
656 
657  x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
658  x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
659  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
660 
661  for (int i=m_rounds; i>0; i-=2)
662  {
663  QUARTER_ROUND(x0, x4, x8, x12)
664  QUARTER_ROUND(x1, x5, x9, x13)
665  QUARTER_ROUND(x2, x6, x10, x14)
666  QUARTER_ROUND(x3, x7, x11, x15)
667 
668  QUARTER_ROUND(x0, x13, x10, x7)
669  QUARTER_ROUND(x1, x14, x11, x4)
670  QUARTER_ROUND(x2, x15, x8, x5)
671  QUARTER_ROUND(x3, x12, x9, x6)
672  }
673 
674  m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
675  m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
676  m_state[8] = m_state[5] = 0;
677 }
678 
679 NAMESPACE_END
680 
681 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
int GetIntValueWithDefault(const char *name, int defaultValue) const
Get a named value with type int, with default.
Definition: cryptlib.h:392
Standard names for retrieving values by name when working with NameValuePairs.
Utility functions for the Crypto++ library.
const char * Rounds()
int
Definition: argnames.h:24
Library configuration file.
virtual unsigned int GetOptimalBlockSize() const
Provides number of ideal bytes to process.
Definition: strciphr.h:122
unsigned int GetAlignment() const
Provides data alignment requirements.
Definition: strciphr.h:191
#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y)
Helper macro to implement OperateKeystream.
Definition: strciphr.h:230
byte order is little-endian
Definition: cryptlib.h:142
void CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
Key the cipher.
Definition: salsa.cpp:629
Exception thrown when an invalid number of rounds is encountered.
Definition: simple.h:59
void Salsa20_Core(word32 *data, unsigned int rounds)
Salsa20 core transform.
Definition: salsa.cpp:39
A::pointer data()
Provides a pointer to the first element in the memory block.
Definition: secblock.h:553
Precompiled header file.
void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
Resynchronize the cipher.
Definition: salsa.cpp:647
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Functions for CPU features and intrinsics.
Classes for Salsa and Salsa20 stream ciphers.
iterator begin()
Provides an iterator pointing to the first element in the memory block.
Definition: secblock.h:536
const char * IV()
ConstByteArrayParameter, also accepts const byte * for backwards compatibility.
Definition: argnames.h:21
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:114
Access a block of memory.
Definition: misc.h:2324
KeystreamOperation
Keystream operation flags.
Definition: strciphr.h:88
Crypto++ library namespace.
SymmetricCipher implementation.
Definition: strciphr.h:571
Interface for retrieving values given their names.
Definition: cryptlib.h:290