6 #ifndef CRYPTOPP_ARM_SIMD_H 7 #define CRYPTOPP_ARM_SIMD_H 11 #if (CRYPTOPP_ARM_NEON_HEADER) 13 # include <arm_neon.h> 16 #if (CRYPTOPP_ARM_ACLE_HEADER) 18 # include <arm_acle.h> 21 #if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 30 inline uint32_t
CRC32B (uint32_t crc, uint8_t val)
33 return __crc32b(crc, val);
35 __asm__ (
"crc32b %w0, %w0, %w1 \n\t" 36 :
"+r" (crc) :
"r" (val) );
46 inline uint32_t
CRC32W (uint32_t crc, uint32_t val)
49 return __crc32w(crc, val);
51 __asm__ (
"crc32w %w0, %w0, %w1 \n\t" 52 :
"+r" (crc) :
"r" (val) );
62 inline uint32_t
CRC32Wx4 (uint32_t crc,
const uint32_t vals[4])
65 return __crc32w(__crc32w(__crc32w(__crc32w(
66 crc, vals[0]), vals[1]), vals[2]), vals[3]);
68 __asm__ (
"crc32w %w0, %w0, %w1 \n\t" 69 "crc32w %w0, %w0, %w2 \n\t" 70 "crc32w %w0, %w0, %w3 \n\t" 71 "crc32w %w0, %w0, %w4 \n\t" 72 :
"+r" (crc) :
"r" (vals[0]),
"r" (vals[1]),
73 "r" (vals[2]),
"r" (vals[3]));
86 inline uint32_t
CRC32CB (uint32_t crc, uint8_t val)
89 return __crc32cb(crc, val);
91 __asm__ (
"crc32cb %w0, %w0, %w1 \n\t" 92 :
"+r" (crc) :
"r" (val) );
102 inline uint32_t
CRC32CW (uint32_t crc, uint32_t val)
104 #if defined(_MSC_VER) 105 return __crc32cw(crc, val);
107 __asm__ (
"crc32cw %w0, %w0, %w1 \n\t" 108 :
"+r" (crc) :
"r" (val) );
118 inline uint32_t
CRC32CWx4 (uint32_t crc,
const uint32_t vals[4])
120 #if defined(_MSC_VER) 121 return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
122 crc, vals[0]), vals[1]), vals[2]), vals[3]);
124 __asm__ (
"crc32cw %w0, %w0, %w1 \n\t" 125 "crc32cw %w0, %w0, %w2 \n\t" 126 "crc32cw %w0, %w0, %w3 \n\t" 127 "crc32cw %w0, %w0, %w4 \n\t" 128 :
"+r" (crc) :
"r" (vals[0]),
"r" (vals[1]),
129 "r" (vals[2]),
"r" (vals[3]));
134 #endif // CRYPTOPP_ARM_CRC32_AVAILABLE 136 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 152 inline uint64x2_t
PMULL_00(
const uint64x2_t a,
const uint64x2_t b)
154 #if defined(_MSC_VER) 155 const __n64 x = { vgetq_lane_u64(a, 0) };
156 const __n64 y = { vgetq_lane_u64(b, 0) };
157 return vmull_p64(x, y);
158 #elif defined(__GNUC__) 160 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t" 161 :
"=w" (r) :
"w" (a),
"w" (b) );
164 return (uint64x2_t)(vmull_p64(
165 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
166 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
182 inline uint64x2_t
PMULL_01(
const uint64x2_t a,
const uint64x2_t b)
184 #if defined(_MSC_VER) 185 const __n64 x = { vgetq_lane_u64(a, 0) };
186 const __n64 y = { vgetq_lane_u64(b, 1) };
187 return vmull_p64(x, y);
188 #elif defined(__GNUC__) 190 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t" 191 :
"=w" (r) :
"w" (a),
"w" (vget_high_u64(b)) );
194 return (uint64x2_t)(vmull_p64(
195 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
196 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
212 inline uint64x2_t
PMULL_10(
const uint64x2_t a,
const uint64x2_t b)
214 #if defined(_MSC_VER) 215 const __n64 x = { vgetq_lane_u64(a, 1) };
216 const __n64 y = { vgetq_lane_u64(b, 0) };
217 return vmull_p64(x, y);
218 #elif defined(__GNUC__) 220 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t" 221 :
"=w" (r) :
"w" (vget_high_u64(a)),
"w" (b) );
224 return (uint64x2_t)(vmull_p64(
225 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
226 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
242 inline uint64x2_t
PMULL_11(
const uint64x2_t a,
const uint64x2_t b)
244 #if defined(_MSC_VER) 245 const __n64 x = { vgetq_lane_u64(a, 1) };
246 const __n64 y = { vgetq_lane_u64(b, 1) };
247 return vmull_p64(x, y);
248 #elif defined(__GNUC__) 250 __asm__ (
"pmull2 %0.1q, %1.2d, %2.2d \n\t" 251 :
"=w" (r) :
"w" (a),
"w" (b) );
254 return (uint64x2_t)(vmull_p64(
255 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
256 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
267 inline uint64x2_t
PMULL(
const uint64x2_t a,
const uint64x2_t b)
269 #if defined(_MSC_VER) 270 const __n64 x = { vgetq_lane_u64(a, 0) };
271 const __n64 y = { vgetq_lane_u64(b, 0) };
272 return vmull_p64(x, y);
273 #elif defined(__GNUC__) 275 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t" 276 :
"=w" (r) :
"w" (a),
"w" (b) );
279 return (uint64x2_t)(vmull_p64(
280 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
281 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
292 inline uint64x2_t
PMULL_HIGH(
const uint64x2_t a,
const uint64x2_t b)
294 #if defined(_MSC_VER) 295 const __n64 x = { vgetq_lane_u64(a, 1) };
296 const __n64 y = { vgetq_lane_u64(b, 1) };
297 return vmull_p64(x, y);
298 #elif defined(__GNUC__) 300 __asm__ (
"pmull2 %0.1q, %1.2d, %2.2d \n\t" 301 :
"=w" (r) :
"w" (a),
"w" (b) );
304 return (uint64x2_t)(vmull_p64(
305 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
306 vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
319 inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b,
unsigned int c)
321 #if defined(_MSC_VER) 322 return vreinterpretq_u64_u8(vextq_u8(
323 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c));
326 __asm__ (
"ext %0.16b, %1.16b, %2.16b, %3 \n\t" 327 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (c) );
341 template <
unsigned int C>
342 inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b)
345 #if defined(_MSC_VER) 346 return vreinterpretq_u64_u8(vextq_u8(
347 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
350 __asm__ (
"ext %0.16b, %1.16b, %2.16b, %3 \n\t" 351 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (C) );
357 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE 359 #if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING) 372 inline uint64x2_t
VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
374 #if defined(_MSC_VER) 375 return veor3q_u64(a, b, c);
378 __asm__ (
"eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t" 379 :
"=w" (r) :
"w" (a),
"w" (b),
"w" (c));
393 inline uint64x2_t
VXAR(uint64x2_t a, uint64x2_t b,
const int imm6)
395 #if defined(_MSC_VER) 396 return vxarq_u64(a, b, imm6);
399 __asm__ (
"xar %0.2d, %1.2d, %2.2d, %3 \n\t" 400 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (imm6));
414 template <
unsigned int C>
415 inline uint64x2_t
VXAR(uint64x2_t a, uint64x2_t b)
417 #if defined(_MSC_VER) 418 return vxarq_u64(a, b, C);
421 __asm__ (
"xar %0.2d, %1.2d, %2.2d, %3 \n\t" 422 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (C));
435 inline uint64x2_t
VRAX1(uint64x2_t a, uint64x2_t b)
437 #if defined(_MSC_VER) 438 return vrax1q_u64(a, b);
441 __asm__ (
"rax1 %0.2d, %1.2d, %2.2d \n\t" 442 :
"=w" (r) :
"w" (a),
"w" (b));
447 #endif // CRYPTOPP_ARM_SHA3_AVAILABLE 449 #endif // CRYPTOPP_ARM_SIMD_H uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Library configuration file.
uint32_t CRC32W(uint32_t crc, uint32_t val)
CRC32 checksum.
uint32_t CRC32B(uint32_t crc, uint8_t val)
CRC32 checksum.
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
XOR and rotate.
uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
Three-way XOR.
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint32_t CRC32CWx4(uint32_t crc, const uint32_t vals[4])
CRC32-C checksum.
uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int imm6)
XOR and rotate.
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint32_t CRC32CB(uint32_t crc, uint8_t val)
CRC32-C checksum.
uint32_t CRC32Wx4(uint32_t crc, const uint32_t vals[4])
CRC32 checksum.
uint32_t CRC32CW(uint32_t crc, uint32_t val)
CRC32-C checksum.
uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.