source: trunk/src-cryptopp/blake2.cpp

Last change on this file was e230cb0, checked in by David Stainton <dstainton415@…>, at 2016-10-12T13:27:29Z

Add cryptopp from tag CRYPTOPP_5_6_5

  • Property mode set to 100644
File size: 218.8 KB
Line 
1// blake2.cpp - written and placed in the public domain by Jeffrey Walton and Zooko
2//              Wilcox-O'Hearn. Copyright assigned to the Crypto++ project.
3//              Based on Aumasson, Neves, Wilcox-O'Hearn and Winnerlein's reference BLAKE2
4//              implementation at http://github.com/BLAKE2/BLAKE2.
5
6#include "pch.h"
7#include "config.h"
8#include "cryptlib.h"
9#include "argnames.h"
10#include "algparam.h"
11#include "blake2.h"
12#include "cpu.h"
13
14NAMESPACE_BEGIN(CryptoPP)
15
16// Uncomment for benchmarking C++ against SSE2 or NEON
17// #undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
18// #undef CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
19
20// Apple Clang 6.0/Clang 3.5 does not have SSSE3 intrinsics
21//   http://llvm.org/bugs/show_bug.cgi?id=20213
22#if (defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION <= 60000)) || (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION <= 30500))
23# undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
24#endif
25
26// Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x (Win64 supplies it except for VS2008).
27// Also see http://stackoverflow.com/a/38547909/608639
28#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (_MSC_VER >= 1200 && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600))
29inline __m128i _mm_set_epi64x(const word64 a, const word64 b)
30{
31        const word64 t[2] = {b,a}; __m128i r;
32        memcpy(&r, &t, sizeof(r));
33        return r;
34}
35#endif
36
37// C/C++ implementation
38static void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
39static void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
40
41// Also see http://github.com/weidai11/cryptopp/issues/247 for singling out SunCC 5.12
42#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
43static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
44# if (__SUNPRO_CC != 0x5120)
45static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
46# endif
47#endif
48
49#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
50static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
51static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
52#endif
53
54#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
55static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
56static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
57#endif
58
59#ifndef CRYPTOPP_DOXYGEN_PROCESSING
60
61// IV and Sigma are a better fit as part of BLAKE2_Base, but that places
62//   the constants out of reach for the NEON, SSE2 and SSE4 implementations.
63template<bool T_64bit>
64struct CRYPTOPP_NO_VTABLE BLAKE2_IV {};
65
66//! \brief BLAKE2s initialization vector specialization
67template<>
68struct CRYPTOPP_NO_VTABLE BLAKE2_IV<false>
69{
70        CRYPTOPP_CONSTANT(IVSIZE = 8)
71        // Always align for NEON and SSE
72        CRYPTOPP_ALIGN_DATA(16) static const word32 iv[8];
73};
74
75CRYPTOPP_ALIGN_DATA(16)
76const word32 BLAKE2_IV<false>::iv[8] = {
77        0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
78        0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
79};
80
81#define BLAKE2S_IV(n) BLAKE2_IV<false>::iv[n]
82
83template<>
84struct CRYPTOPP_NO_VTABLE BLAKE2_IV<true>
85{
86        CRYPTOPP_CONSTANT(IVSIZE = 8)
87        // Always align for NEON and SSE
88        CRYPTOPP_ALIGN_DATA(16) static const word64 iv[8];
89};
90
91CRYPTOPP_ALIGN_DATA(16)
92const word64 BLAKE2_IV<true>::iv[8] = {
93        W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
94        W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
95        W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
96        W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)
97};
98
99#define BLAKE2B_IV(n) BLAKE2_IV<true>::iv[n]
100
101// IV and Sigma are a better fit as part of BLAKE2_Base, but that places
102//   the constants out of reach for the NEON, SSE2 and SSE4 implementations.
103template<bool T_64bit>
104struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma {};
105
106template<>
107struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<false>
108{
109        // Always align for NEON and SSE
110        CRYPTOPP_ALIGN_DATA(16) static const byte sigma[10][16];
111};
112
113CRYPTOPP_ALIGN_DATA(16)
114const byte BLAKE2_Sigma<false>::sigma[10][16] = {
115        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
116        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
117        { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
118        {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
119        {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
120        {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
121        { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
122        { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
123        {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
124        { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
125};
126
127//! \brief BLAKE2b sigma table specialization
128template<>
129struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<true>
130{
131        // Always align for NEON and SSE
132        CRYPTOPP_ALIGN_DATA(16) static const byte sigma[12][16];
133};
134
135CRYPTOPP_ALIGN_DATA(16)
136const byte BLAKE2_Sigma<true>::sigma[12][16] = {
137        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
138        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
139        { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
140        {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
141        {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
142        {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
143        { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
144        { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
145        {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
146        { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
147        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
148        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
149};
150
151typedef void (*pfnCompress32)(const byte*, BLAKE2_State<word32, false>&);
152typedef void (*pfnCompress64)(const byte*, BLAKE2_State<word64, true>&);
153
154pfnCompress64 InitializeCompress64Fn()
155{
156#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
157        if (HasSSE4())
158                return &BLAKE2_SSE4_Compress64;
159        else
160#endif
161#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
162# if (__SUNPRO_CC != 0x5120)
163        if (HasSSE2())
164                return &BLAKE2_SSE2_Compress64;
165        else
166# endif
167#endif
168#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
169        if (HasNEON())
170                return &BLAKE2_NEON_Compress64;
171        else
172#endif
173        return &BLAKE2_CXX_Compress64;
174}
175
176pfnCompress32 InitializeCompress32Fn()
177{
178#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
179        if (HasSSE4())
180                return &BLAKE2_SSE4_Compress32;
181        else
182#endif
183#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
184        if (HasSSE2())
185                return &BLAKE2_SSE2_Compress32;
186        else
187#endif
188#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
189        if (HasNEON())
190                return &BLAKE2_NEON_Compress32;
191        else
192#endif
193        return &BLAKE2_CXX_Compress32;
194}
195
196#endif // CRYPTOPP_DOXYGEN_PROCESSING
197
198BLAKE2_ParameterBlock<false>::BLAKE2_ParameterBlock(size_t digestLen, size_t keyLen,
199                const byte* saltStr, size_t saltLen,
200                const byte* personalizationStr, size_t personalizationLen)
201{
202        // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
203        digestLength = (byte)digestLen;
204        keyLength = (byte)keyLen;
205        fanout = depth = 1;
206        nodeDepth = innerLength = 0;
207
208        memset(leafLength, 0x00, COUNTOF(leafLength));
209        memset(nodeOffset, 0x00, COUNTOF(nodeOffset));
210
211        if (saltStr && saltLen)
212        {
213                memcpy_s(salt, COUNTOF(salt), saltStr, saltLen);
214                const size_t rem = COUNTOF(salt) - saltLen;
215                const size_t off = COUNTOF(salt) - rem;
216                if (rem)
217                        memset(salt+off, 0x00, rem);
218        }
219        else
220        {
221                memset(salt, 0x00, COUNTOF(salt));
222        }
223
224        if (personalizationStr && personalizationLen)
225        {
226                memcpy_s(personalization, COUNTOF(personalization), personalizationStr, personalizationLen);
227                const size_t rem = COUNTOF(personalization) - personalizationLen;
228                const size_t off = COUNTOF(personalization) - rem;
229                if (rem)
230                        memset(personalization+off, 0x00, rem);
231        }
232        else
233        {
234                memset(personalization, 0x00, COUNTOF(personalization));
235        }
236}
237
238BLAKE2_ParameterBlock<true>::BLAKE2_ParameterBlock(size_t digestLen, size_t keyLen,
239                const byte* saltStr, size_t saltLen,
240                const byte* personalizationStr, size_t personalizationLen)
241{
242        // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
243        digestLength = (byte)digestLen;
244        keyLength = (byte)keyLen;
245        fanout = depth = 1;
246        nodeDepth = innerLength = 0;
247
248        memset(rfu, 0x00, COUNTOF(rfu));
249        memset(leafLength, 0x00, COUNTOF(leafLength));
250        memset(nodeOffset, 0x00, COUNTOF(nodeOffset));
251
252        if (saltStr && saltLen)
253        {
254                memcpy_s(salt, COUNTOF(salt), saltStr, saltLen);
255                const size_t rem = COUNTOF(salt) - saltLen;
256                const size_t off = COUNTOF(salt) - rem;
257                if (rem)
258                        memset(salt+off, 0x00, rem);
259        }
260        else
261        {
262                memset(salt, 0x00, COUNTOF(salt));
263        }
264
265        if (personalizationStr && personalizationLen)
266        {
267                memcpy_s(personalization, COUNTOF(personalization), personalizationStr, personalizationLen);
268                const size_t rem = COUNTOF(personalization) - personalizationLen;
269                const size_t off = COUNTOF(personalization) - rem;
270                if (rem)
271                        memset(personalization+off, 0x00, rem);
272        }
273        else
274        {
275                memset(personalization, 0x00, COUNTOF(personalization));
276        }
277}
278
279template <class W, bool T_64bit>
280void BLAKE2_Base<W, T_64bit>::UncheckedSetKey(const byte *key, unsigned int length, const CryptoPP::NameValuePairs& params)
281{
282        if (key && length)
283        {
284                AlignedSecByteBlock temp(BLOCKSIZE);
285                memcpy_s(temp, BLOCKSIZE, key, length);
286
287                const size_t rem = BLOCKSIZE - length;
288                if (rem)
289                        memset(temp+length, 0x00, rem);
290
291                m_key.swap(temp);
292        }
293        else
294        {
295                m_key.resize(0);
296        }
297
298#if defined(__COVERITY__)
299        // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
300        ParameterBlock& block = *m_block.data();
301        memset(m_block.data(), 0x00, sizeof(ParameterBlock));
302#else
303        // Set Head bytes; Tail bytes are set below
304        ParameterBlock& block = *m_block.data();
305        memset(m_block.data(), 0x00, T_64bit ? 32 : 16);
306#endif
307
308        block.keyLength = (byte)length;
309        block.digestLength = (byte)params.GetIntValueWithDefault(Name::DigestSize(), DIGESTSIZE);
310        block.fanout = block.depth = 1;
311
312        ConstByteArrayParameter t;
313        if (params.GetValue(Name::Salt(), t) && t.begin() && t.size())
314        {
315                memcpy_s(block.salt, COUNTOF(block.salt), t.begin(), t.size());
316                const size_t rem = COUNTOF(block.salt) - t.size();
317                const size_t off = COUNTOF(block.salt) - rem;
318                if (rem)
319                        memset(block.salt+off, 0x00, rem);
320        }
321        else
322        {
323                memset(block.salt, 0x00, COUNTOF(block.salt));
324        }
325
326        if (params.GetValue(Name::Personalization(), t) && t.begin() && t.size())
327        {
328                memcpy_s(block.personalization, COUNTOF(block.personalization), t.begin(), t.size());
329                const size_t rem = COUNTOF(block.personalization) - t.size();
330                const size_t off = COUNTOF(block.personalization) - rem;
331                if (rem)
332                        memset(block.personalization+off, 0x00, rem);
333        }
334        else
335        {
336                memset(block.personalization, 0x00, COUNTOF(block.personalization));
337        }
338}
339
340template <class W, bool T_64bit>
341BLAKE2_Base<W, T_64bit>::BLAKE2_Base() : m_state(1), m_block(1), m_digestSize(DIGESTSIZE), m_treeMode(false)
342{
343        UncheckedSetKey(NULL, 0, g_nullNameValuePairs);
344        Restart();
345}
346
347template <class W, bool T_64bit>
348BLAKE2_Base<W, T_64bit>::BLAKE2_Base(bool treeMode, unsigned int digestSize) : m_state(1), m_block(1), m_digestSize(digestSize), m_treeMode(treeMode)
349{
350        CRYPTOPP_ASSERT(digestSize <= DIGESTSIZE);
351
352        UncheckedSetKey(NULL, 0, g_nullNameValuePairs);
353        Restart();
354}
355
356template <class W, bool T_64bit>
357BLAKE2_Base<W, T_64bit>::BLAKE2_Base(const byte *key, size_t keyLength, const byte* salt, size_t saltLength,
358        const byte* personalization, size_t personalizationLength, bool treeMode, unsigned int digestSize)
359        : m_state(1), m_block(1), m_digestSize(digestSize), m_treeMode(treeMode)
360{
361        CRYPTOPP_ASSERT(keyLength <= MAX_KEYLENGTH);
362        CRYPTOPP_ASSERT(digestSize <= DIGESTSIZE);
363        CRYPTOPP_ASSERT(saltLength <= SALTSIZE);
364        CRYPTOPP_ASSERT(personalizationLength <= PERSONALIZATIONSIZE);
365
366        UncheckedSetKey(key, static_cast<unsigned int>(keyLength), MakeParameters(Name::DigestSize(),(int)digestSize)(Name::TreeMode(),treeMode, false)
367                (Name::Salt(), ConstByteArrayParameter(salt, saltLength))(Name::Personalization(), ConstByteArrayParameter(personalization, personalizationLength)));
368        Restart();
369}
370
371template <class W, bool T_64bit>
372void BLAKE2_Base<W, T_64bit>::Restart()
373{
374        static const W zero[2] = {0,0};
375        Restart(*m_block.data(), zero);
376}
377
378template <class W, bool T_64bit>
379void BLAKE2_Base<W, T_64bit>::Restart(const BLAKE2_ParameterBlock<T_64bit>& block, const W counter[2])
380{
381        // We take a parameter block as a parameter to allow customized state.
382        // Avoid the copy of the parameter block when we are passing our own block.
383        if (&block != m_block.data())
384        {
385                memcpy_s(m_block.data(), sizeof(ParameterBlock), &block, sizeof(ParameterBlock));
386                m_block.data()->digestLength = (byte)m_digestSize;
387                m_block.data()->keyLength = (byte)m_key.size();
388        }
389
390        State& state = *m_state.data();
391        state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0;
392
393        if (counter != NULL)
394        {
395                state.t[0] = counter[0];
396                state.t[1] = counter[1];
397        }
398
399        PutBlock<W, LittleEndian, true> put(m_block.data(), &state.h[0]);
400        put(BLAKE2_IV<T_64bit>::iv[0])(BLAKE2_IV<T_64bit>::iv[1])(BLAKE2_IV<T_64bit>::iv[2])(BLAKE2_IV<T_64bit>::iv[3]);
401        put(BLAKE2_IV<T_64bit>::iv[4])(BLAKE2_IV<T_64bit>::iv[5])(BLAKE2_IV<T_64bit>::iv[6])(BLAKE2_IV<T_64bit>::iv[7]);
402
403        // When BLAKE2 is keyed, the input stream is simply {key||message}. Key it
404        // during Restart to avoid FirstPut and friends. Key size == 0 means no key.
405        if (m_key.size())
406                Update(m_key, m_key.size());
407}
408
409template <class W, bool T_64bit>
410void BLAKE2_Base<W, T_64bit>::Update(const byte *input, size_t length)
411{
412        State& state = *m_state.data();
413        if (state.length + length > BLOCKSIZE)
414        {
415                // Complete current block
416                const size_t fill = BLOCKSIZE - state.length;
417                memcpy_s(&state.buffer[state.length], fill, input, fill);
418
419                IncrementCounter();
420                Compress(state.buffer);
421                state.length = 0;
422
423                length -= fill, input += fill;
424
425                // Compress in-place to avoid copies
426                while (length > BLOCKSIZE)
427                {
428                        IncrementCounter();
429                        Compress(input);
430                        length -= BLOCKSIZE, input += BLOCKSIZE;
431                }
432        }
433
434        // Copy tail bytes
435        if (input && length)
436        {
437                CRYPTOPP_ASSERT(length <= BLOCKSIZE - state.length);
438                memcpy_s(&state.buffer[state.length], length, input, length);
439                state.length += static_cast<unsigned int>(length);
440        }
441}
442
443template <class W, bool T_64bit>
444void BLAKE2_Base<W, T_64bit>::TruncatedFinal(byte *hash, size_t size)
445{
446        this->ThrowIfInvalidTruncatedSize(size);
447
448        // Set last block unconditionally
449        State& state = *m_state.data();
450        state.f[0] = static_cast<W>(-1);
451
452        // Set last node if tree mode
453        if (m_treeMode)
454                state.f[1] = static_cast<W>(-1);
455
456        // Increment counter for tail bytes only
457        IncrementCounter(state.length);
458
459        memset(state.buffer + state.length, 0x00, BLOCKSIZE - state.length);
460        Compress(state.buffer);
461
462        // Copy to caller buffer
463        memcpy_s(hash, size, &state.h[0], size);
464
465        Restart();
466}
467
468template <class W, bool T_64bit>
469void BLAKE2_Base<W, T_64bit>::IncrementCounter(size_t count)
470{
471        State& state = *m_state.data();
472        state.t[0] += static_cast<W>(count);
473        state.t[1] += !!(state.t[0] < count);
474}
475
476template <>
477void BLAKE2_Base<word64, true>::Compress(const byte *input)
478{
479        // Selects the most advanced implmentation at runtime
480        static const pfnCompress64 s_pfn = InitializeCompress64Fn();
481        s_pfn(input, *m_state.data());
482}
483
484template <>
485void BLAKE2_Base<word32, false>::Compress(const byte *input)
486{
487        // Selects the most advanced implmentation at runtime
488        static const pfnCompress32 s_pfn = InitializeCompress32Fn();
489        s_pfn(input, *m_state.data());
490}
491
492void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
493{
494        #undef BLAKE2_G
495        #undef BLAKE2_ROUND
496
497        #define BLAKE2_G(r,i,a,b,c,d) \
498          do { \
499            a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+0]]; \
500            d = rotrVariable<word64>(d ^ a, 32); \
501            c = c + d; \
502            b = rotrVariable<word64>(b ^ c, 24); \
503            a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+1]]; \
504            d = rotrVariable<word64>(d ^ a, 16); \
505            c = c + d; \
506            b = rotrVariable<word64>(b ^ c, 63); \
507          } while(0)
508
509        #define BLAKE2_ROUND(r)  \
510          do { \
511            BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
512            BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
513            BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \
514            BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \
515            BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \
516            BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \
517            BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
518            BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
519          } while(0)
520
521        word64 m[16], v[16];
522
523        GetBlock<word64, LittleEndian, true> get1(input);
524        get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
525
526        GetBlock<word64, LittleEndian, true> get2(&state.h[0]);
527        get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
528
529        v[ 8] = BLAKE2B_IV(0);
530        v[ 9] = BLAKE2B_IV(1);
531        v[10] = BLAKE2B_IV(2);
532        v[11] = BLAKE2B_IV(3);
533        v[12] = state.t[0] ^ BLAKE2B_IV(4);
534        v[13] = state.t[1] ^ BLAKE2B_IV(5);
535        v[14] = state.f[0] ^ BLAKE2B_IV(6);
536        v[15] = state.f[1] ^ BLAKE2B_IV(7);
537
538        BLAKE2_ROUND( 0 );
539        BLAKE2_ROUND( 1 );
540        BLAKE2_ROUND( 2 );
541        BLAKE2_ROUND( 3 );
542        BLAKE2_ROUND( 4 );
543        BLAKE2_ROUND( 5 );
544        BLAKE2_ROUND( 6 );
545        BLAKE2_ROUND( 7 );
546        BLAKE2_ROUND( 8 );
547        BLAKE2_ROUND( 9 );
548        BLAKE2_ROUND( 10 );
549        BLAKE2_ROUND( 11 );
550
551        for(unsigned int i = 0; i < 8; ++i)
552                state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]);
553}
554
555void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
556{
557        #undef BLAKE2_G
558        #undef BLAKE2_ROUND
559
560        #define BLAKE2_G(r,i,a,b,c,d) \
561          do { \
562            a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+0]]; \
563            d = rotrVariable<word32>(d ^ a, 16); \
564            c = c + d; \
565            b = rotrVariable<word32>(b ^ c, 12); \
566            a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+1]]; \
567            d = rotrVariable<word32>(d ^ a, 8); \
568            c = c + d; \
569            b = rotrVariable<word32>(b ^ c, 7); \
570          } while(0)
571
572        #define BLAKE2_ROUND(r)  \
573          do { \
574            BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
575            BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
576            BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \
577            BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \
578            BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \
579            BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \
580            BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
581            BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
582          } while(0)
583
584        word32 m[16], v[16];
585
586        GetBlock<word32, LittleEndian, true> get1(input);
587        get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
588
589        GetBlock<word32, LittleEndian, true> get2(&state.h[0]);
590        get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
591
592        v[ 8] = BLAKE2S_IV(0);
593        v[ 9] = BLAKE2S_IV(1);
594        v[10] = BLAKE2S_IV(2);
595        v[11] = BLAKE2S_IV(3);
596        v[12] = state.t[0] ^ BLAKE2S_IV(4);
597        v[13] = state.t[1] ^ BLAKE2S_IV(5);
598        v[14] = state.f[0] ^ BLAKE2S_IV(6);
599        v[15] = state.f[1] ^ BLAKE2S_IV(7);
600
601        BLAKE2_ROUND( 0 );
602        BLAKE2_ROUND( 1 );
603        BLAKE2_ROUND( 2 );
604        BLAKE2_ROUND( 3 );
605        BLAKE2_ROUND( 4 );
606        BLAKE2_ROUND( 5 );
607        BLAKE2_ROUND( 6 );
608        BLAKE2_ROUND( 7 );
609        BLAKE2_ROUND( 8 );
610        BLAKE2_ROUND( 9 );
611
612        for(unsigned int i = 0; i < 8; ++i)
613                state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]);
614}
615
616#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
617static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
618{
619  word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
620  GetBlock<word32, LittleEndian, true> get(input);
621  get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
622
623  __m128i row1,row2,row3,row4;
624  __m128i buf1,buf2,buf3,buf4;
625  __m128i ff0,ff1;
626
627  row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
628  row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
629  row3 = _mm_setr_epi32(BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3));
630  row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
631  buf1 = _mm_set_epi32(m6,m4,m2,m0);
632  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
633  row4 = _mm_xor_si128(row4,row1);
634  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
635  row3 = _mm_add_epi32(row3,row4);
636  row2 = _mm_xor_si128(row2,row3);
637  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
638
639  buf2 = _mm_set_epi32(m7,m5,m3,m1);
640  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
641  row4 = _mm_xor_si128(row4,row1);
642  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
643  row3 = _mm_add_epi32(row3,row4);
644  row2 = _mm_xor_si128(row2,row3);
645  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
646
647  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
648  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
649  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
650
651  buf3 = _mm_set_epi32(m14,m12,m10,m8);
652  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
653  row4 = _mm_xor_si128(row4,row1);
654  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
655  row3 = _mm_add_epi32(row3,row4);
656  row2 = _mm_xor_si128(row2,row3);
657  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
658
659  buf4 = _mm_set_epi32(m15,m13,m11,m9);
660  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
661  row4 = _mm_xor_si128(row4,row1);
662  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
663  row3 = _mm_add_epi32(row3,row4);
664  row2 = _mm_xor_si128(row2,row3);
665  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
666
667  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
668  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
669  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
670
671  buf1 = _mm_set_epi32(m13,m9,m4,m14);
672  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
673  row4 = _mm_xor_si128(row4,row1);
674  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
675  row3 = _mm_add_epi32(row3,row4);
676  row2 = _mm_xor_si128(row2,row3);
677  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
678
679  buf2 = _mm_set_epi32(m6,m15,m8,m10);
680  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
681  row4 = _mm_xor_si128(row4,row1);
682  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
683  row3 = _mm_add_epi32(row3,row4);
684  row2 = _mm_xor_si128(row2,row3);
685  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
686
687  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
688  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
689  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
690
691  buf3 = _mm_set_epi32(m5,m11,m0,m1);
692  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
693  row4 = _mm_xor_si128(row4,row1);
694  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
695  row3 = _mm_add_epi32(row3,row4);
696  row2 = _mm_xor_si128(row2,row3);
697  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
698
699  buf4 = _mm_set_epi32(m3,m7,m2,m12);
700  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
701  row4 = _mm_xor_si128(row4,row1);
702  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
703  row3 = _mm_add_epi32(row3,row4);
704  row2 = _mm_xor_si128(row2,row3);
705  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
706
707  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
708  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
709  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
710
711  buf1 = _mm_set_epi32(m15,m5,m12,m11);
712  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
713  row4 = _mm_xor_si128(row4,row1);
714  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
715  row3 = _mm_add_epi32(row3,row4);
716  row2 = _mm_xor_si128(row2,row3);
717  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
718
719  buf2 = _mm_set_epi32(m13,m2,m0,m8);
720  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
721  row4 = _mm_xor_si128(row4,row1);
722  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
723  row3 = _mm_add_epi32(row3,row4);
724  row2 = _mm_xor_si128(row2,row3);
725  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
726
727  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
728  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
729  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
730
731  buf3 = _mm_set_epi32(m9,m7,m3,m10);
732  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
733  row4 = _mm_xor_si128(row4,row1);
734  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
735  row3 = _mm_add_epi32(row3,row4);
736  row2 = _mm_xor_si128(row2,row3);
737  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
738
739  buf4 = _mm_set_epi32(m4,m1,m6,m14);
740  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
741  row4 = _mm_xor_si128(row4,row1);
742  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
743  row3 = _mm_add_epi32(row3,row4);
744  row2 = _mm_xor_si128(row2,row3);
745  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
746
747  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
748  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
749  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
750
751  buf1 = _mm_set_epi32(m11,m13,m3,m7);
752  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
753  row4 = _mm_xor_si128(row4,row1);
754  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
755  row3 = _mm_add_epi32(row3,row4);
756  row2 = _mm_xor_si128(row2,row3);
757  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
758
759  buf2 = _mm_set_epi32(m14,m12,m1,m9);
760  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
761  row4 = _mm_xor_si128(row4,row1);
762  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
763  row3 = _mm_add_epi32(row3,row4);
764  row2 = _mm_xor_si128(row2,row3);
765  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
766
767  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
768  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
769  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
770
771  buf3 = _mm_set_epi32(m15,m4,m5,m2);
772  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
773  row4 = _mm_xor_si128(row4,row1);
774  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
775  row3 = _mm_add_epi32(row3,row4);
776  row2 = _mm_xor_si128(row2,row3);
777  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
778
779  buf4 = _mm_set_epi32(m8,m0,m10,m6);
780  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
781  row4 = _mm_xor_si128(row4,row1);
782  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
783  row3 = _mm_add_epi32(row3,row4);
784  row2 = _mm_xor_si128(row2,row3);
785  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
786
787  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
788  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
789  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
790
791  buf1 = _mm_set_epi32(m10,m2,m5,m9);
792  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
793  row4 = _mm_xor_si128(row4,row1);
794  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
795  row3 = _mm_add_epi32(row3,row4);
796  row2 = _mm_xor_si128(row2,row3);
797  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
798
799  buf2 = _mm_set_epi32(m15,m4,m7,m0);
800  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
801  row4 = _mm_xor_si128(row4,row1);
802  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
803  row3 = _mm_add_epi32(row3,row4);
804  row2 = _mm_xor_si128(row2,row3);
805  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
806
807  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
808  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
809  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
810
811  buf3 = _mm_set_epi32(m3,m6,m11,m14);
812  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
813  row4 = _mm_xor_si128(row4,row1);
814  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
815  row3 = _mm_add_epi32(row3,row4);
816  row2 = _mm_xor_si128(row2,row3);
817  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
818
819  buf4 = _mm_set_epi32(m13,m8,m12,m1);
820  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
821  row4 = _mm_xor_si128(row4,row1);
822  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
823  row3 = _mm_add_epi32(row3,row4);
824  row2 = _mm_xor_si128(row2,row3);
825  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
826
827  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
828  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
829  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
830
831  buf1 = _mm_set_epi32(m8,m0,m6,m2);
832  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
833  row4 = _mm_xor_si128(row4,row1);
834  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
835  row3 = _mm_add_epi32(row3,row4);
836  row2 = _mm_xor_si128(row2,row3);
837  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
838
839  buf2 = _mm_set_epi32(m3,m11,m10,m12);
840  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
841  row4 = _mm_xor_si128(row4,row1);
842  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
843  row3 = _mm_add_epi32(row3,row4);
844  row2 = _mm_xor_si128(row2,row3);
845  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
846
847  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
848  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
849  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
850
851  buf3 = _mm_set_epi32(m1,m15,m7,m4);
852  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
853  row4 = _mm_xor_si128(row4,row1);
854  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
855  row3 = _mm_add_epi32(row3,row4);
856  row2 = _mm_xor_si128(row2,row3);
857  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
858
859  buf4 = _mm_set_epi32(m9,m14,m5,m13);
860  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
861  row4 = _mm_xor_si128(row4,row1);
862  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
863  row3 = _mm_add_epi32(row3,row4);
864  row2 = _mm_xor_si128(row2,row3);
865  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
866
867  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
868  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
869  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
870
871  buf1 = _mm_set_epi32(m4,m14,m1,m12);
872  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
873  row4 = _mm_xor_si128(row4,row1);
874  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
875  row3 = _mm_add_epi32(row3,row4);
876  row2 = _mm_xor_si128(row2,row3);
877  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
878
879  buf2 = _mm_set_epi32(m10,m13,m15,m5);
880  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
881  row4 = _mm_xor_si128(row4,row1);
882  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
883  row3 = _mm_add_epi32(row3,row4);
884  row2 = _mm_xor_si128(row2,row3);
885  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
886
887  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
888  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
889  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
890
891  buf3 = _mm_set_epi32(m8,m9,m6,m0);
892  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
893  row4 = _mm_xor_si128(row4,row1);
894  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
895  row3 = _mm_add_epi32(row3,row4);
896  row2 = _mm_xor_si128(row2,row3);
897  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
898
899  buf4 = _mm_set_epi32(m11,m2,m3,m7);
900  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
901  row4 = _mm_xor_si128(row4,row1);
902  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
903  row3 = _mm_add_epi32(row3,row4);
904  row2 = _mm_xor_si128(row2,row3);
905  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
906
907  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
908  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
909  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
910
911  buf1 = _mm_set_epi32(m3,m12,m7,m13);
912  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
913  row4 = _mm_xor_si128(row4,row1);
914  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
915  row3 = _mm_add_epi32(row3,row4);
916  row2 = _mm_xor_si128(row2,row3);
917  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
918
919  buf2 = _mm_set_epi32(m9,m1,m14,m11);
920  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
921  row4 = _mm_xor_si128(row4,row1);
922  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
923  row3 = _mm_add_epi32(row3,row4);
924  row2 = _mm_xor_si128(row2,row3);
925  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
926
927  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
928  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
929  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
930
931  buf3 = _mm_set_epi32(m2,m8,m15,m5);
932  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
933  row4 = _mm_xor_si128(row4,row1);
934  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
935  row3 = _mm_add_epi32(row3,row4);
936  row2 = _mm_xor_si128(row2,row3);
937  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
938
939  buf4 = _mm_set_epi32(m10,m6,m4,m0);
940  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
941  row4 = _mm_xor_si128(row4,row1);
942  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
943  row3 = _mm_add_epi32(row3,row4);
944  row2 = _mm_xor_si128(row2,row3);
945  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
946
947  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
948  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
949  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
950
951  buf1 = _mm_set_epi32(m0,m11,m14,m6);
952  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
953  row4 = _mm_xor_si128(row4,row1);
954  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
955  row3 = _mm_add_epi32(row3,row4);
956  row2 = _mm_xor_si128(row2,row3);
957  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
958
959  buf2 = _mm_set_epi32(m8,m3,m9,m15);
960  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
961  row4 = _mm_xor_si128(row4,row1);
962  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
963  row3 = _mm_add_epi32(row3,row4);
964  row2 = _mm_xor_si128(row2,row3);
965  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
966
967  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
968  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
969  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
970
971  buf3 = _mm_set_epi32(m10,m1,m13,m12);
972  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
973  row4 = _mm_xor_si128(row4,row1);
974  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
975  row3 = _mm_add_epi32(row3,row4);
976  row2 = _mm_xor_si128(row2,row3);
977  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
978
979  buf4 = _mm_set_epi32(m5,m4,m7,m2);
980  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
981  row4 = _mm_xor_si128(row4,row1);
982  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
983  row3 = _mm_add_epi32(row3,row4);
984  row2 = _mm_xor_si128(row2,row3);
985  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
986
987  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
988  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
989  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
990
991  buf1 = _mm_set_epi32(m1,m7,m8,m10);
992  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
993  row4 = _mm_xor_si128(row4,row1);
994  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
995  row3 = _mm_add_epi32(row3,row4);
996  row2 = _mm_xor_si128(row2,row3);
997  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
998
999  buf2 = _mm_set_epi32(m5,m6,m4,m2);
1000  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
1001  row4 = _mm_xor_si128(row4,row1);
1002  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1003  row3 = _mm_add_epi32(row3,row4);
1004  row2 = _mm_xor_si128(row2,row3);
1005  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1006
1007  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
1008  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1009  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
1010
1011  buf3 = _mm_set_epi32(m13,m3,m9,m15);
1012  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
1013  row4 = _mm_xor_si128(row4,row1);
1014  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
1015  row3 = _mm_add_epi32(row3,row4);
1016  row2 = _mm_xor_si128(row2,row3);
1017  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
1018
1019  buf4 = _mm_set_epi32(m0,m12,m14,m11);
1020  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
1021  row4 = _mm_xor_si128(row4,row1);
1022  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1023  row3 = _mm_add_epi32(row3,row4);
1024  row2 = _mm_xor_si128(row2,row3);
1025  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1026
1027  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
1028  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1029  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
1030
1031  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3)));
1032  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4)));
1033}
1034
1035# if (__SUNPRO_CC != 0x5120)
1036static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
1037{
1038  word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
1039  GetBlock<word64, LittleEndian, true> get(input);
1040  get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
1041
1042  __m128i row1l, row1h, row2l, row2h;
1043  __m128i row3l, row3h, row4l, row4h;
1044  __m128i b0, b1, t0, t1;
1045
1046  row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
1047  row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]));
1048  row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
1049  row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]));
1050  row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0)));
1051  row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2)));
1052  row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
1053  row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0])));
1054
1055  b0 = _mm_set_epi64x(m2, m0);
1056  b1 = _mm_set_epi64x(m6, m4);
1057  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1058  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1059  row4l = _mm_xor_si128(row4l, row1l);
1060  row4h = _mm_xor_si128(row4h, row1h);
1061  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1062  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1063  row3l = _mm_add_epi64(row3l, row4l);
1064  row3h = _mm_add_epi64(row3h, row4h);
1065  row2l = _mm_xor_si128(row2l, row3l);
1066  row2h = _mm_xor_si128(row2h, row3h);
1067  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40));
1068  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40));
1069
1070  b0 = _mm_set_epi64x(m3, m1);
1071  b1 = _mm_set_epi64x(m7, m5);
1072  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1073  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1074  row4l = _mm_xor_si128(row4l, row1l);
1075  row4h = _mm_xor_si128(row4h, row1h);
1076  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1077  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1078  row3l = _mm_add_epi64(row3l, row4l);
1079  row3h = _mm_add_epi64(row3h, row4h);
1080  row2l = _mm_xor_si128(row2l, row3l);
1081  row2h = _mm_xor_si128(row2h, row3h);
1082  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1083  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1084
1085  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1086  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1087  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1088  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1089  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1090
1091  b0 = _mm_set_epi64x(m10, m8);
1092  b1 = _mm_set_epi64x(m14, m12);
1093  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1094  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1095  row4l = _mm_xor_si128(row4l, row1l);
1096  row4h = _mm_xor_si128(row4h, row1h);
1097  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1098  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1099  row3l = _mm_add_epi64(row3l, row4l);
1100  row3h = _mm_add_epi64(row3h, row4h);
1101  row2l = _mm_xor_si128(row2l, row3l);
1102  row2h = _mm_xor_si128(row2h, row3h);
1103  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1104  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1105
1106  b0 = _mm_set_epi64x(m11, m9);
1107  b1 = _mm_set_epi64x(m15, m13);
1108  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1109  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1110  row4l = _mm_xor_si128(row4l, row1l);
1111  row4h = _mm_xor_si128(row4h, row1h);
1112  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1113  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1114  row3l = _mm_add_epi64(row3l, row4l);
1115  row3h = _mm_add_epi64(row3h, row4h);
1116  row2l = _mm_xor_si128(row2l, row3l);
1117  row2h = _mm_xor_si128(row2h, row3h);
1118  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1119  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1120
1121  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1122  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1123  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1124  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1125  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1126
1127  b0 = _mm_set_epi64x(m4, m14);
1128  b1 = _mm_set_epi64x(m13, m9);
1129  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1130  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1131  row4l = _mm_xor_si128(row4l, row1l);
1132  row4h = _mm_xor_si128(row4h, row1h);
1133  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1134  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1135  row3l = _mm_add_epi64(row3l, row4l);
1136  row3h = _mm_add_epi64(row3h, row4h);
1137  row2l = _mm_xor_si128(row2l, row3l);
1138  row2h = _mm_xor_si128(row2h, row3h);
1139  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1140  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1141
1142  b0 = _mm_set_epi64x(m8, m10);
1143  b1 = _mm_set_epi64x(m6, m15);
1144  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1145  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1146  row4l = _mm_xor_si128(row4l, row1l);
1147  row4h = _mm_xor_si128(row4h, row1h);
1148  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1149  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1150  row3l = _mm_add_epi64(row3l, row4l);
1151  row3h = _mm_add_epi64(row3h, row4h);
1152  row2l = _mm_xor_si128(row2l, row3l);
1153  row2h = _mm_xor_si128(row2h, row3h);
1154  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1155  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1156
1157  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1158  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1159  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1160  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1161  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1162  b0 = _mm_set_epi64x(m0, m1);
1163  b1 = _mm_set_epi64x(m5, m11);
1164  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1165  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1166  row4l = _mm_xor_si128(row4l, row1l);
1167  row4h = _mm_xor_si128(row4h, row1h);
1168  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1169  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1170  row3l = _mm_add_epi64(row3l, row4l);
1171  row3h = _mm_add_epi64(row3h, row4h);
1172  row2l = _mm_xor_si128(row2l, row3l);
1173  row2h = _mm_xor_si128(row2h, row3h);
1174  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1175  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1176
1177  b0 = _mm_set_epi64x(m2, m12);
1178  b1 = _mm_set_epi64x(m3, m7);
1179  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1180  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1181  row4l = _mm_xor_si128(row4l, row1l);
1182  row4h = _mm_xor_si128(row4h, row1h);
1183  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1184  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1185  row3l = _mm_add_epi64(row3l, row4l);
1186  row3h = _mm_add_epi64(row3h, row4h);
1187  row2l = _mm_xor_si128(row2l, row3l);
1188  row2h = _mm_xor_si128(row2h, row3h);
1189  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1190  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1191
1192  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1193  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1194  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1195  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1196  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1197
1198  b0 = _mm_set_epi64x(m12, m11);
1199  b1 = _mm_set_epi64x(m15, m5);
1200  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1201  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1202  row4l = _mm_xor_si128(row4l, row1l);
1203  row4h = _mm_xor_si128(row4h, row1h);
1204  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1205  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1206  row3l = _mm_add_epi64(row3l, row4l);
1207  row3h = _mm_add_epi64(row3h, row4h);
1208  row2l = _mm_xor_si128(row2l, row3l);
1209  row2h = _mm_xor_si128(row2h, row3h);
1210  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1211  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1212
1213  b0 = _mm_set_epi64x(m0, m8);
1214  b1 = _mm_set_epi64x(m13, m2);
1215  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1216  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1217  row4l = _mm_xor_si128(row4l, row1l);
1218  row4h = _mm_xor_si128(row4h, row1h);
1219  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1220  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1221  row3l = _mm_add_epi64(row3l, row4l);
1222  row3h = _mm_add_epi64(row3h, row4h);
1223  row2l = _mm_xor_si128(row2l, row3l);
1224  row2h = _mm_xor_si128(row2h, row3h);
1225  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1226  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1227
1228  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1229  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1230  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1231  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1232  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1233  b0 = _mm_set_epi64x(m3, m10);
1234  b1 = _mm_set_epi64x(m9, m7);
1235  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1236  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1237  row4l = _mm_xor_si128(row4l, row1l);
1238  row4h = _mm_xor_si128(row4h, row1h);
1239  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1240  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1241  row3l = _mm_add_epi64(row3l, row4l);
1242  row3h = _mm_add_epi64(row3h, row4h);
1243  row2l = _mm_xor_si128(row2l, row3l);
1244  row2h = _mm_xor_si128(row2h, row3h);
1245  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1246  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1247
1248  b0 = _mm_set_epi64x(m6, m14);
1249  b1 = _mm_set_epi64x(m4, m1);
1250  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1251  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1252  row4l = _mm_xor_si128(row4l, row1l);
1253  row4h = _mm_xor_si128(row4h, row1h);
1254  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1255  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1256  row3l = _mm_add_epi64(row3l, row4l);
1257  row3h = _mm_add_epi64(row3h, row4h);
1258  row2l = _mm_xor_si128(row2l, row3l);
1259  row2h = _mm_xor_si128(row2h, row3h);
1260  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1261  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1262
1263  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1264  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1265  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1266  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1267  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1268
1269  b0 = _mm_set_epi64x(m3, m7);
1270  b1 = _mm_set_epi64x(m11, m13);
1271  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1272  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1273  row4l = _mm_xor_si128(row4l, row1l);
1274  row4h = _mm_xor_si128(row4h, row1h);
1275  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1276  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1277  row3l = _mm_add_epi64(row3l, row4l);
1278  row3h = _mm_add_epi64(row3h, row4h);
1279  row2l = _mm_xor_si128(row2l, row3l);
1280  row2h = _mm_xor_si128(row2h, row3h);
1281  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1282  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1283
1284  b0 = _mm_set_epi64x(m1, m9);
1285  b1 = _mm_set_epi64x(m14, m12);
1286  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1287  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1288  row4l = _mm_xor_si128(row4l, row1l);
1289  row4h = _mm_xor_si128(row4h, row1h);
1290  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1291  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1292  row3l = _mm_add_epi64(row3l, row4l);
1293  row3h = _mm_add_epi64(row3h, row4h);
1294  row2l = _mm_xor_si128(row2l, row3l);
1295  row2h = _mm_xor_si128(row2h, row3h);
1296  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1297  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1298
1299  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1300  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1301  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1302  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1303  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1304  b0 = _mm_set_epi64x(m5, m2);
1305  b1 = _mm_set_epi64x(m15, m4);
1306  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1307  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1308  row4l = _mm_xor_si128(row4l, row1l);
1309  row4h = _mm_xor_si128(row4h, row1h);
1310  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1311  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1312  row3l = _mm_add_epi64(row3l, row4l);
1313  row3h = _mm_add_epi64(row3h, row4h);
1314  row2l = _mm_xor_si128(row2l, row3l);
1315  row2h = _mm_xor_si128(row2h, row3h);
1316  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1317  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1318
1319  b0 = _mm_set_epi64x(m10, m6);
1320  b1 = _mm_set_epi64x(m8, m0);
1321  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1322  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1323  row4l = _mm_xor_si128(row4l, row1l);
1324  row4h = _mm_xor_si128(row4h, row1h);
1325  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1326  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1327  row3l = _mm_add_epi64(row3l, row4l);
1328  row3h = _mm_add_epi64(row3h, row4h);
1329  row2l = _mm_xor_si128(row2l, row3l);
1330  row2h = _mm_xor_si128(row2h, row3h);
1331  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1332  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1333
1334  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1335  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1336  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1337  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1338  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1339
1340  b0 = _mm_set_epi64x(m5, m9);
1341  b1 = _mm_set_epi64x(m10, m2);
1342  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1343  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1344  row4l = _mm_xor_si128(row4l, row1l);
1345  row4h = _mm_xor_si128(row4h, row1h);
1346  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1347  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1348  row3l = _mm_add_epi64(row3l, row4l);
1349  row3h = _mm_add_epi64(row3h, row4h);
1350  row2l = _mm_xor_si128(row2l, row3l);
1351  row2h = _mm_xor_si128(row2h, row3h);
1352  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1353  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1354
1355  b0 = _mm_set_epi64x(m7, m0);
1356  b1 = _mm_set_epi64x(m15, m4);
1357  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1358  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1359  row4l = _mm_xor_si128(row4l, row1l);
1360  row4h = _mm_xor_si128(row4h, row1h);
1361  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1362  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1363  row3l = _mm_add_epi64(row3l, row4l);
1364  row3h = _mm_add_epi64(row3h, row4h);
1365  row2l = _mm_xor_si128(row2l, row3l);
1366  row2h = _mm_xor_si128(row2h, row3h);
1367  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1368  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1369
1370  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1371  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1372  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1373  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1374  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1375  b0 = _mm_set_epi64x(m11, m14);
1376  b1 = _mm_set_epi64x(m3, m6);
1377  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1378  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1379  row4l = _mm_xor_si128(row4l, row1l);
1380  row4h = _mm_xor_si128(row4h, row1h);
1381  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1382  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1383  row3l = _mm_add_epi64(row3l, row4l);
1384  row3h = _mm_add_epi64(row3h, row4h);
1385  row2l = _mm_xor_si128(row2l, row3l);
1386  row2h = _mm_xor_si128(row2h, row3h);
1387  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1388  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1389
1390
1391  b0 = _mm_set_epi64x(m12, m1);
1392  b1 = _mm_set_epi64x(m13, m8);
1393  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1394  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1395  row4l = _mm_xor_si128(row4l, row1l);
1396  row4h = _mm_xor_si128(row4h, row1h);
1397  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1398  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1399  row3l = _mm_add_epi64(row3l, row4l);
1400  row3h = _mm_add_epi64(row3h, row4h);
1401  row2l = _mm_xor_si128(row2l, row3l);
1402  row2h = _mm_xor_si128(row2h, row3h);
1403  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1404  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1405
1406  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1407  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1408  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1409  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1410  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1411
1412  b0 = _mm_set_epi64x(m6, m2);
1413  b1 = _mm_set_epi64x(m8, m0);
1414  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1415  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1416  row4l = _mm_xor_si128(row4l, row1l);
1417  row4h = _mm_xor_si128(row4h, row1h);
1418  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1419  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1420  row3l = _mm_add_epi64(row3l, row4l);
1421  row3h = _mm_add_epi64(row3h, row4h);
1422  row2l = _mm_xor_si128(row2l, row3l);
1423  row2h = _mm_xor_si128(row2h, row3h);
1424  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1425  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1426
1427  b0 = _mm_set_epi64x(m10, m12);
1428  b1 = _mm_set_epi64x(m3, m11);
1429  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1430  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1431  row4l = _mm_xor_si128(row4l, row1l);
1432  row4h = _mm_xor_si128(row4h, row1h);
1433  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1434  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1435  row3l = _mm_add_epi64(row3l, row4l);
1436  row3h = _mm_add_epi64(row3h, row4h);
1437  row2l = _mm_xor_si128(row2l, row3l);
1438  row2h = _mm_xor_si128(row2h, row3h);
1439  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1440  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1441
1442  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1443  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1444  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1445  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1446  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1447  b0 = _mm_set_epi64x(m7, m4);
1448  b1 = _mm_set_epi64x(m1, m15);
1449  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1450  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1451  row4l = _mm_xor_si128(row4l, row1l);
1452  row4h = _mm_xor_si128(row4h, row1h);
1453  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1454  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1455  row3l = _mm_add_epi64(row3l, row4l);
1456  row3h = _mm_add_epi64(row3h, row4h);
1457  row2l = _mm_xor_si128(row2l, row3l);
1458  row2h = _mm_xor_si128(row2h, row3h);
1459  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1460  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1461
1462  b0 = _mm_set_epi64x(m5, m13);
1463  b1 = _mm_set_epi64x(m9, m14);
1464  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1465  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1466  row4l = _mm_xor_si128(row4l, row1l);
1467  row4h = _mm_xor_si128(row4h, row1h);
1468  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1469  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1470  row3l = _mm_add_epi64(row3l, row4l);
1471  row3h = _mm_add_epi64(row3h, row4h);
1472  row2l = _mm_xor_si128(row2l, row3l);
1473  row2h = _mm_xor_si128(row2h, row3h);
1474  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1475  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1476
1477  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1478  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1479  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1480  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1481  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1482
1483  b0 = _mm_set_epi64x(m1, m12);
1484  b1 = _mm_set_epi64x(m4, m14);
1485  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1486  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1487  row4l = _mm_xor_si128(row4l, row1l);
1488  row4h = _mm_xor_si128(row4h, row1h);
1489  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1490  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1491  row3l = _mm_add_epi64(row3l, row4l);
1492  row3h = _mm_add_epi64(row3h, row4h);
1493  row2l = _mm_xor_si128(row2l, row3l);
1494  row2h = _mm_xor_si128(row2h, row3h);
1495  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1496  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1497
1498  b0 = _mm_set_epi64x(m15, m5);
1499  b1 = _mm_set_epi64x(m10, m13);
1500  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1501  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1502  row4l = _mm_xor_si128(row4l, row1l);
1503  row4h = _mm_xor_si128(row4h, row1h);
1504  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1505  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1506  row3l = _mm_add_epi64(row3l, row4l);
1507  row3h = _mm_add_epi64(row3h, row4h);
1508  row2l = _mm_xor_si128(row2l, row3l);
1509  row2h = _mm_xor_si128(row2h, row3h);
1510  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1511  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1512
1513  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1514  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1515  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1516  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1517  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1518  b0 = _mm_set_epi64x(m6, m0);
1519  b1 = _mm_set_epi64x(m8, m9);
1520  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1521  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1522  row4l = _mm_xor_si128(row4l, row1l);
1523  row4h = _mm_xor_si128(row4h, row1h);
1524  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1525  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1526  row3l = _mm_add_epi64(row3l, row4l);
1527  row3h = _mm_add_epi64(row3h, row4h);
1528  row2l = _mm_xor_si128(row2l, row3l);
1529  row2h = _mm_xor_si128(row2h, row3h);
1530  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1531  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1532
1533  b0 = _mm_set_epi64x(m3, m7);
1534  b1 = _mm_set_epi64x(m11, m2);
1535  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1536  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1537  row4l = _mm_xor_si128(row4l, row1l);
1538  row4h = _mm_xor_si128(row4h, row1h);
1539  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1540  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1541  row3l = _mm_add_epi64(row3l, row4l);
1542  row3h = _mm_add_epi64(row3h, row4h);
1543  row2l = _mm_xor_si128(row2l, row3l);
1544  row2h = _mm_xor_si128(row2h, row3h);
1545  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1546  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1547
1548  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1549  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1550  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1551  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1552  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1553
1554  b0 = _mm_set_epi64x(m7, m13);
1555  b1 = _mm_set_epi64x(m3, m12);
1556  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1557  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1558  row4l = _mm_xor_si128(row4l, row1l);
1559  row4h = _mm_xor_si128(row4h, row1h);
1560  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1561  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1562  row3l = _mm_add_epi64(row3l, row4l);
1563  row3h = _mm_add_epi64(row3h, row4h);
1564  row2l = _mm_xor_si128(row2l, row3l);
1565  row2h = _mm_xor_si128(row2h, row3h);
1566  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1567  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1568
1569  b0 = _mm_set_epi64x(m14, m11);
1570  b1 = _mm_set_epi64x(m9, m1);
1571  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1572  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1573  row4l = _mm_xor_si128(row4l, row1l);
1574  row4h = _mm_xor_si128(row4h, row1h);
1575  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1576  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1577  row3l = _mm_add_epi64(row3l, row4l);
1578  row3h = _mm_add_epi64(row3h, row4h);
1579  row2l = _mm_xor_si128(row2l, row3l);
1580  row2h = _mm_xor_si128(row2h, row3h);
1581  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1582  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1583
1584  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1585  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1586  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1587  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1588  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1589  b0 = _mm_set_epi64x(m15, m5);
1590  b1 = _mm_set_epi64x(m2, m8);
1591  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1592  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1593  row4l = _mm_xor_si128(row4l, row1l);
1594  row4h = _mm_xor_si128(row4h, row1h);
1595  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1596  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1597  row3l = _mm_add_epi64(row3l, row4l);
1598  row3h = _mm_add_epi64(row3h, row4h);
1599  row2l = _mm_xor_si128(row2l, row3l);
1600  row2h = _mm_xor_si128(row2h, row3h);
1601  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1602  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1603
1604  b0 = _mm_set_epi64x(m4, m0);
1605  b1 = _mm_set_epi64x(m10, m6);
1606  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1607  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1608  row4l = _mm_xor_si128(row4l, row1l);
1609  row4h = _mm_xor_si128(row4h, row1h);
1610  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1611  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1612  row3l = _mm_add_epi64(row3l, row4l);
1613  row3h = _mm_add_epi64(row3h, row4h);
1614  row2l = _mm_xor_si128(row2l, row3l);
1615  row2h = _mm_xor_si128(row2h, row3h);
1616  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1617  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1618
1619  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1620  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1621  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1622  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1623  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1624
1625  b0 = _mm_set_epi64x(m14, m6);
1626  b1 = _mm_set_epi64x(m0, m11);
1627  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1628  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1629  row4l = _mm_xor_si128(row4l, row1l);
1630  row4h = _mm_xor_si128(row4h, row1h);
1631  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1632  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1633  row3l = _mm_add_epi64(row3l, row4l);
1634  row3h = _mm_add_epi64(row3h, row4h);
1635  row2l = _mm_xor_si128(row2l, row3l);
1636  row2h = _mm_xor_si128(row2h, row3h);
1637  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1638  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1639
1640  b0 = _mm_set_epi64x(m9, m15);
1641  b1 = _mm_set_epi64x(m8, m3);
1642  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1643  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1644  row4l = _mm_xor_si128(row4l, row1l);
1645  row4h = _mm_xor_si128(row4h, row1h);
1646  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1647  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1648  row3l = _mm_add_epi64(row3l, row4l);
1649  row3h = _mm_add_epi64(row3h, row4h);
1650  row2l = _mm_xor_si128(row2l, row3l);
1651  row2h = _mm_xor_si128(row2h, row3h);
1652  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1653  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1654
1655  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1656  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1657  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1658  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1659  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1660  b0 = _mm_set_epi64x(m13, m12);
1661  b1 = _mm_set_epi64x(m10, m1);
1662  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1663  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1664  row4l = _mm_xor_si128(row4l, row1l);
1665  row4h = _mm_xor_si128(row4h, row1h);
1666  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1667  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1668  row3l = _mm_add_epi64(row3l, row4l);
1669  row3h = _mm_add_epi64(row3h, row4h);
1670  row2l = _mm_xor_si128(row2l, row3l);
1671  row2h = _mm_xor_si128(row2h, row3h);
1672  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1673  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1674
1675  b0 = _mm_set_epi64x(m7, m2);
1676  b1 = _mm_set_epi64x(m5, m4);
1677  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1678  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1679  row4l = _mm_xor_si128(row4l, row1l);
1680  row4h = _mm_xor_si128(row4h, row1h);
1681  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1682  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1683  row3l = _mm_add_epi64(row3l, row4l);
1684  row3h = _mm_add_epi64(row3h, row4h);
1685  row2l = _mm_xor_si128(row2l, row3l);
1686  row2h = _mm_xor_si128(row2h, row3h);
1687  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1688  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1689
1690  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1691  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1692  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1693  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1694  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1695
1696  b0 = _mm_set_epi64x(m8, m10);
1697  b1 = _mm_set_epi64x(m1, m7);
1698  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1699  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1700  row4l = _mm_xor_si128(row4l, row1l);
1701  row4h = _mm_xor_si128(row4h, row1h);
1702  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1703  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1704  row3l = _mm_add_epi64(row3l, row4l);
1705  row3h = _mm_add_epi64(row3h, row4h);
1706  row2l = _mm_xor_si128(row2l, row3l);
1707  row2h = _mm_xor_si128(row2h, row3h);
1708  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1709  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1710
1711  b0 = _mm_set_epi64x(m4, m2);
1712  b1 = _mm_set_epi64x(m5, m6);
1713  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1714  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1715  row4l = _mm_xor_si128(row4l, row1l);
1716  row4h = _mm_xor_si128(row4h, row1h);
1717  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1718  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1719  row3l = _mm_add_epi64(row3l, row4l);
1720  row3h = _mm_add_epi64(row3h, row4h);
1721  row2l = _mm_xor_si128(row2l, row3l);
1722  row2h = _mm_xor_si128(row2h, row3h);
1723  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1724  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1725
1726  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1727  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1728  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1729  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1730  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1731  b0 = _mm_set_epi64x(m9, m15);
1732  b1 = _mm_set_epi64x(m13, m3);
1733  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1734  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1735  row4l = _mm_xor_si128(row4l, row1l);
1736  row4h = _mm_xor_si128(row4h, row1h);
1737  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1738  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1739  row3l = _mm_add_epi64(row3l, row4l);
1740  row3h = _mm_add_epi64(row3h, row4h);
1741  row2l = _mm_xor_si128(row2l, row3l);
1742  row2h = _mm_xor_si128(row2h, row3h);
1743  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1744  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1745
1746  b0 = _mm_set_epi64x(m14, m11);
1747  b1 = _mm_set_epi64x(m0, m12);
1748  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1749  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1750  row4l = _mm_xor_si128(row4l, row1l);
1751  row4h = _mm_xor_si128(row4h, row1h);
1752  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1753  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1754  row3l = _mm_add_epi64(row3l, row4l);
1755  row3h = _mm_add_epi64(row3h, row4h);
1756  row2l = _mm_xor_si128(row2l, row3l);
1757  row2h = _mm_xor_si128(row2h, row3h);
1758  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1759  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1760
1761  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1762  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1763  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1764  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1765  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1766
1767  b0 = _mm_set_epi64x(m2, m0);
1768  b1 = _mm_set_epi64x(m6, m4);
1769  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1770  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1771  row4l = _mm_xor_si128(row4l, row1l);
1772  row4h = _mm_xor_si128(row4h, row1h);
1773  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1774  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1775  row3l = _mm_add_epi64(row3l, row4l);
1776  row3h = _mm_add_epi64(row3h, row4h);
1777  row2l = _mm_xor_si128(row2l, row3l);
1778  row2h = _mm_xor_si128(row2h, row3h);
1779  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1780  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1781
1782  b0 = _mm_set_epi64x(m3, m1);
1783  b1 = _mm_set_epi64x(m7, m5);
1784  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1785  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1786  row4l = _mm_xor_si128(row4l, row1l);
1787  row4h = _mm_xor_si128(row4h, row1h);
1788  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1789  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1790  row3l = _mm_add_epi64(row3l, row4l);
1791  row3h = _mm_add_epi64(row3h, row4h);
1792  row2l = _mm_xor_si128(row2l, row3l);
1793  row2h = _mm_xor_si128(row2h, row3h);
1794  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1795  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1796
1797  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1798  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1799  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1800  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1801  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1802
1803  b0 = _mm_set_epi64x(m10, m8);
1804  b1 = _mm_set_epi64x(m14, m12);
1805  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1806  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1807  row4l = _mm_xor_si128(row4l, row1l);
1808  row4h = _mm_xor_si128(row4h, row1h);
1809  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1810  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1811  row3l = _mm_add_epi64(row3l, row4l);
1812  row3h = _mm_add_epi64(row3h, row4h);
1813  row2l = _mm_xor_si128(row2l, row3l);
1814  row2h = _mm_xor_si128(row2h, row3h);
1815  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1816  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1817
1818  b0 = _mm_set_epi64x(m11, m9);
1819  b1 = _mm_set_epi64x(m15, m13);
1820  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1821  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1822  row4l = _mm_xor_si128(row4l, row1l);
1823  row4h = _mm_xor_si128(row4h, row1h);
1824  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1825  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1826  row3l = _mm_add_epi64(row3l, row4l);
1827  row3h = _mm_add_epi64(row3h, row4h);
1828  row2l = _mm_xor_si128(row2l, row3l);
1829  row2h = _mm_xor_si128(row2h, row3h);
1830  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1831  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1832
1833  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1834  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1835  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1836  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1837  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1838
1839  b0 = _mm_set_epi64x(m4, m14);
1840  b1 = _mm_set_epi64x(m13, m9);
1841  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1842  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1843  row4l = _mm_xor_si128(row4l, row1l);
1844  row4h = _mm_xor_si128(row4h, row1h);
1845  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1846  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1847  row3l = _mm_add_epi64(row3l, row4l);
1848  row3h = _mm_add_epi64(row3h, row4h);
1849  row2l = _mm_xor_si128(row2l, row3l);
1850  row2h = _mm_xor_si128(row2h, row3h);
1851  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1852  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1853
1854  b0 = _mm_set_epi64x(m8, m10);
1855  b1 = _mm_set_epi64x(m6, m15);
1856  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1857  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1858  row4l = _mm_xor_si128(row4l, row1l);
1859  row4h = _mm_xor_si128(row4h, row1h);
1860  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1861  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1862  row3l = _mm_add_epi64(row3l, row4l);
1863  row3h = _mm_add_epi64(row3h, row4h);
1864  row2l = _mm_xor_si128(row2l, row3l);
1865  row2h = _mm_xor_si128(row2h, row3h);
1866  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1867  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1868
1869  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1870  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1871  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1872  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1873  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1874  b0 = _mm_set_epi64x(m0, m1);
1875  b1 = _mm_set_epi64x(m5, m11);
1876  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1877  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1878  row4l = _mm_xor_si128(row4l, row1l);
1879  row4h = _mm_xor_si128(row4h, row1h);
1880  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1881  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1882  row3l = _mm_add_epi64(row3l, row4l);
1883  row3h = _mm_add_epi64(row3h, row4h);
1884  row2l = _mm_xor_si128(row2l, row3l);
1885  row2h = _mm_xor_si128(row2h, row3h);
1886  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1887  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1888
1889  b0 = _mm_set_epi64x(m2, m12);
1890  b1 = _mm_set_epi64x(m3, m7);
1891  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1892  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1893  row4l = _mm_xor_si128(row4l, row1l);
1894  row4h = _mm_xor_si128(row4h, row1h);
1895  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1896  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1897  row3l = _mm_add_epi64(row3l, row4l);
1898  row3h = _mm_add_epi64(row3h, row4h);
1899  row2l = _mm_xor_si128(row2l, row3l);
1900  row2h = _mm_xor_si128(row2h, row3h);
1901  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1902  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1903
1904  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1905  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1906  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1907  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1908  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1909
1910  row1l = _mm_xor_si128(row3l, row1l);
1911  row1h = _mm_xor_si128(row3h, row1h);
1912  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l));
1913  _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h));
1914
1915  row2l = _mm_xor_si128(row4l, row2l);
1916  row2h = _mm_xor_si128(row4h, row2h);
1917  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l));
1918  _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h));
1919}
1920# endif // (__SUNPRO_CC != 0x5120)
1921#endif  // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
1922
1923#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
1924static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
1925{
1926  __m128i row1, row2, row3, row4;
1927  __m128i buf1, buf2, buf3, buf4;
1928
1929  __m128i t0, t1, t2;
1930  __m128i ff0, ff1;
1931
1932  const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
1933  const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
1934
1935  const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
1936  const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
1937  const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
1938  const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
1939
1940  row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
1941  row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
1942  row3 = _mm_setr_epi32(BLAKE2S_IV(0), BLAKE2S_IV(1), BLAKE2S_IV(2), BLAKE2S_IV(3));
1943  row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4), BLAKE2S_IV(5), BLAKE2S_IV(6), BLAKE2S_IV(7)), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
1944  buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
1945
1946  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1947  row4 = _mm_xor_si128(row4, row1);
1948  row4 = _mm_shuffle_epi8(row4,r16);
1949  row3 = _mm_add_epi32(row3, row4);
1950  row2 = _mm_xor_si128(row2, row3);
1951  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1952
1953  buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1))));
1954
1955  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
1956  row4 = _mm_xor_si128(row4, row1);
1957  row4 = _mm_shuffle_epi8(row4,r8);
1958  row3 = _mm_add_epi32(row3, row4);
1959  row2 = _mm_xor_si128(row2, row3);
1960  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1961
1962  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
1963  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1964  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
1965
1966  buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0))));
1967
1968  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
1969  row4 = _mm_xor_si128(row4, row1);
1970  row4 = _mm_shuffle_epi8(row4,r16);
1971  row3 = _mm_add_epi32(row3, row4);
1972  row2 = _mm_xor_si128(row2, row3);
1973  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1974
1975  buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1))));
1976
1977  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
1978  row4 = _mm_xor_si128(row4, row1);
1979  row4 = _mm_shuffle_epi8(row4,r8);
1980  row3 = _mm_add_epi32(row3, row4);
1981  row2 = _mm_xor_si128(row2, row3);
1982  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1983
1984  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
1985  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1986  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
1987
1988  t0 = _mm_blend_epi16(m1, m2, 0x0C);
1989  t1 = _mm_slli_si128(m3, 4);
1990  t2 = _mm_blend_epi16(t0, t1, 0xF0);
1991  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
1992
1993  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1994  row4 = _mm_xor_si128(row4, row1);
1995  row4 = _mm_shuffle_epi8(row4,r16);
1996  row3 = _mm_add_epi32(row3, row4);
1997  row2 = _mm_xor_si128(row2, row3);
1998  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1999
2000  t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0));
2001  t1 = _mm_blend_epi16(m1,m3,0xC0);
2002  t2 = _mm_blend_epi16(t0, t1, 0xF0);
2003  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2004
2005  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2006  row4 = _mm_xor_si128(row4, row1);
2007  row4 = _mm_shuffle_epi8(row4,r8);
2008  row3 = _mm_add_epi32(row3, row4);
2009  row2 = _mm_xor_si128(row2, row3);
2010  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2011
2012  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2013  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2014  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2015
2016  t0 = _mm_slli_si128(m1, 4);
2017  t1 = _mm_blend_epi16(m2, t0, 0x30);
2018  t2 = _mm_blend_epi16(m0, t1, 0xF0);
2019  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2020
2021  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2022  row4 = _mm_xor_si128(row4, row1);
2023  row4 = _mm_shuffle_epi8(row4,r16);
2024  row3 = _mm_add_epi32(row3, row4);
2025  row2 = _mm_xor_si128(row2, row3);
2026  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2027
2028  t0 = _mm_unpackhi_epi32(m0,m1);
2029  t1 = _mm_slli_si128(m3, 4);
2030  t2 = _mm_blend_epi16(t0, t1, 0x0C);
2031  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2032
2033  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2034  row4 = _mm_xor_si128(row4, row1);
2035  row4 = _mm_shuffle_epi8(row4,r8);
2036  row3 = _mm_add_epi32(row3, row4);
2037  row2 = _mm_xor_si128(row2, row3);
2038  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2039
2040  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2041  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2042  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2043
2044  t0 = _mm_unpackhi_epi32(m2,m3);
2045  t1 = _mm_blend_epi16(m3,m1,0x0C);
2046  t2 = _mm_blend_epi16(t0, t1, 0x0F);
2047  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2048
2049  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2050  row4 = _mm_xor_si128(row4, row1);
2051  row4 = _mm_shuffle_epi8(row4,r16);
2052  row3 = _mm_add_epi32(row3, row4);
2053  row2 = _mm_xor_si128(row2, row3);
2054  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2055
2056  t0 = _mm_unpacklo_epi32(m2,m0);
2057  t1 = _mm_blend_epi16(t0, m0, 0xF0);
2058  t2 = _mm_slli_si128(m3, 8);
2059  buf2 = _mm_blend_epi16(t1, t2, 0xC0);
2060
2061  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2062  row4 = _mm_xor_si128(row4, row1);
2063  row4 = _mm_shuffle_epi8(row4,r8);
2064  row3 = _mm_add_epi32(row3, row4);
2065  row2 = _mm_xor_si128(row2, row3);
2066  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2067
2068  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2069  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2070  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2071
2072  t0 = _mm_blend_epi16(m0, m2, 0x3C);
2073  t1 = _mm_srli_si128(m1, 12);
2074  t2 = _mm_blend_epi16(t0,t1,0x03);
2075  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
2076
2077  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2078  row4 = _mm_xor_si128(row4, row1);
2079  row4 = _mm_shuffle_epi8(row4,r16);
2080  row3 = _mm_add_epi32(row3, row4);
2081  row2 = _mm_xor_si128(row2, row3);
2082  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2083
2084  t0 = _mm_slli_si128(m3, 4);
2085  t1 = _mm_blend_epi16(m0, m1, 0x33);
2086  t2 = _mm_blend_epi16(t1, t0, 0xC0);
2087  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
2088
2089  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2090  row4 = _mm_xor_si128(row4, row1);
2091  row4 = _mm_shuffle_epi8(row4,r8);
2092  row3 = _mm_add_epi32(row3, row4);
2093  row2 = _mm_xor_si128(row2, row3);
2094  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2095
2096  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2097  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2098  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2099
2100  t0 = _mm_unpackhi_epi32(m0,m1);
2101  t1 = _mm_unpackhi_epi32(t0, m2);
2102  t2 = _mm_blend_epi16(t1, m3, 0x0C);
2103  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2104
2105  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2106  row4 = _mm_xor_si128(row4, row1);
2107  row4 = _mm_shuffle_epi8(row4,r16);
2108  row3 = _mm_add_epi32(row3, row4);
2109  row2 = _mm_xor_si128(row2, row3);
2110  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2111
2112  t0 = _mm_slli_si128(m2, 8);
2113  t1 = _mm_blend_epi16(m3,m0,0x0C);
2114  t2 = _mm_blend_epi16(t1, t0, 0xC0);
2115  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2116
2117  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2118  row4 = _mm_xor_si128(row4, row1);
2119  row4 = _mm_shuffle_epi8(row4,r8);
2120  row3 = _mm_add_epi32(row3, row4);
2121  row2 = _mm_xor_si128(row2, row3);
2122  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2123
2124  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2125  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2126  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2127
2128  t0 = _mm_blend_epi16(m0,m1,0x0F);
2129  t1 = _mm_blend_epi16(t0, m3, 0xC0);
2130  buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2131
2132  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2133  row4 = _mm_xor_si128(row4, row1);
2134  row4 = _mm_shuffle_epi8(row4,r16);
2135  row3 = _mm_add_epi32(row3, row4);
2136  row2 = _mm_xor_si128(row2, row3);
2137  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2138
2139  t0 = _mm_unpacklo_epi32(m0,m2);
2140  t1 = _mm_unpackhi_epi32(m1,m2);
2141  buf4 = _mm_unpacklo_epi64(t1,t0);
2142
2143  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2144  row4 = _mm_xor_si128(row4, row1);
2145  row4 = _mm_shuffle_epi8(row4,r8);
2146  row3 = _mm_add_epi32(row3, row4);
2147  row2 = _mm_xor_si128(row2, row3);
2148  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2149
2150  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2151  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2152  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2153
2154  t0 = _mm_unpacklo_epi64(m1,m2);
2155  t1 = _mm_unpackhi_epi64(m0,m2);
2156  t2 = _mm_blend_epi16(t0,t1,0x33);
2157  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2158
2159  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2160  row4 = _mm_xor_si128(row4, row1);
2161  row4 = _mm_shuffle_epi8(row4,r16);
2162  row3 = _mm_add_epi32(row3, row4);
2163  row2 = _mm_xor_si128(row2, row3);
2164  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2165
2166  t0 = _mm_unpackhi_epi64(m1,m3);
2167  t1 = _mm_unpacklo_epi64(m0,m1);
2168  buf2 = _mm_blend_epi16(t0,t1,0x33);
2169
2170  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2171  row4 = _mm_xor_si128(row4, row1);
2172  row4 = _mm_shuffle_epi8(row4,r8);
2173  row3 = _mm_add_epi32(row3, row4);
2174  row2 = _mm_xor_si128(row2, row3);
2175  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2176
2177  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2178  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2179  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2180
2181  t0 = _mm_unpackhi_epi64(m3,m1);
2182  t1 = _mm_unpackhi_epi64(m2,m0);
2183  buf3 = _mm_blend_epi16(t1,t0,0x33);
2184
2185  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2186  row4 = _mm_xor_si128(row4, row1);
2187  row4 = _mm_shuffle_epi8(row4,r16);
2188  row3 = _mm_add_epi32(row3, row4);
2189  row2 = _mm_xor_si128(row2, row3);
2190  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2191
2192  t0 = _mm_blend_epi16(m0,m2,0x03);
2193  t1 = _mm_slli_si128(t0, 8);
2194  t2 = _mm_blend_epi16(t1,m3,0x0F);
2195  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
2196
2197  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2198  row4 = _mm_xor_si128(row4, row1);
2199  row4 = _mm_shuffle_epi8(row4,r8);
2200  row3 = _mm_add_epi32(row3, row4);
2201  row2 = _mm_xor_si128(row2, row3);
2202  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2203
2204  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2205  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2206  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2207
2208  t0 = _mm_unpackhi_epi32(m0,m1);
2209  t1 = _mm_unpacklo_epi32(m0,m2);
2210  buf1 = _mm_unpacklo_epi64(t0,t1);
2211
2212  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2213  row4 = _mm_xor_si128(row4, row1);
2214  row4 = _mm_shuffle_epi8(row4,r16);
2215  row3 = _mm_add_epi32(row3, row4);
2216  row2 = _mm_xor_si128(row2, row3);
2217  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2218
2219  t0 = _mm_srli_si128(m2, 4);
2220  t1 = _mm_blend_epi16(m0,m3,0x03);
2221  buf2 = _mm_blend_epi16(t1,t0,0x3C);
2222
2223  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2224  row4 = _mm_xor_si128(row4, row1);
2225  row4 = _mm_shuffle_epi8(row4,r8);
2226  row3 = _mm_add_epi32(row3, row4);
2227  row2 = _mm_xor_si128(row2, row3);
2228  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2229
2230  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2231  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2232  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2233
2234  t0 = _mm_blend_epi16(m1,m0,0x0C);
2235  t1 = _mm_srli_si128(m3, 4);
2236  t2 = _mm_blend_epi16(t0,t1,0x30);
2237  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
2238
2239  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2240  row4 = _mm_xor_si128(row4, row1);
2241  row4 = _mm_shuffle_epi8(row4,r16);
2242  row3 = _mm_add_epi32(row3, row4);
2243  row2 = _mm_xor_si128(row2, row3);
2244  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2245
2246  t0 = _mm_unpacklo_epi64(m1,m2);
2247  t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1));
2248  buf4 = _mm_blend_epi16(t0,t1,0x33);
2249
2250  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2251  row4 = _mm_xor_si128(row4, row1);
2252  row4 = _mm_shuffle_epi8(row4,r8);
2253  row3 = _mm_add_epi32(row3, row4);
2254  row2 = _mm_xor_si128(row2, row3);
2255  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2256
2257  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2258  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2259  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2260
2261  t0 = _mm_slli_si128(m1, 12);
2262  t1 = _mm_blend_epi16(m0,m3,0x33);
2263  buf1 = _mm_blend_epi16(t1,t0,0xC0);
2264
2265  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2266  row4 = _mm_xor_si128(row4, row1);
2267  row4 = _mm_shuffle_epi8(row4,r16);
2268  row3 = _mm_add_epi32(row3, row4);
2269  row2 = _mm_xor_si128(row2, row3);
2270  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2271
2272  t0 = _mm_blend_epi16(m3,m2,0x30);
2273  t1 = _mm_srli_si128(m1, 4);
2274  t2 = _mm_blend_epi16(t0,t1,0x03);
2275  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
2276
2277  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2278  row4 = _mm_xor_si128(row4, row1);
2279  row4 = _mm_shuffle_epi8(row4,r8);
2280  row3 = _mm_add_epi32(row3, row4);
2281  row2 = _mm_xor_si128(row2, row3);
2282  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2283
2284  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2285  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2286  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2287
2288  t0 = _mm_unpacklo_epi64(m0,m2);
2289  t1 = _mm_srli_si128(m1, 4);
2290  buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
2291
2292  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2293  row4 = _mm_xor_si128(row4, row1);
2294  row4 = _mm_shuffle_epi8(row4,r16);
2295  row3 = _mm_add_epi32(row3, row4);
2296  row2 = _mm_xor_si128(row2, row3);
2297  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2298
2299  t0 = _mm_unpackhi_epi32(m1,m2);
2300  t1 = _mm_unpackhi_epi64(m0,t0);
2301  buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2302
2303  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2304  row4 = _mm_xor_si128(row4, row1);
2305  row4 = _mm_shuffle_epi8(row4,r8);
2306  row3 = _mm_add_epi32(row3, row4);
2307  row2 = _mm_xor_si128(row2, row3);
2308  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2309
2310  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2311  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2312  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2313
2314  t0 = _mm_unpackhi_epi32(m0,m1);
2315  t1 = _mm_blend_epi16(t0,m3,0x0F);
2316  buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
2317
2318  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2319  row4 = _mm_xor_si128(row4, row1);
2320  row4 = _mm_shuffle_epi8(row4,r16);
2321  row3 = _mm_add_epi32(row3, row4);
2322  row2 = _mm_xor_si128(row2, row3);
2323  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2324
2325  t0 = _mm_blend_epi16(m2,m3,0x30);
2326  t1 = _mm_srli_si128(m0,4);
2327  t2 = _mm_blend_epi16(t0,t1,0x03);
2328  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
2329
2330  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2331  row4 = _mm_xor_si128(row4, row1);
2332  row4 = _mm_shuffle_epi8(row4,r8);
2333  row3 = _mm_add_epi32(row3, row4);
2334  row2 = _mm_xor_si128(row2, row3);
2335  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2336
2337  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2338  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2339  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2340
2341  t0 = _mm_unpackhi_epi64(m0,m3);
2342  t1 = _mm_unpacklo_epi64(m1,m2);
2343  t2 = _mm_blend_epi16(t0,t1,0x3C);
2344  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
2345
2346  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2347  row4 = _mm_xor_si128(row4, row1);
2348  row4 = _mm_shuffle_epi8(row4,r16);
2349  row3 = _mm_add_epi32(row3, row4);
2350  row2 = _mm_xor_si128(row2, row3);
2351  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2352
2353  t0 = _mm_unpacklo_epi32(m0,m1);
2354  t1 = _mm_unpackhi_epi32(m1,m2);
2355  buf4 = _mm_unpacklo_epi64(t0,t1);
2356
2357  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2358  row4 = _mm_xor_si128(row4, row1);
2359  row4 = _mm_shuffle_epi8(row4,r8);
2360  row3 = _mm_add_epi32(row3, row4);
2361  row2 = _mm_xor_si128(row2, row3);
2362  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2363
2364  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2365  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2366  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2367
2368  t0 = _mm_unpackhi_epi32(m1,m3);
2369  t1 = _mm_unpacklo_epi64(t0,m0);
2370  t2 = _mm_blend_epi16(t1,m2,0xC0);
2371  buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
2372
2373  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2374  row4 = _mm_xor_si128(row4, row1);
2375  row4 = _mm_shuffle_epi8(row4,r16);
2376  row3 = _mm_add_epi32(row3, row4);
2377  row2 = _mm_xor_si128(row2, row3);
2378  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2379
2380  t0 = _mm_unpackhi_epi32(m0,m3);
2381  t1 = _mm_blend_epi16(m2,t0,0xF0);
2382  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
2383
2384  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2385  row4 = _mm_xor_si128(row4, row1);
2386  row4 = _mm_shuffle_epi8(row4,r8);
2387  row3 = _mm_add_epi32(row3, row4);
2388  row2 = _mm_xor_si128(row2, row3);
2389  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2390
2391  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2392  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2393  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2394
2395  t0 = _mm_blend_epi16(m2,m0,0x0C);
2396  t1 = _mm_slli_si128(t0,4);
2397  buf3 = _mm_blend_epi16(t1,m3,0x0F);
2398
2399  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2400  row4 = _mm_xor_si128(row4, row1);
2401  row4 = _mm_shuffle_epi8(row4,r16);
2402  row3 = _mm_add_epi32(row3, row4);
2403  row2 = _mm_xor_si128(row2, row3);
2404  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2405
2406  t0 = _mm_blend_epi16(m1,m0,0x30);
2407  buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
2408
2409  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2410  row4 = _mm_xor_si128(row4, row1);
2411  row4 = _mm_shuffle_epi8(row4,r8);
2412  row3 = _mm_add_epi32(row3, row4);
2413  row2 = _mm_xor_si128(row2, row3);
2414  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2415
2416  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2417  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2418  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2419
2420  t0 = _mm_blend_epi16(m0,m2,0x03);
2421  t1 = _mm_blend_epi16(m1,m2,0x30);
2422  t2 = _mm_blend_epi16(t1,t0,0x0F);
2423  buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
2424
2425  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2426  row4 = _mm_xor_si128(row4, row1);
2427  row4 = _mm_shuffle_epi8(row4,r16);
2428  row3 = _mm_add_epi32(row3, row4);
2429  row2 = _mm_xor_si128(row2, row3);
2430  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2431
2432  t0 = _mm_slli_si128(m0,4);
2433  t1 = _mm_blend_epi16(m1,t0,0xC0);
2434  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
2435
2436  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2437  row4 = _mm_xor_si128(row4, row1);
2438  row4 = _mm_shuffle_epi8(row4,r8);
2439  row3 = _mm_add_epi32(row3, row4);
2440  row2 = _mm_xor_si128(row2, row3);
2441  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2442
2443  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2444  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2445  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2446
2447  t0 = _mm_unpackhi_epi32(m0,m3);
2448  t1 = _mm_unpacklo_epi32(m2,m3);
2449  t2 = _mm_unpackhi_epi64(t0,t1);
2450  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
2451
2452  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2453  row4 = _mm_xor_si128(row4, row1);
2454  row4 = _mm_shuffle_epi8(row4,r16);
2455  row3 = _mm_add_epi32(row3, row4);
2456  row2 = _mm_xor_si128(row2, row3);
2457  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2458
2459  t0 = _mm_blend_epi16(m3,m2,0xC0);
2460  t1 = _mm_unpacklo_epi32(m0,m3);
2461  t2 = _mm_blend_epi16(t0,t1,0x0F);
2462  buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
2463
2464  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2465  row4 = _mm_xor_si128(row4, row1);
2466  row4 = _mm_shuffle_epi8(row4,r8);
2467  row3 = _mm_add_epi32(row3, row4);
2468  row2 = _mm_xor_si128(row2, row3);
2469  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2470
2471  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2472  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2473  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2474
2475  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
2476  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
2477}
2478
2479static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
2480{
2481  __m128i row1l, row1h;
2482  __m128i row2l, row2h;
2483  __m128i row3l, row3h;
2484  __m128i row4l, row4h;
2485  __m128i b0, b1, t0, t1;
2486
2487  const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
2488  const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
2489
2490  const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
2491  const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
2492  const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
2493  const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
2494  const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64));
2495  const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80));
2496  const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96));
2497  const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112));
2498
2499  row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
2500  row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]));
2501  row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
2502  row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]));
2503  row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0)));
2504  row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2)));
2505  row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
2506  row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0])));
2507
2508  b0 = _mm_unpacklo_epi64(m0, m1);
2509  b1 = _mm_unpacklo_epi64(m2, m3);
2510  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2511  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2512  row4l = _mm_xor_si128(row4l, row1l);
2513  row4h = _mm_xor_si128(row4h, row1h);
2514  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2515  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2516  row3l = _mm_add_epi64(row3l, row4l);
2517  row3h = _mm_add_epi64(row3h, row4h);
2518  row2l = _mm_xor_si128(row2l, row3l);
2519  row2h = _mm_xor_si128(row2h, row3h);
2520  row2l = _mm_shuffle_epi8(row2l, r24);
2521  row2h = _mm_shuffle_epi8(row2h, r24);
2522
2523  b0 = _mm_unpackhi_epi64(m0, m1);
2524  b1 = _mm_unpackhi_epi64(m2, m3);
2525
2526  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2527  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2528  row4l = _mm_xor_si128(row4l, row1l);
2529  row4h = _mm_xor_si128(row4h, row1h);
2530  row4l = _mm_shuffle_epi8(row4l, r16);
2531  row4h = _mm_shuffle_epi8(row4h, r16);
2532  row3l = _mm_add_epi64(row3l, row4l);
2533  row3h = _mm_add_epi64(row3h, row4h);
2534  row2l = _mm_xor_si128(row2l, row3l);
2535  row2h = _mm_xor_si128(row2h, row3h);
2536  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2537  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2538
2539  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2540  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2541  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2542  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2543  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2544  row4l = t1, row4h = t0;
2545
2546  b0 = _mm_unpacklo_epi64(m4, m5);
2547  b1 = _mm_unpacklo_epi64(m6, m7);
2548
2549  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2550  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2551  row4l = _mm_xor_si128(row4l, row1l);
2552  row4h = _mm_xor_si128(row4h, row1h);
2553  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2554  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2555  row3l = _mm_add_epi64(row3l, row4l);
2556  row3h = _mm_add_epi64(row3h, row4h);
2557  row2l = _mm_xor_si128(row2l, row3l);
2558  row2h = _mm_xor_si128(row2h, row3h);
2559  row2l = _mm_shuffle_epi8(row2l, r24);
2560  row2h = _mm_shuffle_epi8(row2h, r24);
2561
2562  b0 = _mm_unpackhi_epi64(m4, m5);
2563  b1 = _mm_unpackhi_epi64(m6, m7);
2564
2565  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2566  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2567  row4l = _mm_xor_si128(row4l, row1l);
2568  row4h = _mm_xor_si128(row4h, row1h);
2569  row4l = _mm_shuffle_epi8(row4l, r16);
2570  row4h = _mm_shuffle_epi8(row4h, r16);
2571  row3l = _mm_add_epi64(row3l, row4l);
2572  row3h = _mm_add_epi64(row3h, row4h);
2573  row2l = _mm_xor_si128(row2l, row3l);
2574  row2h = _mm_xor_si128(row2h, row3h);
2575  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2576  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2577
2578  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2579  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2580  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2581  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2582  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2583  row4l = t1, row4h = t0;
2584
2585  b0 = _mm_unpacklo_epi64(m7, m2);
2586  b1 = _mm_unpackhi_epi64(m4, m6);
2587
2588  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2589  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2590  row4l = _mm_xor_si128(row4l, row1l);
2591  row4h = _mm_xor_si128(row4h, row1h);
2592  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2593  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2594  row3l = _mm_add_epi64(row3l, row4l);
2595  row3h = _mm_add_epi64(row3h, row4h);
2596  row2l = _mm_xor_si128(row2l, row3l);
2597  row2h = _mm_xor_si128(row2h, row3h);
2598  row2l = _mm_shuffle_epi8(row2l, r24);
2599  row2h = _mm_shuffle_epi8(row2h, r24);
2600
2601  b0 = _mm_unpacklo_epi64(m5, m4);
2602  b1 = _mm_alignr_epi8(m3, m7, 8);
2603
2604  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2605  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2606  row4l = _mm_xor_si128(row4l, row1l);
2607  row4h = _mm_xor_si128(row4h, row1h);
2608  row4l = _mm_shuffle_epi8(row4l, r16);
2609  row4h = _mm_shuffle_epi8(row4h, r16);
2610  row3l = _mm_add_epi64(row3l, row4l);
2611  row3h = _mm_add_epi64(row3h, row4h);
2612  row2l = _mm_xor_si128(row2l, row3l);
2613  row2h = _mm_xor_si128(row2h, row3h);
2614  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2615  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2616
2617  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2618  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2619  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2620  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2621  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2622  row4l = t1, row4h = t0;
2623
2624  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
2625  b1 = _mm_unpackhi_epi64(m5, m2);
2626
2627  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2628  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2629  row4l = _mm_xor_si128(row4l, row1l);
2630  row4h = _mm_xor_si128(row4h, row1h);
2631  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2632  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2633  row3l = _mm_add_epi64(row3l, row4l);
2634  row3h = _mm_add_epi64(row3h, row4h);
2635  row2l = _mm_xor_si128(row2l, row3l);
2636  row2h = _mm_xor_si128(row2h, row3h);
2637  row2l = _mm_shuffle_epi8(row2l, r24);
2638  row2h = _mm_shuffle_epi8(row2h, r24);
2639
2640  b0 = _mm_unpacklo_epi64(m6, m1);
2641  b1 = _mm_unpackhi_epi64(m3, m1);
2642
2643  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2644  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2645  row4l = _mm_xor_si128(row4l, row1l);
2646  row4h = _mm_xor_si128(row4h, row1h);
2647  row4l = _mm_shuffle_epi8(row4l, r16);
2648  row4h = _mm_shuffle_epi8(row4h, r16);
2649  row3l = _mm_add_epi64(row3l, row4l);
2650  row3h = _mm_add_epi64(row3h, row4h);
2651  row2l = _mm_xor_si128(row2l, row3l);
2652  row2h = _mm_xor_si128(row2h, row3h);
2653  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2654  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2655
2656  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2657  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2658  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2659  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2660  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2661  row4l = t1, row4h = t0;
2662
2663  b0 = _mm_alignr_epi8(m6, m5, 8);
2664  b1 = _mm_unpackhi_epi64(m2, m7);
2665
2666  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2667  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2668  row4l = _mm_xor_si128(row4l, row1l);
2669  row4h = _mm_xor_si128(row4h, row1h);
2670  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2671  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2672  row3l = _mm_add_epi64(row3l, row4l);
2673  row3h = _mm_add_epi64(row3h, row4h);
2674  row2l = _mm_xor_si128(row2l, row3l);
2675  row2h = _mm_xor_si128(row2h, row3h);
2676  row2l = _mm_shuffle_epi8(row2l, r24);
2677  row2h = _mm_shuffle_epi8(row2h, r24);
2678
2679  b0 = _mm_unpacklo_epi64(m4, m0);
2680  b1 = _mm_blend_epi16(m1, m6, 0xF0);
2681
2682  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2683  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2684  row4l = _mm_xor_si128(row4l, row1l);
2685  row4h = _mm_xor_si128(row4h, row1h);
2686  row4l = _mm_shuffle_epi8(row4l, r16);
2687  row4h = _mm_shuffle_epi8(row4h, r16);
2688  row3l = _mm_add_epi64(row3l, row4l);
2689  row3h = _mm_add_epi64(row3h, row4h);
2690  row2l = _mm_xor_si128(row2l, row3l);
2691  row2h = _mm_xor_si128(row2h, row3h);
2692  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2693  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2694
2695  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2696  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2697  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2698  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2699  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2700  row4l = t1, row4h = t0;
2701
2702  b0 = _mm_blend_epi16(m5, m1, 0xF0);
2703  b1 = _mm_unpackhi_epi64(m3, m4);
2704
2705  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2706  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2707  row4l = _mm_xor_si128(row4l, row1l);
2708  row4h = _mm_xor_si128(row4h, row1h);
2709  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2710  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2711  row3l = _mm_add_epi64(row3l, row4l);
2712  row3h = _mm_add_epi64(row3h, row4h);
2713  row2l = _mm_xor_si128(row2l, row3l);
2714  row2h = _mm_xor_si128(row2h, row3h);
2715  row2l = _mm_shuffle_epi8(row2l, r24);
2716  row2h = _mm_shuffle_epi8(row2h, r24);
2717
2718  b0 = _mm_unpacklo_epi64(m7, m3);
2719  b1 = _mm_alignr_epi8(m2, m0, 8);
2720
2721  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2722  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2723  row4l = _mm_xor_si128(row4l, row1l);
2724  row4h = _mm_xor_si128(row4h, row1h);
2725  row4l = _mm_shuffle_epi8(row4l, r16);
2726  row4h = _mm_shuffle_epi8(row4h, r16);
2727  row3l = _mm_add_epi64(row3l, row4l);
2728  row3h = _mm_add_epi64(row3h, row4h);
2729  row2l = _mm_xor_si128(row2l, row3l);
2730  row2h = _mm_xor_si128(row2h, row3h);
2731  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2732  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2733
2734  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2735  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2736  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2737  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2738  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2739  row4l = t1, row4h = t0;
2740
2741  b0 = _mm_unpackhi_epi64(m3, m1);
2742  b1 = _mm_unpackhi_epi64(m6, m5);
2743
2744  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2745  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2746  row4l = _mm_xor_si128(row4l, row1l);
2747  row4h = _mm_xor_si128(row4h, row1h);
2748  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2749  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2750  row3l = _mm_add_epi64(row3l, row4l);
2751  row3h = _mm_add_epi64(row3h, row4h);
2752  row2l = _mm_xor_si128(row2l, row3l);
2753  row2h = _mm_xor_si128(row2h, row3h);
2754  row2l = _mm_shuffle_epi8(row2l, r24);
2755  row2h = _mm_shuffle_epi8(row2h, r24);
2756
2757  b0 = _mm_unpackhi_epi64(m4, m0);
2758  b1 = _mm_unpacklo_epi64(m6, m7);
2759
2760  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2761  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2762  row4l = _mm_xor_si128(row4l, row1l);
2763  row4h = _mm_xor_si128(row4h, row1h);
2764  row4l = _mm_shuffle_epi8(row4l, r16);
2765  row4h = _mm_shuffle_epi8(row4h, r16);
2766  row3l = _mm_add_epi64(row3l, row4l);
2767  row3h = _mm_add_epi64(row3h, row4h);
2768  row2l = _mm_xor_si128(row2l, row3l);
2769  row2h = _mm_xor_si128(row2h, row3h);
2770  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2771  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2772
2773  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2774  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2775  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2776  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2777  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2778  row4l = t1, row4h = t0;
2779
2780  b0 = _mm_blend_epi16(m1, m2, 0xF0);
2781  b1 = _mm_blend_epi16(m2, m7, 0xF0);
2782
2783  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2784  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2785  row4l = _mm_xor_si128(row4l, row1l);
2786  row4h = _mm_xor_si128(row4h, row1h);
2787  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2788  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2789  row3l = _mm_add_epi64(row3l, row4l);
2790  row3h = _mm_add_epi64(row3h, row4h);
2791  row2l = _mm_xor_si128(row2l, row3l);
2792  row2h = _mm_xor_si128(row2h, row3h);
2793  row2l = _mm_shuffle_epi8(row2l, r24);
2794  row2h = _mm_shuffle_epi8(row2h, r24);
2795
2796  b0 = _mm_unpacklo_epi64(m3, m5);
2797  b1 = _mm_unpacklo_epi64(m0, m4);
2798
2799  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2800  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2801  row4l = _mm_xor_si128(row4l, row1l);
2802  row4h = _mm_xor_si128(row4h, row1h);
2803  row4l = _mm_shuffle_epi8(row4l, r16);
2804  row4h = _mm_shuffle_epi8(row4h, r16);
2805  row3l = _mm_add_epi64(row3l, row4l);
2806  row3h = _mm_add_epi64(row3h, row4h);
2807  row2l = _mm_xor_si128(row2l, row3l);
2808  row2h = _mm_xor_si128(row2h, row3h);
2809  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2810  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2811
2812  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2813  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2814  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2815  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2816  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2817  row4l = t1, row4h = t0;
2818
2819  b0 = _mm_unpackhi_epi64(m4, m2);
2820  b1 = _mm_unpacklo_epi64(m1, m5);
2821
2822  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2823  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2824  row4l = _mm_xor_si128(row4l, row1l);
2825  row4h = _mm_xor_si128(row4h, row1h);
2826  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2827  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2828  row3l = _mm_add_epi64(row3l, row4l);
2829  row3h = _mm_add_epi64(row3h, row4h);
2830  row2l = _mm_xor_si128(row2l, row3l);
2831  row2h = _mm_xor_si128(row2h, row3h);
2832  row2l = _mm_shuffle_epi8(row2l, r24);
2833  row2h = _mm_shuffle_epi8(row2h, r24);
2834
2835  b0 = _mm_blend_epi16(m0, m3, 0xF0);
2836  b1 = _mm_blend_epi16(m2, m7, 0xF0);
2837
2838  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2839  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2840  row4l = _mm_xor_si128(row4l, row1l);
2841  row4h = _mm_xor_si128(row4h, row1h);
2842  row4l = _mm_shuffle_epi8(row4l, r16);
2843  row4h = _mm_shuffle_epi8(row4h, r16);
2844  row3l = _mm_add_epi64(row3l, row4l);
2845  row3h = _mm_add_epi64(row3h, row4h);
2846  row2l = _mm_xor_si128(row2l, row3l);
2847  row2h = _mm_xor_si128(row2h, row3h);
2848  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2849  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2850
2851  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2852  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2853  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2854  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2855  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2856  row4l = t1, row4h = t0;
2857
2858  b0 = _mm_blend_epi16(m7, m5, 0xF0);
2859  b1 = _mm_blend_epi16(m3, m1, 0xF0);
2860
2861  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2862  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2863  row4l = _mm_xor_si128(row4l, row1l);
2864  row4h = _mm_xor_si128(row4h, row1h);
2865  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2866  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2867  row3l = _mm_add_epi64(row3l, row4l);
2868  row3h = _mm_add_epi64(row3h, row4h);
2869  row2l = _mm_xor_si128(row2l, row3l);
2870  row2h = _mm_xor_si128(row2h, row3h);
2871  row2l = _mm_shuffle_epi8(row2l, r24);
2872  row2h = _mm_shuffle_epi8(row2h, r24);
2873
2874  b0 = _mm_alignr_epi8(m6, m0, 8);
2875  b1 = _mm_blend_epi16(m4, m6, 0xF0);
2876
2877  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2878  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2879  row4l = _mm_xor_si128(row4l, row1l);
2880  row4h = _mm_xor_si128(row4h, row1h);
2881  row4l = _mm_shuffle_epi8(row4l, r16);
2882  row4h = _mm_shuffle_epi8(row4h, r16);
2883  row3l = _mm_add_epi64(row3l, row4l);
2884  row3h = _mm_add_epi64(row3h, row4h);
2885  row2l = _mm_xor_si128(row2l, row3l);
2886  row2h = _mm_xor_si128(row2h, row3h);
2887  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2888  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2889
2890  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2891  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2892  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2893  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2894  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2895  row4l = t1, row4h = t0;
2896
2897  b0 = _mm_unpacklo_epi64(m1, m3);
2898  b1 = _mm_unpacklo_epi64(m0, m4);
2899
2900  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2901  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2902  row4l = _mm_xor_si128(row4l, row1l);
2903  row4h = _mm_xor_si128(row4h, row1h);
2904  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2905  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2906  row3l = _mm_add_epi64(row3l, row4l);
2907  row3h = _mm_add_epi64(row3h, row4h);
2908  row2l = _mm_xor_si128(row2l, row3l);
2909  row2h = _mm_xor_si128(row2h, row3h);
2910  row2l = _mm_shuffle_epi8(row2l, r24);
2911  row2h = _mm_shuffle_epi8(row2h, r24);
2912
2913  b0 = _mm_unpacklo_epi64(m6, m5);
2914  b1 = _mm_unpackhi_epi64(m5, m1);
2915
2916  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2917  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2918  row4l = _mm_xor_si128(row4l, row1l);
2919  row4h = _mm_xor_si128(row4h, row1h);
2920  row4l = _mm_shuffle_epi8(row4l, r16);
2921  row4h = _mm_shuffle_epi8(row4h, r16);
2922  row3l = _mm_add_epi64(row3l, row4l);
2923  row3h = _mm_add_epi64(row3h, row4h);
2924  row2l = _mm_xor_si128(row2l, row3l);
2925  row2h = _mm_xor_si128(row2h, row3h);
2926  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2927  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2928
2929  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2930  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2931  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2932  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2933  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2934  row4l = t1, row4h = t0;
2935
2936  b0 = _mm_blend_epi16(m2, m3, 0xF0);
2937  b1 = _mm_unpackhi_epi64(m7, m0);
2938
2939  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2940  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2941  row4l = _mm_xor_si128(row4l, row1l);
2942  row4h = _mm_xor_si128(row4h, row1h);
2943  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2944  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2945  row3l = _mm_add_epi64(row3l, row4l);
2946  row3h = _mm_add_epi64(row3h, row4h);
2947  row2l = _mm_xor_si128(row2l, row3l);
2948  row2h = _mm_xor_si128(row2h, row3h);
2949  row2l = _mm_shuffle_epi8(row2l, r24);
2950  row2h = _mm_shuffle_epi8(row2h, r24);
2951
2952  b0 = _mm_unpackhi_epi64(m6, m2);
2953  b1 = _mm_blend_epi16(m7, m4, 0xF0);
2954
2955  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2956  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2957  row4l = _mm_xor_si128(row4l, row1l);
2958  row4h = _mm_xor_si128(row4h, row1h);
2959  row4l = _mm_shuffle_epi8(row4l, r16);
2960  row4h = _mm_shuffle_epi8(row4h, r16);
2961  row3l = _mm_add_epi64(row3l, row4l);
2962  row3h = _mm_add_epi64(row3h, row4h);
2963  row2l = _mm_xor_si128(row2l, row3l);
2964  row2h = _mm_xor_si128(row2h, row3h);
2965  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2966  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2967
2968  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2969  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2970  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2971  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2972  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2973  row4l = t1, row4h = t0;
2974
2975  b0 = _mm_blend_epi16(m6, m0, 0xF0);
2976  b1 = _mm_unpacklo_epi64(m7, m2);
2977
2978  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2979  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2980  row4l = _mm_xor_si128(row4l, row1l);
2981  row4h = _mm_xor_si128(row4h, row1h);
2982  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2983  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2984  row3l = _mm_add_epi64(row3l, row4l);
2985  row3h = _mm_add_epi64(row3h, row4h);
2986  row2l = _mm_xor_si128(row2l, row3l);
2987  row2h = _mm_xor_si128(row2h, row3h);
2988  row2l = _mm_shuffle_epi8(row2l, r24);
2989  row2h = _mm_shuffle_epi8(row2h, r24);
2990
2991  b0 = _mm_unpackhi_epi64(m2, m7);
2992  b1 = _mm_alignr_epi8(m5, m6, 8);
2993
2994  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2995  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2996  row4l = _mm_xor_si128(row4l, row1l);
2997  row4h = _mm_xor_si128(row4h, row1h);
2998  row4l = _mm_shuffle_epi8(row4l, r16);
2999  row4h = _mm_shuffle_epi8(row4h, r16);
3000  row3l = _mm_add_epi64(row3l, row4l);
3001  row3h = _mm_add_epi64(row3h, row4h);
3002  row2l = _mm_xor_si128(row2l, row3l);
3003  row2h = _mm_xor_si128(row2h, row3h);
3004  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3005  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3006
3007  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3008  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3009  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3010  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3011  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3012  row4l = t1, row4h = t0;
3013
3014  b0 = _mm_unpacklo_epi64(m0, m3);
3015  b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));
3016
3017  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3018  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3019  row4l = _mm_xor_si128(row4l, row1l);
3020  row4h = _mm_xor_si128(row4h, row1h);
3021  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3022  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3023  row3l = _mm_add_epi64(row3l, row4l);
3024  row3h = _mm_add_epi64(row3h, row4h);
3025  row2l = _mm_xor_si128(row2l, row3l);
3026  row2h = _mm_xor_si128(row2h, row3h);
3027  row2l = _mm_shuffle_epi8(row2l, r24);
3028  row2h = _mm_shuffle_epi8(row2h, r24);
3029
3030  b0 = _mm_unpackhi_epi64(m3, m1);
3031  b1 = _mm_blend_epi16(m1, m5, 0xF0);
3032
3033  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3034  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3035  row4l = _mm_xor_si128(row4l, row1l);
3036  row4h = _mm_xor_si128(row4h, row1h);
3037  row4l = _mm_shuffle_epi8(row4l, r16);
3038  row4h = _mm_shuffle_epi8(row4h, r16);
3039  row3l = _mm_add_epi64(row3l, row4l);
3040  row3h = _mm_add_epi64(row3h, row4h);
3041  row2l = _mm_xor_si128(row2l, row3l);
3042  row2h = _mm_xor_si128(row2h, row3h);
3043  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3044  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3045
3046  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3047  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3048  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3049  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3050  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3051  row4l = t1, row4h = t0;
3052
3053  b0 = _mm_unpackhi_epi64(m6, m3);
3054  b1 = _mm_blend_epi16(m6, m1, 0xF0);
3055
3056  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3057  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3058  row4l = _mm_xor_si128(row4l, row1l);
3059  row4h = _mm_xor_si128(row4h, row1h);
3060  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3061  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3062  row3l = _mm_add_epi64(row3l, row4l);
3063  row3h = _mm_add_epi64(row3h, row4h);
3064  row2l = _mm_xor_si128(row2l, row3l);
3065  row2h = _mm_xor_si128(row2h, row3h);
3066  row2l = _mm_shuffle_epi8(row2l, r24);
3067  row2h = _mm_shuffle_epi8(row2h, r24);
3068
3069  b0 = _mm_alignr_epi8(m7, m5, 8);
3070  b1 = _mm_unpackhi_epi64(m0, m4);
3071
3072  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3073  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3074  row4l = _mm_xor_si128(row4l, row1l);
3075  row4h = _mm_xor_si128(row4h, row1h);
3076  row4l = _mm_shuffle_epi8(row4l, r16);
3077  row4h = _mm_shuffle_epi8(row4h, r16);
3078  row3l = _mm_add_epi64(row3l, row4l);
3079  row3h = _mm_add_epi64(row3h, row4h);
3080  row2l = _mm_xor_si128(row2l, row3l);
3081  row2h = _mm_xor_si128(row2h, row3h);
3082  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3083  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3084
3085  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3086  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3087  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3088  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3089  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3090  row4l = t1, row4h = t0;
3091
3092  b0 = _mm_unpackhi_epi64(m2, m7);
3093  b1 = _mm_unpacklo_epi64(m4, m1);
3094
3095  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3096  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3097  row4l = _mm_xor_si128(row4l, row1l);
3098  row4h = _mm_xor_si128(row4h, row1h);
3099  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3100  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3101  row3l = _mm_add_epi64(row3l, row4l);
3102  row3h = _mm_add_epi64(row3h, row4h);
3103  row2l = _mm_xor_si128(row2l, row3l);
3104  row2h = _mm_xor_si128(row2h, row3h);
3105  row2l = _mm_shuffle_epi8(row2l, r24);
3106  row2h = _mm_shuffle_epi8(row2h, r24);
3107
3108  b0 = _mm_unpacklo_epi64(m0, m2);
3109  b1 = _mm_unpacklo_epi64(m3, m5);
3110
3111  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3112  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3113  row4l = _mm_xor_si128(row4l, row1l);
3114  row4h = _mm_xor_si128(row4h, row1h);
3115  row4l = _mm_shuffle_epi8(row4l, r16);
3116  row4h = _mm_shuffle_epi8(row4h, r16);
3117  row3l = _mm_add_epi64(row3l, row4l);
3118  row3h = _mm_add_epi64(row3h, row4h);
3119  row2l = _mm_xor_si128(row2l, row3l);
3120  row2h = _mm_xor_si128(row2h, row3h);
3121  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3122  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3123
3124  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3125  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3126  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3127  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3128  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3129  row4l = t1, row4h = t0;
3130
3131  b0 = _mm_unpacklo_epi64(m3, m7);
3132  b1 = _mm_alignr_epi8(m0, m5, 8);
3133
3134  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3135  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3136  row4l = _mm_xor_si128(row4l, row1l);
3137  row4h = _mm_xor_si128(row4h, row1h);
3138  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3139  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3140  row3l = _mm_add_epi64(row3l, row4l);
3141  row3h = _mm_add_epi64(row3h, row4h);
3142  row2l = _mm_xor_si128(row2l, row3l);
3143  row2h = _mm_xor_si128(row2h, row3h);
3144  row2l = _mm_shuffle_epi8(row2l, r24);
3145  row2h = _mm_shuffle_epi8(row2h, r24);
3146
3147  b0 = _mm_unpackhi_epi64(m7, m4);
3148  b1 = _mm_alignr_epi8(m4, m1, 8);
3149
3150  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3151  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3152  row4l = _mm_xor_si128(row4l, row1l);
3153  row4h = _mm_xor_si128(row4h, row1h);
3154  row4l = _mm_shuffle_epi8(row4l, r16);
3155  row4h = _mm_shuffle_epi8(row4h, r16);
3156  row3l = _mm_add_epi64(row3l, row4l);
3157  row3h = _mm_add_epi64(row3h, row4h);
3158  row2l = _mm_xor_si128(row2l, row3l);
3159  row2h = _mm_xor_si128(row2h, row3h);
3160  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3161  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3162
3163  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3164  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3165  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3166  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3167  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3168  row4l = t1, row4h = t0;
3169
3170  b0 = m6;
3171  b1 = _mm_alignr_epi8(m5, m0, 8);
3172
3173  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3174  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3175  row4l = _mm_xor_si128(row4l, row1l);
3176  row4h = _mm_xor_si128(row4h, row1h);
3177  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3178  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3179  row3l = _mm_add_epi64(row3l, row4l);
3180  row3h = _mm_add_epi64(row3h, row4h);
3181  row2l = _mm_xor_si128(row2l, row3l);
3182  row2h = _mm_xor_si128(row2h, row3h);
3183  row2l = _mm_shuffle_epi8(row2l, r24);
3184  row2h = _mm_shuffle_epi8(row2h, r24);
3185
3186  b0 = _mm_blend_epi16(m1, m3, 0xF0);
3187  b1 = m2;
3188
3189  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3190  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3191  row4l = _mm_xor_si128(row4l, row1l);
3192  row4h = _mm_xor_si128(row4h, row1h);
3193  row4l = _mm_shuffle_epi8(row4l, r16);
3194  row4h = _mm_shuffle_epi8(row4h, r16);
3195  row3l = _mm_add_epi64(row3l, row4l);
3196  row3h = _mm_add_epi64(row3h, row4h);
3197  row2l = _mm_xor_si128(row2l, row3l);
3198  row2h = _mm_xor_si128(row2h, row3h);
3199  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3200  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3201
3202  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3203  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3204  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3205  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3206  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3207  row4l = t1, row4h = t0;
3208
3209  b0 = _mm_unpacklo_epi64(m5, m4);
3210  b1 = _mm_unpackhi_epi64(m3, m0);
3211
3212  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3213  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3214  row4l = _mm_xor_si128(row4l, row1l);
3215  row4h = _mm_xor_si128(row4h, row1h);
3216  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3217  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3218  row3l = _mm_add_epi64(row3l, row4l);
3219  row3h = _mm_add_epi64(row3h, row4h);
3220  row2l = _mm_xor_si128(row2l, row3l);
3221  row2h = _mm_xor_si128(row2h, row3h);
3222  row2l = _mm_shuffle_epi8(row2l, r24);
3223  row2h = _mm_shuffle_epi8(row2h, r24);
3224
3225  b0 = _mm_unpacklo_epi64(m1, m2);
3226  b1 = _mm_blend_epi16(m3, m2, 0xF0);
3227
3228  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3229  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3230  row4l = _mm_xor_si128(row4l, row1l);
3231  row4h = _mm_xor_si128(row4h, row1h);
3232  row4l = _mm_shuffle_epi8(row4l, r16);
3233  row4h = _mm_shuffle_epi8(row4h, r16);
3234  row3l = _mm_add_epi64(row3l, row4l);
3235  row3h = _mm_add_epi64(row3h, row4h);
3236  row2l = _mm_xor_si128(row2l, row3l);
3237  row2h = _mm_xor_si128(row2h, row3h);
3238  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3239  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3240
3241  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3242  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3243  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3244  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3245  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3246  row4l = t1, row4h = t0;
3247
3248  b0 = _mm_unpackhi_epi64(m7, m4);
3249  b1 = _mm_unpackhi_epi64(m1, m6);
3250
3251  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3252  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3253  row4l = _mm_xor_si128(row4l, row1l);
3254  row4h = _mm_xor_si128(row4h, row1h);
3255  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3256  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3257  row3l = _mm_add_epi64(row3l, row4l);
3258  row3h = _mm_add_epi64(row3h, row4h);
3259  row2l = _mm_xor_si128(row2l, row3l);
3260  row2h = _mm_xor_si128(row2h, row3h);
3261  row2l = _mm_shuffle_epi8(row2l, r24);
3262  row2h = _mm_shuffle_epi8(row2h, r24);
3263
3264  b0 = _mm_alignr_epi8(m7, m5, 8);
3265  b1 = _mm_unpacklo_epi64(m6, m0);
3266
3267  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3268  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3269  row4l = _mm_xor_si128(row4l, row1l);
3270  row4h = _mm_xor_si128(row4h, row1h);
3271  row4l = _mm_shuffle_epi8(row4l, r16);
3272  row4h = _mm_shuffle_epi8(row4h, r16);
3273  row3l = _mm_add_epi64(row3l, row4l);
3274  row3h = _mm_add_epi64(row3h, row4h);
3275  row2l = _mm_xor_si128(row2l, row3l);
3276  row2h = _mm_xor_si128(row2h, row3h);
3277  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3278  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3279
3280  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3281  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3282  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3283  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3284  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3285  row4l = t1, row4h = t0;
3286
3287  b0 = _mm_unpacklo_epi64(m0, m1);
3288  b1 = _mm_unpacklo_epi64(m2, m3);
3289
3290  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3291  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3292  row4l = _mm_xor_si128(row4l, row1l);
3293  row4h = _mm_xor_si128(row4h, row1h);
3294  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3295  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3296  row3l = _mm_add_epi64(row3l, row4l);
3297  row3h = _mm_add_epi64(row3h, row4h);
3298  row2l = _mm_xor_si128(row2l, row3l);
3299  row2h = _mm_xor_si128(row2h, row3h);
3300  row2l = _mm_shuffle_epi8(row2l, r24);
3301  row2h = _mm_shuffle_epi8(row2h, r24);
3302
3303  b0 = _mm_unpackhi_epi64(m0, m1);
3304  b1 = _mm_unpackhi_epi64(m2, m3);
3305
3306  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3307  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3308  row4l = _mm_xor_si128(row4l, row1l);
3309  row4h = _mm_xor_si128(row4h, row1h);
3310  row4l = _mm_shuffle_epi8(row4l, r16);
3311  row4h = _mm_shuffle_epi8(row4h, r16);
3312  row3l = _mm_add_epi64(row3l, row4l);
3313  row3h = _mm_add_epi64(row3h, row4h);
3314  row2l = _mm_xor_si128(row2l, row3l);
3315  row2h = _mm_xor_si128(row2h, row3h);
3316  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3317  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3318
3319  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3320  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3321  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3322  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3323  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3324  row4l = t1, row4h = t0;
3325
3326  b0 = _mm_unpacklo_epi64(m4, m5);
3327  b1 = _mm_unpacklo_epi64(m6, m7);
3328
3329  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3330  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3331  row4l = _mm_xor_si128(row4l, row1l);
3332  row4h = _mm_xor_si128(row4h, row1h);
3333  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3334  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3335  row3l = _mm_add_epi64(row3l, row4l);
3336  row3h = _mm_add_epi64(row3h, row4h);
3337  row2l = _mm_xor_si128(row2l, row3l);
3338  row2h = _mm_xor_si128(row2h, row3h);
3339  row2l = _mm_shuffle_epi8(row2l, r24);
3340  row2h = _mm_shuffle_epi8(row2h, r24);
3341
3342  b0 = _mm_unpackhi_epi64(m4, m5);
3343  b1 = _mm_unpackhi_epi64(m6, m7);
3344
3345  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3346  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3347  row4l = _mm_xor_si128(row4l, row1l);
3348  row4h = _mm_xor_si128(row4h, row1h);
3349  row4l = _mm_shuffle_epi8(row4l, r16);
3350  row4h = _mm_shuffle_epi8(row4h, r16);
3351  row3l = _mm_add_epi64(row3l, row4l);
3352  row3h = _mm_add_epi64(row3h, row4h);
3353  row2l = _mm_xor_si128(row2l, row3l);
3354  row2h = _mm_xor_si128(row2h, row3h);
3355  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3356  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3357
3358  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3359  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3360  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3361  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3362  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3363  row4l = t1, row4h = t0;
3364
3365  b0 = _mm_unpacklo_epi64(m7, m2);
3366  b1 = _mm_unpackhi_epi64(m4, m6);
3367
3368  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3369  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3370  row4l = _mm_xor_si128(row4l, row1l);
3371  row4h = _mm_xor_si128(row4h, row1h);
3372  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3373  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3374  row3l = _mm_add_epi64(row3l, row4l);
3375  row3h = _mm_add_epi64(row3h, row4h);
3376  row2l = _mm_xor_si128(row2l, row3l);
3377  row2h = _mm_xor_si128(row2h, row3h);
3378  row2l = _mm_shuffle_epi8(row2l, r24);
3379  row2h = _mm_shuffle_epi8(row2h, r24);
3380
3381  b0 = _mm_unpacklo_epi64(m5, m4);
3382  b1 = _mm_alignr_epi8(m3, m7, 8);
3383
3384  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3385  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3386  row4l = _mm_xor_si128(row4l, row1l);
3387  row4h = _mm_xor_si128(row4h, row1h);
3388  row4l = _mm_shuffle_epi8(row4l, r16);
3389  row4h = _mm_shuffle_epi8(row4h, r16);
3390  row3l = _mm_add_epi64(row3l, row4l);
3391  row3h = _mm_add_epi64(row3h, row4h);
3392  row2l = _mm_xor_si128(row2l, row3l);
3393  row2h = _mm_xor_si128(row2h, row3h);
3394  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3395  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3396
3397  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3398  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3399  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3400  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3401  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3402  row4l = t1, row4h = t0;
3403
3404  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
3405  b1 = _mm_unpackhi_epi64(m5, m2);
3406
3407  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3408  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3409  row4l = _mm_xor_si128(row4l, row1l);
3410  row4h = _mm_xor_si128(row4h, row1h);
3411  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3412  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3413  row3l = _mm_add_epi64(row3l, row4l);
3414  row3h = _mm_add_epi64(row3h, row4h);
3415  row2l = _mm_xor_si128(row2l, row3l);
3416  row2h = _mm_xor_si128(row2h, row3h);
3417  row2l = _mm_shuffle_epi8(row2l, r24);
3418  row2h = _mm_shuffle_epi8(row2h, r24);
3419
3420  b0 = _mm_unpacklo_epi64(m6, m1);
3421  b1 = _mm_unpackhi_epi64(m3, m1);
3422
3423  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3424  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3425  row4l = _mm_xor_si128(row4l, row1l);
3426  row4h = _mm_xor_si128(row4h, row1h);
3427  row4l = _mm_shuffle_epi8(row4l, r16);
3428  row4h = _mm_shuffle_epi8(row4h, r16);
3429  row3l = _mm_add_epi64(row3l, row4l);
3430  row3h = _mm_add_epi64(row3h, row4h);
3431  row2l = _mm_xor_si128(row2l, row3l);
3432  row2h = _mm_xor_si128(row2h, row3h);
3433  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3434  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3435
3436  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3437  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3438  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3439  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3440  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3441  row4l = t1, row4h = t0;
3442
3443  row1l = _mm_xor_si128(row3l, row1l);
3444  row1h = _mm_xor_si128(row3h, row1h);
3445  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l));
3446  _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h));
3447
3448  row2l = _mm_xor_si128(row4l, row2l);
3449  row2h = _mm_xor_si128(row4h, row2h);
3450  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l));
3451  _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h));
3452}
3453#endif  // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
3454
3455#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
3456
3457// Reverse words for ARM (use arguments to _mm_set_epi32 without reversing them).
3458#define vld1q_u32_rev(x, a,b,c,d) d[1]=c[0],d[2]=b[0],d[3]=a[0]; x = vld1q_u32(d);
3459
3460// Keep things straight due to swapping. For a 128-bit vector, H64 denotes
3461//   the high 64-bit vector, and L64 denotes the low 64-bit vector. The
3462//   vectors are the same as returned by vget_high_u64 and vget_low_u64.
3463static const int LANE_H64 = 1;
3464static const int LANE_L64 = 0;
3465
3466static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
3467{
3468  //CRYPTOPP_ASSERT(IsAlignedOn(input,GetAlignmentOf<uint8_t*>()));
3469  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf<uint32x4_t>()));
3470  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[4],GetAlignmentOf<uint32x4_t>()));
3471  CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf<uint32x4_t>()));
3472
3473  CRYPTOPP_ALIGN_DATA(16) uint32_t m0[4], m1[4], m2[4], m3[4], m4[4], m5[4], m6[4], m7[4];
3474  CRYPTOPP_ALIGN_DATA(16) uint32_t m8[4], m9[4], m10[4], m11[4], m12[4], m13[4], m14[4], m15[4];
3475
3476  GetBlock<word32, LittleEndian, true> get(input);
3477  get(m0[0])(m1[0])(m2[0])(m3[0])(m4[0])(m5[0])(m6[0])(m7[0])(m8[0])(m9[0])(m10[0])(m11[0])(m12[0])(m13[0])(m14[0])(m15[0]);
3478
3479  uint32x4_t row1,row2,row3,row4;
3480  uint32x4_t buf1,buf2,buf3,buf4;
3481  uint32x4_t ff0,ff1;
3482
3483  row1 = ff0 = vld1q_u32((const uint32_t*)&state.h[0]);
3484  row2 = ff1 = vld1q_u32((const uint32_t*)&state.h[4]);
3485  row3 = vld1q_u32((const uint32_t*)&BLAKE2S_IV(0));
3486  row4 = veorq_u32(vld1q_u32((const uint32_t*)&BLAKE2S_IV(4)), vld1q_u32((const uint32_t*)&state.t[0]));
3487
3488  // buf1 = vld1q_u32(m6,m4,m2,m0);
3489  vld1q_u32_rev(buf1, m6,m4,m2,m0);
3490
3491  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3492  row4 = veorq_u32(row4,row1);
3493  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3494  row3 = vaddq_u32(row3,row4);
3495  row2 = veorq_u32(row2,row3);
3496  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3497
3498  // buf2 = vld1q_u32(m7,m5,m3,m1);
3499  vld1q_u32_rev(buf2, m7,m5,m3,m1);
3500
3501  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3502  row4 = veorq_u32(row4,row1);
3503  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3504  row3 = vaddq_u32(row3,row4);
3505  row2 = veorq_u32(row2,row3);
3506  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3507
3508  row4 = vextq_u32(row4,row4,3);
3509  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3510  row2 = vextq_u32(row2,row2,1);
3511
3512  // buf3 = vld1q_u32(m14,m12,m10,m8);
3513  vld1q_u32_rev(buf3, m14,m12,m10,m8);
3514
3515  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3516  row4 = veorq_u32(row4,row1);
3517  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3518  row3 = vaddq_u32(row3,row4);
3519  row2 = veorq_u32(row2,row3);
3520  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3521
3522  // buf4 = vld1q_u32(m15,m13,m11,m9);
3523  vld1q_u32_rev(buf4, m15,m13,m11,m9);
3524
3525  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3526  row4 = veorq_u32(row4,row1);
3527  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3528  row3 = vaddq_u32(row3,row4);
3529  row2 = veorq_u32(row2,row3);
3530  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3531
3532  row4 = vextq_u32(row4,row4,1);
3533  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3534  row2 = vextq_u32(row2,row2,3);
3535
3536  // buf1 = vld1q_u32(m13,m9,m4,m14);
3537  vld1q_u32_rev(buf1, m13,m9,m4,m14);
3538
3539  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3540  row4 = veorq_u32(row4,row1);
3541  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3542  row3 = vaddq_u32(row3,row4);
3543  row2 = veorq_u32(row2,row3);
3544  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3545
3546  // buf2 = vld1q_u32(m6,m15,m8,m10);
3547  vld1q_u32_rev(buf2, m6,m15,m8,m10);
3548
3549  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3550  row4 = veorq_u32(row4,row1);
3551  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3552  row3 = vaddq_u32(row3,row4);
3553  row2 = veorq_u32(row2,row3);
3554  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3555
3556  row4 = vextq_u32(row4,row4,3);
3557  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3558  row2 = vextq_u32(row2,row2,1);
3559
3560  // buf3 = vld1q_u32(m5,m11,m0,m1);
3561  vld1q_u32_rev(buf3, m5,m11,m0,m1);
3562
3563  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3564  row4 = veorq_u32(row4,row1);
3565  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3566  row3 = vaddq_u32(row3,row4);
3567  row2 = veorq_u32(row2,row3);
3568  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3569
3570  // buf4 = vld1q_u32(m3,m7,m2,m12);
3571  vld1q_u32_rev(buf4, m3,m7,m2,m12);
3572
3573  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3574  row4 = veorq_u32(row4,row1);
3575  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3576  row3 = vaddq_u32(row3,row4);
3577  row2 = veorq_u32(row2,row3);
3578  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3579
3580  row4 = vextq_u32(row4,row4,1);
3581  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3582  row2 = vextq_u32(row2,row2,3);
3583
3584  // buf1 = vld1q_u32(m15,m5,m12,m11);
3585  vld1q_u32_rev(buf1, m15,m5,m12,m11);
3586
3587  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3588  row4 = veorq_u32(row4,row1);
3589  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3590  row3 = vaddq_u32(row3,row4);
3591  row2 = veorq_u32(row2,row3);
3592  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3593
3594  // buf2 = vld1q_u32(m13,m2,m0,m8);
3595  vld1q_u32_rev(buf2, m13,m2,m0,m8);
3596
3597  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3598  row4 = veorq_u32(row4,row1);
3599  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3600  row3 = vaddq_u32(row3,row4);
3601  row2 = veorq_u32(row2,row3);
3602  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3603
3604  row4 = vextq_u32(row4,row4,3);
3605  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3606  row2 = vextq_u32(row2,row2,1);
3607
3608  // buf3 = vld1q_u32(m9,m7,m3,m10);
3609  vld1q_u32_rev(buf3, m9,m7,m3,m10);
3610
3611  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3612  row4 = veorq_u32(row4,row1);
3613  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3614  row3 = vaddq_u32(row3,row4);
3615  row2 = veorq_u32(row2,row3);
3616  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3617
3618  // buf4 = vld1q_u32(m4,m1,m6,m14);
3619  vld1q_u32_rev(buf4, m4,m1,m6,m14);
3620
3621  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3622  row4 = veorq_u32(row4,row1);
3623  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3624  row3 = vaddq_u32(row3,row4);
3625  row2 = veorq_u32(row2,row3);
3626  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3627
3628  row4 = vextq_u32(row4,row4,1);
3629  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3630  row2 = vextq_u32(row2,row2,3);
3631
3632  // buf1 = vld1q_u32(m11,m13,m3,m7);
3633  vld1q_u32_rev(buf1, m11,m13,m3,m7);
3634
3635  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3636  row4 = veorq_u32(row4,row1);
3637  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3638  row3 = vaddq_u32(row3,row4);
3639  row2 = veorq_u32(row2,row3);
3640  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3641
3642  // buf2 = vld1q_u32(m14,m12,m1,m9);
3643  vld1q_u32_rev(buf2, m14,m12,m1,m9);
3644
3645  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3646  row4 = veorq_u32(row4,row1);
3647  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3648  row3 = vaddq_u32(row3,row4);
3649  row2 = veorq_u32(row2,row3);
3650  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3651
3652  row4 = vextq_u32(row4,row4,3);
3653  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3654  row2 = vextq_u32(row2,row2,1);
3655
3656  // buf3 = vld1q_u32(m15,m4,m5,m2);
3657  vld1q_u32_rev(buf3, m15,m4,m5,m2);
3658
3659  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3660  row4 = veorq_u32(row4,row1);
3661  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3662  row3 = vaddq_u32(row3,row4);
3663  row2 = veorq_u32(row2,row3);
3664  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3665
3666  // buf4 = vld1q_u32(m8,m0,m10,m6);
3667  vld1q_u32_rev(buf4, m8,m0,m10,m6);
3668
3669  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3670  row4 = veorq_u32(row4,row1);
3671  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3672  row3 = vaddq_u32(row3,row4);
3673  row2 = veorq_u32(row2,row3);
3674  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3675
3676  row4 = vextq_u32(row4,row4,1);
3677  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3678  row2 = vextq_u32(row2,row2,3);
3679
3680  // buf1 = vld1q_u32(m10,m2,m5,m9);
3681  vld1q_u32_rev(buf1, m10,m2,m5,m9);
3682
3683  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3684  row4 = veorq_u32(row4,row1);
3685  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3686  row3 = vaddq_u32(row3,row4);
3687  row2 = veorq_u32(row2,row3);
3688  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3689
3690  // buf2 = vld1q_u32(m15,m4,m7,m0);
3691  vld1q_u32_rev(buf2, m15,m4,m7,m0);
3692
3693  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3694  row4 = veorq_u32(row4,row1);
3695  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3696  row3 = vaddq_u32(row3,row4);
3697  row2 = veorq_u32(row2,row3);
3698  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3699
3700  row4 = vextq_u32(row4,row4,3);
3701  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3702  row2 = vextq_u32(row2,row2,1);
3703
3704  // buf3 = vld1q_u32(m3,m6,m11,m14);
3705  vld1q_u32_rev(buf3, m3,m6,m11,m14);
3706
3707  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3708  row4 = veorq_u32(row4,row1);
3709  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3710  row3 = vaddq_u32(row3,row4);
3711  row2 = veorq_u32(row2,row3);
3712  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3713
3714  // buf4 = vld1q_u32(m13,m8,m12,m1);
3715  vld1q_u32_rev(buf4, m13,m8,m12,m1);
3716
3717  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3718  row4 = veorq_u32(row4,row1);
3719  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3720  row3 = vaddq_u32(row3,row4);
3721  row2 = veorq_u32(row2,row3);
3722  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3723
3724  row4 = vextq_u32(row4,row4,1);
3725  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3726  row2 = vextq_u32(row2,row2,3);
3727
3728  // buf1 = vld1q_u32(m8,m0,m6,m2);
3729  vld1q_u32_rev(buf1, m8,m0,m6,m2);
3730
3731  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3732  row4 = veorq_u32(row4,row1);
3733  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3734  row3 = vaddq_u32(row3,row4);
3735  row2 = veorq_u32(row2,row3);
3736  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3737
3738  // buf2 = vld1q_u32(m3,m11,m10,m12);
3739  vld1q_u32_rev(buf2, m3,m11,m10,m12);
3740
3741  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3742  row4 = veorq_u32(row4,row1);
3743  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3744  row3 = vaddq_u32(row3,row4);
3745  row2 = veorq_u32(row2,row3);
3746  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3747
3748  row4 = vextq_u32(row4,row4,3);
3749  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3750  row2 = vextq_u32(row2,row2,1);
3751
3752  // buf3 = vld1q_u32(m1,m15,m7,m4);
3753  vld1q_u32_rev(buf3, m1,m15,m7,m4);
3754
3755  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3756  row4 = veorq_u32(row4,row1);
3757  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3758  row3 = vaddq_u32(row3,row4);
3759  row2 = veorq_u32(row2,row3);
3760  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3761
3762  // buf4 = vld1q_u32(m9,m14,m5,m13);
3763  vld1q_u32_rev(buf4, m9,m14,m5,m13);
3764
3765  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3766  row4 = veorq_u32(row4,row1);
3767  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3768  row3 = vaddq_u32(row3,row4);
3769  row2 = veorq_u32(row2,row3);
3770  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3771
3772  row4 = vextq_u32(row4,row4,1);
3773  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3774  row2 = vextq_u32(row2,row2,3);
3775
3776  // buf1 = vld1q_u32(m4,m14,m1,m12);
3777  vld1q_u32_rev(buf1, m4,m14,m1,m12);
3778
3779  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3780  row4 = veorq_u32(row4,row1);
3781  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3782  row3 = vaddq_u32(row3,row4);
3783  row2 = veorq_u32(row2,row3);
3784  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3785
3786  // buf2 = vld1q_u32(m10,m13,m15,m5);
3787  vld1q_u32_rev(buf2, m10,m13,m15,m5);
3788
3789  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3790  row4 = veorq_u32(row4,row1);
3791  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3792  row3 = vaddq_u32(row3,row4);
3793  row2 = veorq_u32(row2,row3);
3794  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3795
3796  row4 = vextq_u32(row4,row4,3);
3797  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3798  row2 = vextq_u32(row2,row2,1);
3799
3800  // buf3 = vld1q_u32(m8,m9,m6,m0);
3801  vld1q_u32_rev(buf3, m8,m9,m6,m0);
3802
3803  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3804  row4 = veorq_u32(row4,row1);
3805  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3806  row3 = vaddq_u32(row3,row4);
3807  row2 = veorq_u32(row2,row3);
3808  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3809
3810  // buf4 = vld1q_u32(m11,m2,m3,m7);
3811  vld1q_u32_rev(buf4, m11,m2,m3,m7);
3812
3813  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3814  row4 = veorq_u32(row4,row1);
3815  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3816  row3 = vaddq_u32(row3,row4);
3817  row2 = veorq_u32(row2,row3);
3818  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3819
3820  row4 = vextq_u32(row4,row4,1);
3821  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3822  row2 = vextq_u32(row2,row2,3);
3823
3824  // buf1 = vld1q_u32(m3,m12,m7,m13);
3825  vld1q_u32_rev(buf1, m3,m12,m7,m13);
3826
3827  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3828  row4 = veorq_u32(row4,row1);
3829  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3830  row3 = vaddq_u32(row3,row4);
3831  row2 = veorq_u32(row2,row3);
3832  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3833
3834  // buf2 = vld1q_u32(m9,m1,m14,m11);
3835  vld1q_u32_rev(buf2, m9,m1,m14,m11);
3836
3837  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3838  row4 = veorq_u32(row4,row1);
3839  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3840  row3 = vaddq_u32(row3,row4);
3841  row2 = veorq_u32(row2,row3);
3842  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3843
3844  row4 = vextq_u32(row4,row4,3);
3845  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3846  row2 = vextq_u32(row2,row2,1);
3847
3848  // buf3 = vld1q_u32(m2,m8,m15,m5);
3849  vld1q_u32_rev(buf3, m2,m8,m15,m5);
3850
3851  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3852  row4 = veorq_u32(row4,row1);
3853  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3854  row3 = vaddq_u32(row3,row4);
3855  row2 = veorq_u32(row2,row3);
3856  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3857
3858  // buf4 = vld1q_u32(m10,m6,m4,m0);
3859  vld1q_u32_rev(buf4, m10,m6,m4,m0);
3860
3861  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3862  row4 = veorq_u32(row4,row1);
3863  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3864  row3 = vaddq_u32(row3,row4);
3865  row2 = veorq_u32(row2,row3);
3866  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3867
3868  row4 = vextq_u32(row4,row4,1);
3869  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3870  row2 = vextq_u32(row2,row2,3);
3871
3872  // buf1 = vld1q_u32(m0,m11,m14,m6);
3873  vld1q_u32_rev(buf1, m0,m11,m14,m6);
3874
3875  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3876  row4 = veorq_u32(row4,row1);
3877  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3878  row3 = vaddq_u32(row3,row4);
3879  row2 = veorq_u32(row2,row3);
3880  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3881
3882  // buf2 = vld1q_u32(m8,m3,m9,m15);
3883  vld1q_u32_rev(buf2, m8,m3,m9,m15);
3884
3885  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3886  row4 = veorq_u32(row4,row1);
3887  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3888  row3 = vaddq_u32(row3,row4);
3889  row2 = veorq_u32(row2,row3);
3890  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3891
3892  row4 = vextq_u32(row4,row4,3);
3893  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3894  row2 = vextq_u32(row2,row2,1);
3895
3896  // buf3 = vld1q_u32(m10,m1,m13,m12);
3897  vld1q_u32_rev(buf3, m10,m1,m13,m12);
3898
3899  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3900  row4 = veorq_u32(row4,row1);
3901  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3902  row3 = vaddq_u32(row3,row4);
3903  row2 = veorq_u32(row2,row3);
3904  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3905
3906  // buf4 = vld1q_u32(m5,m4,m7,m2);
3907  vld1q_u32_rev(buf4, m5,m4,m7,m2);
3908
3909  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3910  row4 = veorq_u32(row4,row1);
3911  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3912  row3 = vaddq_u32(row3,row4);
3913  row2 = veorq_u32(row2,row3);
3914  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3915
3916  row4 = vextq_u32(row4,row4,1);
3917  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3918  row2 = vextq_u32(row2,row2,3);
3919
3920  // buf1 = vld1q_u32(m1,m7,m8,m10);
3921  vld1q_u32_rev(buf1, m1,m7,m8,m10);
3922
3923  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3924  row4 = veorq_u32(row4,row1);
3925  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3926  row3 = vaddq_u32(row3,row4);
3927  row2 = veorq_u32(row2,row3);
3928  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3929
3930  // buf2 = vld1q_u32(m5,m6,m4,m2);
3931  vld1q_u32_rev(buf2, m5,m6,m4,m2);
3932
3933  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3934  row4 = veorq_u32(row4,row1);
3935  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3936  row3 = vaddq_u32(row3,row4);
3937  row2 = veorq_u32(row2,row3);
3938  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3939
3940  row4 = vextq_u32(row4,row4,3);
3941  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3942  row2 = vextq_u32(row2,row2,1);
3943
3944  // buf3 = vld1q_u32(m13,m3,m9,m15);
3945  vld1q_u32_rev(buf3, m13,m3,m9,m15);
3946
3947  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3948  row4 = veorq_u32(row4,row1);
3949  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3950  row3 = vaddq_u32(row3,row4);
3951  row2 = veorq_u32(row2,row3);
3952  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3953
3954  // buf4 = vld1q_u32(m0,m12,m14,m11);
3955  vld1q_u32_rev(buf4, m0,m12,m14,m11);
3956
3957  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3958  row4 = veorq_u32(row4,row1);
3959  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3960  row3 = vaddq_u32(row3,row4);
3961  row2 = veorq_u32(row2,row3);
3962  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3963
3964  row4 = vextq_u32(row4,row4,1);
3965  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3966  row2 = vextq_u32(row2,row2,3);
3967
3968  vst1q_u32((uint32_t*)&state.h[0],veorq_u32(ff0,veorq_u32(row1,row3)));
3969  vst1q_u32((uint32_t*)&state.h[4],veorq_u32(ff1,veorq_u32(row2,row4)));
3970}
3971
3972static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
3973{
3974  //CRYPTOPP_ASSERT(IsAlignedOn(input,GetAlignmentOf<uint8_t*>()));
3975  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf<uint64x2_t>()));
3976  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[4],GetAlignmentOf<uint64x2_t>()));
3977  CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf<uint64x2_t>()));
3978
3979  uint64x2_t m0m1,m2m3,m4m5,m6m7,m8m9,m10m11,m12m13,m14m15;
3980
3981    m0m1 = vreinterpretq_u64_u8(vld1q_u8(input+  0));
3982    m2m3 = vreinterpretq_u64_u8(vld1q_u8(input+ 16));
3983    m4m5 = vreinterpretq_u64_u8(vld1q_u8(input+ 32));
3984    m6m7 = vreinterpretq_u64_u8(vld1q_u8(input+ 48));
3985    m8m9 = vreinterpretq_u64_u8(vld1q_u8(input+ 64));
3986  m10m11 = vreinterpretq_u64_u8(vld1q_u8(input+ 80));
3987  m12m13 = vreinterpretq_u64_u8(vld1q_u8(input+ 96));
3988  m14m15 = vreinterpretq_u64_u8(vld1q_u8(input+112));
3989
3990  uint64x2_t row1l, row1h, row2l, row2h;
3991  uint64x2_t row3l, row3h, row4l, row4h;
3992  uint64x2_t b0 = {0,0}, b1 = {0,0}, t0, t1;
3993
3994  row1l = vld1q_u64((const uint64_t *)&state.h[0]);
3995  row1h = vld1q_u64((const uint64_t *)&state.h[2]);
3996  row2l = vld1q_u64((const uint64_t *)&state.h[4]);
3997  row2h = vld1q_u64((const uint64_t *)&state.h[6]);
3998  row3l = vld1q_u64((const uint64_t *)&BLAKE2B_IV(0));
3999  row3h = vld1q_u64((const uint64_t *)&BLAKE2B_IV(2));
4000  row4l = veorq_u64(vld1q_u64((const uint64_t *)&BLAKE2B_IV(4)), vld1q_u64((const uint64_t*)&state.t[0]));
4001  row4h = veorq_u64(vld1q_u64((const uint64_t *)&BLAKE2B_IV(6)), vld1q_u64((const uint64_t*)&state.f[0]));
4002
4003  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4004  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4005  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4006  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4007  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4008  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4009  row4l = veorq_u64(row4l, row1l);
4010  row4h = veorq_u64(row4h, row1h);
4011  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4012  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4013  row3l = vaddq_u64(row3l, row4l);
4014  row3h = vaddq_u64(row3h, row4h);
4015  row2l = veorq_u64(row2l, row3l);
4016  row2h = veorq_u64(row2h, row3h);
4017  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4018  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4019
4020  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4021  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4022  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4023  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4024  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4025  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4026  row4l = veorq_u64(row4l, row1l);
4027  row4h = veorq_u64(row4h, row1h);
4028  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4029  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4030  row3l = vaddq_u64(row3l, row4l);
4031  row3h = vaddq_u64(row3h, row4h);
4032  row2l = veorq_u64(row2l, row3l);
4033  row2h = veorq_u64(row2h, row3h);
4034  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4035  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4036
4037  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4038  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4039  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4040  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4041  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4042  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4043  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4044  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4045  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4046
4047  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4048  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4049  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4050  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4051  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4052  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4053  row4l = veorq_u64(row4l, row1l);
4054  row4h = veorq_u64(row4h, row1h);
4055  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4056  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4057  row3l = vaddq_u64(row3l, row4l);
4058  row3h = vaddq_u64(row3h, row4h);
4059  row2l = veorq_u64(row2l, row3l);
4060  row2h = veorq_u64(row2h, row3h);
4061  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4062  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4063
4064  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4065  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4066  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4067  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4068  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4069  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4070  row4l = veorq_u64(row4l, row1l);
4071  row4h = veorq_u64(row4h, row1h);
4072  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4073  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4074  row3l = vaddq_u64(row3l, row4l);
4075  row3h = vaddq_u64(row3h, row4h);
4076  row2l = veorq_u64(row2l, row3l);
4077  row2h = veorq_u64(row2h, row3h);
4078  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4079  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4080
4081  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4082  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4083  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4084  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4085  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4086  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4087  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4088  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4089  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4090
4091  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4092  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4093  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4094  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4095  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4096  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4097  row4l = veorq_u64(row4l, row1l);
4098  row4h = veorq_u64(row4h, row1h);
4099  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4100  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4101  row3l = vaddq_u64(row3l, row4l);
4102  row3h = vaddq_u64(row3h, row4h);
4103  row2l = veorq_u64(row2l, row3l);
4104  row2h = veorq_u64(row2h, row3h);
4105  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4106  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4107
4108  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4109  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4110  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4111  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4112  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4113  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4114  row4l = veorq_u64(row4l, row1l);
4115  row4h = veorq_u64(row4h, row1h);
4116  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4117  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4118  row3l = vaddq_u64(row3l, row4l);
4119  row3h = vaddq_u64(row3h, row4h);
4120  row2l = veorq_u64(row2l, row3l);
4121  row2h = veorq_u64(row2h, row3h);
4122  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4123  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4124
4125  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4126  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4127  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4128  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4129  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4130  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4131  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4132  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4133  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4134
4135  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4136  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4137  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4138  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4139  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4140  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4141  row4l = veorq_u64(row4l, row1l);
4142  row4h = veorq_u64(row4h, row1h);
4143  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4144  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4145  row3l = vaddq_u64(row3l, row4l);
4146  row3h = vaddq_u64(row3h, row4h);
4147  row2l = veorq_u64(row2l, row3l);
4148  row2h = veorq_u64(row2h, row3h);
4149  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4150  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4151
4152  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4153  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4154  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4155  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4156  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4157  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4158  row4l = veorq_u64(row4l, row1l);
4159  row4h = veorq_u64(row4h, row1h);
4160  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4161  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4162  row3l = vaddq_u64(row3l, row4l);
4163  row3h = vaddq_u64(row3h, row4h);
4164  row2l = veorq_u64(row2l, row3l);
4165  row2h = veorq_u64(row2h, row3h);
4166  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4167  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4168
4169  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4170  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4171  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4172  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4173  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4174  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4175  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4176  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4177  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4178
4179  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4180  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4181  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4182  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4183  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4184  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4185  row4l = veorq_u64(row4l, row1l);
4186  row4h = veorq_u64(row4h, row1h);
4187  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4188  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4189  row3l = vaddq_u64(row3l, row4l);
4190  row3h = vaddq_u64(row3h, row4h);
4191  row2l = veorq_u64(row2l, row3l);
4192  row2h = veorq_u64(row2h, row3h);
4193  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4194  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4195
4196  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4197  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4198  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4199  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4200  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4201  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4202  row4l = veorq_u64(row4l, row1l);
4203  row4h = veorq_u64(row4h, row1h);
4204  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4205  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4206  row3l = vaddq_u64(row3l, row4l);
4207  row3h = vaddq_u64(row3h, row4h);
4208  row2l = veorq_u64(row2l, row3l);
4209  row2h = veorq_u64(row2h, row3h);
4210  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4211  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4212
4213  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4214  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4215  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4216  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4217  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4218  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4219  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4220  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4221  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4222
4223  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4224  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4225  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4226  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4227  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4228  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4229  row4l = veorq_u64(row4l, row1l);
4230  row4h = veorq_u64(row4h, row1h);
4231  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4232  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4233  row3l = vaddq_u64(row3l, row4l);
4234  row3h = vaddq_u64(row3h, row4h);
4235  row2l = veorq_u64(row2l, row3l);
4236  row2h = veorq_u64(row2h, row3h);
4237  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4238  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4239
4240  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4241  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4242  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4243  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4244  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4245  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4246  row4l = veorq_u64(row4l, row1l);
4247  row4h = veorq_u64(row4h, row1h);
4248  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4249  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4250  row3l = vaddq_u64(row3l, row4l);
4251  row3h = vaddq_u64(row3h, row4h);
4252  row2l = veorq_u64(row2l, row3l);
4253  row2h = veorq_u64(row2h, row3h);
4254  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4255  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4256
4257  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4258  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4259  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4260  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4261  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4262  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4263  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4264  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4265  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4266
4267  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4268  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4269  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4270  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4271  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4272  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4273  row4l = veorq_u64(row4l, row1l);
4274  row4h = veorq_u64(row4h, row1h);
4275  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4276  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4277  row3l = vaddq_u64(row3l, row4l);
4278  row3h = vaddq_u64(row3h, row4h);
4279  row2l = veorq_u64(row2l, row3l);
4280  row2h = veorq_u64(row2h, row3h);
4281  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4282  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4283
4284  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4285  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4286  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4287  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4288  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4289  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4290  row4l = veorq_u64(row4l, row1l);
4291  row4h = veorq_u64(row4h, row1h);
4292  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4293  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4294  row3l = vaddq_u64(row3l, row4l);
4295  row3h = vaddq_u64(row3h, row4h);
4296  row2l = veorq_u64(row2l, row3l);
4297  row2h = veorq_u64(row2h, row3h);
4298  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4299  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4300
4301  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4302  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4303  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4304  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4305  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4306  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4307  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4308  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4309  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4310
4311  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4312  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4313  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4314  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4315  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4316  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4317  row4l = veorq_u64(row4l, row1l);
4318  row4h = veorq_u64(row4h, row1h);
4319  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4320  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4321  row3l = vaddq_u64(row3l, row4l);
4322  row3h = vaddq_u64(row3h, row4h);
4323  row2l = veorq_u64(row2l, row3l);
4324  row2h = veorq_u64(row2h, row3h);
4325  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4326  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4327
4328  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4329  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4330  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4331  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4332  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4333  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4334  row4l = veorq_u64(row4l, row1l);
4335  row4h = veorq_u64(row4h, row1h);
4336  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4337  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4338  row3l = vaddq_u64(row3l, row4l);
4339  row3h = vaddq_u64(row3h, row4h);
4340  row2l = veorq_u64(row2l, row3l);
4341  row2h = veorq_u64(row2h, row3h);
4342  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4343  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4344
4345  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4346  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4347  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4348  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4349  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4350  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4351  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4352  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4353  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4354
4355  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4356  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4357  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4358  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4359  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4360  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4361  row4l = veorq_u64(row4l, row1l);
4362  row4h = veorq_u64(row4h, row1h);
4363  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4364  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4365  row3l = vaddq_u64(row3l, row4l);
4366  row3h = vaddq_u64(row3h, row4h);
4367  row2l = veorq_u64(row2l, row3l);
4368  row2h = veorq_u64(row2h, row3h);
4369  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4370  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4371
4372  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4373  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4374  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4375  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4376  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4377  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4378  row4l = veorq_u64(row4l, row1l);
4379  row4h = veorq_u64(row4h, row1h);
4380  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4381  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4382  row3l = vaddq_u64(row3l, row4l);
4383  row3h = vaddq_u64(row3h, row4h);
4384  row2l = veorq_u64(row2l, row3l);
4385  row2h = veorq_u64(row2h, row3h);
4386  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4387  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4388
4389  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4390  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4391  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4392  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4393  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4394  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4395  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4396  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4397  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4398
4399  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4400  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4401  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4402  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4403  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4404  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4405  row4l = veorq_u64(row4l, row1l);
4406  row4h = veorq_u64(row4h, row1h);
4407  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4408  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4409  row3l = vaddq_u64(row3l, row4l);
4410  row3h = vaddq_u64(row3h, row4h);
4411  row2l = veorq_u64(row2l, row3l);
4412  row2h = veorq_u64(row2h, row3h);
4413  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4414  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4415
4416  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4417  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4418  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4419  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4420  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4421  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4422  row4l = veorq_u64(row4l, row1l);
4423  row4h = veorq_u64(row4h, row1h);
4424  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4425  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4426  row3l = vaddq_u64(row3l, row4l);
4427  row3h = vaddq_u64(row3h, row4h);
4428  row2l = veorq_u64(row2l, row3l);
4429  row2h = veorq_u64(row2h, row3h);
4430  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4431  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4432
4433  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4434  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4435  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4436  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4437  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4438  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4439  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4440  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4441  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4442
4443  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4444  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4445  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4446  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4447  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4448  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4449  row4l = veorq_u64(row4l, row1l);
4450  row4h = veorq_u64(row4h, row1h);
4451  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4452  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4453  row3l = vaddq_u64(row3l, row4l);
4454  row3h = vaddq_u64(row3h, row4h);
4455  row2l = veorq_u64(row2l, row3l);
4456  row2h = veorq_u64(row2h, row3h);
4457  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4458  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4459
4460  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4461  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4462  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4463  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4464  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4465  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4466  row4l = veorq_u64(row4l, row1l);
4467  row4h = veorq_u64(row4h, row1h);
4468  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4469  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4470  row3l = vaddq_u64(row3l, row4l);
4471  row3h = vaddq_u64(row3h, row4h);
4472  row2l = veorq_u64(row2l, row3l);
4473  row2h = veorq_u64(row2h, row3h);
4474  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4475  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4476
4477  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4478  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4479  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4480  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4481  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4482  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4483  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4484  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4485  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4486
4487  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_L64);
4488  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4489  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4490  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4491  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4492  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4493  row4l = veorq_u64(row4l, row1l);
4494  row4h = veorq_u64(row4h, row1h);
4495  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4496  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4497  row3l = vaddq_u64(row3l, row4l);
4498  row3h = vaddq_u64(row3h, row4h);
4499  row2l = veorq_u64(row2l, row3l);
4500  row2h = veorq_u64(row2h, row3h);
4501  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4502  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4503
4504  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4505  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4506  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4507  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4508  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4509  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4510  row4l = veorq_u64(row4l, row1l);
4511  row4h = veorq_u64(row4h, row1h);
4512  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4513  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4514  row3l = vaddq_u64(row3l, row4l);
4515  row3h = vaddq_u64(row3h, row4h);
4516  row2l = veorq_u64(row2l, row3l);
4517  row2h = veorq_u64(row2h, row3h);
4518  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4519  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4520
4521  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4522  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4523  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4524  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4525  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4526  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4527  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4528  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4529  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4530
4531  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4532  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4533  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4534  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4535  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4536  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4537  row4l = veorq_u64(row4l, row1l);
4538  row4h = veorq_u64(row4h, row1h);
4539  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4540  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4541  row3l = vaddq_u64(row3l, row4l);
4542  row3h = vaddq_u64(row3h, row4h);
4543  row2l = veorq_u64(row2l, row3l);
4544  row2h = veorq_u64(row2h, row3h);
4545  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4546  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4547
4548  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4549  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4550  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4551  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4552  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4553  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4554  row4l = veorq_u64(row4l, row1l);
4555  row4h = veorq_u64(row4h, row1h);
4556  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4557  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4558  row3l = vaddq_u64(row3l, row4l);
4559  row3h = vaddq_u64(row3h, row4h);
4560  row2l = veorq_u64(row2l, row3l);
4561  row2h = veorq_u64(row2h, row3h);
4562  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4563  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4564
4565  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4566  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4567  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4568  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4569  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4570  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4571  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4572  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4573  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4574
4575  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4576  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4577  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4578  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4579  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4580  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4581  row4l = veorq_u64(row4l, row1l);
4582  row4h = veorq_u64(row4h, row1h);
4583  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4584  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4585  row3l = vaddq_u64(row3l, row4l);
4586  row3h = vaddq_u64(row3h, row4h);
4587  row2l = veorq_u64(row2l, row3l);
4588  row2h = veorq_u64(row2h, row3h);
4589  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4590  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4591
4592  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4593  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4594  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4595  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4596  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4597  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4598  row4l = veorq_u64(row4l, row1l);
4599  row4h = veorq_u64(row4h, row1h);
4600  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4601  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4602  row3l = vaddq_u64(row3l, row4l);
4603  row3h = vaddq_u64(row3h, row4h);
4604  row2l = veorq_u64(row2l, row3l);
4605  row2h = veorq_u64(row2h, row3h);
4606  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4607  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4608
4609  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4610  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4611  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4612  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4613  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4614  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4615  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4616  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4617  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4618
4619  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4620  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4621  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4622  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4623  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4624  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4625  row4l = veorq_u64(row4l, row1l);
4626  row4h = veorq_u64(row4h, row1h);
4627  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4628  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4629  row3l = vaddq_u64(row3l, row4l);
4630  row3h = vaddq_u64(row3h, row4h);
4631  row2l = veorq_u64(row2l, row3l);
4632  row2h = veorq_u64(row2h, row3h);
4633  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4634  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4635
4636  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4637  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4638  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4639  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4640  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4641  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4642  row4l = veorq_u64(row4l, row1l);
4643  row4h = veorq_u64(row4h, row1h);
4644  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4645  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4646  row3l = vaddq_u64(row3l, row4l);
4647  row3h = vaddq_u64(row3h, row4h);
4648  row2l = veorq_u64(row2l, row3l);
4649  row2h = veorq_u64(row2h, row3h);
4650  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4651  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4652
4653  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4654  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4655  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4656  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4657  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4658  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4659  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4660  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4661  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4662
4663  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4664  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4665  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4666  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_H64);
4667  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4668  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4669  row4l = veorq_u64(row4l, row1l);
4670  row4h = veorq_u64(row4h, row1h);
4671  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4672  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4673  row3l = vaddq_u64(row3l, row4l);
4674  row3h = vaddq_u64(row3h, row4h);
4675  row2l = veorq_u64(row2l, row3l);
4676  row2h = veorq_u64(row2h, row3h);
4677  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4678  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4679
4680  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4681  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4682  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4683  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4684  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4685  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4686  row4l = veorq_u64(row4l, row1l);
4687  row4h = veorq_u64(row4h, row1h);
4688  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4689  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4690  row3l = vaddq_u64(row3l, row4l);
4691  row3h = vaddq_u64(row3h, row4h);
4692  row2l = veorq_u64(row2l, row3l);
4693  row2h = veorq_u64(row2h, row3h);
4694  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4695  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4696
4697  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4698  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4699  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4700  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4701  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4702  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4703  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4704  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4705  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4706
4707  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4708  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4709  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4710  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4711  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4712  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4713  row4l = veorq_u64(row4l, row1l);
4714  row4h = veorq_u64(row4h, row1h);
4715  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4716  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4717  row3l = vaddq_u64(row3l, row4l);
4718  row3h = vaddq_u64(row3h, row4h);
4719  row2l = veorq_u64(row2l, row3l);
4720  row2h = veorq_u64(row2h, row3h);
4721  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4722  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4723
4724  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4725  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4726  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4727  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4728  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4729  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4730  row4l = veorq_u64(row4l, row1l);
4731  row4h = veorq_u64(row4h, row1h);
4732  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4733  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4734  row3l = vaddq_u64(row3l, row4l);
4735  row3h = vaddq_u64(row3h, row4h);
4736  row2l = veorq_u64(row2l, row3l);
4737  row2h = veorq_u64(row2h, row3h);
4738  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4739  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4740
4741  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4742  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4743  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4744  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4745  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4746  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4747  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4748  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4749  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4750
4751  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4752  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_H64);
4753  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4754  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4755  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4756  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4757  row4l = veorq_u64(row4l, row1l);
4758  row4h = veorq_u64(row4h, row1h);
4759  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4760  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4761  row3l = vaddq_u64(row3l, row4l);
4762  row3h = vaddq_u64(row3h, row4h);
4763  row2l = veorq_u64(row2l, row3l);
4764  row2h = veorq_u64(row2h, row3h);
4765  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4766  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4767
4768  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4769  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4770  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4771  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4772  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4773  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4774  row4l = veorq_u64(row4l, row1l);
4775  row4h = veorq_u64(row4h, row1h);
4776  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4777  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4778  row3l = vaddq_u64(row3l, row4l);
4779  row3h = vaddq_u64(row3h, row4h);
4780  row2l = veorq_u64(row2l, row3l);
4781  row2h = veorq_u64(row2h, row3h);
4782  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4783  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4784
4785  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4786  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4787  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4788  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4789  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4790  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4791  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4792  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4793  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4794
4795  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4796  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4797  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4798  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4799  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4800  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4801  row4l = veorq_u64(row4l, row1l);
4802  row4h = veorq_u64(row4h, row1h);
4803  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4804  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4805  row3l = vaddq_u64(row3l, row4l);
4806  row3h = vaddq_u64(row3h, row4h);
4807  row2l = veorq_u64(row2l, row3l);
4808  row2h = veorq_u64(row2h, row3h);
4809  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4810  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4811
4812  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4813  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4814  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4815  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4816  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4817  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4818  row4l = veorq_u64(row4l, row1l);
4819  row4h = veorq_u64(row4h, row1h);
4820  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4821  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4822  row3l = vaddq_u64(row3l, row4l);
4823  row3h = vaddq_u64(row3h, row4h);
4824  row2l = veorq_u64(row2l, row3l);
4825  row2h = veorq_u64(row2h, row3h);
4826  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4827  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4828
4829  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4830  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4831  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4832  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4833  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4834  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4835  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4836  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4837  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4838
4839  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4840  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4841  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4842  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4843  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4844  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4845  row4l = veorq_u64(row4l, row1l);
4846  row4h = veorq_u64(row4h, row1h);
4847  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4848  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4849  row3l = vaddq_u64(row3l, row4l);
4850  row3h = vaddq_u64(row3h, row4h);
4851  row2l = veorq_u64(row2l, row3l);
4852  row2h = veorq_u64(row2h, row3h);
4853  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4854  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4855
4856  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4857  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4858  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4859  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4860  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4861  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4862  row4l = veorq_u64(row4l, row1l);
4863  row4h = veorq_u64(row4h, row1h);
4864  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4865  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4866  row3l = vaddq_u64(row3l, row4l);
4867  row3h = vaddq_u64(row3h, row4h);
4868  row2l = veorq_u64(row2l, row3l);
4869  row2h = veorq_u64(row2h, row3h);
4870  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4871  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4872
4873  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4874  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4875  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4876  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4877  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4878  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4879  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4880  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4881  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4882
4883  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4884  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4885  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4886  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4887  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4888  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4889  row4l = veorq_u64(row4l, row1l);
4890  row4h = veorq_u64(row4h, row1h);
4891  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4892  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4893  row3l = vaddq_u64(row3l, row4l);
4894  row3h = vaddq_u64(row3h, row4h);
4895  row2l = veorq_u64(row2l, row3l);
4896  row2h = veorq_u64(row2h, row3h);
4897  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4898  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4899
4900  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4901  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4902  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4903  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4904  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4905  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4906  row4l = veorq_u64(row4l, row1l);
4907  row4h = veorq_u64(row4h, row1h);
4908  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4909  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4910  row3l = vaddq_u64(row3l, row4l);
4911  row3h = vaddq_u64(row3h, row4h);
4912  row2l = veorq_u64(row2l, row3l);
4913  row2h = veorq_u64(row2h, row3h);
4914  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4915  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4916
4917  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4918  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4919  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4920  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4921  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4922  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4923  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4924  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4925  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4926
4927  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4928  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4929  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4930  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4931  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4932  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4933  row4l = veorq_u64(row4l, row1l);
4934  row4h = veorq_u64(row4h, row1h);
4935  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4936  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4937  row3l = vaddq_u64(row3l, row4l);
4938  row3h = vaddq_u64(row3h, row4h);
4939  row2l = veorq_u64(row2l, row3l);
4940  row2h = veorq_u64(row2h, row3h);
4941  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4942  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4943
4944  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4945  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4946  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4947  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4948  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4949  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4950  row4l = veorq_u64(row4l, row1l);
4951  row4h = veorq_u64(row4h, row1h);
4952  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4953  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4954  row3l = vaddq_u64(row3l, row4l);
4955  row3h = vaddq_u64(row3h, row4h);
4956  row2l = veorq_u64(row2l, row3l);
4957  row2h = veorq_u64(row2h, row3h);
4958  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4959  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4960
4961  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4962  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4963  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4964  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4965  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4966  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4967  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4968  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4969  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4970
4971  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4972  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4973  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4974  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4975  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4976  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4977  row4l = veorq_u64(row4l, row1l);
4978  row4h = veorq_u64(row4h, row1h);
4979  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4980  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4981  row3l = vaddq_u64(row3l, row4l);
4982  row3h = vaddq_u64(row3h, row4h);
4983  row2l = veorq_u64(row2l, row3l);
4984  row2h = veorq_u64(row2h, row3h);
4985  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4986  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4987
4988  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4989  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4990  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4991  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4992  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4993  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4994  row4l = veorq_u64(row4l, row1l);
4995  row4h = veorq_u64(row4h, row1h);
4996  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4997  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4998  row3l = vaddq_u64(row3l, row4l);
4999  row3h = vaddq_u64(row3h, row4h);
5000  row2l = veorq_u64(row2l, row3l);
5001  row2h = veorq_u64(row2h, row3h);
5002  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5003  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5004
5005  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
5006  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
5007  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
5008  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
5009  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
5010  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
5011  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
5012  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
5013  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
5014
5015  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
5016  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
5017  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
5018  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
5019  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5020  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5021  row4l = veorq_u64(row4l, row1l);
5022  row4h = veorq_u64(row4h, row1h);
5023  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
5024  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
5025  row3l = vaddq_u64(row3l, row4l);
5026  row3h = vaddq_u64(row3h, row4h);
5027  row2l = veorq_u64(row2l, row3l);
5028  row2h = veorq_u64(row2h, row3h);
5029  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
5030  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
5031
5032  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
5033  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
5034  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
5035  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
5036  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5037  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5038  row4l = veorq_u64(row4l, row1l);
5039  row4h = veorq_u64(row4h, row1h);
5040  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
5041  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
5042  row3l = vaddq_u64(row3l, row4l);
5043  row3h = vaddq_u64(row3h, row4h);
5044  row2l = veorq_u64(row2l, row3l);
5045  row2h = veorq_u64(row2h, row3h);
5046  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5047  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5048
5049  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
5050  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
5051  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
5052  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
5053  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
5054  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
5055  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
5056  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
5057  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
5058
5059  row1l = veorq_u64(row3l, row1l);
5060  row1h = veorq_u64(row3h, row1h);
5061  vst1q_u64((uint64_t*)&state.h[0], veorq_u64(vld1q_u64((const uint64_t*)&state.h[0]), row1l));
5062  vst1q_u64((uint64_t*)&state.h[2], veorq_u64(vld1q_u64((const uint64_t*)&state.h[2]), row1h));
5063
5064  row2l = veorq_u64(row4l, row2l);
5065  row2h = veorq_u64(row4h, row2h);
5066  vst1q_u64((uint64_t*)&state.h[4], veorq_u64(vld1q_u64((const uint64_t*)&state.h[4]), row2l));
5067  vst1q_u64((uint64_t*)&state.h[6], veorq_u64(vld1q_u64((const uint64_t*)&state.h[6]), row2h));
5068}
5069#endif  // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
5070
5071template class BLAKE2_Base<word32, false>;
5072template class BLAKE2_Base<word64, true>;
5073
5074NAMESPACE_END
Note: See TracBrowser for help on using the repository browser.