1 | // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu> |
---|
2 | // and Wei Dai from Paulo Baretto's Rijndael implementation |
---|
3 | // The original code and all modifications are in the public domain. |
---|
4 | |
---|
5 | // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code |
---|
6 | |
---|
7 | /* |
---|
8 | July 2010: Added support for AES-NI instructions via compiler intrinsics. |
---|
9 | */ |
---|
10 | |
---|
11 | /* |
---|
12 | Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode |
---|
13 | caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein |
---|
14 | and Peter Schwabe in their paper "New AES software speed records". The round |
---|
15 | function was also modified to include a trick similar to one in Brian Gladman's |
---|
16 | x86 assembly code, doing an 8-bit register move to minimize the number of |
---|
17 | register spills. Also switched to compressed tables and copying round keys to |
---|
18 | the stack. |
---|
19 | |
---|
20 | The C++ implementation now uses compressed tables if |
---|
21 | CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined. |
---|
22 | */ |
---|
23 | |
---|
24 | /* |
---|
25 | July 2006: Defense against timing attacks was added in by Wei Dai. |
---|
26 | |
---|
27 | The code now uses smaller tables in the first and last rounds, |
---|
28 | and preloads them into L1 cache before usage (by loading at least |
---|
29 | one element in each cache line). |
---|
30 | |
---|
31 | We try to delay subsequent accesses to each table (used in the first |
---|
32 | and last rounds) until all of the table has been preloaded. Hopefully |
---|
33 | the compiler isn't smart enough to optimize that code away. |
---|
34 | |
---|
35 | After preloading the table, we also try not to access any memory location |
---|
36 | other than the table and the stack, in order to prevent table entries from |
---|
37 | being unloaded from L1 cache, until that round is finished. |
---|
38 | (Some popular CPUs have 2-way associative caches.) |
---|
39 | */ |
---|
40 | |
---|
41 | // This is the original introductory comment: |
---|
42 | |
---|
43 | /** |
---|
44 | * version 3.0 (December 2000) |
---|
45 | * |
---|
46 | * Optimised ANSI C code for the Rijndael cipher (now AES) |
---|
47 | * |
---|
48 | * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> |
---|
49 | * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> |
---|
50 | * author Paulo Barreto <paulo.barreto@terra.com.br> |
---|
51 | * |
---|
52 | * This code is hereby placed in the public domain. |
---|
53 | * |
---|
54 | * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
---|
55 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
---|
56 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
57 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
---|
58 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
59 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
60 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
---|
61 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
---|
62 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
---|
63 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
---|
64 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
65 | */ |
---|
66 | |
---|
67 | #include "pch.h" |
---|
68 | #include "config.h" |
---|
69 | |
---|
70 | #ifndef CRYPTOPP_IMPORTS |
---|
71 | #ifndef CRYPTOPP_GENERATE_X64_MASM |
---|
72 | |
---|
73 | #include "rijndael.h" |
---|
74 | #include "stdcpp.h" // alloca |
---|
75 | #include "misc.h" |
---|
76 | #include "cpu.h" |
---|
77 | |
---|
78 | NAMESPACE_BEGIN(CryptoPP) |
---|
79 | |
---|
80 | // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132 |
---|
81 | #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) |
---|
82 | # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1 |
---|
83 | #endif |
---|
84 | |
---|
85 | // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 |
---|
86 | #if (__SUNPRO_CC >= 0x5130) |
---|
87 | # define MAYBE_CONST |
---|
88 | #else |
---|
89 | # define MAYBE_CONST const |
---|
90 | #endif |
---|
91 | |
---|
92 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
93 | # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
---|
94 | namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} |
---|
95 | using namespace rdtable; |
---|
96 | # else |
---|
97 | static word64 Te[256]; |
---|
98 | # endif |
---|
99 | static word64 Td[256]; |
---|
100 | #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS |
---|
101 | # if defined(CRYPTOPP_X64_MASM_AVAILABLE) |
---|
102 | // Unused; avoids linker error on Microsoft X64 non-AESNI platforms |
---|
103 | namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} |
---|
104 | # endif |
---|
105 | CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4]; |
---|
106 | CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4]; |
---|
107 | #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS |
---|
108 | |
---|
109 | static volatile bool s_TeFilled = false, s_TdFilled = false; |
---|
110 | |
---|
111 | // ************************* Portable Code ************************************ |
---|
112 | |
---|
113 | #define QUARTER_ROUND(L, T, t, a, b, c, d) \ |
---|
114 | a ^= L(T, 3, byte(t)); t >>= 8;\ |
---|
115 | b ^= L(T, 2, byte(t)); t >>= 8;\ |
---|
116 | c ^= L(T, 1, byte(t)); t >>= 8;\ |
---|
117 | d ^= L(T, 0, t); |
---|
118 | |
---|
119 | #define QUARTER_ROUND_LE(t, a, b, c, d) \ |
---|
120 | tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ |
---|
121 | tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ |
---|
122 | tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ |
---|
123 | tempBlock[d] = ((byte *)(Te+t))[1]; |
---|
124 | |
---|
125 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
126 | #define QUARTER_ROUND_LD(t, a, b, c, d) \ |
---|
127 | tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ |
---|
128 | tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ |
---|
129 | tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ |
---|
130 | tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7]; |
---|
131 | #else |
---|
132 | #define QUARTER_ROUND_LD(t, a, b, c, d) \ |
---|
133 | tempBlock[a] = Sd[byte(t)]; t >>= 8;\ |
---|
134 | tempBlock[b] = Sd[byte(t)]; t >>= 8;\ |
---|
135 | tempBlock[c] = Sd[byte(t)]; t >>= 8;\ |
---|
136 | tempBlock[d] = Sd[t]; |
---|
137 | #endif |
---|
138 | |
---|
139 | #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d) |
---|
140 | #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d) |
---|
141 | |
---|
142 | #ifdef IS_LITTLE_ENDIAN |
---|
143 | #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a) |
---|
144 | #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a) |
---|
145 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
146 | #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1)) |
---|
147 | #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1)) |
---|
148 | #else |
---|
149 | #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8) |
---|
150 | #define TL_M(T, i, x) T[i*256 + x] |
---|
151 | #endif |
---|
152 | #else |
---|
153 | #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d) |
---|
154 | #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d) |
---|
155 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
156 | #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4)) |
---|
157 | #define TL_M TL_F |
---|
158 | #else |
---|
159 | #define TL_F(T, i, x) rotrFixed(T[x], i*8) |
---|
160 | #define TL_M(T, i, x) T[i*256 + x] |
---|
161 | #endif |
---|
162 | #endif |
---|
163 | |
---|
164 | |
---|
165 | #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) |
---|
166 | #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) |
---|
167 | #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) |
---|
168 | |
---|
169 | #define f3(x) (f2(x) ^ x) |
---|
170 | #define f9(x) (f8(x) ^ x) |
---|
171 | #define fb(x) (f8(x) ^ f2(x) ^ x) |
---|
172 | #define fd(x) (f8(x) ^ f4(x) ^ x) |
---|
173 | #define fe(x) (f8(x) ^ f4(x) ^ f2(x)) |
---|
174 | |
---|
175 | void Rijndael::Base::FillEncTable() |
---|
176 | { |
---|
177 | for (int i=0; i<256; i++) |
---|
178 | { |
---|
179 | byte x = Se[i]; |
---|
180 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
181 | word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; |
---|
182 | Te[i] = word64(y | f3(x))<<32 | y; |
---|
183 | #else |
---|
184 | word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; |
---|
185 | for (int j=0; j<4; j++) |
---|
186 | { |
---|
187 | Te[i+j*256] = y; |
---|
188 | y = rotrFixed(y, 8); |
---|
189 | } |
---|
190 | #endif |
---|
191 | } |
---|
192 | #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
---|
193 | Te[256] = Te[257] = 0; |
---|
194 | #endif |
---|
195 | s_TeFilled = true; |
---|
196 | } |
---|
197 | |
---|
198 | void Rijndael::Base::FillDecTable() |
---|
199 | { |
---|
200 | for (int i=0; i<256; i++) |
---|
201 | { |
---|
202 | byte x = Sd[i]; |
---|
203 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
204 | word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24; |
---|
205 | Td[i] = word64(y | fb(x))<<32 | y | x; |
---|
206 | #else |
---|
207 | word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;; |
---|
208 | for (int j=0; j<4; j++) |
---|
209 | { |
---|
210 | Td[i+j*256] = y; |
---|
211 | y = rotrFixed(y, 8); |
---|
212 | } |
---|
213 | #endif |
---|
214 | } |
---|
215 | s_TdFilled = true; |
---|
216 | } |
---|
217 | |
---|
218 | void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &) |
---|
219 | { |
---|
220 | AssertValidKeyLength(keylen); |
---|
221 | |
---|
222 | m_rounds = keylen/4 + 6; |
---|
223 | m_key.New(4*(m_rounds+1)); |
---|
224 | |
---|
225 | word32 *rk = m_key; |
---|
226 | |
---|
227 | #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)) |
---|
228 | // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 |
---|
229 | if (HasAESNI() && HasSSE4()) |
---|
230 | { |
---|
231 | static const word32 rcLE[] = { |
---|
232 | 0x01, 0x02, 0x04, 0x08, |
---|
233 | 0x10, 0x20, 0x40, 0x80, |
---|
234 | 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ |
---|
235 | }; |
---|
236 | |
---|
237 | // Coverity finding, appears to be false positive. Assert the condition. |
---|
238 | const word32 *ro = rcLE, *rc = rcLE; |
---|
239 | CRYPTOPP_UNUSED(ro); |
---|
240 | |
---|
241 | __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16)); |
---|
242 | memcpy(rk, userKey, keylen); |
---|
243 | |
---|
244 | while (true) |
---|
245 | { |
---|
246 | // Coverity finding, appears to be false positive. Assert the condition. |
---|
247 | CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE)); |
---|
248 | rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); |
---|
249 | rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; |
---|
250 | rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; |
---|
251 | rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; |
---|
252 | |
---|
253 | if (rk + keylen/4 + 4 == m_key.end()) |
---|
254 | break; |
---|
255 | |
---|
256 | if (keylen == 24) |
---|
257 | { |
---|
258 | rk[10] = rk[ 4] ^ rk[ 9]; |
---|
259 | rk[11] = rk[ 5] ^ rk[10]; |
---|
260 | // Coverity finding, appears to be false positive. Assert the condition. |
---|
261 | CRYPTOPP_ASSERT(m_key.size() >= 12); |
---|
262 | temp = _mm_insert_epi32(temp, rk[11], 3); |
---|
263 | } |
---|
264 | else if (keylen == 32) |
---|
265 | { |
---|
266 | // Coverity finding, appears to be false positive. Assert the condition. |
---|
267 | CRYPTOPP_ASSERT(m_key.size() >= 12); |
---|
268 | temp = _mm_insert_epi32(temp, rk[11], 3); |
---|
269 | rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); |
---|
270 | rk[13] = rk[ 5] ^ rk[12]; |
---|
271 | rk[14] = rk[ 6] ^ rk[13]; |
---|
272 | rk[15] = rk[ 7] ^ rk[14]; |
---|
273 | // Coverity finding, appears to be false positive. Assert the condition. |
---|
274 | CRYPTOPP_ASSERT(m_key.size() >= 16); |
---|
275 | temp = _mm_insert_epi32(temp, rk[15], 3); |
---|
276 | } |
---|
277 | else |
---|
278 | { |
---|
279 | // Coverity finding, appears to be false positive. Assert the condition. |
---|
280 | CRYPTOPP_ASSERT(m_key.size() >= 8); |
---|
281 | temp = _mm_insert_epi32(temp, rk[7], 3); |
---|
282 | } |
---|
283 | |
---|
284 | rk += keylen/4; |
---|
285 | } |
---|
286 | |
---|
287 | if (!IsForwardTransformation()) |
---|
288 | { |
---|
289 | rk = m_key; |
---|
290 | unsigned int i, j; |
---|
291 | |
---|
292 | #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) |
---|
293 | // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. |
---|
294 | // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. |
---|
295 | vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds)); |
---|
296 | #else |
---|
297 | std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds)); |
---|
298 | #endif |
---|
299 | for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) |
---|
300 | { |
---|
301 | temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i)); |
---|
302 | *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j)); |
---|
303 | *(__m128i *)(void *)(rk+j) = temp; |
---|
304 | } |
---|
305 | |
---|
306 | *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i)); |
---|
307 | } |
---|
308 | |
---|
309 | return; |
---|
310 | } |
---|
311 | #endif |
---|
312 | |
---|
313 | GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); |
---|
314 | const word32 *rc = rcon; |
---|
315 | word32 temp; |
---|
316 | |
---|
317 | while (true) |
---|
318 | { |
---|
319 | temp = rk[keylen/4-1]; |
---|
320 | word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; |
---|
321 | rk[keylen/4] = rk[0] ^ x ^ *(rc++); |
---|
322 | rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; |
---|
323 | rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; |
---|
324 | rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; |
---|
325 | |
---|
326 | if (rk + keylen/4 + 4 == m_key.end()) |
---|
327 | break; |
---|
328 | |
---|
329 | if (keylen == 24) |
---|
330 | { |
---|
331 | rk[10] = rk[ 4] ^ rk[ 9]; |
---|
332 | rk[11] = rk[ 5] ^ rk[10]; |
---|
333 | } |
---|
334 | else if (keylen == 32) |
---|
335 | { |
---|
336 | temp = rk[11]; |
---|
337 | rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; |
---|
338 | rk[13] = rk[ 5] ^ rk[12]; |
---|
339 | rk[14] = rk[ 6] ^ rk[13]; |
---|
340 | rk[15] = rk[ 7] ^ rk[14]; |
---|
341 | } |
---|
342 | rk += keylen/4; |
---|
343 | } |
---|
344 | |
---|
345 | rk = m_key; |
---|
346 | |
---|
347 | if (IsForwardTransformation()) |
---|
348 | { |
---|
349 | if (!s_TeFilled) |
---|
350 | FillEncTable(); |
---|
351 | |
---|
352 | ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16); |
---|
353 | ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16); |
---|
354 | } |
---|
355 | else |
---|
356 | { |
---|
357 | if (!s_TdFilled) |
---|
358 | FillDecTable(); |
---|
359 | |
---|
360 | unsigned int i, j; |
---|
361 | |
---|
362 | #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) |
---|
363 | |
---|
364 | for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) |
---|
365 | { |
---|
366 | temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp; |
---|
367 | temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp; |
---|
368 | temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp; |
---|
369 | temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp; |
---|
370 | } |
---|
371 | |
---|
372 | rk[i+0] = InverseMixColumn(rk[i+0]); |
---|
373 | rk[i+1] = InverseMixColumn(rk[i+1]); |
---|
374 | rk[i+2] = InverseMixColumn(rk[i+2]); |
---|
375 | rk[i+3] = InverseMixColumn(rk[i+3]); |
---|
376 | |
---|
377 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp; |
---|
378 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp; |
---|
379 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp; |
---|
380 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp; |
---|
381 | } |
---|
382 | |
---|
383 | #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE |
---|
384 | if (HasAESNI()) |
---|
385 | ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); |
---|
386 | #endif |
---|
387 | } |
---|
388 | |
---|
389 | void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const |
---|
390 | { |
---|
391 | #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE |
---|
392 | #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
---|
393 | if (HasSSE2()) |
---|
394 | #else |
---|
395 | if (HasAESNI()) |
---|
396 | #endif |
---|
397 | { |
---|
398 | return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
---|
399 | } |
---|
400 | #endif |
---|
401 | |
---|
402 | typedef BlockGetAndPut<word32, NativeByteOrder> Block; |
---|
403 | |
---|
404 | word32 s0, s1, s2, s3, t0, t1, t2, t3; |
---|
405 | Block::Get(inBlock)(s0)(s1)(s2)(s3); |
---|
406 | |
---|
407 | const word32 *rk = m_key; |
---|
408 | s0 ^= rk[0]; |
---|
409 | s1 ^= rk[1]; |
---|
410 | s2 ^= rk[2]; |
---|
411 | s3 ^= rk[3]; |
---|
412 | t0 = rk[4]; |
---|
413 | t1 = rk[5]; |
---|
414 | t2 = rk[6]; |
---|
415 | t3 = rk[7]; |
---|
416 | rk += 8; |
---|
417 | |
---|
418 | // timing attack countermeasure. see comments at top for more details. |
---|
419 | // also see http://github.com/weidai11/cryptopp/issues/146 |
---|
420 | const int cacheLineSize = GetCacheLineSize(); |
---|
421 | unsigned int i; |
---|
422 | volatile word32 _u = 0; |
---|
423 | word32 u = _u; |
---|
424 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
425 | for (i=0; i<2048; i+=cacheLineSize) |
---|
426 | #else |
---|
427 | for (i=0; i<1024; i+=cacheLineSize) |
---|
428 | #endif |
---|
429 | u &= *(const word32 *)(const void *)(((const byte *)Te)+i); |
---|
430 | u &= Te[255]; |
---|
431 | s0 |= u; s1 |= u; s2 |= u; s3 |= u; |
---|
432 | |
---|
433 | QUARTER_ROUND_FE(s3, t0, t1, t2, t3) |
---|
434 | QUARTER_ROUND_FE(s2, t3, t0, t1, t2) |
---|
435 | QUARTER_ROUND_FE(s1, t2, t3, t0, t1) |
---|
436 | QUARTER_ROUND_FE(s0, t1, t2, t3, t0) |
---|
437 | |
---|
438 | // Nr - 2 full rounds: |
---|
439 | unsigned int r = m_rounds/2 - 1; |
---|
440 | do |
---|
441 | { |
---|
442 | s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; |
---|
443 | |
---|
444 | QUARTER_ROUND_E(t3, s0, s1, s2, s3) |
---|
445 | QUARTER_ROUND_E(t2, s3, s0, s1, s2) |
---|
446 | QUARTER_ROUND_E(t1, s2, s3, s0, s1) |
---|
447 | QUARTER_ROUND_E(t0, s1, s2, s3, s0) |
---|
448 | |
---|
449 | t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; |
---|
450 | |
---|
451 | QUARTER_ROUND_E(s3, t0, t1, t2, t3) |
---|
452 | QUARTER_ROUND_E(s2, t3, t0, t1, t2) |
---|
453 | QUARTER_ROUND_E(s1, t2, t3, t0, t1) |
---|
454 | QUARTER_ROUND_E(s0, t1, t2, t3, t0) |
---|
455 | |
---|
456 | rk += 8; |
---|
457 | } while (--r); |
---|
458 | |
---|
459 | word32 tbw[4]; |
---|
460 | byte *const tempBlock = (byte *)tbw; |
---|
461 | |
---|
462 | QUARTER_ROUND_LE(t2, 15, 2, 5, 8) |
---|
463 | QUARTER_ROUND_LE(t1, 11, 14, 1, 4) |
---|
464 | QUARTER_ROUND_LE(t0, 7, 10, 13, 0) |
---|
465 | QUARTER_ROUND_LE(t3, 3, 6, 9, 12) |
---|
466 | |
---|
467 | Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); |
---|
468 | } |
---|
469 | |
---|
470 | void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const |
---|
471 | { |
---|
472 | #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE |
---|
473 | if (HasAESNI()) |
---|
474 | { |
---|
475 | Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
---|
476 | return; |
---|
477 | } |
---|
478 | #endif |
---|
479 | |
---|
480 | typedef BlockGetAndPut<word32, NativeByteOrder> Block; |
---|
481 | |
---|
482 | word32 s0, s1, s2, s3, t0, t1, t2, t3; |
---|
483 | Block::Get(inBlock)(s0)(s1)(s2)(s3); |
---|
484 | |
---|
485 | const word32 *rk = m_key; |
---|
486 | s0 ^= rk[0]; |
---|
487 | s1 ^= rk[1]; |
---|
488 | s2 ^= rk[2]; |
---|
489 | s3 ^= rk[3]; |
---|
490 | t0 = rk[4]; |
---|
491 | t1 = rk[5]; |
---|
492 | t2 = rk[6]; |
---|
493 | t3 = rk[7]; |
---|
494 | rk += 8; |
---|
495 | |
---|
496 | // timing attack countermeasure. see comments at top for more details. |
---|
497 | // also see http://github.com/weidai11/cryptopp/issues/146 |
---|
498 | const int cacheLineSize = GetCacheLineSize(); |
---|
499 | unsigned int i; |
---|
500 | volatile word32 _u = 0; |
---|
501 | word32 u = _u; |
---|
502 | #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
---|
503 | for (i=0; i<2048; i+=cacheLineSize) |
---|
504 | #else |
---|
505 | for (i=0; i<1024; i+=cacheLineSize) |
---|
506 | #endif |
---|
507 | u &= *(const word32 *)(const void *)(((const byte *)Td)+i); |
---|
508 | u &= Td[255]; |
---|
509 | s0 |= u; s1 |= u; s2 |= u; s3 |= u; |
---|
510 | |
---|
511 | QUARTER_ROUND_FD(s3, t2, t1, t0, t3) |
---|
512 | QUARTER_ROUND_FD(s2, t1, t0, t3, t2) |
---|
513 | QUARTER_ROUND_FD(s1, t0, t3, t2, t1) |
---|
514 | QUARTER_ROUND_FD(s0, t3, t2, t1, t0) |
---|
515 | |
---|
516 | // Nr - 2 full rounds: |
---|
517 | unsigned int r = m_rounds/2 - 1; |
---|
518 | do |
---|
519 | { |
---|
520 | s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; |
---|
521 | |
---|
522 | QUARTER_ROUND_D(t3, s2, s1, s0, s3) |
---|
523 | QUARTER_ROUND_D(t2, s1, s0, s3, s2) |
---|
524 | QUARTER_ROUND_D(t1, s0, s3, s2, s1) |
---|
525 | QUARTER_ROUND_D(t0, s3, s2, s1, s0) |
---|
526 | |
---|
527 | t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; |
---|
528 | |
---|
529 | QUARTER_ROUND_D(s3, t2, t1, t0, t3) |
---|
530 | QUARTER_ROUND_D(s2, t1, t0, t3, t2) |
---|
531 | QUARTER_ROUND_D(s1, t0, t3, t2, t1) |
---|
532 | QUARTER_ROUND_D(s0, t3, t2, t1, t0) |
---|
533 | |
---|
534 | rk += 8; |
---|
535 | } while (--r); |
---|
536 | |
---|
537 | #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)) |
---|
538 | // timing attack countermeasure. see comments at top for more details |
---|
539 | // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined, |
---|
540 | // QUARTER_ROUND_LD will use Td, which is already preloaded. |
---|
541 | u = _u; |
---|
542 | for (i=0; i<256; i+=cacheLineSize) |
---|
543 | u &= *(const word32 *)(const void *)(Sd+i); |
---|
544 | u &= *(const word32 *)(const void *)(Sd+252); |
---|
545 | t0 |= u; t1 |= u; t2 |= u; t3 |= u; |
---|
546 | #endif |
---|
547 | |
---|
548 | word32 tbw[4]; |
---|
549 | byte *const tempBlock = (byte *)tbw; |
---|
550 | |
---|
551 | QUARTER_ROUND_LD(t2, 7, 2, 13, 8) |
---|
552 | QUARTER_ROUND_LD(t1, 3, 14, 9, 4) |
---|
553 | QUARTER_ROUND_LD(t0, 15, 10, 5, 0) |
---|
554 | QUARTER_ROUND_LD(t3, 11, 6, 1, 12) |
---|
555 | |
---|
556 | Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); |
---|
557 | } |
---|
558 | |
---|
559 | // ************************* Assembly Code ************************************ |
---|
560 | |
---|
561 | #if CRYPTOPP_MSC_VERSION |
---|
562 | # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code |
---|
563 | #endif |
---|
564 | |
---|
565 | #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM |
---|
566 | |
---|
567 | #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
---|
568 | |
---|
569 | CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k) |
---|
570 | { |
---|
571 | CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k); |
---|
572 | |
---|
573 | #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 |
---|
574 | |
---|
575 | #define L_REG esp |
---|
576 | #define L_INDEX(i) (L_REG+768+i) |
---|
577 | #define L_INXORBLOCKS L_INBLOCKS+4 |
---|
578 | #define L_OUTXORBLOCKS L_INBLOCKS+8 |
---|
579 | #define L_OUTBLOCKS L_INBLOCKS+12 |
---|
580 | #define L_INCREMENTS L_INDEX(16*15) |
---|
581 | #define L_SP L_INDEX(16*16) |
---|
582 | #define L_LENGTH L_INDEX(16*16+4) |
---|
583 | #define L_KEYS_BEGIN L_INDEX(16*16+8) |
---|
584 | |
---|
585 | #define MOVD movd |
---|
586 | #define MM(i) mm##i |
---|
587 | |
---|
588 | #define MXOR(a,b,c) \ |
---|
589 | AS2( movzx esi, b)\ |
---|
590 | AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
---|
591 | AS2( pxor MM(a), mm7)\ |
---|
592 | |
---|
593 | #define MMOV(a,b,c) \ |
---|
594 | AS2( movzx esi, b)\ |
---|
595 | AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
---|
596 | |
---|
597 | #else |
---|
598 | |
---|
599 | #define L_REG r8 |
---|
600 | #define L_INDEX(i) (L_REG+i) |
---|
601 | #define L_INXORBLOCKS L_INBLOCKS+8 |
---|
602 | #define L_OUTXORBLOCKS L_INBLOCKS+16 |
---|
603 | #define L_OUTBLOCKS L_INBLOCKS+24 |
---|
604 | #define L_INCREMENTS L_INDEX(16*16) |
---|
605 | #define L_LENGTH L_INDEX(16*18+8) |
---|
606 | #define L_KEYS_BEGIN L_INDEX(16*19) |
---|
607 | |
---|
608 | #define MOVD mov |
---|
609 | #define MM_0 r9d |
---|
610 | #define MM_1 r12d |
---|
611 | #ifdef __GNUC__ |
---|
612 | #define MM_2 r11d |
---|
613 | #else |
---|
614 | #define MM_2 r10d |
---|
615 | #endif |
---|
616 | #define MM(i) MM_##i |
---|
617 | |
---|
618 | #define MXOR(a,b,c) \ |
---|
619 | AS2( movzx esi, b)\ |
---|
620 | AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
---|
621 | |
---|
622 | #define MMOV(a,b,c) \ |
---|
623 | AS2( movzx esi, b)\ |
---|
624 | AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
---|
625 | |
---|
626 | #endif |
---|
627 | |
---|
628 | #define L_SUBKEYS L_INDEX(0) |
---|
629 | #define L_SAVED_X L_SUBKEYS |
---|
630 | #define L_KEY12 L_INDEX(16*12) |
---|
631 | #define L_LASTROUND L_INDEX(16*13) |
---|
632 | #define L_INBLOCKS L_INDEX(16*14) |
---|
633 | #define MAP0TO4(i) (ASM_MOD(i+3,4)+1) |
---|
634 | |
---|
635 | #define XOR(a,b,c) \ |
---|
636 | AS2( movzx esi, b)\ |
---|
637 | AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
---|
638 | |
---|
639 | #define MOV(a,b,c) \ |
---|
640 | AS2( movzx esi, b)\ |
---|
641 | AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
---|
642 | |
---|
643 | #ifdef CRYPTOPP_GENERATE_X64_MASM |
---|
644 | ALIGN 8 |
---|
645 | Rijndael_Enc_AdvancedProcessBlocks PROC FRAME |
---|
646 | rex_push_reg rsi |
---|
647 | push_reg rdi |
---|
648 | push_reg rbx |
---|
649 | push_reg r12 |
---|
650 | .endprolog |
---|
651 | mov L_REG, rcx |
---|
652 | mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA |
---|
653 | mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA] |
---|
654 | #elif defined(__GNUC__) |
---|
655 | __asm__ __volatile__ |
---|
656 | ( |
---|
657 | INTEL_NOPREFIX |
---|
658 | #if CRYPTOPP_BOOL_X64 |
---|
659 | AS2( mov L_REG, rcx) |
---|
660 | #endif |
---|
661 | AS_PUSH_IF86(bx) |
---|
662 | AS_PUSH_IF86(bp) |
---|
663 | AS2( mov AS_REG_7, WORD_REG(si)) |
---|
664 | #else |
---|
665 | AS_PUSH_IF86(si) |
---|
666 | AS_PUSH_IF86(di) |
---|
667 | AS_PUSH_IF86(bx) |
---|
668 | AS_PUSH_IF86(bp) |
---|
669 | AS2( lea AS_REG_7, [Te]) |
---|
670 | AS2( mov edi, [g_cacheLineSize]) |
---|
671 | #endif |
---|
672 | |
---|
673 | #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 |
---|
674 | AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP |
---|
675 | AS2( lea esp, [ecx-768]) |
---|
676 | #endif |
---|
677 | |
---|
678 | // copy subkeys to stack |
---|
679 | AS2( mov WORD_REG(si), [L_KEYS_BEGIN]) |
---|
680 | AS2( mov WORD_REG(ax), 16) |
---|
681 | AS2( and WORD_REG(ax), WORD_REG(si)) |
---|
682 | AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter) |
---|
683 | AS2( movdqa [L_KEY12], xmm3) |
---|
684 | AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16]) |
---|
685 | AS2( sub WORD_REG(ax), WORD_REG(si)) |
---|
686 | ASL(0) |
---|
687 | AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)]) |
---|
688 | AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0) |
---|
689 | AS2( add WORD_REG(si), 16) |
---|
690 | AS2( cmp WORD_REG(si), 16*12) |
---|
691 | ATT_NOPREFIX |
---|
692 | ASJ( jl, 0, b) |
---|
693 | INTEL_NOPREFIX |
---|
694 | |
---|
695 | // read subkeys 0, 1 and last |
---|
696 | AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey |
---|
697 | AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0 |
---|
698 | AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3 |
---|
699 | AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7 |
---|
700 | AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11 |
---|
701 | AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15 |
---|
702 | |
---|
703 | // load table into cache |
---|
704 | AS2( xor WORD_REG(ax), WORD_REG(ax)) |
---|
705 | ASL(9) |
---|
706 | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
---|
707 | AS2( add WORD_REG(ax), WORD_REG(di)) |
---|
708 | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
---|
709 | AS2( add WORD_REG(ax), WORD_REG(di)) |
---|
710 | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
---|
711 | AS2( add WORD_REG(ax), WORD_REG(di)) |
---|
712 | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
---|
713 | AS2( add WORD_REG(ax), WORD_REG(di)) |
---|
714 | AS2( cmp WORD_REG(ax), 2048) |
---|
715 | ATT_NOPREFIX |
---|
716 | ASJ( jl, 9, b) |
---|
717 | INTEL_NOPREFIX |
---|
718 | AS1( lfence) |
---|
719 | |
---|
720 | AS2( test DWORD PTR [L_LENGTH], 1) |
---|
721 | ATT_NOPREFIX |
---|
722 | ASJ( jz, 8, f) |
---|
723 | INTEL_NOPREFIX |
---|
724 | |
---|
725 | // counter mode one-time setup |
---|
726 | AS2( mov WORD_REG(si), [L_INBLOCKS]) |
---|
727 | AS2( movdqu xmm2, [WORD_REG(si)]) // counter |
---|
728 | AS2( pxor xmm2, xmm1) |
---|
729 | AS2( psrldq xmm1, 14) |
---|
730 | AS2( movd eax, xmm1) |
---|
731 | AS2( mov al, BYTE PTR [WORD_REG(si)+15]) |
---|
732 | AS2( MOVD MM(2), eax) |
---|
733 | #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 |
---|
734 | AS2( mov eax, 1) |
---|
735 | AS2( movd mm3, eax) |
---|
736 | #endif |
---|
737 | |
---|
738 | // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx |
---|
739 | AS2( movd eax, xmm2) |
---|
740 | AS2( psrldq xmm2, 4) |
---|
741 | AS2( movd edi, xmm2) |
---|
742 | AS2( psrldq xmm2, 4) |
---|
743 | MXOR( 1, al, 0) // 0 |
---|
744 | XOR( edx, ah, 1) // 1 |
---|
745 | AS2( shr eax, 16) |
---|
746 | XOR( ecx, al, 2) // 2 |
---|
747 | XOR( ebx, ah, 3) // 3 |
---|
748 | AS2( mov eax, edi) |
---|
749 | AS2( movd edi, xmm2) |
---|
750 | AS2( psrldq xmm2, 4) |
---|
751 | XOR( ebx, al, 0) // 4 |
---|
752 | MXOR( 1, ah, 1) // 5 |
---|
753 | AS2( shr eax, 16) |
---|
754 | XOR( edx, al, 2) // 6 |
---|
755 | XOR( ecx, ah, 3) // 7 |
---|
756 | AS2( mov eax, edi) |
---|
757 | AS2( movd edi, xmm2) |
---|
758 | XOR( ecx, al, 0) // 8 |
---|
759 | XOR( ebx, ah, 1) // 9 |
---|
760 | AS2( shr eax, 16) |
---|
761 | MXOR( 1, al, 2) // 10 |
---|
762 | XOR( edx, ah, 3) // 11 |
---|
763 | AS2( mov eax, edi) |
---|
764 | XOR( edx, al, 0) // 12 |
---|
765 | XOR( ecx, ah, 1) // 13 |
---|
766 | AS2( shr eax, 16) |
---|
767 | XOR( ebx, al, 2) // 14 |
---|
768 | AS2( psrldq xmm2, 3) |
---|
769 | |
---|
770 | // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0 |
---|
771 | AS2( mov eax, [L_KEY12+0*4]) |
---|
772 | AS2( mov edi, [L_KEY12+2*4]) |
---|
773 | AS2( MOVD MM(0), [L_KEY12+3*4]) |
---|
774 | MXOR( 0, cl, 3) /* 11 */ |
---|
775 | XOR( edi, bl, 3) /* 7 */ |
---|
776 | MXOR( 0, bh, 2) /* 6 */ |
---|
777 | AS2( shr ebx, 16) /* 4,5 */ |
---|
778 | XOR( eax, bl, 1) /* 5 */ |
---|
779 | MOV( ebx, bh, 0) /* 4 */ |
---|
780 | AS2( xor ebx, [L_KEY12+1*4]) |
---|
781 | XOR( eax, ch, 2) /* 10 */ |
---|
782 | AS2( shr ecx, 16) /* 8,9 */ |
---|
783 | XOR( eax, dl, 3) /* 15 */ |
---|
784 | XOR( ebx, dh, 2) /* 14 */ |
---|
785 | AS2( shr edx, 16) /* 12,13 */ |
---|
786 | XOR( edi, ch, 0) /* 8 */ |
---|
787 | XOR( ebx, cl, 1) /* 9 */ |
---|
788 | XOR( edi, dl, 1) /* 13 */ |
---|
789 | MXOR( 0, dh, 0) /* 12 */ |
---|
790 | |
---|
791 | AS2( movd ecx, xmm2) |
---|
792 | AS2( MOVD edx, MM(1)) |
---|
793 | AS2( MOVD [L_SAVED_X+3*4], MM(0)) |
---|
794 | AS2( mov [L_SAVED_X+0*4], eax) |
---|
795 | AS2( mov [L_SAVED_X+1*4], ebx) |
---|
796 | AS2( mov [L_SAVED_X+2*4], edi) |
---|
797 | ATT_NOPREFIX |
---|
798 | ASJ( jmp, 5, f) |
---|
799 | INTEL_NOPREFIX |
---|
800 | ASL(3) |
---|
801 | // non-counter mode per-block setup |
---|
802 | AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3 |
---|
803 | AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7 |
---|
804 | AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11 |
---|
805 | AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15 |
---|
806 | ASL(8) |
---|
807 | AS2( mov WORD_REG(ax), [L_INBLOCKS]) |
---|
808 | AS2( movdqu xmm2, [WORD_REG(ax)]) |
---|
809 | AS2( mov WORD_REG(si), [L_INXORBLOCKS]) |
---|
810 | AS2( movdqu xmm5, [WORD_REG(si)]) |
---|
811 | AS2( pxor xmm2, xmm1) |
---|
812 | AS2( pxor xmm2, xmm5) |
---|
813 | |
---|
814 | // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx |
---|
815 | AS2( movd eax, xmm2) |
---|
816 | AS2( psrldq xmm2, 4) |
---|
817 | AS2( movd edi, xmm2) |
---|
818 | AS2( psrldq xmm2, 4) |
---|
819 | MXOR( 1, al, 0) // 0 |
---|
820 | XOR( edx, ah, 1) // 1 |
---|
821 | AS2( shr eax, 16) |
---|
822 | XOR( ecx, al, 2) // 2 |
---|
823 | XOR( ebx, ah, 3) // 3 |
---|
824 | AS2( mov eax, edi) |
---|
825 | AS2( movd edi, xmm2) |
---|
826 | AS2( psrldq xmm2, 4) |
---|
827 | XOR( ebx, al, 0) // 4 |
---|
828 | MXOR( 1, ah, 1) // 5 |
---|
829 | AS2( shr eax, 16) |
---|
830 | XOR( edx, al, 2) // 6 |
---|
831 | XOR( ecx, ah, 3) // 7 |
---|
832 | AS2( mov eax, edi) |
---|
833 | AS2( movd edi, xmm2) |
---|
834 | XOR( ecx, al, 0) // 8 |
---|
835 | XOR( ebx, ah, 1) // 9 |
---|
836 | AS2( shr eax, 16) |
---|
837 | MXOR( 1, al, 2) // 10 |
---|
838 | XOR( edx, ah, 3) // 11 |
---|
839 | AS2( mov eax, edi) |
---|
840 | XOR( edx, al, 0) // 12 |
---|
841 | XOR( ecx, ah, 1) // 13 |
---|
842 | AS2( shr eax, 16) |
---|
843 | XOR( ebx, al, 2) // 14 |
---|
844 | MXOR( 1, ah, 3) // 15 |
---|
845 | AS2( MOVD eax, MM(1)) |
---|
846 | |
---|
847 | AS2( add L_REG, [L_KEYS_BEGIN]) |
---|
848 | AS2( add L_REG, 4*16) |
---|
849 | ATT_NOPREFIX |
---|
850 | ASJ( jmp, 2, f) |
---|
851 | INTEL_NOPREFIX |
---|
852 | ASL(1) |
---|
853 | // counter-mode per-block setup |
---|
854 | AS2( MOVD ecx, MM(2)) |
---|
855 | AS2( MOVD edx, MM(1)) |
---|
856 | AS2( mov eax, [L_SAVED_X+0*4]) |
---|
857 | AS2( mov ebx, [L_SAVED_X+1*4]) |
---|
858 | AS2( xor cl, ch) |
---|
859 | AS2( and WORD_REG(cx), 255) |
---|
860 | ASL(5) |
---|
861 | #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 |
---|
862 | AS2( paddb MM(2), mm3) |
---|
863 | #else |
---|
864 | AS2( add MM(2), 1) |
---|
865 | #endif |
---|
866 | // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx |
---|
867 | AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3]) |
---|
868 | XOR( ebx, dl, 3) |
---|
869 | MOV( ecx, dh, 2) |
---|
870 | AS2( shr edx, 16) |
---|
871 | AS2( xor ecx, [L_SAVED_X+2*4]) |
---|
872 | XOR( eax, dh, 0) |
---|
873 | MOV( edx, dl, 1) |
---|
874 | AS2( xor edx, [L_SAVED_X+3*4]) |
---|
875 | |
---|
876 | AS2( add L_REG, [L_KEYS_BEGIN]) |
---|
877 | AS2( add L_REG, 3*16) |
---|
878 | ATT_NOPREFIX |
---|
879 | ASJ( jmp, 4, f) |
---|
880 | INTEL_NOPREFIX |
---|
881 | |
---|
882 | // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15) |
---|
883 | // out: eax, ebx, edi, mm0 |
---|
884 | #define ROUND() \ |
---|
885 | MXOR( 0, cl, 3) /* 11 */\ |
---|
886 | AS2( mov cl, al) /* 8,9,10,3 */\ |
---|
887 | XOR( edi, ah, 2) /* 2 */\ |
---|
888 | AS2( shr eax, 16) /* 0,1 */\ |
---|
889 | XOR( edi, bl, 3) /* 7 */\ |
---|
890 | MXOR( 0, bh, 2) /* 6 */\ |
---|
891 | AS2( shr ebx, 16) /* 4,5 */\ |
---|
892 | MXOR( 0, al, 1) /* 1 */\ |
---|
893 | MOV( eax, ah, 0) /* 0 */\ |
---|
894 | XOR( eax, bl, 1) /* 5 */\ |
---|
895 | MOV( ebx, bh, 0) /* 4 */\ |
---|
896 | XOR( eax, ch, 2) /* 10 */\ |
---|
897 | XOR( ebx, cl, 3) /* 3 */\ |
---|
898 | AS2( shr ecx, 16) /* 8,9 */\ |
---|
899 | XOR( eax, dl, 3) /* 15 */\ |
---|
900 | XOR( ebx, dh, 2) /* 14 */\ |
---|
901 | AS2( shr edx, 16) /* 12,13 */\ |
---|
902 | XOR( edi, ch, 0) /* 8 */\ |
---|
903 | XOR( ebx, cl, 1) /* 9 */\ |
---|
904 | XOR( edi, dl, 1) /* 13 */\ |
---|
905 | MXOR( 0, dh, 0) /* 12 */\ |
---|
906 | |
---|
907 | ASL(2) // 2-round loop |
---|
908 | AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4]) |
---|
909 | AS2( mov edi, [L_SUBKEYS-4*16+2*4]) |
---|
910 | ROUND() |
---|
911 | AS2( mov ecx, edi) |
---|
912 | AS2( xor eax, [L_SUBKEYS-4*16+0*4]) |
---|
913 | AS2( xor ebx, [L_SUBKEYS-4*16+1*4]) |
---|
914 | AS2( MOVD edx, MM(0)) |
---|
915 | |
---|
916 | ASL(4) |
---|
917 | AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4]) |
---|
918 | AS2( mov edi, [L_SUBKEYS-4*16+6*4]) |
---|
919 | ROUND() |
---|
920 | AS2( mov ecx, edi) |
---|
921 | AS2( xor eax, [L_SUBKEYS-4*16+4*4]) |
---|
922 | AS2( xor ebx, [L_SUBKEYS-4*16+5*4]) |
---|
923 | AS2( MOVD edx, MM(0)) |
---|
924 | |
---|
925 | AS2( add L_REG, 32) |
---|
926 | AS2( test L_REG, 255) |
---|
927 | ATT_NOPREFIX |
---|
928 | ASJ( jnz, 2, b) |
---|
929 | INTEL_NOPREFIX |
---|
930 | AS2( sub L_REG, 16*16) |
---|
931 | |
---|
932 | #define LAST(a, b, c) \ |
---|
933 | AS2( movzx esi, a )\ |
---|
934 | AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\ |
---|
935 | AS2( movzx esi, b )\ |
---|
936 | AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\ |
---|
937 | AS2( mov WORD PTR [L_LASTROUND+c], di )\ |
---|
938 | |
---|
939 | // last round |
---|
940 | LAST(ch, dl, 2) |
---|
941 | LAST(dh, al, 6) |
---|
942 | AS2( shr edx, 16) |
---|
943 | LAST(ah, bl, 10) |
---|
944 | AS2( shr eax, 16) |
---|
945 | LAST(bh, cl, 14) |
---|
946 | AS2( shr ebx, 16) |
---|
947 | LAST(dh, al, 12) |
---|
948 | AS2( shr ecx, 16) |
---|
949 | LAST(ah, bl, 0) |
---|
950 | LAST(bh, cl, 4) |
---|
951 | LAST(ch, dl, 8) |
---|
952 | |
---|
953 | AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS]) |
---|
954 | AS2( mov WORD_REG(bx), [L_OUTBLOCKS]) |
---|
955 | |
---|
956 | AS2( mov WORD_REG(cx), [L_LENGTH]) |
---|
957 | AS2( sub WORD_REG(cx), 16) |
---|
958 | |
---|
959 | AS2( movdqu xmm2, [WORD_REG(ax)]) |
---|
960 | AS2( pxor xmm2, xmm4) |
---|
961 | |
---|
962 | #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 |
---|
963 | AS2( movdqa xmm0, [L_INCREMENTS]) |
---|
964 | AS2( paddd xmm0, [L_INBLOCKS]) |
---|
965 | AS2( movdqa [L_INBLOCKS], xmm0) |
---|
966 | #else |
---|
967 | AS2( movdqa xmm0, [L_INCREMENTS+16]) |
---|
968 | AS2( paddq xmm0, [L_INBLOCKS+16]) |
---|
969 | AS2( movdqa [L_INBLOCKS+16], xmm0) |
---|
970 | #endif |
---|
971 | |
---|
972 | AS2( pxor xmm2, [L_LASTROUND]) |
---|
973 | AS2( movdqu [WORD_REG(bx)], xmm2) |
---|
974 | |
---|
975 | ATT_NOPREFIX |
---|
976 | ASJ( jle, 7, f) |
---|
977 | INTEL_NOPREFIX |
---|
978 | AS2( mov [L_LENGTH], WORD_REG(cx)) |
---|
979 | AS2( test WORD_REG(cx), 1) |
---|
980 | ATT_NOPREFIX |
---|
981 | ASJ( jnz, 1, b) |
---|
982 | INTEL_NOPREFIX |
---|
983 | #if CRYPTOPP_BOOL_X64 |
---|
984 | AS2( movdqa xmm0, [L_INCREMENTS]) |
---|
985 | AS2( paddq xmm0, [L_INBLOCKS]) |
---|
986 | AS2( movdqa [L_INBLOCKS], xmm0) |
---|
987 | #endif |
---|
988 | ATT_NOPREFIX |
---|
989 | ASJ( jmp, 3, b) |
---|
990 | INTEL_NOPREFIX |
---|
991 | |
---|
992 | ASL(7) |
---|
993 | // erase keys on stack |
---|
994 | AS2( xorps xmm0, xmm0) |
---|
995 | AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16]) |
---|
996 | AS2( movaps [WORD_REG(ax)-7*16], xmm0) |
---|
997 | AS2( movaps [WORD_REG(ax)-6*16], xmm0) |
---|
998 | AS2( movaps [WORD_REG(ax)-5*16], xmm0) |
---|
999 | AS2( movaps [WORD_REG(ax)-4*16], xmm0) |
---|
1000 | AS2( movaps [WORD_REG(ax)-3*16], xmm0) |
---|
1001 | AS2( movaps [WORD_REG(ax)-2*16], xmm0) |
---|
1002 | AS2( movaps [WORD_REG(ax)-1*16], xmm0) |
---|
1003 | AS2( movaps [WORD_REG(ax)+0*16], xmm0) |
---|
1004 | AS2( movaps [WORD_REG(ax)+1*16], xmm0) |
---|
1005 | AS2( movaps [WORD_REG(ax)+2*16], xmm0) |
---|
1006 | AS2( movaps [WORD_REG(ax)+3*16], xmm0) |
---|
1007 | AS2( movaps [WORD_REG(ax)+4*16], xmm0) |
---|
1008 | AS2( movaps [WORD_REG(ax)+5*16], xmm0) |
---|
1009 | AS2( movaps [WORD_REG(ax)+6*16], xmm0) |
---|
1010 | #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 |
---|
1011 | AS2( mov esp, [L_SP]) |
---|
1012 | AS1( emms) |
---|
1013 | #endif |
---|
1014 | AS_POP_IF86(bp) |
---|
1015 | AS_POP_IF86(bx) |
---|
1016 | #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86 |
---|
1017 | AS_POP_IF86(di) |
---|
1018 | AS_POP_IF86(si) |
---|
1019 | AS1(ret) |
---|
1020 | #endif |
---|
1021 | #ifdef CRYPTOPP_GENERATE_X64_MASM |
---|
1022 | pop r12 |
---|
1023 | pop rbx |
---|
1024 | pop rdi |
---|
1025 | pop rsi |
---|
1026 | ret |
---|
1027 | Rijndael_Enc_AdvancedProcessBlocks ENDP |
---|
1028 | #endif |
---|
1029 | #ifdef __GNUC__ |
---|
1030 | ATT_PREFIX |
---|
1031 | : |
---|
1032 | : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize) |
---|
1033 | : "memory", "cc", "%eax" |
---|
1034 | #if CRYPTOPP_BOOL_X64 |
---|
1035 | , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12" |
---|
1036 | #endif |
---|
1037 | ); |
---|
1038 | #endif |
---|
1039 | } |
---|
1040 | |
---|
1041 | #endif |
---|
1042 | |
---|
1043 | #ifndef CRYPTOPP_GENERATE_X64_MASM |
---|
1044 | |
---|
1045 | #ifdef CRYPTOPP_X64_MASM_AVAILABLE |
---|
1046 | extern "C" { |
---|
1047 | void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k); |
---|
1048 | } |
---|
1049 | #endif |
---|
1050 | |
---|
1051 | #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 |
---|
1052 | |
---|
1053 | // Determine whether the range between begin and end overlaps |
---|
1054 | // with the same 4k block offsets as the Te table. Logically, |
---|
1055 | // the code is trying to create the condition: |
---|
1056 | // |
---|
1057 | // Two sepearate memory pages: |
---|
1058 | // |
---|
1059 | // +-----+ +-----+ |
---|
1060 | // |XXXXX| |YYYYY| |
---|
1061 | // |XXXXX| |YYYYY| |
---|
1062 | // | | | | |
---|
1063 | // | | | | |
---|
1064 | // +-----+ +-----+ |
---|
1065 | // Te Table Locals |
---|
1066 | // |
---|
1067 | // Have a logical cache view of (X and Y may be inverted): |
---|
1068 | // |
---|
1069 | // +-----+ |
---|
1070 | // |XXXXX| |
---|
1071 | // |XXXXX| |
---|
1072 | // |YYYYY| |
---|
1073 | // |YYYYY| |
---|
1074 | // +-----+ |
---|
1075 | // |
---|
1076 | static inline bool AliasedWithTable(const byte *begin, const byte *end) |
---|
1077 | { |
---|
1078 | ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096; |
---|
1079 | ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096; |
---|
1080 | if (t1 > t0) |
---|
1081 | return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1); |
---|
1082 | else |
---|
1083 | return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); |
---|
1084 | } |
---|
1085 | |
---|
1086 | #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE |
---|
1087 | |
---|
1088 | inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) |
---|
1089 | { |
---|
1090 | block = _mm_xor_si128(block, subkeys[0]); |
---|
1091 | for (unsigned int i=1; i<rounds-1; i+=2) |
---|
1092 | { |
---|
1093 | block = _mm_aesenc_si128(block, subkeys[i]); |
---|
1094 | block = _mm_aesenc_si128(block, subkeys[i+1]); |
---|
1095 | } |
---|
1096 | block = _mm_aesenc_si128(block, subkeys[rounds-1]); |
---|
1097 | block = _mm_aesenclast_si128(block, subkeys[rounds]); |
---|
1098 | } |
---|
1099 | |
---|
1100 | inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds) |
---|
1101 | { |
---|
1102 | __m128i rk = subkeys[0]; |
---|
1103 | block0 = _mm_xor_si128(block0, rk); |
---|
1104 | block1 = _mm_xor_si128(block1, rk); |
---|
1105 | block2 = _mm_xor_si128(block2, rk); |
---|
1106 | block3 = _mm_xor_si128(block3, rk); |
---|
1107 | for (unsigned int i=1; i<rounds; i++) |
---|
1108 | { |
---|
1109 | rk = subkeys[i]; |
---|
1110 | block0 = _mm_aesenc_si128(block0, rk); |
---|
1111 | block1 = _mm_aesenc_si128(block1, rk); |
---|
1112 | block2 = _mm_aesenc_si128(block2, rk); |
---|
1113 | block3 = _mm_aesenc_si128(block3, rk); |
---|
1114 | } |
---|
1115 | rk = subkeys[rounds]; |
---|
1116 | block0 = _mm_aesenclast_si128(block0, rk); |
---|
1117 | block1 = _mm_aesenclast_si128(block1, rk); |
---|
1118 | block2 = _mm_aesenclast_si128(block2, rk); |
---|
1119 | block3 = _mm_aesenclast_si128(block3, rk); |
---|
1120 | } |
---|
1121 | |
---|
1122 | inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) |
---|
1123 | { |
---|
1124 | block = _mm_xor_si128(block, subkeys[0]); |
---|
1125 | for (unsigned int i=1; i<rounds-1; i+=2) |
---|
1126 | { |
---|
1127 | block = _mm_aesdec_si128(block, subkeys[i]); |
---|
1128 | block = _mm_aesdec_si128(block, subkeys[i+1]); |
---|
1129 | } |
---|
1130 | block = _mm_aesdec_si128(block, subkeys[rounds-1]); |
---|
1131 | block = _mm_aesdeclast_si128(block, subkeys[rounds]); |
---|
1132 | } |
---|
1133 | |
---|
1134 | inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds) |
---|
1135 | { |
---|
1136 | __m128i rk = subkeys[0]; |
---|
1137 | block0 = _mm_xor_si128(block0, rk); |
---|
1138 | block1 = _mm_xor_si128(block1, rk); |
---|
1139 | block2 = _mm_xor_si128(block2, rk); |
---|
1140 | block3 = _mm_xor_si128(block3, rk); |
---|
1141 | for (unsigned int i=1; i<rounds; i++) |
---|
1142 | { |
---|
1143 | rk = subkeys[i]; |
---|
1144 | block0 = _mm_aesdec_si128(block0, rk); |
---|
1145 | block1 = _mm_aesdec_si128(block1, rk); |
---|
1146 | block2 = _mm_aesdec_si128(block2, rk); |
---|
1147 | block3 = _mm_aesdec_si128(block3, rk); |
---|
1148 | } |
---|
1149 | rk = subkeys[rounds]; |
---|
1150 | block0 = _mm_aesdeclast_si128(block0, rk); |
---|
1151 | block1 = _mm_aesdeclast_si128(block1, rk); |
---|
1152 | block2 = _mm_aesdeclast_si128(block2, rk); |
---|
1153 | block3 = _mm_aesdeclast_si128(block3, rk); |
---|
1154 | } |
---|
1155 | |
---|
1156 | CRYPTOPP_ALIGN_DATA(16) |
---|
1157 | static const word32 s_one[] = {0, 0, 0, 1<<24}; |
---|
1158 | |
---|
1159 | template <typename F1, typename F4> |
---|
1160 | inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
---|
1161 | { |
---|
1162 | size_t blockSize = 16; |
---|
1163 | size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
---|
1164 | size_t xorIncrement = xorBlocks ? blockSize : 0; |
---|
1165 | size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; |
---|
1166 | |
---|
1167 | if (flags & BlockTransformation::BT_ReverseDirection) |
---|
1168 | { |
---|
1169 | CRYPTOPP_ASSERT(length % blockSize == 0); |
---|
1170 | inBlocks += length - blockSize; |
---|
1171 | xorBlocks += length - blockSize; |
---|
1172 | outBlocks += length - blockSize; |
---|
1173 | inIncrement = 0-inIncrement; |
---|
1174 | xorIncrement = 0-xorIncrement; |
---|
1175 | outIncrement = 0-outIncrement; |
---|
1176 | } |
---|
1177 | |
---|
1178 | if (flags & BlockTransformation::BT_AllowParallel) |
---|
1179 | { |
---|
1180 | while (length >= 4*blockSize) |
---|
1181 | { |
---|
1182 | __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3; |
---|
1183 | if (flags & BlockTransformation::BT_InBlockIsCounter) |
---|
1184 | { |
---|
1185 | const __m128i be1 = *(const __m128i *)(const void *)s_one; |
---|
1186 | block1 = _mm_add_epi32(block0, be1); |
---|
1187 | block2 = _mm_add_epi32(block1, be1); |
---|
1188 | block3 = _mm_add_epi32(block2, be1); |
---|
1189 | _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1)); |
---|
1190 | } |
---|
1191 | else |
---|
1192 | { |
---|
1193 | inBlocks += inIncrement; |
---|
1194 | block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); |
---|
1195 | inBlocks += inIncrement; |
---|
1196 | block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); |
---|
1197 | inBlocks += inIncrement; |
---|
1198 | block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); |
---|
1199 | inBlocks += inIncrement; |
---|
1200 | } |
---|
1201 | |
---|
1202 | if (flags & BlockTransformation::BT_XorInput) |
---|
1203 | { |
---|
1204 | // Coverity finding, appears to be false positive. Assert the condition. |
---|
1205 | CRYPTOPP_ASSERT(xorBlocks); |
---|
1206 | block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1207 | xorBlocks += xorIncrement; |
---|
1208 | block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1209 | xorBlocks += xorIncrement; |
---|
1210 | block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1211 | xorBlocks += xorIncrement; |
---|
1212 | block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1213 | xorBlocks += xorIncrement; |
---|
1214 | } |
---|
1215 | |
---|
1216 | func4(block0, block1, block2, block3, subkeys, rounds); |
---|
1217 | |
---|
1218 | if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) |
---|
1219 | { |
---|
1220 | block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1221 | xorBlocks += xorIncrement; |
---|
1222 | block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1223 | xorBlocks += xorIncrement; |
---|
1224 | block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1225 | xorBlocks += xorIncrement; |
---|
1226 | block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1227 | xorBlocks += xorIncrement; |
---|
1228 | } |
---|
1229 | |
---|
1230 | _mm_storeu_si128((__m128i *)(void *)outBlocks, block0); |
---|
1231 | outBlocks += outIncrement; |
---|
1232 | _mm_storeu_si128((__m128i *)(void *)outBlocks, block1); |
---|
1233 | outBlocks += outIncrement; |
---|
1234 | _mm_storeu_si128((__m128i *)(void *)outBlocks, block2); |
---|
1235 | outBlocks += outIncrement; |
---|
1236 | _mm_storeu_si128((__m128i *)(void *)outBlocks, block3); |
---|
1237 | outBlocks += outIncrement; |
---|
1238 | |
---|
1239 | length -= 4*blockSize; |
---|
1240 | } |
---|
1241 | } |
---|
1242 | |
---|
1243 | while (length >= blockSize) |
---|
1244 | { |
---|
1245 | __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); |
---|
1246 | |
---|
1247 | if (flags & BlockTransformation::BT_XorInput) |
---|
1248 | block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1249 | |
---|
1250 | if (flags & BlockTransformation::BT_InBlockIsCounter) |
---|
1251 | const_cast<byte *>(inBlocks)[15]++; |
---|
1252 | |
---|
1253 | func1(block, subkeys, rounds); |
---|
1254 | |
---|
1255 | if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) |
---|
1256 | block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); |
---|
1257 | |
---|
1258 | _mm_storeu_si128((__m128i *)(void *)outBlocks, block); |
---|
1259 | |
---|
1260 | inBlocks += inIncrement; |
---|
1261 | outBlocks += outIncrement; |
---|
1262 | xorBlocks += xorIncrement; |
---|
1263 | length -= blockSize; |
---|
1264 | } |
---|
1265 | |
---|
1266 | return length; |
---|
1267 | } |
---|
1268 | #endif |
---|
1269 | |
---|
1270 | #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 |
---|
1271 | struct Locals |
---|
1272 | { |
---|
1273 | word32 subkeys[4*12], workspace[8]; |
---|
1274 | const byte *inBlocks, *inXorBlocks, *outXorBlocks; |
---|
1275 | byte *outBlocks; |
---|
1276 | size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement; |
---|
1277 | size_t regSpill, lengthAndCounterFlag, keysBegin; |
---|
1278 | }; |
---|
1279 | |
---|
1280 | const size_t s_aliasPageSize = 4096; |
---|
1281 | const size_t s_aliasBlockSize = 256; |
---|
1282 | const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals); |
---|
1283 | |
---|
1284 | Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { } |
---|
1285 | #endif |
---|
1286 | |
---|
1287 | size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const |
---|
1288 | { |
---|
1289 | #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE |
---|
1290 | if (HasAESNI()) |
---|
1291 | return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
---|
1292 | #endif |
---|
1293 | |
---|
1294 | #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
---|
1295 | if (HasSSE2()) |
---|
1296 | { |
---|
1297 | if (length < BLOCKSIZE) |
---|
1298 | return length; |
---|
1299 | |
---|
1300 | static const byte *zeros = (const byte*)(Te+256); |
---|
1301 | byte *space = NULL, *originalSpace = const_cast<byte*>(m_aliasBlock.data()); |
---|
1302 | |
---|
1303 | // round up to nearest 256 byte boundary |
---|
1304 | space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize; |
---|
1305 | while (AliasedWithTable(space, space + sizeof(Locals))) |
---|
1306 | { |
---|
1307 | space += 256; |
---|
1308 | CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize)); |
---|
1309 | } |
---|
1310 | |
---|
1311 | size_t increment = BLOCKSIZE; |
---|
1312 | if (flags & BT_ReverseDirection) |
---|
1313 | { |
---|
1314 | CRYPTOPP_ASSERT(length % BLOCKSIZE == 0); |
---|
1315 | inBlocks += length - BLOCKSIZE; |
---|
1316 | xorBlocks += length - BLOCKSIZE; |
---|
1317 | outBlocks += length - BLOCKSIZE; |
---|
1318 | increment = 0-increment; |
---|
1319 | } |
---|
1320 | |
---|
1321 | Locals &locals = *(Locals *)(void *)space; |
---|
1322 | |
---|
1323 | locals.inBlocks = inBlocks; |
---|
1324 | locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros; |
---|
1325 | locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks; |
---|
1326 | locals.outBlocks = outBlocks; |
---|
1327 | |
---|
1328 | locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; |
---|
1329 | locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0; |
---|
1330 | locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment; |
---|
1331 | locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; |
---|
1332 | |
---|
1333 | locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter); |
---|
1334 | int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2); |
---|
1335 | locals.keysBegin = (12-keysToCopy)*16; |
---|
1336 | |
---|
1337 | Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key); |
---|
1338 | |
---|
1339 | return length % BLOCKSIZE; |
---|
1340 | } |
---|
1341 | #endif |
---|
1342 | |
---|
1343 | return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); |
---|
1344 | } |
---|
1345 | |
---|
1346 | #endif |
---|
1347 | |
---|
1348 | #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE |
---|
1349 | |
---|
1350 | size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const |
---|
1351 | { |
---|
1352 | if (HasAESNI()) |
---|
1353 | return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
---|
1354 | |
---|
1355 | return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); |
---|
1356 | } |
---|
1357 | |
---|
1358 | #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE |
---|
1359 | |
---|
1360 | NAMESPACE_END |
---|
1361 | |
---|
1362 | #endif |
---|
1363 | #endif |
---|