Crypto++
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 
69 #ifndef CRYPTOPP_IMPORTS
70 #ifndef CRYPTOPP_GENERATE_X64_MASM
71 
72 #include "rijndael.h"
73 #include "misc.h"
74 #include "cpu.h"
75 
76 NAMESPACE_BEGIN(CryptoPP)
77 
78 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
79 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
80 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
81 using namespace rdtable;
82 #else
83 static word64 Te[256];
84 #endif
85 static word64 Td[256];
86 #else
87 static word32 Te[256*4], Td[256*4];
88 #endif
89 static volatile bool s_TeFilled = false, s_TdFilled = false;
90 
91 // ************************* Portable Code ************************************
92 
93 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
94  a ^= L(T, 3, byte(t)); t >>= 8;\
95  b ^= L(T, 2, byte(t)); t >>= 8;\
96  c ^= L(T, 1, byte(t)); t >>= 8;\
97  d ^= L(T, 0, t);
98 
99 #define QUARTER_ROUND_LE(t, a, b, c, d) \
100  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
101  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
102  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
103  tempBlock[d] = ((byte *)(Te+t))[1];
104 
105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
106  #define QUARTER_ROUND_LD(t, a, b, c, d) \
107  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
108  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
109  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
110  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
111 #else
112  #define QUARTER_ROUND_LD(t, a, b, c, d) \
113  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
114  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
115  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
116  tempBlock[d] = Sd[t];
117 #endif
118 
119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
121 
122 #ifdef IS_LITTLE_ENDIAN
123  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
124  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
125  #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
126  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
127  #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
128  #else
129  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
130  #define TL_M(T, i, x) T[i*256 + x]
131  #endif
132 #else
133  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
134  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
135  #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
136  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
137  #define TL_M TL_F
138  #else
139  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
140  #define TL_M(T, i, x) T[i*256 + x]
141  #endif
142 #endif
143 
144 
145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
148 
149 #define f3(x) (f2(x) ^ x)
150 #define f9(x) (f8(x) ^ x)
151 #define fb(x) (f8(x) ^ f2(x) ^ x)
152 #define fd(x) (f8(x) ^ f4(x) ^ x)
153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
154 
155 void Rijndael::Base::FillEncTable()
156 {
157  for (int i=0; i<256; i++)
158  {
159  byte x = Se[i];
160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
161  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
162  Te[i] = word64(y | f3(x))<<32 | y;
163 #else
164  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
165  for (int j=0; j<4; j++)
166  {
167  Te[i+j*256] = y;
168  y = rotrFixed(y, 8);
169  }
170 #endif
171  }
172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
173  Te[256] = Te[257] = 0;
174 #endif
175  s_TeFilled = true;
176 }
177 
178 void Rijndael::Base::FillDecTable()
179 {
180  for (int i=0; i<256; i++)
181  {
182  byte x = Sd[i];
183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
184  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
185  Td[i] = word64(y | fb(x))<<32 | y | x;
186 #else
187  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
188  for (int j=0; j<4; j++)
189  {
190  Td[i+j*256] = y;
191  y = rotrFixed(y, 8);
192  }
193 #endif
194  }
195  s_TdFilled = true;
196 }
197 
198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
199 {
200  AssertValidKeyLength(keylen);
201 
202  m_rounds = keylen/4 + 6;
203  m_key.New(4*(m_rounds+1));
204 
205  word32 *rk = m_key;
206 
207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
208  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
209  if (HasAESNI())
210  {
211  static const word32 rcLE[] = {
212  0x01, 0x02, 0x04, 0x08,
213  0x10, 0x20, 0x40, 0x80,
214  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
215  };
216  const word32 *rc = rcLE;
217 
218  __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
219  memcpy(rk, userKey, keylen);
220 
221  while (true)
222  {
223  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
224  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
225  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
226  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
227 
228  if (rk + keylen/4 + 4 == m_key.end())
229  break;
230 
231  if (keylen == 24)
232  {
233  rk[10] = rk[ 4] ^ rk[ 9];
234  rk[11] = rk[ 5] ^ rk[10];
235  temp = _mm_insert_epi32(temp, rk[11], 3);
236  }
237  else if (keylen == 32)
238  {
239  temp = _mm_insert_epi32(temp, rk[11], 3);
240  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
241  rk[13] = rk[ 5] ^ rk[12];
242  rk[14] = rk[ 6] ^ rk[13];
243  rk[15] = rk[ 7] ^ rk[14];
244  temp = _mm_insert_epi32(temp, rk[15], 3);
245  }
246  else
247  temp = _mm_insert_epi32(temp, rk[7], 3);
248 
249  rk += keylen/4;
250  }
251 
252  if (!IsForwardTransformation())
253  {
254  rk = m_key;
255  unsigned int i, j;
256 
257  std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
258 
259  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
260  {
261  temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
262  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
263  *(__m128i *)(rk+j) = temp;
264  }
265 
266  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
267  }
268 
269  return;
270  }
271 #endif
272 
273  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
274  const word32 *rc = rcon;
275  word32 temp;
276 
277  while (true)
278  {
279  temp = rk[keylen/4-1];
280  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
281  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
282  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
283  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
284  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
285 
286  if (rk + keylen/4 + 4 == m_key.end())
287  break;
288 
289  if (keylen == 24)
290  {
291  rk[10] = rk[ 4] ^ rk[ 9];
292  rk[11] = rk[ 5] ^ rk[10];
293  }
294  else if (keylen == 32)
295  {
296  temp = rk[11];
297  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
298  rk[13] = rk[ 5] ^ rk[12];
299  rk[14] = rk[ 6] ^ rk[13];
300  rk[15] = rk[ 7] ^ rk[14];
301  }
302  rk += keylen/4;
303  }
304 
305  rk = m_key;
306 
307  if (IsForwardTransformation())
308  {
309  if (!s_TeFilled)
310  FillEncTable();
311 
312  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
313  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
314  }
315  else
316  {
317  if (!s_TdFilled)
318  FillDecTable();
319 
320  unsigned int i, j;
321 
322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
323 
324  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
325  {
326  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
327  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
328  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
329  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
330  }
331 
332  rk[i+0] = InverseMixColumn(rk[i+0]);
333  rk[i+1] = InverseMixColumn(rk[i+1]);
334  rk[i+2] = InverseMixColumn(rk[i+2]);
335  rk[i+3] = InverseMixColumn(rk[i+3]);
336 
337  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
338  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
339  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
340  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
341  }
342 
343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
344  if (HasAESNI())
345  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
346 #endif
347 }
348 
349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
350 {
351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
352  if (HasSSE2())
353  {
354  Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
355  return;
356  }
357 #endif
358 
360 
361  word32 s0, s1, s2, s3, t0, t1, t2, t3;
362  Block::Get(inBlock)(s0)(s1)(s2)(s3);
363 
364  const word32 *rk = m_key;
365  s0 ^= rk[0];
366  s1 ^= rk[1];
367  s2 ^= rk[2];
368  s3 ^= rk[3];
369  t0 = rk[4];
370  t1 = rk[5];
371  t2 = rk[6];
372  t3 = rk[7];
373  rk += 8;
374 
375  // timing attack countermeasure. see comments at top for more details
376  const int cacheLineSize = GetCacheLineSize();
377  unsigned int i;
378  word32 u = 0;
379 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
380  for (i=0; i<2048; i+=cacheLineSize)
381 #else
382  for (i=0; i<1024; i+=cacheLineSize)
383 #endif
384  u &= *(const word32 *)(((const byte *)Te)+i);
385  u &= Te[255];
386  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
387 
388  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
389  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
390  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
391  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
392 
393  // Nr - 2 full rounds:
394  unsigned int r = m_rounds/2 - 1;
395  do
396  {
397  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
398 
399  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
400  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
401  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
402  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
403 
404  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
405 
406  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
407  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
408  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
409  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
410 
411  rk += 8;
412  } while (--r);
413 
414  word32 tbw[4];
415  byte *const tempBlock = (byte *)tbw;
416 
417  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
418  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
419  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
420  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
421 
422  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
423 }
424 
425 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
426 {
427 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
428  if (HasAESNI())
429  {
430  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
431  return;
432  }
433 #endif
434 
436 
437  word32 s0, s1, s2, s3, t0, t1, t2, t3;
438  Block::Get(inBlock)(s0)(s1)(s2)(s3);
439 
440  const word32 *rk = m_key;
441  s0 ^= rk[0];
442  s1 ^= rk[1];
443  s2 ^= rk[2];
444  s3 ^= rk[3];
445  t0 = rk[4];
446  t1 = rk[5];
447  t2 = rk[6];
448  t3 = rk[7];
449  rk += 8;
450 
451  // timing attack countermeasure. see comments at top for more details
452  const int cacheLineSize = GetCacheLineSize();
453  unsigned int i;
454  word32 u = 0;
455 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
456  for (i=0; i<2048; i+=cacheLineSize)
457 #else
458  for (i=0; i<1024; i+=cacheLineSize)
459 #endif
460  u &= *(const word32 *)(((const byte *)Td)+i);
461  u &= Td[255];
462  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
463 
464  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
465  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
466  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
467  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
468 
469  // Nr - 2 full rounds:
470  unsigned int r = m_rounds/2 - 1;
471  do
472  {
473  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
474 
475  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
476  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
477  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
478  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
479 
480  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
481 
482  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
483  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
484  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
485  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
486 
487  rk += 8;
488  } while (--r);
489 
490 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
491  // timing attack countermeasure. see comments at top for more details
492  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
493  // QUARTER_ROUND_LD will use Td, which is already preloaded.
494  u = 0;
495  for (i=0; i<256; i+=cacheLineSize)
496  u &= *(const word32 *)(Sd+i);
497  u &= *(const word32 *)(Sd+252);
498  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
499 #endif
500 
501  word32 tbw[4];
502  byte *const tempBlock = (byte *)tbw;
503 
504  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
505  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
506  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
507  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
508 
509  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
510 }
511 
512 // ************************* Assembly Code ************************************
513 
514 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
515 
516 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
517 
518 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
519 
520 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
521 {
522 #if CRYPTOPP_BOOL_X86
523 
524 #define L_REG esp
525 #define L_INDEX(i) (L_REG+512+i)
526 #define L_INXORBLOCKS L_INBLOCKS+4
527 #define L_OUTXORBLOCKS L_INBLOCKS+8
528 #define L_OUTBLOCKS L_INBLOCKS+12
529 #define L_INCREMENTS L_INDEX(16*15)
530 #define L_SP L_INDEX(16*16)
531 #define L_LENGTH L_INDEX(16*16+4)
532 #define L_KEYS_BEGIN L_INDEX(16*16+8)
533 
534 #define MOVD movd
535 #define MM(i) mm##i
536 
537 #define MXOR(a,b,c) \
538  AS2( movzx esi, b)\
539  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
540  AS2( pxor MM(a), mm7)\
541 
542 #define MMOV(a,b,c) \
543  AS2( movzx esi, b)\
544  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
545 
546 #else
547 
548 #define L_REG r8
549 #define L_INDEX(i) (L_REG+i)
550 #define L_INXORBLOCKS L_INBLOCKS+8
551 #define L_OUTXORBLOCKS L_INBLOCKS+16
552 #define L_OUTBLOCKS L_INBLOCKS+24
553 #define L_INCREMENTS L_INDEX(16*16)
554 #define L_LENGTH L_INDEX(16*18+8)
555 #define L_KEYS_BEGIN L_INDEX(16*19)
556 
557 #define MOVD mov
558 #define MM_0 r9d
559 #define MM_1 r12d
560 #ifdef __GNUC__
561 #define MM_2 r11d
562 #else
563 #define MM_2 r10d
564 #endif
565 #define MM(i) MM_##i
566 
567 #define MXOR(a,b,c) \
568  AS2( movzx esi, b)\
569  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
570 
571 #define MMOV(a,b,c) \
572  AS2( movzx esi, b)\
573  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
574 
575 #endif
576 
577 #define L_SUBKEYS L_INDEX(0)
578 #define L_SAVED_X L_SUBKEYS
579 #define L_KEY12 L_INDEX(16*12)
580 #define L_LASTROUND L_INDEX(16*13)
581 #define L_INBLOCKS L_INDEX(16*14)
582 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
583 
584 #define XOR(a,b,c) \
585  AS2( movzx esi, b)\
586  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
587 
588 #define MOV(a,b,c) \
589  AS2( movzx esi, b)\
590  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
591 
592 #ifdef CRYPTOPP_GENERATE_X64_MASM
593  ALIGN 8
594  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
595  rex_push_reg rsi
596  push_reg rdi
597  push_reg rbx
598  push_reg r12
599  .endprolog
600  mov L_REG, rcx
601  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
602  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
603 #elif defined(__GNUC__)
604  __asm__ __volatile__
605  (
606  ".intel_syntax noprefix;"
607  #if CRYPTOPP_BOOL_X64
608  AS2( mov L_REG, rcx)
609  #endif
610  AS_PUSH_IF86(bx)
611  AS_PUSH_IF86(bp)
612  AS2( mov AS_REG_7, WORD_REG(si))
613 #else
614  AS_PUSH_IF86(si)
615  AS_PUSH_IF86(di)
616  AS_PUSH_IF86(bx)
617  AS_PUSH_IF86(bp)
618  AS2( lea AS_REG_7, [Te])
619  AS2( mov edi, [g_cacheLineSize])
620 #endif
621 
622 #if CRYPTOPP_BOOL_X86
623  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
624  AS2( lea esp, [ecx-512])
625 #endif
626 
627  // copy subkeys to stack
628  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
629  AS2( mov WORD_REG(ax), 16)
630  AS2( and WORD_REG(ax), WORD_REG(si))
631  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
632  AS2( movdqa [L_KEY12], xmm3)
633  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
634  AS2( sub WORD_REG(ax), WORD_REG(si))
635  ASL(0)
636  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
637  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
638  AS2( add WORD_REG(si), 16)
639  AS2( cmp WORD_REG(si), 16*12)
640  ASJ( jl, 0, b)
641 
642  // read subkeys 0, 1 and last
643  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
644  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
645  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
646  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
647  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
648  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
649 
650  // load table into cache
651  AS2( xor WORD_REG(ax), WORD_REG(ax))
652  ASL(9)
653  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
654  AS2( add WORD_REG(ax), WORD_REG(di))
655  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
656  AS2( add WORD_REG(ax), WORD_REG(di))
657  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
658  AS2( add WORD_REG(ax), WORD_REG(di))
659  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
660  AS2( add WORD_REG(ax), WORD_REG(di))
661  AS2( cmp WORD_REG(ax), 2048)
662  ASJ( jl, 9, b)
663  AS1( lfence)
664 
665  AS2( test DWORD PTR [L_LENGTH], 1)
666  ASJ( jz, 8, f)
667 
668  // counter mode one-time setup
669  AS2( mov WORD_REG(si), [L_INBLOCKS])
670  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
671  AS2( pxor xmm2, xmm1)
672  AS2( psrldq xmm1, 14)
673  AS2( movd eax, xmm1)
674  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
675  AS2( MOVD MM(2), eax)
676 #if CRYPTOPP_BOOL_X86
677  AS2( mov eax, 1)
678  AS2( movd mm3, eax)
679 #endif
680 
681  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
682  AS2( movd eax, xmm2)
683  AS2( psrldq xmm2, 4)
684  AS2( movd edi, xmm2)
685  AS2( psrldq xmm2, 4)
686  MXOR( 1, al, 0) // 0
687  XOR( edx, ah, 1) // 1
688  AS2( shr eax, 16)
689  XOR( ecx, al, 2) // 2
690  XOR( ebx, ah, 3) // 3
691  AS2( mov eax, edi)
692  AS2( movd edi, xmm2)
693  AS2( psrldq xmm2, 4)
694  XOR( ebx, al, 0) // 4
695  MXOR( 1, ah, 1) // 5
696  AS2( shr eax, 16)
697  XOR( edx, al, 2) // 6
698  XOR( ecx, ah, 3) // 7
699  AS2( mov eax, edi)
700  AS2( movd edi, xmm2)
701  XOR( ecx, al, 0) // 8
702  XOR( ebx, ah, 1) // 9
703  AS2( shr eax, 16)
704  MXOR( 1, al, 2) // 10
705  XOR( edx, ah, 3) // 11
706  AS2( mov eax, edi)
707  XOR( edx, al, 0) // 12
708  XOR( ecx, ah, 1) // 13
709  AS2( shr eax, 16)
710  XOR( ebx, al, 2) // 14
711  AS2( psrldq xmm2, 3)
712 
713  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
714  AS2( mov eax, [L_KEY12+0*4])
715  AS2( mov edi, [L_KEY12+2*4])
716  AS2( MOVD MM(0), [L_KEY12+3*4])
717  MXOR( 0, cl, 3) /* 11 */
718  XOR( edi, bl, 3) /* 7 */
719  MXOR( 0, bh, 2) /* 6 */
720  AS2( shr ebx, 16) /* 4,5 */
721  XOR( eax, bl, 1) /* 5 */
722  MOV( ebx, bh, 0) /* 4 */
723  AS2( xor ebx, [L_KEY12+1*4])
724  XOR( eax, ch, 2) /* 10 */
725  AS2( shr ecx, 16) /* 8,9 */
726  XOR( eax, dl, 3) /* 15 */
727  XOR( ebx, dh, 2) /* 14 */
728  AS2( shr edx, 16) /* 12,13 */
729  XOR( edi, ch, 0) /* 8 */
730  XOR( ebx, cl, 1) /* 9 */
731  XOR( edi, dl, 1) /* 13 */
732  MXOR( 0, dh, 0) /* 12 */
733 
734  AS2( movd ecx, xmm2)
735  AS2( MOVD edx, MM(1))
736  AS2( MOVD [L_SAVED_X+3*4], MM(0))
737  AS2( mov [L_SAVED_X+0*4], eax)
738  AS2( mov [L_SAVED_X+1*4], ebx)
739  AS2( mov [L_SAVED_X+2*4], edi)
740  ASJ( jmp, 5, f)
741 
742  ASL(3)
743  // non-counter mode per-block setup
744  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
745  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
746  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
747  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
748  ASL(8)
749  AS2( mov WORD_REG(ax), [L_INBLOCKS])
750  AS2( movdqu xmm2, [WORD_REG(ax)])
751  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
752  AS2( movdqu xmm5, [WORD_REG(si)])
753  AS2( pxor xmm2, xmm1)
754  AS2( pxor xmm2, xmm5)
755 
756  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
757  AS2( movd eax, xmm2)
758  AS2( psrldq xmm2, 4)
759  AS2( movd edi, xmm2)
760  AS2( psrldq xmm2, 4)
761  MXOR( 1, al, 0) // 0
762  XOR( edx, ah, 1) // 1
763  AS2( shr eax, 16)
764  XOR( ecx, al, 2) // 2
765  XOR( ebx, ah, 3) // 3
766  AS2( mov eax, edi)
767  AS2( movd edi, xmm2)
768  AS2( psrldq xmm2, 4)
769  XOR( ebx, al, 0) // 4
770  MXOR( 1, ah, 1) // 5
771  AS2( shr eax, 16)
772  XOR( edx, al, 2) // 6
773  XOR( ecx, ah, 3) // 7
774  AS2( mov eax, edi)
775  AS2( movd edi, xmm2)
776  XOR( ecx, al, 0) // 8
777  XOR( ebx, ah, 1) // 9
778  AS2( shr eax, 16)
779  MXOR( 1, al, 2) // 10
780  XOR( edx, ah, 3) // 11
781  AS2( mov eax, edi)
782  XOR( edx, al, 0) // 12
783  XOR( ecx, ah, 1) // 13
784  AS2( shr eax, 16)
785  XOR( ebx, al, 2) // 14
786  MXOR( 1, ah, 3) // 15
787  AS2( MOVD eax, MM(1))
788 
789  AS2( add L_REG, [L_KEYS_BEGIN])
790  AS2( add L_REG, 4*16)
791  ASJ( jmp, 2, f)
792 
793  ASL(1)
794  // counter-mode per-block setup
795  AS2( MOVD ecx, MM(2))
796  AS2( MOVD edx, MM(1))
797  AS2( mov eax, [L_SAVED_X+0*4])
798  AS2( mov ebx, [L_SAVED_X+1*4])
799  AS2( xor cl, ch)
800  AS2( and WORD_REG(cx), 255)
801  ASL(5)
802 #if CRYPTOPP_BOOL_X86
803  AS2( paddb MM(2), mm3)
804 #else
805  AS2( add MM(2), 1)
806 #endif
807  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
808  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
809  XOR( ebx, dl, 3)
810  MOV( ecx, dh, 2)
811  AS2( shr edx, 16)
812  AS2( xor ecx, [L_SAVED_X+2*4])
813  XOR( eax, dh, 0)
814  MOV( edx, dl, 1)
815  AS2( xor edx, [L_SAVED_X+3*4])
816 
817  AS2( add L_REG, [L_KEYS_BEGIN])
818  AS2( add L_REG, 3*16)
819  ASJ( jmp, 4, f)
820 
821 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
822 // out: eax, ebx, edi, mm0
823 #define ROUND() \
824  MXOR( 0, cl, 3) /* 11 */\
825  AS2( mov cl, al) /* 8,9,10,3 */\
826  XOR( edi, ah, 2) /* 2 */\
827  AS2( shr eax, 16) /* 0,1 */\
828  XOR( edi, bl, 3) /* 7 */\
829  MXOR( 0, bh, 2) /* 6 */\
830  AS2( shr ebx, 16) /* 4,5 */\
831  MXOR( 0, al, 1) /* 1 */\
832  MOV( eax, ah, 0) /* 0 */\
833  XOR( eax, bl, 1) /* 5 */\
834  MOV( ebx, bh, 0) /* 4 */\
835  XOR( eax, ch, 2) /* 10 */\
836  XOR( ebx, cl, 3) /* 3 */\
837  AS2( shr ecx, 16) /* 8,9 */\
838  XOR( eax, dl, 3) /* 15 */\
839  XOR( ebx, dh, 2) /* 14 */\
840  AS2( shr edx, 16) /* 12,13 */\
841  XOR( edi, ch, 0) /* 8 */\
842  XOR( ebx, cl, 1) /* 9 */\
843  XOR( edi, dl, 1) /* 13 */\
844  MXOR( 0, dh, 0) /* 12 */\
845 
846  ASL(2) // 2-round loop
847  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
848  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
849  ROUND()
850  AS2( mov ecx, edi)
851  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
852  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
853  AS2( MOVD edx, MM(0))
854 
855  ASL(4)
856  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
857  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
858  ROUND()
859  AS2( mov ecx, edi)
860  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
861  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
862  AS2( MOVD edx, MM(0))
863 
864  AS2( add L_REG, 32)
865  AS2( test L_REG, 255)
866  ASJ( jnz, 2, b)
867  AS2( sub L_REG, 16*16)
868 
869 #define LAST(a, b, c) \
870  AS2( movzx esi, a )\
871  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
872  AS2( movzx esi, b )\
873  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
874  AS2( mov WORD PTR [L_LASTROUND+c], di )\
875 
876  // last round
877  LAST(ch, dl, 2)
878  LAST(dh, al, 6)
879  AS2( shr edx, 16)
880  LAST(ah, bl, 10)
881  AS2( shr eax, 16)
882  LAST(bh, cl, 14)
883  AS2( shr ebx, 16)
884  LAST(dh, al, 12)
885  AS2( shr ecx, 16)
886  LAST(ah, bl, 0)
887  LAST(bh, cl, 4)
888  LAST(ch, dl, 8)
889 
890  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
891  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
892 
893  AS2( mov WORD_REG(cx), [L_LENGTH])
894  AS2( sub WORD_REG(cx), 16)
895 
896  AS2( movdqu xmm2, [WORD_REG(ax)])
897  AS2( pxor xmm2, xmm4)
898 
899 #if CRYPTOPP_BOOL_X86
900  AS2( movdqa xmm0, [L_INCREMENTS])
901  AS2( paddd xmm0, [L_INBLOCKS])
902  AS2( movdqa [L_INBLOCKS], xmm0)
903 #else
904  AS2( movdqa xmm0, [L_INCREMENTS+16])
905  AS2( paddq xmm0, [L_INBLOCKS+16])
906  AS2( movdqa [L_INBLOCKS+16], xmm0)
907 #endif
908 
909  AS2( pxor xmm2, [L_LASTROUND])
910  AS2( movdqu [WORD_REG(bx)], xmm2)
911 
912  ASJ( jle, 7, f)
913  AS2( mov [L_LENGTH], WORD_REG(cx))
914  AS2( test WORD_REG(cx), 1)
915  ASJ( jnz, 1, b)
916 #if CRYPTOPP_BOOL_X64
917  AS2( movdqa xmm0, [L_INCREMENTS])
918  AS2( paddq xmm0, [L_INBLOCKS])
919  AS2( movdqa [L_INBLOCKS], xmm0)
920 #endif
921  ASJ( jmp, 3, b)
922 
923  ASL(7)
924  // erase keys on stack
925  AS2( xorps xmm0, xmm0)
926  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
927  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
928  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
929  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
930  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
931  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
932  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
933  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
934  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
935  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
936  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
937  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
938  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
939  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
940  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
941 #if CRYPTOPP_BOOL_X86
942  AS2( mov esp, [L_SP])
943  AS1( emms)
944 #endif
945  AS_POP_IF86(bp)
946  AS_POP_IF86(bx)
947 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
948  AS_POP_IF86(di)
949  AS_POP_IF86(si)
950  AS1(ret)
951 #endif
952 #ifdef CRYPTOPP_GENERATE_X64_MASM
953  pop r12
954  pop rbx
955  pop rdi
956  pop rsi
957  ret
958  Rijndael_Enc_AdvancedProcessBlocks ENDP
959 #endif
960 #ifdef __GNUC__
961  ".att_syntax prefix;"
962  :
963  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
964  : "memory", "cc", "%eax"
965  #if CRYPTOPP_BOOL_X64
966  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
967  #endif
968  );
969 #endif
970 }
971 
972 #endif
973 
974 #ifndef CRYPTOPP_GENERATE_X64_MASM
975 
976 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
977 extern "C" {
978 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
979 }
980 #endif
981 
982 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
983 
984 static inline bool AliasedWithTable(const byte *begin, const byte *end)
985 {
986  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
987  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
988  if (t1 > t0)
989  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
990  else
991  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
992 }
993 
994 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
995 
996 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
997 {
998  block = _mm_xor_si128(block, subkeys[0]);
999  for (unsigned int i=1; i<rounds-1; i+=2)
1000  {
1001  block = _mm_aesenc_si128(block, subkeys[i]);
1002  block = _mm_aesenc_si128(block, subkeys[i+1]);
1003  }
1004  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1005  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1006 }
1007 
1008 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1009 {
1010  __m128i rk = subkeys[0];
1011  block0 = _mm_xor_si128(block0, rk);
1012  block1 = _mm_xor_si128(block1, rk);
1013  block2 = _mm_xor_si128(block2, rk);
1014  block3 = _mm_xor_si128(block3, rk);
1015  for (unsigned int i=1; i<rounds; i++)
1016  {
1017  rk = subkeys[i];
1018  block0 = _mm_aesenc_si128(block0, rk);
1019  block1 = _mm_aesenc_si128(block1, rk);
1020  block2 = _mm_aesenc_si128(block2, rk);
1021  block3 = _mm_aesenc_si128(block3, rk);
1022  }
1023  rk = subkeys[rounds];
1024  block0 = _mm_aesenclast_si128(block0, rk);
1025  block1 = _mm_aesenclast_si128(block1, rk);
1026  block2 = _mm_aesenclast_si128(block2, rk);
1027  block3 = _mm_aesenclast_si128(block3, rk);
1028 }
1029 
1030 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1031 {
1032  block = _mm_xor_si128(block, subkeys[0]);
1033  for (unsigned int i=1; i<rounds-1; i+=2)
1034  {
1035  block = _mm_aesdec_si128(block, subkeys[i]);
1036  block = _mm_aesdec_si128(block, subkeys[i+1]);
1037  }
1038  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1039  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1040 }
1041 
1042 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1043 {
1044  __m128i rk = subkeys[0];
1045  block0 = _mm_xor_si128(block0, rk);
1046  block1 = _mm_xor_si128(block1, rk);
1047  block2 = _mm_xor_si128(block2, rk);
1048  block3 = _mm_xor_si128(block3, rk);
1049  for (unsigned int i=1; i<rounds; i++)
1050  {
1051  rk = subkeys[i];
1052  block0 = _mm_aesdec_si128(block0, rk);
1053  block1 = _mm_aesdec_si128(block1, rk);
1054  block2 = _mm_aesdec_si128(block2, rk);
1055  block3 = _mm_aesdec_si128(block3, rk);
1056  }
1057  rk = subkeys[rounds];
1058  block0 = _mm_aesdeclast_si128(block0, rk);
1059  block1 = _mm_aesdeclast_si128(block1, rk);
1060  block2 = _mm_aesdeclast_si128(block2, rk);
1061  block3 = _mm_aesdeclast_si128(block3, rk);
1062 }
1063 
1064 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
1065 
1066 template <typename F1, typename F4>
1067 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1068 {
1069  size_t blockSize = 16;
1070  size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1071  size_t xorIncrement = xorBlocks ? blockSize : 0;
1072  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1073 
1074  if (flags & BlockTransformation::BT_ReverseDirection)
1075  {
1076  assert(length % blockSize == 0);
1077  inBlocks += length - blockSize;
1078  xorBlocks += length - blockSize;
1079  outBlocks += length - blockSize;
1080  inIncrement = 0-inIncrement;
1081  xorIncrement = 0-xorIncrement;
1082  outIncrement = 0-outIncrement;
1083  }
1084 
1085  if (flags & BlockTransformation::BT_AllowParallel)
1086  {
1087  while (length >= 4*blockSize)
1088  {
1089  __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
1090  if (flags & BlockTransformation::BT_InBlockIsCounter)
1091  {
1092  const __m128i be1 = *(const __m128i *)s_one;
1093  block1 = _mm_add_epi32(block0, be1);
1094  block2 = _mm_add_epi32(block1, be1);
1095  block3 = _mm_add_epi32(block2, be1);
1096  _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
1097  }
1098  else
1099  {
1100  inBlocks += inIncrement;
1101  block1 = _mm_loadu_si128((const __m128i *)inBlocks);
1102  inBlocks += inIncrement;
1103  block2 = _mm_loadu_si128((const __m128i *)inBlocks);
1104  inBlocks += inIncrement;
1105  block3 = _mm_loadu_si128((const __m128i *)inBlocks);
1106  inBlocks += inIncrement;
1107  }
1108 
1109  if (flags & BlockTransformation::BT_XorInput)
1110  {
1111  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1112  xorBlocks += xorIncrement;
1113  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1114  xorBlocks += xorIncrement;
1115  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1116  xorBlocks += xorIncrement;
1117  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1118  xorBlocks += xorIncrement;
1119  }
1120 
1121  func4(block0, block1, block2, block3, subkeys, rounds);
1122 
1123  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1124  {
1125  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1126  xorBlocks += xorIncrement;
1127  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1128  xorBlocks += xorIncrement;
1129  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1130  xorBlocks += xorIncrement;
1131  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1132  xorBlocks += xorIncrement;
1133  }
1134 
1135  _mm_storeu_si128((__m128i *)outBlocks, block0);
1136  outBlocks += outIncrement;
1137  _mm_storeu_si128((__m128i *)outBlocks, block1);
1138  outBlocks += outIncrement;
1139  _mm_storeu_si128((__m128i *)outBlocks, block2);
1140  outBlocks += outIncrement;
1141  _mm_storeu_si128((__m128i *)outBlocks, block3);
1142  outBlocks += outIncrement;
1143 
1144  length -= 4*blockSize;
1145  }
1146  }
1147 
1148  while (length >= blockSize)
1149  {
1150  __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
1151 
1152  if (flags & BlockTransformation::BT_XorInput)
1153  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1154 
1155  if (flags & BlockTransformation::BT_InBlockIsCounter)
1156  const_cast<byte *>(inBlocks)[15]++;
1157 
1158  func1(block, subkeys, rounds);
1159 
1160  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1161  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1162 
1163  _mm_storeu_si128((__m128i *)outBlocks, block);
1164 
1165  inBlocks += inIncrement;
1166  outBlocks += outIncrement;
1167  xorBlocks += xorIncrement;
1168  length -= blockSize;
1169  }
1170 
1171  return length;
1172 }
1173 #endif
1174 
1175 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1176 {
1177 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1178  if (HasAESNI())
1179  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1180 #endif
1181 
1182 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
1183  if (HasSSE2())
1184  {
1185  if (length < BLOCKSIZE)
1186  return length;
1187 
1188  struct Locals
1189  {
1190  word32 subkeys[4*12], workspace[8];
1191  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1192  byte *outBlocks;
1193  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1194  size_t regSpill, lengthAndCounterFlag, keysBegin;
1195  };
1196 
1197  size_t increment = BLOCKSIZE;
1198  const byte* zeros = (byte *)(Te+256);
1199  byte *space;
1200 
1201  do {
1202  space = (byte *)alloca(255+sizeof(Locals));
1203  space += (256-(size_t)space%256)%256;
1204  }
1205  while (AliasedWithTable(space, space+sizeof(Locals)));
1206 
1207  if (flags & BT_ReverseDirection)
1208  {
1209  assert(length % BLOCKSIZE == 0);
1210  inBlocks += length - BLOCKSIZE;
1211  xorBlocks += length - BLOCKSIZE;
1212  outBlocks += length - BLOCKSIZE;
1213  increment = 0-increment;
1214  }
1215 
1216  Locals &locals = *(Locals *)space;
1217 
1218  locals.inBlocks = inBlocks;
1219  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1220  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1221  locals.outBlocks = outBlocks;
1222 
1223  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1224  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1225  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1226  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1227 
1228  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1229  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1230  locals.keysBegin = (12-keysToCopy)*16;
1231 
1232  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1233  return length % BLOCKSIZE;
1234  }
1235 #endif
1236 
1237  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1238 }
1239 
1240 #endif
1241 
1242 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1243 
1244 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1245 {
1246  if (HasAESNI())
1247  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1248 
1249  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1250 }
1251 
1252 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1253 
1254 NAMESPACE_END
1255 
1256 #endif
1257 #endif