Mercurial > trustbridge > nss-cmake-static
comparison nss/lib/freebl/intel-gcm-x86-masm.asm @ 0:1e5118fa0cb1
This is NSS with a Cmake Buildsyste
To compile a static NSS library for Windows we've used the
Chromium-NSS fork and added a Cmake buildsystem to compile
it statically for Windows. See README.chromium for chromium
changes and README.trustbridge for our modifications.
author | Andre Heinecke <andre.heinecke@intevation.de> |
---|---|
date | Mon, 28 Jul 2014 10:47:06 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1e5118fa0cb1 |
---|---|
1 ; LICENSE: | |
2 ; This submission to NSS is to be made available under the terms of the | |
3 ; Mozilla Public License, v. 2.0. You can obtain one at http: | |
4 ; //mozilla.org/MPL/2.0/. | |
5 ;############################################################################### | |
6 ; Copyright(c) 2014, Intel Corp. | |
7 ; Developers and authors: | |
8 ; Shay Gueron and Vlad Krasnov | |
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel | |
10 ; Please send feedback directly to crypto.feedback.alias@intel.com | |
11 | |
12 | |
13 .MODEL FLAT, C | |
14 .XMM | |
15 | |
16 .DATA | |
17 ALIGN 16 | |
18 Lone dq 1,0 | |
19 Ltwo dq 2,0 | |
20 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | |
21 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh | |
22 Lpoly dq 01h, 0c200000000000000h | |
23 | |
24 .CODE | |
25 | |
26 | |
27 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 | |
28 vpclmulqdq TMP1, SRC2, SRC1, 0h | |
29 vpclmulqdq TMP4, SRC2, SRC1, 011h | |
30 | |
31 vpshufd TMP2, SRC2, 78 | |
32 vpshufd TMP3, SRC1, 78 | |
33 vpxor TMP2, TMP2, SRC2 | |
34 vpxor TMP3, TMP3, SRC1 | |
35 | |
36 vpclmulqdq TMP2, TMP2, TMP3, 0h | |
37 vpxor TMP2, TMP2, TMP1 | |
38 vpxor TMP2, TMP2, TMP4 | |
39 | |
40 vpslldq TMP3, TMP2, 8 | |
41 vpsrldq TMP2, TMP2, 8 | |
42 | |
43 vpxor TMP1, TMP1, TMP3 | |
44 vpxor TMP4, TMP4, TMP2 | |
45 | |
46 vpclmulqdq TMP2, TMP1, [Lpoly], 010h | |
47 vpshufd TMP3, TMP1, 78 | |
48 vpxor TMP1, TMP2, TMP3 | |
49 | |
50 vpclmulqdq TMP2, TMP1, [Lpoly], 010h | |
51 vpshufd TMP3, TMP1, 78 | |
52 vpxor TMP1, TMP2, TMP3 | |
53 | |
54 vpxor DST, TMP1, TMP4 | |
55 | |
56 ENDM | |
57 | |
58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
59 ; | |
60 ; Generates the final GCM tag | |
61 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], | |
62 ; unsigned char *Tp, | |
63 ; unsigned int Mlen, | |
64 ; unsigned int Alen, | |
65 ; unsigned char* X0, | |
66 ; unsigned char* TAG); | |
67 ; | |
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
69 | |
70 ALIGN 16 | |
71 intel_aes_gcmTAG PROC | |
72 | |
73 Htbl textequ <eax> | |
74 Tp textequ <ecx> | |
75 X0 textequ <edx> | |
76 TAG textequ <ebx> | |
77 | |
78 T textequ <xmm0> | |
79 TMP0 textequ <xmm1> | |
80 | |
81 push ebx | |
82 | |
83 mov Htbl, [esp + 2*4 + 0*4] | |
84 mov Tp, [esp + 2*4 + 1*4] | |
85 mov X0, [esp + 2*4 + 4*4] | |
86 mov TAG, [esp + 2*4 + 5*4] | |
87 | |
88 vzeroupper | |
89 vmovdqu T, XMMWORD PTR[Tp] | |
90 | |
91 vpxor TMP0, TMP0, TMP0 | |
92 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0 | |
93 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2 | |
94 vpsllq TMP0, TMP0, 3 | |
95 | |
96 vpxor T, T, TMP0 | |
97 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
98 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 | |
99 | |
100 vpshufb T, T, [Lbswap_mask] | |
101 vpxor T, T, [X0] | |
102 vmovdqu XMMWORD PTR[TAG], T | |
103 vzeroupper | |
104 | |
105 pop ebx | |
106 | |
107 ret | |
108 | |
109 intel_aes_gcmTAG ENDP | |
110 | |
111 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
112 ; | |
113 ; Generates the H table | |
114 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); | |
115 ; | |
116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
117 | |
118 ALIGN 16 | |
119 intel_aes_gcmINIT PROC | |
120 | |
121 Htbl textequ <eax> | |
122 KS textequ <ecx> | |
123 NR textequ <edx> | |
124 | |
125 T textequ <xmm0> | |
126 TMP0 textequ <xmm1> | |
127 | |
128 mov Htbl, [esp + 4*1 + 0*4] | |
129 mov KS, [esp + 4*1 + 1*4] | |
130 mov NR, [esp + 4*1 + 2*4] | |
131 | |
132 vzeroupper | |
133 ; AES-ENC(0) | |
134 vmovdqu T, XMMWORD PTR[KS] | |
135 lea KS, [16 + KS] | |
136 dec NR | |
137 Lenc_loop: | |
138 vaesenc T, T, [KS] | |
139 lea KS, [16 + KS] | |
140 dec NR | |
141 jnz Lenc_loop | |
142 | |
143 vaesenclast T, T, [KS] | |
144 vpshufb T, T, [Lbswap_mask] | |
145 | |
146 ;Calculate H` = GFMUL(H, 2) | |
147 vpsrad xmm3, T, 31 | |
148 vpshufd xmm3, xmm3, 0ffh | |
149 vpand xmm5, xmm3, [Lpoly] | |
150 vpsrld xmm3, T, 31 | |
151 vpslld xmm4, T, 1 | |
152 vpslldq xmm3, xmm3, 4 | |
153 vpxor T, xmm4, xmm3 | |
154 vpxor T, T, xmm5 | |
155 | |
156 vmovdqu TMP0, T | |
157 vmovdqu XMMWORD PTR[Htbl + 0*16], T | |
158 | |
159 vpshufd xmm2, T, 78 | |
160 vpxor xmm2, xmm2, T | |
161 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 | |
162 | |
163 i = 1 | |
164 WHILE i LT 8 | |
165 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 | |
166 vmovdqu XMMWORD PTR[Htbl + i*16], T | |
167 vpshufd xmm2, T, 78 | |
168 vpxor xmm2, xmm2, T | |
169 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 | |
170 i = i+1 | |
171 ENDM | |
172 vzeroupper | |
173 ret | |
174 intel_aes_gcmINIT ENDP | |
175 | |
176 | |
177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
178 ; | |
179 ; Authenticate only | |
180 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); | |
181 ; | |
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
183 | |
184 ALIGN 16 | |
185 intel_aes_gcmAAD PROC | |
186 | |
187 Htbl textequ <eax> | |
188 inp textequ <ecx> | |
189 len textequ <edx> | |
190 Tp textequ <ebx> | |
191 hlp0 textequ <esi> | |
192 | |
193 DATA textequ <xmm0> | |
194 T textequ <xmm1> | |
195 TMP0 textequ <xmm2> | |
196 TMP1 textequ <xmm3> | |
197 TMP2 textequ <xmm4> | |
198 TMP3 textequ <xmm5> | |
199 TMP4 textequ <xmm6> | |
200 Xhi textequ <xmm7> | |
201 | |
202 KARATSUBA_AAD MACRO i | |
203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h | |
204 vpxor TMP0, TMP0, TMP3 | |
205 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h | |
206 vpxor TMP1, TMP1, TMP3 | |
207 vpshufd TMP3, DATA, 78 | |
208 vpxor TMP3, TMP3, DATA | |
209 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h | |
210 vpxor TMP2, TMP2, TMP3 | |
211 ENDM | |
212 | |
213 cmp DWORD PTR[esp + 1*3 + 2*4], 0 | |
214 jnz LbeginAAD | |
215 ret | |
216 | |
217 LbeginAAD: | |
218 push ebx | |
219 push esi | |
220 | |
221 mov Htbl, [esp + 4*3 + 0*4] | |
222 mov inp, [esp + 4*3 + 1*4] | |
223 mov len, [esp + 4*3 + 2*4] | |
224 mov Tp, [esp + 4*3 + 3*4] | |
225 | |
226 vzeroupper | |
227 | |
228 vpxor Xhi, Xhi, Xhi | |
229 | |
230 vmovdqu T, XMMWORD PTR[Tp] | |
231 ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first | |
232 mov hlp0, len | |
233 and hlp0, 128-1 | |
234 jz Lmod_loop | |
235 | |
236 and len, -128 | |
237 sub hlp0, 16 | |
238 | |
239 ; Prefix block | |
240 vmovdqu DATA, XMMWORD PTR[inp] | |
241 vpshufb DATA, DATA, [Lbswap_mask] | |
242 vpxor DATA, DATA, T | |
243 | |
244 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h | |
245 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h | |
246 vpshufd TMP3, DATA, 78 | |
247 vpxor TMP3, TMP3, DATA | |
248 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h | |
249 | |
250 lea inp, [inp+16] | |
251 test hlp0, hlp0 | |
252 jnz Lpre_loop | |
253 jmp Lred1 | |
254 | |
255 ;hash remaining prefix bocks (up to 7 total prefix blocks) | |
256 Lpre_loop: | |
257 | |
258 sub hlp0, 16 | |
259 | |
260 vmovdqu DATA, XMMWORD PTR[inp] | |
261 vpshufb DATA, DATA, [Lbswap_mask] | |
262 | |
263 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h | |
264 vpxor TMP0, TMP0, TMP3 | |
265 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h | |
266 vpxor TMP1, TMP1, TMP3 | |
267 vpshufd TMP3, DATA, 78 | |
268 vpxor TMP3, TMP3, DATA | |
269 vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h | |
270 vpxor TMP2, TMP2, TMP3 | |
271 | |
272 test hlp0, hlp0 | |
273 lea inp, [inp+16] | |
274 jnz Lpre_loop | |
275 | |
276 Lred1: | |
277 | |
278 vpxor TMP2, TMP2, TMP0 | |
279 vpxor TMP2, TMP2, TMP1 | |
280 vpsrldq TMP3, TMP2, 8 | |
281 vpslldq TMP2, TMP2, 8 | |
282 | |
283 vpxor Xhi, TMP1, TMP3 | |
284 vpxor T, TMP0, TMP2 | |
285 | |
286 Lmod_loop: | |
287 | |
288 sub len, 16*8 | |
289 jb Ldone | |
290 ; Block #0 | |
291 vmovdqu DATA, XMMWORD PTR[inp + 16*7] | |
292 vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask] | |
293 | |
294 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h | |
295 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h | |
296 vpshufd TMP3, DATA, 78 | |
297 vpxor TMP3, TMP3, DATA | |
298 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h | |
299 | |
300 ; Block #1 | |
301 vmovdqu DATA, XMMWORD PTR[inp + 16*6] | |
302 vpshufb DATA, DATA, [Lbswap_mask] | |
303 KARATSUBA_AAD 1 | |
304 | |
305 ; Block #2 | |
306 vmovdqu DATA, XMMWORD PTR[inp + 16*5] | |
307 vpshufb DATA, DATA, [Lbswap_mask] | |
308 | |
309 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a | |
310 vpalignr T, T, T, 8 | |
311 | |
312 KARATSUBA_AAD 2 | |
313 | |
314 vpxor T, T, TMP4 ;reduction stage 1b | |
315 | |
316 ; Block #3 | |
317 vmovdqu DATA, XMMWORD PTR[inp + 16*4] | |
318 vpshufb DATA, DATA, [Lbswap_mask] | |
319 KARATSUBA_AAD 3 | |
320 ; Block #4 | |
321 vmovdqu DATA, XMMWORD PTR[inp + 16*3] | |
322 vpshufb DATA, DATA, [Lbswap_mask] | |
323 | |
324 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a | |
325 vpalignr T, T, T, 8 | |
326 | |
327 KARATSUBA_AAD 4 | |
328 | |
329 vpxor T, T, TMP4 ;reduction stage 2b | |
330 ; Block #5 | |
331 vmovdqu DATA, XMMWORD PTR[inp + 16*2] | |
332 vpshufb DATA, DATA, [Lbswap_mask] | |
333 KARATSUBA_AAD 5 | |
334 | |
335 vpxor T, T, Xhi ;reduction finalize | |
336 ; Block #6 | |
337 vmovdqu DATA, XMMWORD PTR[inp + 16*1] | |
338 vpshufb DATA, DATA, [Lbswap_mask] | |
339 KARATSUBA_AAD 6 | |
340 ; Block #7 | |
341 vmovdqu DATA, XMMWORD PTR[inp + 16*0] | |
342 vpshufb DATA, DATA, [Lbswap_mask] | |
343 vpxor DATA, DATA, T | |
344 KARATSUBA_AAD 7 | |
345 ; Aggregated 8 blocks, now karatsuba fixup | |
346 vpxor TMP2, TMP2, TMP0 | |
347 vpxor TMP2, TMP2, TMP1 | |
348 vpsrldq TMP3, TMP2, 8 | |
349 vpslldq TMP2, TMP2, 8 | |
350 | |
351 vpxor Xhi, TMP1, TMP3 | |
352 vpxor T, TMP0, TMP2 | |
353 | |
354 lea inp, [inp + 16*8] | |
355 jmp Lmod_loop | |
356 | |
357 Ldone: | |
358 vpclmulqdq TMP4, T, [Lpoly], 010h | |
359 vpalignr T, T, T, 8 | |
360 vpxor T, T, TMP4 | |
361 | |
362 vpclmulqdq TMP4, T, [Lpoly], 010h | |
363 vpalignr T, T, T, 8 | |
364 vpxor T, T, TMP4 | |
365 | |
366 vpxor T, T, Xhi | |
367 vmovdqu XMMWORD PTR[Tp], T | |
368 vzeroupper | |
369 | |
370 pop esi | |
371 pop ebx | |
372 ret | |
373 | |
374 intel_aes_gcmAAD ENDP | |
375 | |
376 | |
377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
378 ; | |
379 ; Encrypt and Authenticate | |
380 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); | |
381 ; | |
382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
383 | |
384 ALIGN 16 | |
385 intel_aes_gcmENC PROC | |
386 | |
387 PT textequ <eax> | |
388 CT textequ <ecx> | |
389 Htbl textequ <edx> | |
390 Gctx textequ <edx> | |
391 len textequ <DWORD PTR[ebp + 5*4 + 3*4]> | |
392 KS textequ <esi> | |
393 NR textequ <DWORD PTR[-40 + KS]> | |
394 | |
395 aluCTR textequ <ebx> | |
396 aluTMP textequ <edi> | |
397 | |
398 T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]> | |
399 TMP0 textequ <xmm1> | |
400 TMP1 textequ <xmm2> | |
401 TMP2 textequ <xmm3> | |
402 TMP3 textequ <xmm4> | |
403 TMP4 textequ <xmm5> | |
404 TMP5 textequ <xmm6> | |
405 | |
406 CTR0 textequ <xmm0> | |
407 CTR1 textequ <xmm1> | |
408 CTR2 textequ <xmm2> | |
409 CTR3 textequ <xmm3> | |
410 CTR4 textequ <xmm4> | |
411 CTR5 textequ <xmm5> | |
412 CTR6 textequ <xmm6> | |
413 | |
414 ROUND MACRO i | |
415 vmovdqu xmm7, XMMWORD PTR[i*16 + KS] | |
416 vaesenc CTR0, CTR0, xmm7 | |
417 vaesenc CTR1, CTR1, xmm7 | |
418 vaesenc CTR2, CTR2, xmm7 | |
419 vaesenc CTR3, CTR3, xmm7 | |
420 vaesenc CTR4, CTR4, xmm7 | |
421 vaesenc CTR5, CTR5, xmm7 | |
422 vaesenc CTR6, CTR6, xmm7 | |
423 ENDM | |
424 | |
425 KARATSUBA MACRO i | |
426 vpshufd TMP4, TMP5, 78 | |
427 vpxor TMP4, TMP4, TMP5 | |
428 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h | |
429 vpxor TMP0, TMP0, TMP3 | |
430 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] | |
431 vpclmulqdq TMP3, TMP5, TMP4, 011h | |
432 vpxor TMP1, TMP1, TMP3 | |
433 vpclmulqdq TMP3, TMP5, TMP4, 000h | |
434 vpxor TMP2, TMP2, TMP3 | |
435 ENDM | |
436 | |
437 NEXTCTR MACRO i | |
438 add aluCTR, 1 | |
439 mov aluTMP, aluCTR | |
440 bswap aluTMP | |
441 xor aluTMP, [3*4 + KS] | |
442 mov [3*4 + 8*16 + i*16 + esp], aluTMP | |
443 ENDM | |
444 | |
445 cmp DWORD PTR[1*4 + 3*4 + esp], 0 | |
446 jne LbeginENC | |
447 ret | |
448 | |
449 LbeginENC: | |
450 | |
451 vzeroupper | |
452 push ebp | |
453 push ebx | |
454 push esi | |
455 push edi | |
456 | |
457 mov ebp, esp | |
458 sub esp, 16*16 | |
459 and esp, -16 | |
460 | |
461 mov PT, [ebp + 5*4 + 0*4] | |
462 mov CT, [ebp + 5*4 + 1*4] | |
463 mov Gctx, [ebp + 5*4 + 2*4] | |
464 | |
465 mov KS, [16*16 + 3*16 + Gctx] | |
466 lea KS, [44 + KS] | |
467 | |
468 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] | |
469 bswap aluCTR | |
470 | |
471 | |
472 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] | |
473 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] | |
474 vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0 | |
475 | |
476 cmp len, 16*7 | |
477 jb LEncDataSingles | |
478 ; Prepare the "top" counters | |
479 vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0 | |
480 vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0 | |
481 vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0 | |
482 vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0 | |
483 vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0 | |
484 vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0 | |
485 | |
486 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] | |
487 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] | |
488 ; Encrypt the initial 7 blocks | |
489 sub len, 16*7 | |
490 vpaddd CTR1, CTR0, XMMWORD PTR[Lone] | |
491 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] | |
492 vpaddd CTR3, CTR2, XMMWORD PTR[Lone] | |
493 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] | |
494 vpaddd CTR5, CTR4, XMMWORD PTR[Lone] | |
495 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] | |
496 | |
497 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] | |
498 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] | |
499 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] | |
500 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] | |
501 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] | |
502 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] | |
503 vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask] | |
504 | |
505 vmovdqu xmm7, XMMWORD PTR[0*16 + KS] | |
506 vpxor CTR0, CTR0, xmm7 | |
507 vpxor CTR1, CTR1, xmm7 | |
508 vpxor CTR2, CTR2, xmm7 | |
509 vpxor CTR3, CTR3, xmm7 | |
510 vpxor CTR4, CTR4, xmm7 | |
511 vpxor CTR5, CTR5, xmm7 | |
512 vpxor CTR6, CTR6, xmm7 | |
513 | |
514 ROUND 1 | |
515 | |
516 add aluCTR, 7 | |
517 mov aluTMP, aluCTR | |
518 bswap aluTMP | |
519 xor aluTMP, [KS + 3*4] | |
520 mov [8*16 + 0*16 + 3*4 + esp], aluTMP | |
521 | |
522 ROUND 2 | |
523 NEXTCTR 1 | |
524 ROUND 3 | |
525 NEXTCTR 2 | |
526 ROUND 4 | |
527 NEXTCTR 3 | |
528 ROUND 5 | |
529 NEXTCTR 4 | |
530 ROUND 6 | |
531 NEXTCTR 5 | |
532 ROUND 7 | |
533 NEXTCTR 6 | |
534 ROUND 8 | |
535 ROUND 9 | |
536 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] | |
537 cmp NR, 10 | |
538 je @f | |
539 | |
540 ROUND 10 | |
541 ROUND 11 | |
542 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] | |
543 cmp NR, 12 | |
544 je @f | |
545 | |
546 ROUND 12 | |
547 ROUND 13 | |
548 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] | |
549 @@: | |
550 vaesenclast CTR0, CTR0, xmm7 | |
551 vaesenclast CTR1, CTR1, xmm7 | |
552 vaesenclast CTR2, CTR2, xmm7 | |
553 vaesenclast CTR3, CTR3, xmm7 | |
554 vaesenclast CTR4, CTR4, xmm7 | |
555 vaesenclast CTR5, CTR5, xmm7 | |
556 vaesenclast CTR6, CTR6, xmm7 | |
557 | |
558 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] | |
559 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] | |
560 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] | |
561 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] | |
562 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] | |
563 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] | |
564 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] | |
565 | |
566 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 | |
567 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 | |
568 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 | |
569 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 | |
570 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 | |
571 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 | |
572 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 | |
573 | |
574 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] | |
575 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] | |
576 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] | |
577 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] | |
578 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] | |
579 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] | |
580 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] | |
581 | |
582 vmovdqa XMMWORD PTR[1*16 + esp], CTR5 | |
583 vmovdqa XMMWORD PTR[2*16 + esp], CTR4 | |
584 vmovdqa XMMWORD PTR[3*16 + esp], CTR3 | |
585 vmovdqa XMMWORD PTR[4*16 + esp], CTR2 | |
586 vmovdqa XMMWORD PTR[5*16 + esp], CTR1 | |
587 vmovdqa XMMWORD PTR[6*16 + esp], CTR0 | |
588 | |
589 lea CT, [7*16 + CT] | |
590 lea PT, [7*16 + PT] | |
591 jmp LEncData7 | |
592 | |
593 LEncData7: | |
594 cmp len, 16*7 | |
595 jb LEndEnc7 | |
596 sub len, 16*7 | |
597 | |
598 vpshufd TMP4, TMP5, 78 | |
599 vpxor TMP4, TMP4, TMP5 | |
600 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h | |
601 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] | |
602 vpclmulqdq TMP1, TMP5, TMP4, 011h | |
603 vpclmulqdq TMP2, TMP5, TMP4, 000h | |
604 | |
605 vmovdqu TMP5, XMMWORD PTR[1*16 + esp] | |
606 KARATSUBA 1 | |
607 vmovdqu TMP5, XMMWORD PTR[2*16 + esp] | |
608 KARATSUBA 2 | |
609 vmovdqu TMP5, XMMWORD PTR[3*16 + esp] | |
610 KARATSUBA 3 | |
611 vmovdqu TMP5, XMMWORD PTR[4*16 + esp] | |
612 KARATSUBA 4 | |
613 vmovdqu TMP5, XMMWORD PTR[5*16 + esp] | |
614 KARATSUBA 5 | |
615 vmovdqu TMP5, XMMWORD PTR[6*16 + esp] | |
616 vpxor TMP5, TMP5, T | |
617 KARATSUBA 6 | |
618 | |
619 vpxor TMP0, TMP0, TMP1 | |
620 vpxor TMP0, TMP0, TMP2 | |
621 vpsrldq TMP3, TMP0, 8 | |
622 vpxor TMP4, TMP1, TMP3 | |
623 vpslldq TMP3, TMP0, 8 | |
624 vpxor TMP5, TMP2, TMP3 | |
625 | |
626 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h | |
627 vpalignr TMP5,TMP5,TMP5,8 | |
628 vpxor TMP5, TMP5, TMP1 | |
629 | |
630 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h | |
631 vpalignr TMP5,TMP5,TMP5,8 | |
632 vpxor TMP5, TMP5, TMP1 | |
633 | |
634 vpxor TMP5, TMP5, TMP4 | |
635 vmovdqu T, TMP5 | |
636 | |
637 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp] | |
638 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp] | |
639 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp] | |
640 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp] | |
641 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp] | |
642 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp] | |
643 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp] | |
644 | |
645 ROUND 1 | |
646 NEXTCTR 0 | |
647 ROUND 2 | |
648 NEXTCTR 1 | |
649 ROUND 3 | |
650 NEXTCTR 2 | |
651 ROUND 4 | |
652 NEXTCTR 3 | |
653 ROUND 5 | |
654 NEXTCTR 4 | |
655 ROUND 6 | |
656 NEXTCTR 5 | |
657 ROUND 7 | |
658 NEXTCTR 6 | |
659 | |
660 ROUND 8 | |
661 ROUND 9 | |
662 | |
663 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] | |
664 cmp NR, 10 | |
665 je @f | |
666 | |
667 ROUND 10 | |
668 ROUND 11 | |
669 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] | |
670 cmp NR, 12 | |
671 je @f | |
672 | |
673 ROUND 12 | |
674 ROUND 13 | |
675 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] | |
676 @@: | |
677 vaesenclast CTR0, CTR0, xmm7 | |
678 vaesenclast CTR1, CTR1, xmm7 | |
679 vaesenclast CTR2, CTR2, xmm7 | |
680 vaesenclast CTR3, CTR3, xmm7 | |
681 vaesenclast CTR4, CTR4, xmm7 | |
682 vaesenclast CTR5, CTR5, xmm7 | |
683 vaesenclast CTR6, CTR6, xmm7 | |
684 | |
685 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] | |
686 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] | |
687 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] | |
688 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] | |
689 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] | |
690 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] | |
691 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] | |
692 | |
693 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 | |
694 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 | |
695 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 | |
696 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 | |
697 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 | |
698 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 | |
699 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 | |
700 | |
701 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] | |
702 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] | |
703 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] | |
704 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] | |
705 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] | |
706 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] | |
707 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] | |
708 | |
709 vmovdqa XMMWORD PTR[1*16 + esp], CTR5 | |
710 vmovdqa XMMWORD PTR[2*16 + esp], CTR4 | |
711 vmovdqa XMMWORD PTR[3*16 + esp], CTR3 | |
712 vmovdqa XMMWORD PTR[4*16 + esp], CTR2 | |
713 vmovdqa XMMWORD PTR[5*16 + esp], CTR1 | |
714 vmovdqa XMMWORD PTR[6*16 + esp], CTR0 | |
715 | |
716 lea CT, [7*16 + CT] | |
717 lea PT, [7*16 + PT] | |
718 jmp LEncData7 | |
719 | |
720 LEndEnc7: | |
721 | |
722 vpshufd TMP4, TMP5, 78 | |
723 vpxor TMP4, TMP4, TMP5 | |
724 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h | |
725 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] | |
726 vpclmulqdq TMP1, TMP5, TMP4, 011h | |
727 vpclmulqdq TMP2, TMP5, TMP4, 000h | |
728 | |
729 vmovdqu TMP5, XMMWORD PTR[1*16 + esp] | |
730 KARATSUBA 1 | |
731 vmovdqu TMP5, XMMWORD PTR[2*16 + esp] | |
732 KARATSUBA 2 | |
733 vmovdqu TMP5, XMMWORD PTR[3*16 + esp] | |
734 KARATSUBA 3 | |
735 vmovdqu TMP5, XMMWORD PTR[4*16 + esp] | |
736 KARATSUBA 4 | |
737 vmovdqu TMP5, XMMWORD PTR[5*16 + esp] | |
738 KARATSUBA 5 | |
739 vmovdqu TMP5, XMMWORD PTR[6*16 + esp] | |
740 vpxor TMP5, TMP5, T | |
741 KARATSUBA 6 | |
742 | |
743 vpxor TMP0, TMP0, TMP1 | |
744 vpxor TMP0, TMP0, TMP2 | |
745 vpsrldq TMP3, TMP0, 8 | |
746 vpxor TMP4, TMP1, TMP3 | |
747 vpslldq TMP3, TMP0, 8 | |
748 vpxor TMP5, TMP2, TMP3 | |
749 | |
750 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h | |
751 vpalignr TMP5,TMP5,TMP5,8 | |
752 vpxor TMP5, TMP5, TMP1 | |
753 | |
754 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h | |
755 vpalignr TMP5,TMP5,TMP5,8 | |
756 vpxor TMP5, TMP5, TMP1 | |
757 | |
758 vpxor TMP5, TMP5, TMP4 | |
759 vmovdqu T, TMP5 | |
760 | |
761 sub aluCTR, 6 | |
762 | |
763 LEncDataSingles: | |
764 | |
765 cmp len, 16 | |
766 jb LEncDataTail | |
767 sub len, 16 | |
768 | |
769 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] | |
770 NEXTCTR 0 | |
771 | |
772 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
773 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
774 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
775 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
776 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
777 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
778 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
779 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
780 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
781 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
782 cmp NR, 10 | |
783 je @f | |
784 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
785 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
786 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
787 cmp NR, 12 | |
788 je @f | |
789 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
790 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
791 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
792 @@: | |
793 vaesenclast TMP1, TMP1, TMP2 | |
794 vpxor TMP1, TMP1, XMMWORD PTR[PT] | |
795 vmovdqu XMMWORD PTR[CT], TMP1 | |
796 | |
797 lea PT, [16+PT] | |
798 lea CT, [16+CT] | |
799 | |
800 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] | |
801 vpxor TMP1, TMP1, T | |
802 | |
803 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
804 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 | |
805 vmovdqu T, TMP1 | |
806 | |
807 jmp LEncDataSingles | |
808 | |
809 LEncDataTail: | |
810 | |
811 cmp len, 0 | |
812 je LEncDataEnd | |
813 | |
814 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] | |
815 | |
816 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
817 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
818 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
819 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
820 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
821 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
822 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
823 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
824 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
825 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
826 cmp NR, 10 | |
827 je @f | |
828 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
829 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
830 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
831 cmp NR, 12 | |
832 je @f | |
833 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
834 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
835 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
836 @@: | |
837 vaesenclast TMP1, TMP1, TMP2 | |
838 ; zero a temp location | |
839 vpxor TMP2, TMP2, TMP2 | |
840 vmovdqa XMMWORD PTR[esp], TMP2 | |
841 ; copy as many bytes as needed | |
842 xor KS, KS | |
843 mov aluTMP, edx | |
844 @@: | |
845 cmp len, KS | |
846 je @f | |
847 mov dl, BYTE PTR[PT + KS] | |
848 mov BYTE PTR[esp + KS], dl | |
849 inc KS | |
850 jmp @b | |
851 @@: | |
852 vpxor TMP1, TMP1, XMMWORD PTR[esp] | |
853 vmovdqa XMMWORD PTR[esp], TMP1 | |
854 xor KS, KS | |
855 @@: | |
856 cmp len, KS | |
857 je @f | |
858 mov dl, BYTE PTR[esp + KS] | |
859 mov BYTE PTR[CT + KS], dl | |
860 inc KS | |
861 jmp @b | |
862 @@: | |
863 cmp KS, 16 | |
864 je @f | |
865 mov BYTE PTR[esp + KS], 0 | |
866 inc KS | |
867 jmp @b | |
868 @@: | |
869 mov edx, aluTMP | |
870 vmovdqa TMP1, XMMWORD PTR[esp] | |
871 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] | |
872 vpxor TMP1, TMP1, T | |
873 | |
874 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
875 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 | |
876 vmovdqu T, TMP1 | |
877 | |
878 LEncDataEnd: | |
879 inc aluCTR | |
880 bswap aluCTR | |
881 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR | |
882 | |
883 mov esp, ebp | |
884 pop edi | |
885 pop esi | |
886 pop ebx | |
887 pop ebp | |
888 | |
889 | |
890 vzeroupper | |
891 | |
892 ret | |
893 intel_aes_gcmENC ENDP | |
894 | |
895 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
896 ; | |
897 ; Decrypt and Authenticate | |
898 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); | |
899 ; | |
900 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
901 | |
902 | |
903 NEXTCTR MACRO i | |
904 add aluCTR, 1 | |
905 mov aluTMP, aluCTR | |
906 bswap aluTMP | |
907 xor aluTMP, [3*4 + KS] | |
908 mov [3*4 + i*16 + esp], aluTMP | |
909 ENDM | |
910 | |
911 intel_aes_gcmDEC PROC | |
912 | |
913 cmp DWORD PTR[1*4 + 3*4 + esp], 0 | |
914 jne LbeginDEC | |
915 ret | |
916 | |
917 LbeginDEC: | |
918 | |
919 vzeroupper | |
920 push ebp | |
921 push ebx | |
922 push esi | |
923 push edi | |
924 | |
925 mov ebp, esp | |
926 sub esp, 8*16 | |
927 and esp, -16 | |
928 | |
929 mov CT, [ebp + 5*4 + 0*4] | |
930 mov PT, [ebp + 5*4 + 1*4] | |
931 mov Gctx, [ebp + 5*4 + 2*4] | |
932 | |
933 mov KS, [16*16 + 3*16 + Gctx] | |
934 lea KS, [44 + KS] | |
935 | |
936 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] | |
937 bswap aluCTR | |
938 | |
939 | |
940 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] | |
941 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] | |
942 vmovdqu XMMWORD PTR[0*16 + esp], TMP0 | |
943 | |
944 cmp len, 16*7 | |
945 jb LDecDataSingles | |
946 vmovdqu XMMWORD PTR[1*16 + esp], TMP0 | |
947 vmovdqu XMMWORD PTR[2*16 + esp], TMP0 | |
948 vmovdqu XMMWORD PTR[3*16 + esp], TMP0 | |
949 vmovdqu XMMWORD PTR[4*16 + esp], TMP0 | |
950 vmovdqu XMMWORD PTR[5*16 + esp], TMP0 | |
951 vmovdqu XMMWORD PTR[6*16 + esp], TMP0 | |
952 dec aluCTR | |
953 | |
954 LDecData7: | |
955 cmp len, 16*7 | |
956 jb LDecData7End | |
957 sub len, 16*7 | |
958 | |
959 vmovdqu TMP5, XMMWORD PTR[0*16 + CT] | |
960 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] | |
961 vpxor TMP5, TMP5, T | |
962 vpshufd TMP4, TMP5, 78 | |
963 vpxor TMP4, TMP4, TMP5 | |
964 vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h | |
965 vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl] | |
966 vpclmulqdq TMP1, TMP5, TMP4, 011h | |
967 vpclmulqdq TMP2, TMP5, TMP4, 000h | |
968 | |
969 NEXTCTR 0 | |
970 vmovdqu TMP5, XMMWORD PTR[1*16 + CT] | |
971 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] | |
972 KARATSUBA 5 | |
973 NEXTCTR 1 | |
974 vmovdqu TMP5, XMMWORD PTR[2*16 + CT] | |
975 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] | |
976 KARATSUBA 4 | |
977 NEXTCTR 2 | |
978 vmovdqu TMP5, XMMWORD PTR[3*16 + CT] | |
979 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] | |
980 KARATSUBA 3 | |
981 NEXTCTR 3 | |
982 vmovdqu TMP5, XMMWORD PTR[4*16 + CT] | |
983 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] | |
984 KARATSUBA 2 | |
985 NEXTCTR 4 | |
986 vmovdqu TMP5, XMMWORD PTR[5*16 + CT] | |
987 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] | |
988 KARATSUBA 1 | |
989 NEXTCTR 5 | |
990 vmovdqu TMP5, XMMWORD PTR[6*16 + CT] | |
991 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] | |
992 KARATSUBA 0 | |
993 NEXTCTR 6 | |
994 | |
995 vpxor TMP0, TMP0, TMP1 | |
996 vpxor TMP0, TMP0, TMP2 | |
997 vpsrldq TMP3, TMP0, 8 | |
998 vpxor TMP4, TMP1, TMP3 | |
999 vpslldq TMP3, TMP0, 8 | |
1000 vpxor TMP5, TMP2, TMP3 | |
1001 | |
1002 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h | |
1003 vpalignr TMP5,TMP5,TMP5,8 | |
1004 vpxor TMP5, TMP5, TMP1 | |
1005 | |
1006 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h | |
1007 vpalignr TMP5,TMP5,TMP5,8 | |
1008 vpxor TMP5, TMP5, TMP1 | |
1009 | |
1010 vpxor TMP5, TMP5, TMP4 | |
1011 vmovdqu T, TMP5 | |
1012 | |
1013 vmovdqa CTR0, XMMWORD PTR[0*16 + esp] | |
1014 vmovdqa CTR1, XMMWORD PTR[1*16 + esp] | |
1015 vmovdqa CTR2, XMMWORD PTR[2*16 + esp] | |
1016 vmovdqa CTR3, XMMWORD PTR[3*16 + esp] | |
1017 vmovdqa CTR4, XMMWORD PTR[4*16 + esp] | |
1018 vmovdqa CTR5, XMMWORD PTR[5*16 + esp] | |
1019 vmovdqa CTR6, XMMWORD PTR[6*16 + esp] | |
1020 | |
1021 ROUND 1 | |
1022 ROUND 2 | |
1023 ROUND 3 | |
1024 ROUND 4 | |
1025 ROUND 5 | |
1026 ROUND 6 | |
1027 ROUND 7 | |
1028 ROUND 8 | |
1029 ROUND 9 | |
1030 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] | |
1031 cmp NR, 10 | |
1032 je @f | |
1033 | |
1034 ROUND 10 | |
1035 ROUND 11 | |
1036 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] | |
1037 cmp NR, 12 | |
1038 je @f | |
1039 | |
1040 ROUND 12 | |
1041 ROUND 13 | |
1042 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] | |
1043 @@: | |
1044 vaesenclast CTR0, CTR0, xmm7 | |
1045 vaesenclast CTR1, CTR1, xmm7 | |
1046 vaesenclast CTR2, CTR2, xmm7 | |
1047 vaesenclast CTR3, CTR3, xmm7 | |
1048 vaesenclast CTR4, CTR4, xmm7 | |
1049 vaesenclast CTR5, CTR5, xmm7 | |
1050 vaesenclast CTR6, CTR6, xmm7 | |
1051 | |
1052 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT] | |
1053 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT] | |
1054 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT] | |
1055 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT] | |
1056 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT] | |
1057 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT] | |
1058 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT] | |
1059 | |
1060 vmovdqu XMMWORD PTR[0*16 + PT], CTR0 | |
1061 vmovdqu XMMWORD PTR[1*16 + PT], CTR1 | |
1062 vmovdqu XMMWORD PTR[2*16 + PT], CTR2 | |
1063 vmovdqu XMMWORD PTR[3*16 + PT], CTR3 | |
1064 vmovdqu XMMWORD PTR[4*16 + PT], CTR4 | |
1065 vmovdqu XMMWORD PTR[5*16 + PT], CTR5 | |
1066 vmovdqu XMMWORD PTR[6*16 + PT], CTR6 | |
1067 | |
1068 lea CT, [7*16 + CT] | |
1069 lea PT, [7*16 + PT] | |
1070 jmp LDecData7 | |
1071 | |
1072 LDecData7End: | |
1073 | |
1074 NEXTCTR 0 | |
1075 | |
1076 LDecDataSingles: | |
1077 | |
1078 cmp len, 16 | |
1079 jb LDecDataTail | |
1080 sub len, 16 | |
1081 | |
1082 vmovdqu TMP1, XMMWORD PTR[CT] | |
1083 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] | |
1084 vpxor TMP1, TMP1, T | |
1085 | |
1086 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
1087 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 | |
1088 vmovdqu T, TMP1 | |
1089 | |
1090 vmovdqa TMP1, XMMWORD PTR[0*16 + esp] | |
1091 NEXTCTR 0 | |
1092 | |
1093 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
1094 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
1095 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
1096 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
1097 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
1098 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
1099 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
1100 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
1101 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
1102 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
1103 cmp NR, 10 | |
1104 je @f | |
1105 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
1106 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
1107 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
1108 cmp NR, 12 | |
1109 je @f | |
1110 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
1111 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
1112 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
1113 @@: | |
1114 vaesenclast TMP1, TMP1, TMP2 | |
1115 vpxor TMP1, TMP1, XMMWORD PTR[CT] | |
1116 vmovdqu XMMWORD PTR[PT], TMP1 | |
1117 | |
1118 lea PT, [16+PT] | |
1119 lea CT, [16+CT] | |
1120 jmp LDecDataSingles | |
1121 | |
1122 LDecDataTail: | |
1123 | |
1124 cmp len, 0 | |
1125 je LDecDataEnd | |
1126 | |
1127 vmovdqa TMP1, XMMWORD PTR[0*16 + esp] | |
1128 inc aluCTR | |
1129 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
1130 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
1131 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
1132 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
1133 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
1134 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
1135 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
1136 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
1137 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
1138 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
1139 cmp NR, 10 | |
1140 je @f | |
1141 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
1142 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
1143 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
1144 cmp NR, 12 | |
1145 je @f | |
1146 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
1147 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
1148 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
1149 @@: | |
1150 vaesenclast xmm7, TMP1, TMP2 | |
1151 | |
1152 ; copy as many bytes as needed | |
1153 xor KS, KS | |
1154 mov aluTMP, edx | |
1155 @@: | |
1156 cmp len, KS | |
1157 je @f | |
1158 mov dl, BYTE PTR[CT + KS] | |
1159 mov BYTE PTR[esp + KS], dl | |
1160 inc KS | |
1161 jmp @b | |
1162 @@: | |
1163 cmp KS, 16 | |
1164 je @f | |
1165 mov BYTE PTR[esp + KS], 0 | |
1166 inc KS | |
1167 jmp @b | |
1168 @@: | |
1169 mov edx, aluTMP | |
1170 vmovdqa TMP1, XMMWORD PTR[esp] | |
1171 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] | |
1172 vpxor TMP1, TMP1, T | |
1173 | |
1174 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
1175 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 | |
1176 vmovdqu T, TMP1 | |
1177 | |
1178 vpxor xmm7, xmm7, XMMWORD PTR[esp] | |
1179 vmovdqa XMMWORD PTR[esp], xmm7 | |
1180 xor KS, KS | |
1181 mov aluTMP, edx | |
1182 @@: | |
1183 cmp len, KS | |
1184 je @f | |
1185 mov dl, BYTE PTR[esp + KS] | |
1186 mov BYTE PTR[PT + KS], dl | |
1187 inc KS | |
1188 jmp @b | |
1189 @@: | |
1190 mov edx, aluTMP | |
1191 | |
1192 LDecDataEnd: | |
1193 | |
1194 bswap aluCTR | |
1195 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR | |
1196 | |
1197 mov esp, ebp | |
1198 pop edi | |
1199 pop esi | |
1200 pop ebx | |
1201 pop ebp | |
1202 | |
1203 vzeroupper | |
1204 | |
1205 ret | |
1206 intel_aes_gcmDEC ENDP | |
1207 | |
1208 | |
1209 END |