Mercurial > trustbridge > nss-cmake-static
comparison nss/lib/freebl/intel-gcm-x64-masm.asm @ 0:1e5118fa0cb1
This is NSS with a Cmake Buildsyste
To compile a static NSS library for Windows we've used the
Chromium-NSS fork and added a Cmake buildsystem to compile
it statically for Windows. See README.chromium for chromium
changes and README.trustbridge for our modifications.
author | Andre Heinecke <andre.heinecke@intevation.de> |
---|---|
date | Mon, 28 Jul 2014 10:47:06 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1e5118fa0cb1 |
---|---|
1 ; LICENSE: | |
2 ; This submission to NSS is to be made available under the terms of the | |
3 ; Mozilla Public License, v. 2.0. You can obtain one at http: | |
4 ; //mozilla.org/MPL/2.0/. | |
5 ;############################################################################### | |
6 ; Copyright(c) 2014, Intel Corp. | |
7 ; Developers and authors: | |
8 ; Shay Gueron and Vlad Krasnov | |
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel | |
10 ; Please send feedback directly to crypto.feedback.alias@intel.com | |
11 | |
12 | |
13 .DATA | |
14 ALIGN 16 | |
15 Lone dq 1,0 | |
16 Ltwo dq 2,0 | |
17 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | |
18 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh | |
19 Lpoly dq 01h, 0c200000000000000h | |
20 | |
21 .CODE | |
22 | |
23 | |
24 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 | |
25 vpclmulqdq TMP1, SRC2, SRC1, 0h | |
26 vpclmulqdq TMP4, SRC2, SRC1, 011h | |
27 | |
28 vpshufd TMP2, SRC2, 78 | |
29 vpshufd TMP3, SRC1, 78 | |
30 vpxor TMP2, TMP2, SRC2 | |
31 vpxor TMP3, TMP3, SRC1 | |
32 | |
33 vpclmulqdq TMP2, TMP2, TMP3, 0h | |
34 vpxor TMP2, TMP2, TMP1 | |
35 vpxor TMP2, TMP2, TMP4 | |
36 | |
37 vpslldq TMP3, TMP2, 8 | |
38 vpsrldq TMP2, TMP2, 8 | |
39 | |
40 vpxor TMP1, TMP1, TMP3 | |
41 vpxor TMP4, TMP4, TMP2 | |
42 | |
43 vpclmulqdq TMP2, TMP1, [Lpoly], 010h | |
44 vpshufd TMP3, TMP1, 78 | |
45 vpxor TMP1, TMP2, TMP3 | |
46 | |
47 vpclmulqdq TMP2, TMP1, [Lpoly], 010h | |
48 vpshufd TMP3, TMP1, 78 | |
49 vpxor TMP1, TMP2, TMP3 | |
50 | |
51 vpxor DST, TMP1, TMP4 | |
52 | |
53 ENDM | |
54 | |
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 ; | |
57 ; Generates the final GCM tag | |
58 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], | |
59 ; unsigned char *Tp, | |
60 ; unsigned int Mlen, | |
61 ; unsigned int Alen, | |
62 ; unsigned char *X0, | |
63 ; unsigned char *TAG); | |
64 ; | |
65 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
66 | |
67 ALIGN 16 | |
68 intel_aes_gcmTAG PROC | |
69 | |
70 Htbl textequ <rcx> | |
71 Tp textequ <rdx> | |
72 Mlen textequ <r8> | |
73 Alen textequ <r9> | |
74 X0 textequ <r10> | |
75 TAG textequ <r11> | |
76 | |
77 T textequ <xmm0> | |
78 TMP0 textequ <xmm1> | |
79 | |
80 mov X0, [rsp + 1*8 + 4*8] | |
81 mov TAG, [rsp + 1*8 + 5*8] | |
82 | |
83 vzeroupper | |
84 vmovdqu T, XMMWORD PTR[Tp] | |
85 vpxor TMP0, TMP0, TMP0 | |
86 | |
87 shl Mlen, 3 | |
88 shl Alen, 3 | |
89 | |
90 ;vpinsrq TMP0, TMP0, Mlen, 0 | |
91 ;vpinsrq TMP0, TMP0, Alen, 1 | |
92 ; workaround the ml64.exe vpinsrq issue | |
93 vpinsrd TMP0, TMP0, r8d, 0 | |
94 vpinsrd TMP0, TMP0, r9d, 2 | |
95 shr Mlen, 32 | |
96 shr Alen, 32 | |
97 vpinsrd TMP0, TMP0, r8d, 1 | |
98 vpinsrd TMP0, TMP0, r9d, 3 | |
99 | |
100 vpxor T, T, TMP0 | |
101 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
102 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 | |
103 | |
104 vpshufb T, T, [Lbswap_mask] | |
105 vpxor T, T, [X0] | |
106 vmovdqu XMMWORD PTR[TAG], T | |
107 vzeroupper | |
108 | |
109 ret | |
110 | |
111 intel_aes_gcmTAG ENDP | |
112 | |
113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
114 ; | |
115 ; Generates the H table | |
116 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); | |
117 ; | |
118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
119 | |
120 ALIGN 16 | |
121 intel_aes_gcmINIT PROC | |
122 | |
123 Htbl textequ <rcx> | |
124 KS textequ <rdx> | |
125 NR textequ <r8d> | |
126 | |
127 T textequ <xmm0> | |
128 TMP0 textequ <xmm1> | |
129 | |
130 vzeroupper | |
131 ; AES-ENC(0) | |
132 vmovdqu T, XMMWORD PTR[KS] | |
133 lea KS, [16 + KS] | |
134 dec NR | |
135 Lenc_loop: | |
136 vaesenc T, T, [KS] | |
137 lea KS, [16 + KS] | |
138 dec NR | |
139 jnz Lenc_loop | |
140 | |
141 vaesenclast T, T, [KS] | |
142 vpshufb T, T, [Lbswap_mask] | |
143 | |
144 ;Calculate H` = GFMUL(H, 2) | |
145 vpsrad xmm3, T, 31 | |
146 vpshufd xmm3, xmm3, 0ffh | |
147 vpand xmm5, xmm3, [Lpoly] | |
148 vpsrld xmm3, T, 31 | |
149 vpslld xmm4, T, 1 | |
150 vpslldq xmm3, xmm3, 4 | |
151 vpxor T, xmm4, xmm3 | |
152 vpxor T, T, xmm5 | |
153 | |
154 vmovdqu TMP0, T | |
155 vmovdqu XMMWORD PTR[Htbl + 0*16], T | |
156 | |
157 vpshufd xmm2, T, 78 | |
158 vpxor xmm2, xmm2, T | |
159 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 | |
160 | |
161 i = 1 | |
162 WHILE i LT 8 | |
163 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 | |
164 vmovdqu XMMWORD PTR[Htbl + i*16], T | |
165 vpshufd xmm2, T, 78 | |
166 vpxor xmm2, xmm2, T | |
167 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 | |
168 i = i+1 | |
169 ENDM | |
170 vzeroupper | |
171 ret | |
172 intel_aes_gcmINIT ENDP | |
173 | |
174 | |
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
176 ; | |
177 ; Authenticate only | |
178 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); | |
179 ; | |
180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
181 | |
182 ALIGN 16 | |
183 intel_aes_gcmAAD PROC | |
184 | |
185 Htbl textequ <rcx> | |
186 inp textequ <rdx> | |
187 len textequ <r8> | |
188 Tp textequ <r9> | |
189 hlp0 textequ <r10> | |
190 | |
191 DATA textequ <xmm0> | |
192 T textequ <xmm1> | |
193 TMP0 textequ <xmm2> | |
194 TMP1 textequ <xmm3> | |
195 TMP2 textequ <xmm4> | |
196 TMP3 textequ <xmm5> | |
197 TMP4 textequ <xmm6> | |
198 Xhi textequ <xmm7> | |
199 | |
200 KARATSUBA_AAD MACRO i | |
201 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h | |
202 vpxor TMP0, TMP0, TMP3 | |
203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h | |
204 vpxor TMP1, TMP1, TMP3 | |
205 vpshufd TMP3, DATA, 78 | |
206 vpxor TMP3, TMP3, DATA | |
207 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h | |
208 vpxor TMP2, TMP2, TMP3 | |
209 ENDM | |
210 | |
211 test len, len | |
212 jnz LbeginAAD | |
213 ret | |
214 | |
215 LbeginAAD: | |
216 vzeroupper | |
217 | |
218 sub rsp, 2*16 | |
219 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 | |
220 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 | |
221 | |
222 vpxor Xhi, Xhi, Xhi | |
223 | |
224 vmovdqu T, XMMWORD PTR[Tp] | |
225 ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first | |
226 mov hlp0, len | |
227 and hlp0, 128-1 | |
228 jz Lmod_loop | |
229 | |
230 and len, -128 | |
231 sub hlp0, 16 | |
232 | |
233 ; Prefix block | |
234 vmovdqu DATA, XMMWORD PTR[inp] | |
235 vpshufb DATA, DATA, [Lbswap_mask] | |
236 vpxor DATA, DATA, T | |
237 | |
238 vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h | |
239 vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h | |
240 vpshufd TMP3, DATA, 78 | |
241 vpxor TMP3, TMP3, DATA | |
242 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h | |
243 | |
244 lea inp, [inp+16] | |
245 test hlp0, hlp0 | |
246 jnz Lpre_loop | |
247 jmp Lred1 | |
248 | |
249 ;hash remaining prefix bocks (up to 7 total prefix blocks) | |
250 Lpre_loop: | |
251 | |
252 sub hlp0, 16 | |
253 | |
254 vmovdqu DATA, XMMWORD PTR[inp] | |
255 vpshufb DATA, DATA, [Lbswap_mask] | |
256 | |
257 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h | |
258 vpxor TMP0, TMP0, TMP3 | |
259 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h | |
260 vpxor TMP1, TMP1, TMP3 | |
261 vpshufd TMP3, DATA, 78 | |
262 vpxor TMP3, TMP3, DATA | |
263 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h | |
264 vpxor TMP2, TMP2, TMP3 | |
265 | |
266 test hlp0, hlp0 | |
267 lea inp, [inp+16] | |
268 jnz Lpre_loop | |
269 | |
270 Lred1: | |
271 | |
272 vpxor TMP2, TMP2, TMP0 | |
273 vpxor TMP2, TMP2, TMP1 | |
274 vpsrldq TMP3, TMP2, 8 | |
275 vpslldq TMP2, TMP2, 8 | |
276 | |
277 vpxor Xhi, TMP1, TMP3 | |
278 vpxor T, TMP0, TMP2 | |
279 | |
280 | |
281 Lmod_loop: | |
282 | |
283 sub len, 16*8 | |
284 jb Ldone | |
285 ; Block #0 | |
286 vmovdqu DATA, XMMWORD PTR[inp + 16*7] | |
287 vpshufb DATA, DATA, [Lbswap_mask] | |
288 | |
289 vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h | |
290 vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h | |
291 vpshufd TMP3, DATA, 78 | |
292 vpxor TMP3, TMP3, DATA | |
293 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h | |
294 | |
295 ; Block #1 | |
296 vmovdqu DATA, XMMWORD PTR[inp + 16*6] | |
297 vpshufb DATA, DATA, [Lbswap_mask] | |
298 KARATSUBA_AAD 1 | |
299 | |
300 ; Block #2 | |
301 vmovdqu DATA, XMMWORD PTR[inp + 16*5] | |
302 vpshufb DATA, DATA, [Lbswap_mask] | |
303 | |
304 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a | |
305 vpalignr T, T, T, 8 | |
306 | |
307 KARATSUBA_AAD 2 | |
308 | |
309 vpxor T, T, TMP4 ;reduction stage 1b | |
310 | |
311 ; Block #3 | |
312 vmovdqu DATA, XMMWORD PTR[inp + 16*4] | |
313 vpshufb DATA, DATA, [Lbswap_mask] | |
314 KARATSUBA_AAD 3 | |
315 ; Block #4 | |
316 vmovdqu DATA, XMMWORD PTR[inp + 16*3] | |
317 vpshufb DATA, DATA, [Lbswap_mask] | |
318 | |
319 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a | |
320 vpalignr T, T, T, 8 | |
321 | |
322 KARATSUBA_AAD 4 | |
323 | |
324 vpxor T, T, TMP4 ;reduction stage 2b | |
325 ; Block #5 | |
326 vmovdqu DATA, XMMWORD PTR[inp + 16*2] | |
327 vpshufb DATA, DATA, [Lbswap_mask] | |
328 KARATSUBA_AAD 5 | |
329 | |
330 vpxor T, T, Xhi ;reduction finalize | |
331 ; Block #6 | |
332 vmovdqu DATA, XMMWORD PTR[inp + 16*1] | |
333 vpshufb DATA, DATA, [Lbswap_mask] | |
334 KARATSUBA_AAD 6 | |
335 ; Block #7 | |
336 vmovdqu DATA, XMMWORD PTR[inp + 16*0] | |
337 vpshufb DATA, DATA, [Lbswap_mask] | |
338 vpxor DATA, DATA, T | |
339 KARATSUBA_AAD 7 | |
340 ; Aggregated 8 blocks, now karatsuba fixup | |
341 vpxor TMP2, TMP2, TMP0 | |
342 vpxor TMP2, TMP2, TMP1 | |
343 vpsrldq TMP3, TMP2, 8 | |
344 vpslldq TMP2, TMP2, 8 | |
345 | |
346 vpxor Xhi, TMP1, TMP3 | |
347 vpxor T, TMP0, TMP2 | |
348 | |
349 lea inp, [inp + 16*8] | |
350 jmp Lmod_loop | |
351 | |
352 Ldone: | |
353 vpclmulqdq TMP4, T, [Lpoly], 010h | |
354 vpalignr T, T, T, 8 | |
355 vpxor T, T, TMP4 | |
356 | |
357 vpclmulqdq TMP4, T, [Lpoly], 010h | |
358 vpalignr T, T, T, 8 | |
359 vpxor T, T, TMP4 | |
360 | |
361 vpxor T, T, Xhi | |
362 vmovdqu XMMWORD PTR[Tp], T | |
363 vzeroupper | |
364 | |
365 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] | |
366 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] | |
367 add rsp, 16*2 | |
368 | |
369 ret | |
370 | |
371 intel_aes_gcmAAD ENDP | |
372 | |
373 | |
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
375 ; | |
376 ; Encrypt and Authenticate | |
377 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); | |
378 ; | |
379 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
380 | |
381 ALIGN 16 | |
382 intel_aes_gcmENC PROC | |
383 | |
384 PT textequ <rcx> | |
385 CT textequ <rdx> | |
386 Htbl textequ <r8> | |
387 Gctx textequ <r8> | |
388 len textequ <r9> | |
389 KS textequ <r10> | |
390 NR textequ <eax> | |
391 | |
392 aluCTR textequ <r11d> | |
393 aluKSl textequ <r12d> | |
394 aluTMP textequ <r13d> | |
395 | |
396 T textequ <xmm0> | |
397 TMP0 textequ <xmm1> | |
398 TMP1 textequ <xmm2> | |
399 TMP2 textequ <xmm3> | |
400 TMP3 textequ <xmm4> | |
401 TMP4 textequ <xmm5> | |
402 TMP5 textequ <xmm6> | |
403 CTR0 textequ <xmm7> | |
404 CTR1 textequ <xmm8> | |
405 CTR2 textequ <xmm9> | |
406 CTR3 textequ <xmm10> | |
407 CTR4 textequ <xmm11> | |
408 CTR5 textequ <xmm12> | |
409 CTR6 textequ <xmm13> | |
410 CTR7 textequ <xmm14> | |
411 BSWAPMASK textequ <xmm15> | |
412 | |
413 ROUND MACRO i | |
414 vmovdqu TMP3, XMMWORD PTR[i*16 + KS] | |
415 vaesenc CTR0, CTR0, TMP3 | |
416 vaesenc CTR1, CTR1, TMP3 | |
417 vaesenc CTR2, CTR2, TMP3 | |
418 vaesenc CTR3, CTR3, TMP3 | |
419 vaesenc CTR4, CTR4, TMP3 | |
420 vaesenc CTR5, CTR5, TMP3 | |
421 vaesenc CTR6, CTR6, TMP3 | |
422 vaesenc CTR7, CTR7, TMP3 | |
423 ENDM | |
424 ROUNDMUL MACRO i | |
425 vmovdqu TMP3, XMMWORD PTR[i*16 + KS] | |
426 | |
427 vaesenc CTR0, CTR0, TMP3 | |
428 vaesenc CTR1, CTR1, TMP3 | |
429 vaesenc CTR2, CTR2, TMP3 | |
430 vaesenc CTR3, CTR3, TMP3 | |
431 | |
432 vpshufd TMP4, TMP5, 78 | |
433 vpxor TMP4, TMP4, TMP5 | |
434 | |
435 vaesenc CTR4, CTR4, TMP3 | |
436 vaesenc CTR5, CTR5, TMP3 | |
437 vaesenc CTR6, CTR6, TMP3 | |
438 vaesenc CTR7, CTR7, TMP3 | |
439 | |
440 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h | |
441 vpxor TMP0, TMP0, TMP3 | |
442 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] | |
443 vpclmulqdq TMP3, TMP5, TMP4, 011h | |
444 vpxor TMP1, TMP1, TMP3 | |
445 vpclmulqdq TMP3, TMP5, TMP4, 000h | |
446 vpxor TMP2, TMP2, TMP3 | |
447 ENDM | |
448 KARATSUBA MACRO i | |
449 vpshufd TMP4, TMP5, 78 | |
450 vpxor TMP4, TMP4, TMP5 | |
451 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h | |
452 vpxor TMP0, TMP0, TMP3 | |
453 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] | |
454 vpclmulqdq TMP3, TMP5, TMP4, 011h | |
455 vpxor TMP1, TMP1, TMP3 | |
456 vpclmulqdq TMP3, TMP5, TMP4, 000h | |
457 vpxor TMP2, TMP2, TMP3 | |
458 ENDM | |
459 NEXTCTR MACRO i | |
460 add aluCTR, 1 | |
461 mov aluTMP, aluCTR | |
462 xor aluTMP, aluKSl | |
463 bswap aluTMP | |
464 mov [3*4 + 8*16 + i*16 + rsp], aluTMP | |
465 ENDM | |
466 | |
467 | |
468 test len, len | |
469 jnz LbeginENC | |
470 ret | |
471 | |
472 LbeginENC: | |
473 | |
474 vzeroupper | |
475 push r11 | |
476 push r12 | |
477 push r13 | |
478 push rbp | |
479 sub rsp, 10*16 | |
480 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 | |
481 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 | |
482 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 | |
483 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 | |
484 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 | |
485 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 | |
486 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 | |
487 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 | |
488 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 | |
489 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 | |
490 | |
491 mov rbp, rsp | |
492 sub rsp, 16*16 | |
493 and rsp, -16 | |
494 | |
495 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] | |
496 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] | |
497 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] | |
498 mov KS, [16*16 + 3*16 + Gctx] | |
499 mov NR, [4 + KS] | |
500 lea KS, [48 + KS] | |
501 | |
502 vpshufb CTR0, CTR0, BSWAPMASK | |
503 | |
504 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] | |
505 mov aluKSl, [3*4 + KS] | |
506 bswap aluCTR | |
507 bswap aluKSl | |
508 | |
509 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] | |
510 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] | |
511 vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0 | |
512 | |
513 cmp len, 128 | |
514 jb LEncDataSingles | |
515 ; Prepare the "top" counters | |
516 vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0 | |
517 vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0 | |
518 vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0 | |
519 vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0 | |
520 vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0 | |
521 vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0 | |
522 vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0 | |
523 | |
524 ; Encrypt the initial 8 blocks | |
525 sub len, 128 | |
526 vpaddd CTR1, CTR0, XMMWORD PTR[Lone] | |
527 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] | |
528 vpaddd CTR3, CTR2, XMMWORD PTR[Lone] | |
529 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] | |
530 vpaddd CTR5, CTR4, XMMWORD PTR[Lone] | |
531 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] | |
532 vpaddd CTR7, CTR6, XMMWORD PTR[Lone] | |
533 | |
534 vpshufb CTR0, CTR0, BSWAPMASK | |
535 vpshufb CTR1, CTR1, BSWAPMASK | |
536 vpshufb CTR2, CTR2, BSWAPMASK | |
537 vpshufb CTR3, CTR3, BSWAPMASK | |
538 vpshufb CTR4, CTR4, BSWAPMASK | |
539 vpshufb CTR5, CTR5, BSWAPMASK | |
540 vpshufb CTR6, CTR6, BSWAPMASK | |
541 vpshufb CTR7, CTR7, BSWAPMASK | |
542 | |
543 vmovdqu TMP3, XMMWORD PTR[0*16 + KS] | |
544 vpxor CTR0, CTR0, TMP3 | |
545 vpxor CTR1, CTR1, TMP3 | |
546 vpxor CTR2, CTR2, TMP3 | |
547 vpxor CTR3, CTR3, TMP3 | |
548 vpxor CTR4, CTR4, TMP3 | |
549 vpxor CTR5, CTR5, TMP3 | |
550 vpxor CTR6, CTR6, TMP3 | |
551 vpxor CTR7, CTR7, TMP3 | |
552 | |
553 ROUND 1 | |
554 | |
555 add aluCTR, 8 | |
556 mov aluTMP, aluCTR | |
557 xor aluTMP, aluKSl | |
558 bswap aluTMP | |
559 mov [8*16 + 0*16 + 3*4 + rsp], aluTMP | |
560 | |
561 ROUND 2 | |
562 NEXTCTR 1 | |
563 ROUND 3 | |
564 NEXTCTR 2 | |
565 ROUND 4 | |
566 NEXTCTR 3 | |
567 ROUND 5 | |
568 NEXTCTR 4 | |
569 ROUND 6 | |
570 NEXTCTR 5 | |
571 ROUND 7 | |
572 NEXTCTR 6 | |
573 ROUND 8 | |
574 NEXTCTR 7 | |
575 ROUND 9 | |
576 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] | |
577 cmp NR, 10 | |
578 je @f | |
579 | |
580 ROUND 10 | |
581 ROUND 11 | |
582 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] | |
583 cmp NR, 12 | |
584 je @f | |
585 | |
586 ROUND 12 | |
587 ROUND 13 | |
588 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] | |
589 @@: | |
590 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] | |
591 vaesenclast CTR0, CTR0, TMP3 | |
592 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] | |
593 vaesenclast CTR1, CTR1, TMP3 | |
594 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] | |
595 vaesenclast CTR2, CTR2, TMP3 | |
596 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] | |
597 vaesenclast CTR3, CTR3, TMP3 | |
598 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] | |
599 vaesenclast CTR4, CTR4, TMP3 | |
600 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] | |
601 vaesenclast CTR5, CTR5, TMP3 | |
602 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] | |
603 vaesenclast CTR6, CTR6, TMP3 | |
604 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] | |
605 vaesenclast CTR7, CTR7, TMP3 | |
606 | |
607 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 | |
608 vpshufb CTR0, CTR0, BSWAPMASK | |
609 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 | |
610 vpshufb CTR1, CTR1, BSWAPMASK | |
611 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 | |
612 vpshufb CTR2, CTR2, BSWAPMASK | |
613 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 | |
614 vpshufb CTR3, CTR3, BSWAPMASK | |
615 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 | |
616 vpshufb CTR4, CTR4, BSWAPMASK | |
617 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 | |
618 vpshufb CTR5, CTR5, BSWAPMASK | |
619 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 | |
620 vpshufb CTR6, CTR6, BSWAPMASK | |
621 vmovdqu XMMWORD PTR[7*16 + CT], CTR7 | |
622 vpshufb TMP5, CTR7, BSWAPMASK | |
623 | |
624 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 | |
625 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 | |
626 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 | |
627 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 | |
628 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 | |
629 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 | |
630 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 | |
631 | |
632 lea CT, [8*16 + CT] | |
633 lea PT, [8*16 + PT] | |
634 jmp LEncDataOctets | |
635 | |
636 LEncDataOctets: | |
637 cmp len, 128 | |
638 jb LEndEncOctets | |
639 sub len, 128 | |
640 | |
641 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp] | |
642 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp] | |
643 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp] | |
644 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp] | |
645 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp] | |
646 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp] | |
647 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp] | |
648 vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp] | |
649 | |
650 vpshufd TMP4, TMP5, 78 | |
651 vpxor TMP4, TMP4, TMP5 | |
652 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h | |
653 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] | |
654 vpclmulqdq TMP1, TMP5, TMP4, 011h | |
655 vpclmulqdq TMP2, TMP5, TMP4, 000h | |
656 | |
657 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] | |
658 ROUNDMUL 1 | |
659 NEXTCTR 0 | |
660 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] | |
661 ROUNDMUL 2 | |
662 NEXTCTR 1 | |
663 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] | |
664 ROUNDMUL 3 | |
665 NEXTCTR 2 | |
666 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] | |
667 ROUNDMUL 4 | |
668 NEXTCTR 3 | |
669 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] | |
670 ROUNDMUL 5 | |
671 NEXTCTR 4 | |
672 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] | |
673 ROUNDMUL 6 | |
674 NEXTCTR 5 | |
675 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] | |
676 ROUNDMUL 7 | |
677 NEXTCTR 6 | |
678 | |
679 ROUND 8 | |
680 NEXTCTR 7 | |
681 | |
682 vpxor TMP0, TMP0, TMP1 | |
683 vpxor TMP0, TMP0, TMP2 | |
684 vpsrldq TMP3, TMP0, 8 | |
685 vpxor TMP4, TMP1, TMP3 | |
686 vpslldq TMP3, TMP0, 8 | |
687 vpxor T, TMP2, TMP3 | |
688 | |
689 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h | |
690 vpalignr T,T,T,8 | |
691 vpxor T, T, TMP1 | |
692 | |
693 ROUND 9 | |
694 | |
695 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h | |
696 vpalignr T,T,T,8 | |
697 vpxor T, T, TMP1 | |
698 | |
699 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] | |
700 cmp NR, 10 | |
701 je @f | |
702 | |
703 ROUND 10 | |
704 ROUND 11 | |
705 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] | |
706 cmp NR, 12 | |
707 je @f | |
708 | |
709 ROUND 12 | |
710 ROUND 13 | |
711 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] | |
712 @@: | |
713 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] | |
714 vaesenclast CTR0, CTR0, TMP3 | |
715 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] | |
716 vaesenclast CTR1, CTR1, TMP3 | |
717 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] | |
718 vaesenclast CTR2, CTR2, TMP3 | |
719 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] | |
720 vaesenclast CTR3, CTR3, TMP3 | |
721 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] | |
722 vaesenclast CTR4, CTR4, TMP3 | |
723 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] | |
724 vaesenclast CTR5, CTR5, TMP3 | |
725 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] | |
726 vaesenclast CTR6, CTR6, TMP3 | |
727 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] | |
728 vaesenclast CTR7, CTR7, TMP3 | |
729 | |
730 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 | |
731 vpshufb CTR0, CTR0, BSWAPMASK | |
732 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 | |
733 vpshufb CTR1, CTR1, BSWAPMASK | |
734 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 | |
735 vpshufb CTR2, CTR2, BSWAPMASK | |
736 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 | |
737 vpshufb CTR3, CTR3, BSWAPMASK | |
738 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 | |
739 vpshufb CTR4, CTR4, BSWAPMASK | |
740 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 | |
741 vpshufb CTR5, CTR5, BSWAPMASK | |
742 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 | |
743 vpshufb CTR6, CTR6, BSWAPMASK | |
744 vmovdqu XMMWORD PTR[7*16 + CT], CTR7 | |
745 vpshufb TMP5, CTR7, BSWAPMASK | |
746 | |
747 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 | |
748 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 | |
749 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 | |
750 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 | |
751 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 | |
752 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 | |
753 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 | |
754 | |
755 vpxor T, T, TMP4 | |
756 | |
757 lea CT, [8*16 + CT] | |
758 lea PT, [8*16 + PT] | |
759 jmp LEncDataOctets | |
760 | |
761 LEndEncOctets: | |
762 | |
763 vpshufd TMP4, TMP5, 78 | |
764 vpxor TMP4, TMP4, TMP5 | |
765 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h | |
766 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] | |
767 vpclmulqdq TMP1, TMP5, TMP4, 011h | |
768 vpclmulqdq TMP2, TMP5, TMP4, 000h | |
769 | |
770 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] | |
771 KARATSUBA 1 | |
772 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] | |
773 KARATSUBA 2 | |
774 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] | |
775 KARATSUBA 3 | |
776 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] | |
777 KARATSUBA 4 | |
778 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] | |
779 KARATSUBA 5 | |
780 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] | |
781 KARATSUBA 6 | |
782 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] | |
783 KARATSUBA 7 | |
784 | |
785 vpxor TMP0, TMP0, TMP1 | |
786 vpxor TMP0, TMP0, TMP2 | |
787 vpsrldq TMP3, TMP0, 8 | |
788 vpxor TMP4, TMP1, TMP3 | |
789 vpslldq TMP3, TMP0, 8 | |
790 vpxor T, TMP2, TMP3 | |
791 | |
792 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h | |
793 vpalignr T,T,T,8 | |
794 vpxor T, T, TMP1 | |
795 | |
796 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h | |
797 vpalignr T,T,T,8 | |
798 vpxor T, T, TMP1 | |
799 | |
800 vpxor T, T, TMP4 | |
801 | |
802 sub aluCTR, 7 | |
803 | |
804 LEncDataSingles: | |
805 | |
806 cmp len, 16 | |
807 jb LEncDataTail | |
808 sub len, 16 | |
809 | |
810 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] | |
811 NEXTCTR 0 | |
812 | |
813 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
814 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
815 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
816 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
817 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
818 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
819 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
820 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
821 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
822 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
823 cmp NR, 10 | |
824 je @f | |
825 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
826 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
827 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
828 cmp NR, 12 | |
829 je @f | |
830 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
831 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
832 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
833 @@: | |
834 vaesenclast TMP1, TMP1, TMP2 | |
835 vpxor TMP1, TMP1, XMMWORD PTR[PT] | |
836 vmovdqu XMMWORD PTR[CT], TMP1 | |
837 | |
838 lea PT, [16+PT] | |
839 lea CT, [16+CT] | |
840 | |
841 vpshufb TMP1, TMP1, BSWAPMASK | |
842 vpxor T, T, TMP1 | |
843 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
844 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 | |
845 | |
846 jmp LEncDataSingles | |
847 | |
848 LEncDataTail: | |
849 | |
850 test len, len | |
851 jz LEncDataEnd | |
852 | |
853 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] | |
854 | |
855 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
856 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
857 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
858 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
859 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
860 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
861 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
862 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
863 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
864 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
865 cmp NR, 10 | |
866 je @f | |
867 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
868 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
869 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
870 cmp NR, 12 | |
871 je @f | |
872 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
873 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
874 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
875 @@: | |
876 vaesenclast TMP1, TMP1, TMP2 | |
877 ; zero a temp location | |
878 vpxor TMP2, TMP2, TMP2 | |
879 vmovdqa XMMWORD PTR[rsp], TMP2 | |
880 ; copy as many bytes as needed | |
881 xor KS, KS | |
882 | |
883 @@: | |
884 cmp len, KS | |
885 je @f | |
886 mov al, [PT + KS] | |
887 mov [rsp + KS], al | |
888 inc KS | |
889 jmp @b | |
890 @@: | |
891 vpxor TMP1, TMP1, XMMWORD PTR[rsp] | |
892 vmovdqa XMMWORD PTR[rsp], TMP1 | |
893 xor KS, KS | |
894 @@: | |
895 cmp len, KS | |
896 je @f | |
897 mov al, [rsp + KS] | |
898 mov [CT + KS], al | |
899 inc KS | |
900 jmp @b | |
901 @@: | |
902 cmp KS, 16 | |
903 je @f | |
904 mov BYTE PTR[rsp + KS], 0 | |
905 inc KS | |
906 jmp @b | |
907 @@: | |
908 BAIL: | |
909 vmovdqa TMP1, XMMWORD PTR[rsp] | |
910 vpshufb TMP1, TMP1, BSWAPMASK | |
911 vpxor T, T, TMP1 | |
912 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
913 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 | |
914 | |
915 LEncDataEnd: | |
916 | |
917 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T | |
918 bswap aluCTR | |
919 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR | |
920 | |
921 mov rsp, rbp | |
922 | |
923 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] | |
924 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] | |
925 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] | |
926 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] | |
927 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] | |
928 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] | |
929 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] | |
930 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] | |
931 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] | |
932 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] | |
933 | |
934 add rsp, 10*16 | |
935 pop rbp | |
936 pop r13 | |
937 pop r12 | |
938 pop r11 | |
939 | |
940 vzeroupper | |
941 | |
942 ret | |
943 intel_aes_gcmENC ENDP | |
944 | |
945 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
946 ; | |
947 ; Decrypt and Authenticate | |
948 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); | |
949 ; | |
950 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
951 | |
952 ALIGN 16 | |
953 intel_aes_gcmDEC PROC | |
954 | |
955 NEXTCTR MACRO i | |
956 add aluCTR, 1 | |
957 mov aluTMP, aluCTR | |
958 xor aluTMP, aluKSl | |
959 bswap aluTMP | |
960 mov [3*4 + i*16 + rsp], aluTMP | |
961 ENDM | |
962 | |
963 PT textequ <rdx> | |
964 CT textequ <rcx> | |
965 | |
966 test len, len | |
967 jnz LbeginDEC | |
968 ret | |
969 | |
970 LbeginDEC: | |
971 | |
972 vzeroupper | |
973 push r11 | |
974 push r12 | |
975 push r13 | |
976 push rbp | |
977 sub rsp, 10*16 | |
978 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 | |
979 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 | |
980 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 | |
981 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 | |
982 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 | |
983 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 | |
984 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 | |
985 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 | |
986 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 | |
987 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 | |
988 | |
989 mov rbp, rsp | |
990 sub rsp, 8*16 | |
991 and rsp, -16 | |
992 | |
993 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] | |
994 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] | |
995 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] | |
996 mov KS, [16*16 + 3*16 + Gctx] | |
997 mov NR, [4 + KS] | |
998 lea KS, [48 + KS] | |
999 | |
1000 vpshufb CTR0, CTR0, BSWAPMASK | |
1001 | |
1002 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] | |
1003 mov aluKSl, [3*4 + KS] | |
1004 bswap aluCTR | |
1005 bswap aluKSl | |
1006 | |
1007 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] | |
1008 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] | |
1009 vmovdqu XMMWORD PTR[0*16 + rsp], TMP0 | |
1010 | |
1011 cmp len, 128 | |
1012 jb LDecDataSingles | |
1013 ; Prepare the "top" counters | |
1014 vmovdqu XMMWORD PTR[1*16 + rsp], TMP0 | |
1015 vmovdqu XMMWORD PTR[2*16 + rsp], TMP0 | |
1016 vmovdqu XMMWORD PTR[3*16 + rsp], TMP0 | |
1017 vmovdqu XMMWORD PTR[4*16 + rsp], TMP0 | |
1018 vmovdqu XMMWORD PTR[5*16 + rsp], TMP0 | |
1019 vmovdqu XMMWORD PTR[6*16 + rsp], TMP0 | |
1020 vmovdqu XMMWORD PTR[7*16 + rsp], TMP0 | |
1021 | |
1022 NEXTCTR 1 | |
1023 NEXTCTR 2 | |
1024 NEXTCTR 3 | |
1025 NEXTCTR 4 | |
1026 NEXTCTR 5 | |
1027 NEXTCTR 6 | |
1028 NEXTCTR 7 | |
1029 | |
1030 LDecDataOctets: | |
1031 cmp len, 128 | |
1032 jb LEndDecOctets | |
1033 sub len, 128 | |
1034 | |
1035 vmovdqa CTR0, XMMWORD PTR[0*16 + rsp] | |
1036 vmovdqa CTR1, XMMWORD PTR[1*16 + rsp] | |
1037 vmovdqa CTR2, XMMWORD PTR[2*16 + rsp] | |
1038 vmovdqa CTR3, XMMWORD PTR[3*16 + rsp] | |
1039 vmovdqa CTR4, XMMWORD PTR[4*16 + rsp] | |
1040 vmovdqa CTR5, XMMWORD PTR[5*16 + rsp] | |
1041 vmovdqa CTR6, XMMWORD PTR[6*16 + rsp] | |
1042 vmovdqa CTR7, XMMWORD PTR[7*16 + rsp] | |
1043 | |
1044 vmovdqu TMP5, XMMWORD PTR[7*16 + CT] | |
1045 vpshufb TMP5, TMP5, BSWAPMASK | |
1046 vpshufd TMP4, TMP5, 78 | |
1047 vpxor TMP4, TMP4, TMP5 | |
1048 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h | |
1049 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] | |
1050 vpclmulqdq TMP1, TMP5, TMP4, 011h | |
1051 vpclmulqdq TMP2, TMP5, TMP4, 000h | |
1052 | |
1053 vmovdqu TMP5, XMMWORD PTR[6*16 + CT] | |
1054 vpshufb TMP5, TMP5, BSWAPMASK | |
1055 ROUNDMUL 1 | |
1056 NEXTCTR 0 | |
1057 vmovdqu TMP5, XMMWORD PTR[5*16 + CT] | |
1058 vpshufb TMP5, TMP5, BSWAPMASK | |
1059 ROUNDMUL 2 | |
1060 NEXTCTR 1 | |
1061 vmovdqu TMP5, XMMWORD PTR[4*16 + CT] | |
1062 vpshufb TMP5, TMP5, BSWAPMASK | |
1063 ROUNDMUL 3 | |
1064 NEXTCTR 2 | |
1065 vmovdqu TMP5, XMMWORD PTR[3*16 + CT] | |
1066 vpshufb TMP5, TMP5, BSWAPMASK | |
1067 ROUNDMUL 4 | |
1068 NEXTCTR 3 | |
1069 vmovdqu TMP5, XMMWORD PTR[2*16 + CT] | |
1070 vpshufb TMP5, TMP5, BSWAPMASK | |
1071 ROUNDMUL 5 | |
1072 NEXTCTR 4 | |
1073 vmovdqu TMP5, XMMWORD PTR[1*16 + CT] | |
1074 vpshufb TMP5, TMP5, BSWAPMASK | |
1075 ROUNDMUL 6 | |
1076 NEXTCTR 5 | |
1077 vmovdqu TMP5, XMMWORD PTR[0*16 + CT] | |
1078 vpshufb TMP5, TMP5, BSWAPMASK | |
1079 vpxor TMP5, TMP5, T | |
1080 ROUNDMUL 7 | |
1081 NEXTCTR 6 | |
1082 | |
1083 ROUND 8 | |
1084 NEXTCTR 7 | |
1085 | |
1086 vpxor TMP0, TMP0, TMP1 | |
1087 vpxor TMP0, TMP0, TMP2 | |
1088 vpsrldq TMP3, TMP0, 8 | |
1089 vpxor TMP4, TMP1, TMP3 | |
1090 vpslldq TMP3, TMP0, 8 | |
1091 vpxor T, TMP2, TMP3 | |
1092 | |
1093 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h | |
1094 vpalignr T,T,T,8 | |
1095 vpxor T, T, TMP1 | |
1096 | |
1097 ROUND 9 | |
1098 | |
1099 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h | |
1100 vpalignr T,T,T,8 | |
1101 vpxor T, T, TMP1 | |
1102 | |
1103 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] | |
1104 cmp NR, 10 | |
1105 je @f | |
1106 | |
1107 ROUND 10 | |
1108 ROUND 11 | |
1109 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] | |
1110 cmp NR, 12 | |
1111 je @f | |
1112 | |
1113 ROUND 12 | |
1114 ROUND 13 | |
1115 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] | |
1116 @@: | |
1117 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT] | |
1118 vaesenclast CTR0, CTR0, TMP3 | |
1119 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT] | |
1120 vaesenclast CTR1, CTR1, TMP3 | |
1121 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT] | |
1122 vaesenclast CTR2, CTR2, TMP3 | |
1123 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT] | |
1124 vaesenclast CTR3, CTR3, TMP3 | |
1125 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT] | |
1126 vaesenclast CTR4, CTR4, TMP3 | |
1127 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT] | |
1128 vaesenclast CTR5, CTR5, TMP3 | |
1129 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT] | |
1130 vaesenclast CTR6, CTR6, TMP3 | |
1131 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT] | |
1132 vaesenclast CTR7, CTR7, TMP3 | |
1133 | |
1134 vmovdqu XMMWORD PTR[0*16 + PT], CTR0 | |
1135 vmovdqu XMMWORD PTR[1*16 + PT], CTR1 | |
1136 vmovdqu XMMWORD PTR[2*16 + PT], CTR2 | |
1137 vmovdqu XMMWORD PTR[3*16 + PT], CTR3 | |
1138 vmovdqu XMMWORD PTR[4*16 + PT], CTR4 | |
1139 vmovdqu XMMWORD PTR[5*16 + PT], CTR5 | |
1140 vmovdqu XMMWORD PTR[6*16 + PT], CTR6 | |
1141 vmovdqu XMMWORD PTR[7*16 + PT], CTR7 | |
1142 | |
1143 vpxor T, T, TMP4 | |
1144 | |
1145 lea CT, [8*16 + CT] | |
1146 lea PT, [8*16 + PT] | |
1147 jmp LDecDataOctets | |
1148 | |
1149 LEndDecOctets: | |
1150 | |
1151 sub aluCTR, 7 | |
1152 | |
1153 LDecDataSingles: | |
1154 | |
1155 cmp len, 16 | |
1156 jb LDecDataTail | |
1157 sub len, 16 | |
1158 | |
1159 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] | |
1160 NEXTCTR 0 | |
1161 | |
1162 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
1163 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
1164 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
1165 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
1166 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
1167 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
1168 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
1169 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
1170 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
1171 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
1172 cmp NR, 10 | |
1173 je @f | |
1174 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
1175 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
1176 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
1177 cmp NR, 12 | |
1178 je @f | |
1179 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
1180 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
1181 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
1182 @@: | |
1183 vaesenclast TMP1, TMP1, TMP2 | |
1184 | |
1185 vmovdqu TMP2, XMMWORD PTR[CT] | |
1186 vpxor TMP1, TMP1, TMP2 | |
1187 vmovdqu XMMWORD PTR[PT], TMP1 | |
1188 | |
1189 lea PT, [16+PT] | |
1190 lea CT, [16+CT] | |
1191 | |
1192 vpshufb TMP2, TMP2, BSWAPMASK | |
1193 vpxor T, T, TMP2 | |
1194 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
1195 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 | |
1196 | |
1197 jmp LDecDataSingles | |
1198 | |
1199 LDecDataTail: | |
1200 | |
1201 test len, len | |
1202 jz LDecDataEnd | |
1203 | |
1204 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] | |
1205 inc aluCTR | |
1206 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] | |
1207 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] | |
1208 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] | |
1209 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] | |
1210 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] | |
1211 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] | |
1212 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] | |
1213 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] | |
1214 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] | |
1215 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] | |
1216 cmp NR, 10 | |
1217 je @f | |
1218 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] | |
1219 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] | |
1220 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] | |
1221 cmp NR, 12 | |
1222 je @f | |
1223 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] | |
1224 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] | |
1225 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] | |
1226 @@: | |
1227 vaesenclast TMP1, TMP1, TMP2 | |
1228 ; copy as many bytes as needed | |
1229 xor KS, KS | |
1230 @@: | |
1231 cmp len, KS | |
1232 je @f | |
1233 mov al, [CT + KS] | |
1234 mov [rsp + KS], al | |
1235 inc KS | |
1236 jmp @b | |
1237 @@: | |
1238 cmp KS, 16 | |
1239 je @f | |
1240 mov BYTE PTR[rsp + KS], 0 | |
1241 inc KS | |
1242 jmp @b | |
1243 @@: | |
1244 vmovdqa TMP2, XMMWORD PTR[rsp] | |
1245 vpshufb TMP2, TMP2, BSWAPMASK | |
1246 vpxor T, T, TMP2 | |
1247 vmovdqu TMP0, XMMWORD PTR[Htbl] | |
1248 GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4 | |
1249 | |
1250 | |
1251 vpxor TMP1, TMP1, XMMWORD PTR[rsp] | |
1252 vmovdqa XMMWORD PTR[rsp], TMP1 | |
1253 xor KS, KS | |
1254 @@: | |
1255 cmp len, KS | |
1256 je @f | |
1257 mov al, [rsp + KS] | |
1258 mov [PT + KS], al | |
1259 inc KS | |
1260 jmp @b | |
1261 @@: | |
1262 | |
1263 LDecDataEnd: | |
1264 | |
1265 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T | |
1266 bswap aluCTR | |
1267 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR | |
1268 | |
1269 mov rsp, rbp | |
1270 | |
1271 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] | |
1272 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] | |
1273 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] | |
1274 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] | |
1275 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] | |
1276 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] | |
1277 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] | |
1278 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] | |
1279 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] | |
1280 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] | |
1281 | |
1282 add rsp, 10*16 | |
1283 pop rbp | |
1284 pop r13 | |
1285 pop r12 | |
1286 pop r11 | |
1287 | |
1288 vzeroupper | |
1289 | |
1290 ret | |
1291 ret | |
1292 intel_aes_gcmDEC ENDP | |
1293 | |
1294 | |
1295 END |