comparison nss/lib/freebl/intel-gcm-x86-masm.asm @ 0:1e5118fa0cb1

This is NSS with a Cmake Buildsyste To compile a static NSS library for Windows we've used the Chromium-NSS fork and added a Cmake buildsystem to compile it statically for Windows. See README.chromium for chromium changes and README.trustbridge for our modifications.
author Andre Heinecke <andre.heinecke@intevation.de>
date Mon, 28 Jul 2014 10:47:06 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1e5118fa0cb1
1 ; LICENSE:
2 ; This submission to NSS is to be made available under the terms of the
3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
4 ; //mozilla.org/MPL/2.0/.
5 ;###############################################################################
6 ; Copyright(c) 2014, Intel Corp.
7 ; Developers and authors:
8 ; Shay Gueron and Vlad Krasnov
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
10 ; Please send feedback directly to crypto.feedback.alias@intel.com
11
12
13 .MODEL FLAT, C
14 .XMM
15
16 .DATA
17 ALIGN 16
18 Lone dq 1,0
19 Ltwo dq 2,0
20 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
21 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
22 Lpoly dq 01h, 0c200000000000000h
23
24 .CODE
25
26
27 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
28 vpclmulqdq TMP1, SRC2, SRC1, 0h
29 vpclmulqdq TMP4, SRC2, SRC1, 011h
30
31 vpshufd TMP2, SRC2, 78
32 vpshufd TMP3, SRC1, 78
33 vpxor TMP2, TMP2, SRC2
34 vpxor TMP3, TMP3, SRC1
35
36 vpclmulqdq TMP2, TMP2, TMP3, 0h
37 vpxor TMP2, TMP2, TMP1
38 vpxor TMP2, TMP2, TMP4
39
40 vpslldq TMP3, TMP2, 8
41 vpsrldq TMP2, TMP2, 8
42
43 vpxor TMP1, TMP1, TMP3
44 vpxor TMP4, TMP4, TMP2
45
46 vpclmulqdq TMP2, TMP1, [Lpoly], 010h
47 vpshufd TMP3, TMP1, 78
48 vpxor TMP1, TMP2, TMP3
49
50 vpclmulqdq TMP2, TMP1, [Lpoly], 010h
51 vpshufd TMP3, TMP1, 78
52 vpxor TMP1, TMP2, TMP3
53
54 vpxor DST, TMP1, TMP4
55
56 ENDM
57
58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
59 ;
60 ; Generates the final GCM tag
61 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
62 ; unsigned char *Tp,
63 ; unsigned int Mlen,
64 ; unsigned int Alen,
65 ; unsigned char* X0,
66 ; unsigned char* TAG);
67 ;
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
69
70 ALIGN 16
71 intel_aes_gcmTAG PROC
72
73 Htbl textequ <eax>
74 Tp textequ <ecx>
75 X0 textequ <edx>
76 TAG textequ <ebx>
77
78 T textequ <xmm0>
79 TMP0 textequ <xmm1>
80
81 push ebx
82
83 mov Htbl, [esp + 2*4 + 0*4]
84 mov Tp, [esp + 2*4 + 1*4]
85 mov X0, [esp + 2*4 + 4*4]
86 mov TAG, [esp + 2*4 + 5*4]
87
88 vzeroupper
89 vmovdqu T, XMMWORD PTR[Tp]
90
91 vpxor TMP0, TMP0, TMP0
92 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0
93 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2
94 vpsllq TMP0, TMP0, 3
95
96 vpxor T, T, TMP0
97 vmovdqu TMP0, XMMWORD PTR[Htbl]
98 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
99
100 vpshufb T, T, [Lbswap_mask]
101 vpxor T, T, [X0]
102 vmovdqu XMMWORD PTR[TAG], T
103 vzeroupper
104
105 pop ebx
106
107 ret
108
109 intel_aes_gcmTAG ENDP
110
111 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
112 ;
113 ; Generates the H table
114 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
115 ;
116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
117
118 ALIGN 16
119 intel_aes_gcmINIT PROC
120
121 Htbl textequ <eax>
122 KS textequ <ecx>
123 NR textequ <edx>
124
125 T textequ <xmm0>
126 TMP0 textequ <xmm1>
127
128 mov Htbl, [esp + 4*1 + 0*4]
129 mov KS, [esp + 4*1 + 1*4]
130 mov NR, [esp + 4*1 + 2*4]
131
132 vzeroupper
133 ; AES-ENC(0)
134 vmovdqu T, XMMWORD PTR[KS]
135 lea KS, [16 + KS]
136 dec NR
137 Lenc_loop:
138 vaesenc T, T, [KS]
139 lea KS, [16 + KS]
140 dec NR
141 jnz Lenc_loop
142
143 vaesenclast T, T, [KS]
144 vpshufb T, T, [Lbswap_mask]
145
146 ;Calculate H` = GFMUL(H, 2)
147 vpsrad xmm3, T, 31
148 vpshufd xmm3, xmm3, 0ffh
149 vpand xmm5, xmm3, [Lpoly]
150 vpsrld xmm3, T, 31
151 vpslld xmm4, T, 1
152 vpslldq xmm3, xmm3, 4
153 vpxor T, xmm4, xmm3
154 vpxor T, T, xmm5
155
156 vmovdqu TMP0, T
157 vmovdqu XMMWORD PTR[Htbl + 0*16], T
158
159 vpshufd xmm2, T, 78
160 vpxor xmm2, xmm2, T
161 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
162
163 i = 1
164 WHILE i LT 8
165 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
166 vmovdqu XMMWORD PTR[Htbl + i*16], T
167 vpshufd xmm2, T, 78
168 vpxor xmm2, xmm2, T
169 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
170 i = i+1
171 ENDM
172 vzeroupper
173 ret
174 intel_aes_gcmINIT ENDP
175
176
177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
178 ;
179 ; Authenticate only
180 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
181 ;
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
183
184 ALIGN 16
185 intel_aes_gcmAAD PROC
186
187 Htbl textequ <eax>
188 inp textequ <ecx>
189 len textequ <edx>
190 Tp textequ <ebx>
191 hlp0 textequ <esi>
192
193 DATA textequ <xmm0>
194 T textequ <xmm1>
195 TMP0 textequ <xmm2>
196 TMP1 textequ <xmm3>
197 TMP2 textequ <xmm4>
198 TMP3 textequ <xmm5>
199 TMP4 textequ <xmm6>
200 Xhi textequ <xmm7>
201
202 KARATSUBA_AAD MACRO i
203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
204 vpxor TMP0, TMP0, TMP3
205 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
206 vpxor TMP1, TMP1, TMP3
207 vpshufd TMP3, DATA, 78
208 vpxor TMP3, TMP3, DATA
209 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
210 vpxor TMP2, TMP2, TMP3
211 ENDM
212
213 cmp DWORD PTR[esp + 1*3 + 2*4], 0
214 jnz LbeginAAD
215 ret
216
217 LbeginAAD:
218 push ebx
219 push esi
220
221 mov Htbl, [esp + 4*3 + 0*4]
222 mov inp, [esp + 4*3 + 1*4]
223 mov len, [esp + 4*3 + 2*4]
224 mov Tp, [esp + 4*3 + 3*4]
225
226 vzeroupper
227
228 vpxor Xhi, Xhi, Xhi
229
230 vmovdqu T, XMMWORD PTR[Tp]
231 ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
232 mov hlp0, len
233 and hlp0, 128-1
234 jz Lmod_loop
235
236 and len, -128
237 sub hlp0, 16
238
239 ; Prefix block
240 vmovdqu DATA, XMMWORD PTR[inp]
241 vpshufb DATA, DATA, [Lbswap_mask]
242 vpxor DATA, DATA, T
243
244 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h
245 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h
246 vpshufd TMP3, DATA, 78
247 vpxor TMP3, TMP3, DATA
248 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
249
250 lea inp, [inp+16]
251 test hlp0, hlp0
252 jnz Lpre_loop
253 jmp Lred1
254
255 ;hash remaining prefix bocks (up to 7 total prefix blocks)
256 Lpre_loop:
257
258 sub hlp0, 16
259
260 vmovdqu DATA, XMMWORD PTR[inp]
261 vpshufb DATA, DATA, [Lbswap_mask]
262
263 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h
264 vpxor TMP0, TMP0, TMP3
265 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h
266 vpxor TMP1, TMP1, TMP3
267 vpshufd TMP3, DATA, 78
268 vpxor TMP3, TMP3, DATA
269 vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
270 vpxor TMP2, TMP2, TMP3
271
272 test hlp0, hlp0
273 lea inp, [inp+16]
274 jnz Lpre_loop
275
276 Lred1:
277
278 vpxor TMP2, TMP2, TMP0
279 vpxor TMP2, TMP2, TMP1
280 vpsrldq TMP3, TMP2, 8
281 vpslldq TMP2, TMP2, 8
282
283 vpxor Xhi, TMP1, TMP3
284 vpxor T, TMP0, TMP2
285
286 Lmod_loop:
287
288 sub len, 16*8
289 jb Ldone
290 ; Block #0
291 vmovdqu DATA, XMMWORD PTR[inp + 16*7]
292 vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask]
293
294 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h
295 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h
296 vpshufd TMP3, DATA, 78
297 vpxor TMP3, TMP3, DATA
298 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h
299
300 ; Block #1
301 vmovdqu DATA, XMMWORD PTR[inp + 16*6]
302 vpshufb DATA, DATA, [Lbswap_mask]
303 KARATSUBA_AAD 1
304
305 ; Block #2
306 vmovdqu DATA, XMMWORD PTR[inp + 16*5]
307 vpshufb DATA, DATA, [Lbswap_mask]
308
309 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
310 vpalignr T, T, T, 8
311
312 KARATSUBA_AAD 2
313
314 vpxor T, T, TMP4 ;reduction stage 1b
315
316 ; Block #3
317 vmovdqu DATA, XMMWORD PTR[inp + 16*4]
318 vpshufb DATA, DATA, [Lbswap_mask]
319 KARATSUBA_AAD 3
320 ; Block #4
321 vmovdqu DATA, XMMWORD PTR[inp + 16*3]
322 vpshufb DATA, DATA, [Lbswap_mask]
323
324 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
325 vpalignr T, T, T, 8
326
327 KARATSUBA_AAD 4
328
329 vpxor T, T, TMP4 ;reduction stage 2b
330 ; Block #5
331 vmovdqu DATA, XMMWORD PTR[inp + 16*2]
332 vpshufb DATA, DATA, [Lbswap_mask]
333 KARATSUBA_AAD 5
334
335 vpxor T, T, Xhi ;reduction finalize
336 ; Block #6
337 vmovdqu DATA, XMMWORD PTR[inp + 16*1]
338 vpshufb DATA, DATA, [Lbswap_mask]
339 KARATSUBA_AAD 6
340 ; Block #7
341 vmovdqu DATA, XMMWORD PTR[inp + 16*0]
342 vpshufb DATA, DATA, [Lbswap_mask]
343 vpxor DATA, DATA, T
344 KARATSUBA_AAD 7
345 ; Aggregated 8 blocks, now karatsuba fixup
346 vpxor TMP2, TMP2, TMP0
347 vpxor TMP2, TMP2, TMP1
348 vpsrldq TMP3, TMP2, 8
349 vpslldq TMP2, TMP2, 8
350
351 vpxor Xhi, TMP1, TMP3
352 vpxor T, TMP0, TMP2
353
354 lea inp, [inp + 16*8]
355 jmp Lmod_loop
356
357 Ldone:
358 vpclmulqdq TMP4, T, [Lpoly], 010h
359 vpalignr T, T, T, 8
360 vpxor T, T, TMP4
361
362 vpclmulqdq TMP4, T, [Lpoly], 010h
363 vpalignr T, T, T, 8
364 vpxor T, T, TMP4
365
366 vpxor T, T, Xhi
367 vmovdqu XMMWORD PTR[Tp], T
368 vzeroupper
369
370 pop esi
371 pop ebx
372 ret
373
374 intel_aes_gcmAAD ENDP
375
376
377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
378 ;
379 ; Encrypt and Authenticate
380 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
381 ;
382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
383
384 ALIGN 16
385 intel_aes_gcmENC PROC
386
387 PT textequ <eax>
388 CT textequ <ecx>
389 Htbl textequ <edx>
390 Gctx textequ <edx>
391 len textequ <DWORD PTR[ebp + 5*4 + 3*4]>
392 KS textequ <esi>
393 NR textequ <DWORD PTR[-40 + KS]>
394
395 aluCTR textequ <ebx>
396 aluTMP textequ <edi>
397
398 T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]>
399 TMP0 textequ <xmm1>
400 TMP1 textequ <xmm2>
401 TMP2 textequ <xmm3>
402 TMP3 textequ <xmm4>
403 TMP4 textequ <xmm5>
404 TMP5 textequ <xmm6>
405
406 CTR0 textequ <xmm0>
407 CTR1 textequ <xmm1>
408 CTR2 textequ <xmm2>
409 CTR3 textequ <xmm3>
410 CTR4 textequ <xmm4>
411 CTR5 textequ <xmm5>
412 CTR6 textequ <xmm6>
413
414 ROUND MACRO i
415 vmovdqu xmm7, XMMWORD PTR[i*16 + KS]
416 vaesenc CTR0, CTR0, xmm7
417 vaesenc CTR1, CTR1, xmm7
418 vaesenc CTR2, CTR2, xmm7
419 vaesenc CTR3, CTR3, xmm7
420 vaesenc CTR4, CTR4, xmm7
421 vaesenc CTR5, CTR5, xmm7
422 vaesenc CTR6, CTR6, xmm7
423 ENDM
424
425 KARATSUBA MACRO i
426 vpshufd TMP4, TMP5, 78
427 vpxor TMP4, TMP4, TMP5
428 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
429 vpxor TMP0, TMP0, TMP3
430 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
431 vpclmulqdq TMP3, TMP5, TMP4, 011h
432 vpxor TMP1, TMP1, TMP3
433 vpclmulqdq TMP3, TMP5, TMP4, 000h
434 vpxor TMP2, TMP2, TMP3
435 ENDM
436
437 NEXTCTR MACRO i
438 add aluCTR, 1
439 mov aluTMP, aluCTR
440 bswap aluTMP
441 xor aluTMP, [3*4 + KS]
442 mov [3*4 + 8*16 + i*16 + esp], aluTMP
443 ENDM
444
445 cmp DWORD PTR[1*4 + 3*4 + esp], 0
446 jne LbeginENC
447 ret
448
449 LbeginENC:
450
451 vzeroupper
452 push ebp
453 push ebx
454 push esi
455 push edi
456
457 mov ebp, esp
458 sub esp, 16*16
459 and esp, -16
460
461 mov PT, [ebp + 5*4 + 0*4]
462 mov CT, [ebp + 5*4 + 1*4]
463 mov Gctx, [ebp + 5*4 + 2*4]
464
465 mov KS, [16*16 + 3*16 + Gctx]
466 lea KS, [44 + KS]
467
468 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
469 bswap aluCTR
470
471
472 vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
473 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
474 vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0
475
476 cmp len, 16*7
477 jb LEncDataSingles
478 ; Prepare the "top" counters
479 vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0
480 vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0
481 vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0
482 vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0
483 vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0
484 vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0
485
486 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
487 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
488 ; Encrypt the initial 7 blocks
489 sub len, 16*7
490 vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
491 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
492 vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
493 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
494 vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
495 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
496
497 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
498 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
499 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
500 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
501 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
502 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
503 vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask]
504
505 vmovdqu xmm7, XMMWORD PTR[0*16 + KS]
506 vpxor CTR0, CTR0, xmm7
507 vpxor CTR1, CTR1, xmm7
508 vpxor CTR2, CTR2, xmm7
509 vpxor CTR3, CTR3, xmm7
510 vpxor CTR4, CTR4, xmm7
511 vpxor CTR5, CTR5, xmm7
512 vpxor CTR6, CTR6, xmm7
513
514 ROUND 1
515
516 add aluCTR, 7
517 mov aluTMP, aluCTR
518 bswap aluTMP
519 xor aluTMP, [KS + 3*4]
520 mov [8*16 + 0*16 + 3*4 + esp], aluTMP
521
522 ROUND 2
523 NEXTCTR 1
524 ROUND 3
525 NEXTCTR 2
526 ROUND 4
527 NEXTCTR 3
528 ROUND 5
529 NEXTCTR 4
530 ROUND 6
531 NEXTCTR 5
532 ROUND 7
533 NEXTCTR 6
534 ROUND 8
535 ROUND 9
536 vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
537 cmp NR, 10
538 je @f
539
540 ROUND 10
541 ROUND 11
542 vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
543 cmp NR, 12
544 je @f
545
546 ROUND 12
547 ROUND 13
548 vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
549 @@:
550 vaesenclast CTR0, CTR0, xmm7
551 vaesenclast CTR1, CTR1, xmm7
552 vaesenclast CTR2, CTR2, xmm7
553 vaesenclast CTR3, CTR3, xmm7
554 vaesenclast CTR4, CTR4, xmm7
555 vaesenclast CTR5, CTR5, xmm7
556 vaesenclast CTR6, CTR6, xmm7
557
558 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
559 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
560 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
561 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
562 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
563 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
564 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
565
566 vmovdqu XMMWORD PTR[0*16 + CT], CTR0
567 vmovdqu XMMWORD PTR[1*16 + CT], CTR1
568 vmovdqu XMMWORD PTR[2*16 + CT], CTR2
569 vmovdqu XMMWORD PTR[3*16 + CT], CTR3
570 vmovdqu XMMWORD PTR[4*16 + CT], CTR4
571 vmovdqu XMMWORD PTR[5*16 + CT], CTR5
572 vmovdqu XMMWORD PTR[6*16 + CT], CTR6
573
574 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
575 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
576 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
577 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
578 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
579 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
580 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
581
582 vmovdqa XMMWORD PTR[1*16 + esp], CTR5
583 vmovdqa XMMWORD PTR[2*16 + esp], CTR4
584 vmovdqa XMMWORD PTR[3*16 + esp], CTR3
585 vmovdqa XMMWORD PTR[4*16 + esp], CTR2
586 vmovdqa XMMWORD PTR[5*16 + esp], CTR1
587 vmovdqa XMMWORD PTR[6*16 + esp], CTR0
588
589 lea CT, [7*16 + CT]
590 lea PT, [7*16 + PT]
591 jmp LEncData7
592
593 LEncData7:
594 cmp len, 16*7
595 jb LEndEnc7
596 sub len, 16*7
597
598 vpshufd TMP4, TMP5, 78
599 vpxor TMP4, TMP4, TMP5
600 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
601 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
602 vpclmulqdq TMP1, TMP5, TMP4, 011h
603 vpclmulqdq TMP2, TMP5, TMP4, 000h
604
605 vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
606 KARATSUBA 1
607 vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
608 KARATSUBA 2
609 vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
610 KARATSUBA 3
611 vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
612 KARATSUBA 4
613 vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
614 KARATSUBA 5
615 vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
616 vpxor TMP5, TMP5, T
617 KARATSUBA 6
618
619 vpxor TMP0, TMP0, TMP1
620 vpxor TMP0, TMP0, TMP2
621 vpsrldq TMP3, TMP0, 8
622 vpxor TMP4, TMP1, TMP3
623 vpslldq TMP3, TMP0, 8
624 vpxor TMP5, TMP2, TMP3
625
626 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
627 vpalignr TMP5,TMP5,TMP5,8
628 vpxor TMP5, TMP5, TMP1
629
630 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
631 vpalignr TMP5,TMP5,TMP5,8
632 vpxor TMP5, TMP5, TMP1
633
634 vpxor TMP5, TMP5, TMP4
635 vmovdqu T, TMP5
636
637 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp]
638 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp]
639 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp]
640 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp]
641 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp]
642 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp]
643 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp]
644
645 ROUND 1
646 NEXTCTR 0
647 ROUND 2
648 NEXTCTR 1
649 ROUND 3
650 NEXTCTR 2
651 ROUND 4
652 NEXTCTR 3
653 ROUND 5
654 NEXTCTR 4
655 ROUND 6
656 NEXTCTR 5
657 ROUND 7
658 NEXTCTR 6
659
660 ROUND 8
661 ROUND 9
662
663 vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
664 cmp NR, 10
665 je @f
666
667 ROUND 10
668 ROUND 11
669 vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
670 cmp NR, 12
671 je @f
672
673 ROUND 12
674 ROUND 13
675 vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
676 @@:
677 vaesenclast CTR0, CTR0, xmm7
678 vaesenclast CTR1, CTR1, xmm7
679 vaesenclast CTR2, CTR2, xmm7
680 vaesenclast CTR3, CTR3, xmm7
681 vaesenclast CTR4, CTR4, xmm7
682 vaesenclast CTR5, CTR5, xmm7
683 vaesenclast CTR6, CTR6, xmm7
684
685 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
686 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
687 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
688 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
689 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
690 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
691 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
692
693 vmovdqu XMMWORD PTR[0*16 + CT], CTR0
694 vmovdqu XMMWORD PTR[1*16 + CT], CTR1
695 vmovdqu XMMWORD PTR[2*16 + CT], CTR2
696 vmovdqu XMMWORD PTR[3*16 + CT], CTR3
697 vmovdqu XMMWORD PTR[4*16 + CT], CTR4
698 vmovdqu XMMWORD PTR[5*16 + CT], CTR5
699 vmovdqu XMMWORD PTR[6*16 + CT], CTR6
700
701 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
702 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
703 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
704 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
705 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
706 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
707 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
708
709 vmovdqa XMMWORD PTR[1*16 + esp], CTR5
710 vmovdqa XMMWORD PTR[2*16 + esp], CTR4
711 vmovdqa XMMWORD PTR[3*16 + esp], CTR3
712 vmovdqa XMMWORD PTR[4*16 + esp], CTR2
713 vmovdqa XMMWORD PTR[5*16 + esp], CTR1
714 vmovdqa XMMWORD PTR[6*16 + esp], CTR0
715
716 lea CT, [7*16 + CT]
717 lea PT, [7*16 + PT]
718 jmp LEncData7
719
720 LEndEnc7:
721
722 vpshufd TMP4, TMP5, 78
723 vpxor TMP4, TMP4, TMP5
724 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
725 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
726 vpclmulqdq TMP1, TMP5, TMP4, 011h
727 vpclmulqdq TMP2, TMP5, TMP4, 000h
728
729 vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
730 KARATSUBA 1
731 vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
732 KARATSUBA 2
733 vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
734 KARATSUBA 3
735 vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
736 KARATSUBA 4
737 vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
738 KARATSUBA 5
739 vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
740 vpxor TMP5, TMP5, T
741 KARATSUBA 6
742
743 vpxor TMP0, TMP0, TMP1
744 vpxor TMP0, TMP0, TMP2
745 vpsrldq TMP3, TMP0, 8
746 vpxor TMP4, TMP1, TMP3
747 vpslldq TMP3, TMP0, 8
748 vpxor TMP5, TMP2, TMP3
749
750 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
751 vpalignr TMP5,TMP5,TMP5,8
752 vpxor TMP5, TMP5, TMP1
753
754 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
755 vpalignr TMP5,TMP5,TMP5,8
756 vpxor TMP5, TMP5, TMP1
757
758 vpxor TMP5, TMP5, TMP4
759 vmovdqu T, TMP5
760
761 sub aluCTR, 6
762
763 LEncDataSingles:
764
765 cmp len, 16
766 jb LEncDataTail
767 sub len, 16
768
769 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
770 NEXTCTR 0
771
772 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
773 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
774 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
775 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
776 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
777 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
778 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
779 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
780 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
781 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
782 cmp NR, 10
783 je @f
784 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
785 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
786 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
787 cmp NR, 12
788 je @f
789 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
790 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
791 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
792 @@:
793 vaesenclast TMP1, TMP1, TMP2
794 vpxor TMP1, TMP1, XMMWORD PTR[PT]
795 vmovdqu XMMWORD PTR[CT], TMP1
796
797 lea PT, [16+PT]
798 lea CT, [16+CT]
799
800 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
801 vpxor TMP1, TMP1, T
802
803 vmovdqu TMP0, XMMWORD PTR[Htbl]
804 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
805 vmovdqu T, TMP1
806
807 jmp LEncDataSingles
808
809 LEncDataTail:
810
811 cmp len, 0
812 je LEncDataEnd
813
814 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
815
816 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
817 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
818 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
819 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
820 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
821 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
822 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
823 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
824 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
825 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
826 cmp NR, 10
827 je @f
828 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
829 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
830 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
831 cmp NR, 12
832 je @f
833 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
834 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
835 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
836 @@:
837 vaesenclast TMP1, TMP1, TMP2
838 ; zero a temp location
839 vpxor TMP2, TMP2, TMP2
840 vmovdqa XMMWORD PTR[esp], TMP2
841 ; copy as many bytes as needed
842 xor KS, KS
843 mov aluTMP, edx
844 @@:
845 cmp len, KS
846 je @f
847 mov dl, BYTE PTR[PT + KS]
848 mov BYTE PTR[esp + KS], dl
849 inc KS
850 jmp @b
851 @@:
852 vpxor TMP1, TMP1, XMMWORD PTR[esp]
853 vmovdqa XMMWORD PTR[esp], TMP1
854 xor KS, KS
855 @@:
856 cmp len, KS
857 je @f
858 mov dl, BYTE PTR[esp + KS]
859 mov BYTE PTR[CT + KS], dl
860 inc KS
861 jmp @b
862 @@:
863 cmp KS, 16
864 je @f
865 mov BYTE PTR[esp + KS], 0
866 inc KS
867 jmp @b
868 @@:
869 mov edx, aluTMP
870 vmovdqa TMP1, XMMWORD PTR[esp]
871 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
872 vpxor TMP1, TMP1, T
873
874 vmovdqu TMP0, XMMWORD PTR[Htbl]
875 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
876 vmovdqu T, TMP1
877
878 LEncDataEnd:
879 inc aluCTR
880 bswap aluCTR
881 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
882
883 mov esp, ebp
884 pop edi
885 pop esi
886 pop ebx
887 pop ebp
888
889
890 vzeroupper
891
892 ret
893 intel_aes_gcmENC ENDP
894
895 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
896 ;
897 ; Decrypt and Authenticate
898 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
899 ;
900 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
901
902
903 NEXTCTR MACRO i
904 add aluCTR, 1
905 mov aluTMP, aluCTR
906 bswap aluTMP
907 xor aluTMP, [3*4 + KS]
908 mov [3*4 + i*16 + esp], aluTMP
909 ENDM
910
911 intel_aes_gcmDEC PROC
912
913 cmp DWORD PTR[1*4 + 3*4 + esp], 0
914 jne LbeginDEC
915 ret
916
917 LbeginDEC:
918
919 vzeroupper
920 push ebp
921 push ebx
922 push esi
923 push edi
924
925 mov ebp, esp
926 sub esp, 8*16
927 and esp, -16
928
929 mov CT, [ebp + 5*4 + 0*4]
930 mov PT, [ebp + 5*4 + 1*4]
931 mov Gctx, [ebp + 5*4 + 2*4]
932
933 mov KS, [16*16 + 3*16 + Gctx]
934 lea KS, [44 + KS]
935
936 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
937 bswap aluCTR
938
939
940 vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
941 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
942 vmovdqu XMMWORD PTR[0*16 + esp], TMP0
943
944 cmp len, 16*7
945 jb LDecDataSingles
946 vmovdqu XMMWORD PTR[1*16 + esp], TMP0
947 vmovdqu XMMWORD PTR[2*16 + esp], TMP0
948 vmovdqu XMMWORD PTR[3*16 + esp], TMP0
949 vmovdqu XMMWORD PTR[4*16 + esp], TMP0
950 vmovdqu XMMWORD PTR[5*16 + esp], TMP0
951 vmovdqu XMMWORD PTR[6*16 + esp], TMP0
952 dec aluCTR
953
954 LDecData7:
955 cmp len, 16*7
956 jb LDecData7End
957 sub len, 16*7
958
959 vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
960 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
961 vpxor TMP5, TMP5, T
962 vpshufd TMP4, TMP5, 78
963 vpxor TMP4, TMP4, TMP5
964 vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h
965 vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl]
966 vpclmulqdq TMP1, TMP5, TMP4, 011h
967 vpclmulqdq TMP2, TMP5, TMP4, 000h
968
969 NEXTCTR 0
970 vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
971 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
972 KARATSUBA 5
973 NEXTCTR 1
974 vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
975 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
976 KARATSUBA 4
977 NEXTCTR 2
978 vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
979 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
980 KARATSUBA 3
981 NEXTCTR 3
982 vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
983 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
984 KARATSUBA 2
985 NEXTCTR 4
986 vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
987 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
988 KARATSUBA 1
989 NEXTCTR 5
990 vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
991 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
992 KARATSUBA 0
993 NEXTCTR 6
994
995 vpxor TMP0, TMP0, TMP1
996 vpxor TMP0, TMP0, TMP2
997 vpsrldq TMP3, TMP0, 8
998 vpxor TMP4, TMP1, TMP3
999 vpslldq TMP3, TMP0, 8
1000 vpxor TMP5, TMP2, TMP3
1001
1002 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
1003 vpalignr TMP5,TMP5,TMP5,8
1004 vpxor TMP5, TMP5, TMP1
1005
1006 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
1007 vpalignr TMP5,TMP5,TMP5,8
1008 vpxor TMP5, TMP5, TMP1
1009
1010 vpxor TMP5, TMP5, TMP4
1011 vmovdqu T, TMP5
1012
1013 vmovdqa CTR0, XMMWORD PTR[0*16 + esp]
1014 vmovdqa CTR1, XMMWORD PTR[1*16 + esp]
1015 vmovdqa CTR2, XMMWORD PTR[2*16 + esp]
1016 vmovdqa CTR3, XMMWORD PTR[3*16 + esp]
1017 vmovdqa CTR4, XMMWORD PTR[4*16 + esp]
1018 vmovdqa CTR5, XMMWORD PTR[5*16 + esp]
1019 vmovdqa CTR6, XMMWORD PTR[6*16 + esp]
1020
1021 ROUND 1
1022 ROUND 2
1023 ROUND 3
1024 ROUND 4
1025 ROUND 5
1026 ROUND 6
1027 ROUND 7
1028 ROUND 8
1029 ROUND 9
1030 vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
1031 cmp NR, 10
1032 je @f
1033
1034 ROUND 10
1035 ROUND 11
1036 vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
1037 cmp NR, 12
1038 je @f
1039
1040 ROUND 12
1041 ROUND 13
1042 vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
1043 @@:
1044 vaesenclast CTR0, CTR0, xmm7
1045 vaesenclast CTR1, CTR1, xmm7
1046 vaesenclast CTR2, CTR2, xmm7
1047 vaesenclast CTR3, CTR3, xmm7
1048 vaesenclast CTR4, CTR4, xmm7
1049 vaesenclast CTR5, CTR5, xmm7
1050 vaesenclast CTR6, CTR6, xmm7
1051
1052 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT]
1053 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT]
1054 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT]
1055 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT]
1056 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT]
1057 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT]
1058 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT]
1059
1060 vmovdqu XMMWORD PTR[0*16 + PT], CTR0
1061 vmovdqu XMMWORD PTR[1*16 + PT], CTR1
1062 vmovdqu XMMWORD PTR[2*16 + PT], CTR2
1063 vmovdqu XMMWORD PTR[3*16 + PT], CTR3
1064 vmovdqu XMMWORD PTR[4*16 + PT], CTR4
1065 vmovdqu XMMWORD PTR[5*16 + PT], CTR5
1066 vmovdqu XMMWORD PTR[6*16 + PT], CTR6
1067
1068 lea CT, [7*16 + CT]
1069 lea PT, [7*16 + PT]
1070 jmp LDecData7
1071
1072 LDecData7End:
1073
1074 NEXTCTR 0
1075
1076 LDecDataSingles:
1077
1078 cmp len, 16
1079 jb LDecDataTail
1080 sub len, 16
1081
1082 vmovdqu TMP1, XMMWORD PTR[CT]
1083 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
1084 vpxor TMP1, TMP1, T
1085
1086 vmovdqu TMP0, XMMWORD PTR[Htbl]
1087 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
1088 vmovdqu T, TMP1
1089
1090 vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
1091 NEXTCTR 0
1092
1093 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
1094 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
1095 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
1096 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
1097 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
1098 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
1099 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
1100 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
1101 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
1102 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
1103 cmp NR, 10
1104 je @f
1105 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
1106 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
1107 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
1108 cmp NR, 12
1109 je @f
1110 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
1111 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
1112 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
1113 @@:
1114 vaesenclast TMP1, TMP1, TMP2
1115 vpxor TMP1, TMP1, XMMWORD PTR[CT]
1116 vmovdqu XMMWORD PTR[PT], TMP1
1117
1118 lea PT, [16+PT]
1119 lea CT, [16+CT]
1120 jmp LDecDataSingles
1121
1122 LDecDataTail:
1123
1124 cmp len, 0
1125 je LDecDataEnd
1126
1127 vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
1128 inc aluCTR
1129 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
1130 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
1131 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
1132 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
1133 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
1134 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
1135 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
1136 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
1137 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
1138 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
1139 cmp NR, 10
1140 je @f
1141 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
1142 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
1143 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
1144 cmp NR, 12
1145 je @f
1146 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
1147 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
1148 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
1149 @@:
1150 vaesenclast xmm7, TMP1, TMP2
1151
1152 ; copy as many bytes as needed
1153 xor KS, KS
1154 mov aluTMP, edx
1155 @@:
1156 cmp len, KS
1157 je @f
1158 mov dl, BYTE PTR[CT + KS]
1159 mov BYTE PTR[esp + KS], dl
1160 inc KS
1161 jmp @b
1162 @@:
1163 cmp KS, 16
1164 je @f
1165 mov BYTE PTR[esp + KS], 0
1166 inc KS
1167 jmp @b
1168 @@:
1169 mov edx, aluTMP
1170 vmovdqa TMP1, XMMWORD PTR[esp]
1171 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
1172 vpxor TMP1, TMP1, T
1173
1174 vmovdqu TMP0, XMMWORD PTR[Htbl]
1175 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
1176 vmovdqu T, TMP1
1177
1178 vpxor xmm7, xmm7, XMMWORD PTR[esp]
1179 vmovdqa XMMWORD PTR[esp], xmm7
1180 xor KS, KS
1181 mov aluTMP, edx
1182 @@:
1183 cmp len, KS
1184 je @f
1185 mov dl, BYTE PTR[esp + KS]
1186 mov BYTE PTR[PT + KS], dl
1187 inc KS
1188 jmp @b
1189 @@:
1190 mov edx, aluTMP
1191
1192 LDecDataEnd:
1193
1194 bswap aluCTR
1195 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
1196
1197 mov esp, ebp
1198 pop edi
1199 pop esi
1200 pop ebx
1201 pop ebp
1202
1203 vzeroupper
1204
1205 ret
1206 intel_aes_gcmDEC ENDP
1207
1208
1209 END
This site is hosted by Intevation GmbH (Datenschutzerklärung und Impressum | Privacy Policy and Imprint)