Mercurial > trustbridge > nss-cmake-static
view nss/lib/freebl/intel-gcm-x64-masm.asm @ 2:a945361df361
Fix NSS_LIBRARIES variable
author | Andre Heinecke <andre.heinecke@intevation.de> |
---|---|
date | Wed, 30 Jul 2014 16:20:44 +0200 |
parents | 1e5118fa0cb1 |
children |
line wrap: on
line source
; LICENSE: ; This submission to NSS is to be made available under the terms of the ; Mozilla Public License, v. 2.0. You can obtain one at http: ; //mozilla.org/MPL/2.0/. ;############################################################################### ; Copyright(c) 2014, Intel Corp. ; Developers and authors: ; Shay Gueron and Vlad Krasnov ; Intel Corporation, Israel Development Centre, Haifa, Israel ; Please send feedback directly to crypto.feedback.alias@intel.com .DATA ALIGN 16 Lone dq 1,0 Ltwo dq 2,0 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh Lpoly dq 01h, 0c200000000000000h .CODE GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 vpclmulqdq TMP1, SRC2, SRC1, 0h vpclmulqdq TMP4, SRC2, SRC1, 011h vpshufd TMP2, SRC2, 78 vpshufd TMP3, SRC1, 78 vpxor TMP2, TMP2, SRC2 vpxor TMP3, TMP3, SRC1 vpclmulqdq TMP2, TMP2, TMP3, 0h vpxor TMP2, TMP2, TMP1 vpxor TMP2, TMP2, TMP4 vpslldq TMP3, TMP2, 8 vpsrldq TMP2, TMP2, 8 vpxor TMP1, TMP1, TMP3 vpxor TMP4, TMP4, TMP2 vpclmulqdq TMP2, TMP1, [Lpoly], 010h vpshufd TMP3, TMP1, 78 vpxor TMP1, TMP2, TMP3 vpclmulqdq TMP2, TMP1, [Lpoly], 010h vpshufd TMP3, TMP1, 78 vpxor TMP1, TMP2, TMP3 vpxor DST, TMP1, TMP4 ENDM ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Generates the final GCM tag ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], ; unsigned char *Tp, ; unsigned int Mlen, ; unsigned int Alen, ; unsigned char *X0, ; unsigned char *TAG); ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ALIGN 16 intel_aes_gcmTAG PROC Htbl textequ <rcx> Tp textequ <rdx> Mlen textequ <r8> Alen textequ <r9> X0 textequ <r10> TAG textequ <r11> T textequ <xmm0> TMP0 textequ <xmm1> mov X0, [rsp + 1*8 + 4*8] mov TAG, [rsp + 1*8 + 5*8] vzeroupper vmovdqu T, XMMWORD PTR[Tp] vpxor TMP0, TMP0, TMP0 shl Mlen, 3 shl Alen, 3 ;vpinsrq TMP0, TMP0, Mlen, 0 ;vpinsrq TMP0, TMP0, Alen, 1 ; workaround the ml64.exe vpinsrq issue vpinsrd TMP0, TMP0, r8d, 0 vpinsrd TMP0, TMP0, r9d, 2 shr Mlen, 32 shr Alen, 32 vpinsrd TMP0, TMP0, r8d, 1 vpinsrd TMP0, TMP0, r9d, 3 vpxor T, T, TMP0 vmovdqu TMP0, XMMWORD PTR[Htbl] GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 vpshufb T, T, [Lbswap_mask] vpxor T, T, [X0] vmovdqu XMMWORD PTR[TAG], T vzeroupper ret intel_aes_gcmTAG ENDP ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Generates the H table ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ALIGN 16 intel_aes_gcmINIT PROC Htbl textequ <rcx> KS textequ <rdx> NR textequ <r8d> T textequ <xmm0> TMP0 textequ <xmm1> vzeroupper ; AES-ENC(0) vmovdqu T, XMMWORD PTR[KS] lea KS, [16 + KS] dec NR Lenc_loop: vaesenc T, T, [KS] lea KS, [16 + KS] dec NR jnz Lenc_loop vaesenclast T, T, [KS] vpshufb T, T, [Lbswap_mask] ;Calculate H` = GFMUL(H, 2) vpsrad xmm3, T, 31 vpshufd xmm3, xmm3, 0ffh vpand xmm5, xmm3, [Lpoly] vpsrld xmm3, T, 31 vpslld xmm4, T, 1 vpslldq xmm3, xmm3, 4 vpxor T, xmm4, xmm3 vpxor T, T, xmm5 vmovdqu TMP0, T vmovdqu XMMWORD PTR[Htbl + 0*16], T vpshufd xmm2, T, 78 vpxor xmm2, xmm2, T vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 i = 1 WHILE i LT 8 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 vmovdqu XMMWORD PTR[Htbl + i*16], T vpshufd xmm2, T, 78 vpxor xmm2, xmm2, T vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 i = i+1 ENDM vzeroupper ret intel_aes_gcmINIT ENDP ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Authenticate only ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ALIGN 16 intel_aes_gcmAAD PROC Htbl textequ <rcx> inp textequ <rdx> len textequ <r8> Tp textequ <r9> hlp0 textequ <r10> DATA textequ <xmm0> T textequ <xmm1> TMP0 textequ <xmm2> TMP1 textequ <xmm3> TMP2 textequ <xmm4> TMP3 textequ <xmm5> TMP4 textequ <xmm6> Xhi textequ <xmm7> KARATSUBA_AAD MACRO i vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h vpxor TMP0, TMP0, TMP3 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h vpxor TMP1, TMP1, TMP3 vpshufd TMP3, DATA, 78 vpxor TMP3, TMP3, DATA vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h vpxor TMP2, TMP2, TMP3 ENDM test len, len jnz LbeginAAD ret LbeginAAD: vzeroupper sub rsp, 2*16 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 vpxor Xhi, Xhi, Xhi vmovdqu T, XMMWORD PTR[Tp] ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first mov hlp0, len and hlp0, 128-1 jz Lmod_loop and len, -128 sub hlp0, 16 ; Prefix block vmovdqu DATA, XMMWORD PTR[inp] vpshufb DATA, DATA, [Lbswap_mask] vpxor DATA, DATA, T vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h vpshufd TMP3, DATA, 78 vpxor TMP3, TMP3, DATA vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h lea inp, [inp+16] test hlp0, hlp0 jnz Lpre_loop jmp Lred1 ;hash remaining prefix bocks (up to 7 total prefix blocks) Lpre_loop: sub hlp0, 16 vmovdqu DATA, XMMWORD PTR[inp] vpshufb DATA, DATA, [Lbswap_mask] vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h vpxor TMP0, TMP0, TMP3 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h vpxor TMP1, TMP1, TMP3 vpshufd TMP3, DATA, 78 vpxor TMP3, TMP3, DATA vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h vpxor TMP2, TMP2, TMP3 test hlp0, hlp0 lea inp, [inp+16] jnz Lpre_loop Lred1: vpxor TMP2, TMP2, TMP0 vpxor TMP2, TMP2, TMP1 vpsrldq TMP3, TMP2, 8 vpslldq TMP2, TMP2, 8 vpxor Xhi, TMP1, TMP3 vpxor T, TMP0, TMP2 Lmod_loop: sub len, 16*8 jb Ldone ; Block #0 vmovdqu DATA, XMMWORD PTR[inp + 16*7] vpshufb DATA, DATA, [Lbswap_mask] vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h vpshufd TMP3, DATA, 78 vpxor TMP3, TMP3, DATA vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h ; Block #1 vmovdqu DATA, XMMWORD PTR[inp + 16*6] vpshufb DATA, DATA, [Lbswap_mask] KARATSUBA_AAD 1 ; Block #2 vmovdqu DATA, XMMWORD PTR[inp + 16*5] vpshufb DATA, DATA, [Lbswap_mask] vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a vpalignr T, T, T, 8 KARATSUBA_AAD 2 vpxor T, T, TMP4 ;reduction stage 1b ; Block #3 vmovdqu DATA, XMMWORD PTR[inp + 16*4] vpshufb DATA, DATA, [Lbswap_mask] KARATSUBA_AAD 3 ; Block #4 vmovdqu DATA, XMMWORD PTR[inp + 16*3] vpshufb DATA, DATA, [Lbswap_mask] vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a vpalignr T, T, T, 8 KARATSUBA_AAD 4 vpxor T, T, TMP4 ;reduction stage 2b ; Block #5 vmovdqu DATA, XMMWORD PTR[inp + 16*2] vpshufb DATA, DATA, [Lbswap_mask] KARATSUBA_AAD 5 vpxor T, T, Xhi ;reduction finalize ; Block #6 vmovdqu DATA, XMMWORD PTR[inp + 16*1] vpshufb DATA, DATA, [Lbswap_mask] KARATSUBA_AAD 6 ; Block #7 vmovdqu DATA, XMMWORD PTR[inp + 16*0] vpshufb DATA, DATA, [Lbswap_mask] vpxor DATA, DATA, T KARATSUBA_AAD 7 ; Aggregated 8 blocks, now karatsuba fixup vpxor TMP2, TMP2, TMP0 vpxor TMP2, TMP2, TMP1 vpsrldq TMP3, TMP2, 8 vpslldq TMP2, TMP2, 8 vpxor Xhi, TMP1, TMP3 vpxor T, TMP0, TMP2 lea inp, [inp + 16*8] jmp Lmod_loop Ldone: vpclmulqdq TMP4, T, [Lpoly], 010h vpalignr T, T, T, 8 vpxor T, T, TMP4 vpclmulqdq TMP4, T, [Lpoly], 010h vpalignr T, T, T, 8 vpxor T, T, TMP4 vpxor T, T, Xhi vmovdqu XMMWORD PTR[Tp], T vzeroupper vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] add rsp, 16*2 ret intel_aes_gcmAAD ENDP ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Encrypt and Authenticate ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ALIGN 16 intel_aes_gcmENC PROC PT textequ <rcx> CT textequ <rdx> Htbl textequ <r8> Gctx textequ <r8> len textequ <r9> KS textequ <r10> NR textequ <eax> aluCTR textequ <r11d> aluKSl textequ <r12d> aluTMP textequ <r13d> T textequ <xmm0> TMP0 textequ <xmm1> TMP1 textequ <xmm2> TMP2 textequ <xmm3> TMP3 textequ <xmm4> TMP4 textequ <xmm5> TMP5 textequ <xmm6> CTR0 textequ <xmm7> CTR1 textequ <xmm8> CTR2 textequ <xmm9> CTR3 textequ <xmm10> CTR4 textequ <xmm11> CTR5 textequ <xmm12> CTR6 textequ <xmm13> CTR7 textequ <xmm14> BSWAPMASK textequ <xmm15> ROUND MACRO i vmovdqu TMP3, XMMWORD PTR[i*16 + KS] vaesenc CTR0, CTR0, TMP3 vaesenc CTR1, CTR1, TMP3 vaesenc CTR2, CTR2, TMP3 vaesenc CTR3, CTR3, TMP3 vaesenc CTR4, CTR4, TMP3 vaesenc CTR5, CTR5, TMP3 vaesenc CTR6, CTR6, TMP3 vaesenc CTR7, CTR7, TMP3 ENDM ROUNDMUL MACRO i vmovdqu TMP3, XMMWORD PTR[i*16 + KS] vaesenc CTR0, CTR0, TMP3 vaesenc CTR1, CTR1, TMP3 vaesenc CTR2, CTR2, TMP3 vaesenc CTR3, CTR3, TMP3 vpshufd TMP4, TMP5, 78 vpxor TMP4, TMP4, TMP5 vaesenc CTR4, CTR4, TMP3 vaesenc CTR5, CTR5, TMP3 vaesenc CTR6, CTR6, TMP3 vaesenc CTR7, CTR7, TMP3 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h vpxor TMP0, TMP0, TMP3 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] vpclmulqdq TMP3, TMP5, TMP4, 011h vpxor TMP1, TMP1, TMP3 vpclmulqdq TMP3, TMP5, TMP4, 000h vpxor TMP2, TMP2, TMP3 ENDM KARATSUBA MACRO i vpshufd TMP4, TMP5, 78 vpxor TMP4, TMP4, TMP5 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h vpxor TMP0, TMP0, TMP3 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] vpclmulqdq TMP3, TMP5, TMP4, 011h vpxor TMP1, TMP1, TMP3 vpclmulqdq TMP3, TMP5, TMP4, 000h vpxor TMP2, TMP2, TMP3 ENDM NEXTCTR MACRO i add aluCTR, 1 mov aluTMP, aluCTR xor aluTMP, aluKSl bswap aluTMP mov [3*4 + 8*16 + i*16 + rsp], aluTMP ENDM test len, len jnz LbeginENC ret LbeginENC: vzeroupper push r11 push r12 push r13 push rbp sub rsp, 10*16 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 mov rbp, rsp sub rsp, 16*16 and rsp, -16 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] mov KS, [16*16 + 3*16 + Gctx] mov NR, [4 + KS] lea KS, [48 + KS] vpshufb CTR0, CTR0, BSWAPMASK mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] mov aluKSl, [3*4 + KS] bswap aluCTR bswap aluKSl vmovdqu TMP0, XMMWORD PTR[0*16 + KS] vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0 cmp len, 128 jb LEncDataSingles ; Prepare the "top" counters vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0 vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0 vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0 vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0 vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0 vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0 vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0 ; Encrypt the initial 8 blocks sub len, 128 vpaddd CTR1, CTR0, XMMWORD PTR[Lone] vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] vpaddd CTR3, CTR2, XMMWORD PTR[Lone] vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] vpaddd CTR5, CTR4, XMMWORD PTR[Lone] vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] vpaddd CTR7, CTR6, XMMWORD PTR[Lone] vpshufb CTR0, CTR0, BSWAPMASK vpshufb CTR1, CTR1, BSWAPMASK vpshufb CTR2, CTR2, BSWAPMASK vpshufb CTR3, CTR3, BSWAPMASK vpshufb CTR4, CTR4, BSWAPMASK vpshufb CTR5, CTR5, BSWAPMASK vpshufb CTR6, CTR6, BSWAPMASK vpshufb CTR7, CTR7, BSWAPMASK vmovdqu TMP3, XMMWORD PTR[0*16 + KS] vpxor CTR0, CTR0, TMP3 vpxor CTR1, CTR1, TMP3 vpxor CTR2, CTR2, TMP3 vpxor CTR3, CTR3, TMP3 vpxor CTR4, CTR4, TMP3 vpxor CTR5, CTR5, TMP3 vpxor CTR6, CTR6, TMP3 vpxor CTR7, CTR7, TMP3 ROUND 1 add aluCTR, 8 mov aluTMP, aluCTR xor aluTMP, aluKSl bswap aluTMP mov [8*16 + 0*16 + 3*4 + rsp], aluTMP ROUND 2 NEXTCTR 1 ROUND 3 NEXTCTR 2 ROUND 4 NEXTCTR 3 ROUND 5 NEXTCTR 4 ROUND 6 NEXTCTR 5 ROUND 7 NEXTCTR 6 ROUND 8 NEXTCTR 7 ROUND 9 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] cmp NR, 10 je @f ROUND 10 ROUND 11 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] cmp NR, 12 je @f ROUND 12 ROUND 13 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] @@: vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] vaesenclast CTR0, CTR0, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] vaesenclast CTR1, CTR1, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] vaesenclast CTR2, CTR2, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] vaesenclast CTR3, CTR3, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] vaesenclast CTR4, CTR4, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] vaesenclast CTR5, CTR5, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] vaesenclast CTR6, CTR6, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] vaesenclast CTR7, CTR7, TMP3 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 vpshufb CTR0, CTR0, BSWAPMASK vmovdqu XMMWORD PTR[1*16 + CT], CTR1 vpshufb CTR1, CTR1, BSWAPMASK vmovdqu XMMWORD PTR[2*16 + CT], CTR2 vpshufb CTR2, CTR2, BSWAPMASK vmovdqu XMMWORD PTR[3*16 + CT], CTR3 vpshufb CTR3, CTR3, BSWAPMASK vmovdqu XMMWORD PTR[4*16 + CT], CTR4 vpshufb CTR4, CTR4, BSWAPMASK vmovdqu XMMWORD PTR[5*16 + CT], CTR5 vpshufb CTR5, CTR5, BSWAPMASK vmovdqu XMMWORD PTR[6*16 + CT], CTR6 vpshufb CTR6, CTR6, BSWAPMASK vmovdqu XMMWORD PTR[7*16 + CT], CTR7 vpshufb TMP5, CTR7, BSWAPMASK vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 lea CT, [8*16 + CT] lea PT, [8*16 + PT] jmp LEncDataOctets LEncDataOctets: cmp len, 128 jb LEndEncOctets sub len, 128 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp] vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp] vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp] vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp] vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp] vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp] vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp] vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp] vpshufd TMP4, TMP5, 78 vpxor TMP4, TMP4, TMP5 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] vpclmulqdq TMP1, TMP5, TMP4, 011h vpclmulqdq TMP2, TMP5, TMP4, 000h vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] ROUNDMUL 1 NEXTCTR 0 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] ROUNDMUL 2 NEXTCTR 1 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] ROUNDMUL 3 NEXTCTR 2 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] ROUNDMUL 4 NEXTCTR 3 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] ROUNDMUL 5 NEXTCTR 4 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] ROUNDMUL 6 NEXTCTR 5 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] ROUNDMUL 7 NEXTCTR 6 ROUND 8 NEXTCTR 7 vpxor TMP0, TMP0, TMP1 vpxor TMP0, TMP0, TMP2 vpsrldq TMP3, TMP0, 8 vpxor TMP4, TMP1, TMP3 vpslldq TMP3, TMP0, 8 vpxor T, TMP2, TMP3 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h vpalignr T,T,T,8 vpxor T, T, TMP1 ROUND 9 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h vpalignr T,T,T,8 vpxor T, T, TMP1 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] cmp NR, 10 je @f ROUND 10 ROUND 11 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] cmp NR, 12 je @f ROUND 12 ROUND 13 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] @@: vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] vaesenclast CTR0, CTR0, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] vaesenclast CTR1, CTR1, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] vaesenclast CTR2, CTR2, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] vaesenclast CTR3, CTR3, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] vaesenclast CTR4, CTR4, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] vaesenclast CTR5, CTR5, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] vaesenclast CTR6, CTR6, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] vaesenclast CTR7, CTR7, TMP3 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 vpshufb CTR0, CTR0, BSWAPMASK vmovdqu XMMWORD PTR[1*16 + CT], CTR1 vpshufb CTR1, CTR1, BSWAPMASK vmovdqu XMMWORD PTR[2*16 + CT], CTR2 vpshufb CTR2, CTR2, BSWAPMASK vmovdqu XMMWORD PTR[3*16 + CT], CTR3 vpshufb CTR3, CTR3, BSWAPMASK vmovdqu XMMWORD PTR[4*16 + CT], CTR4 vpshufb CTR4, CTR4, BSWAPMASK vmovdqu XMMWORD PTR[5*16 + CT], CTR5 vpshufb CTR5, CTR5, BSWAPMASK vmovdqu XMMWORD PTR[6*16 + CT], CTR6 vpshufb CTR6, CTR6, BSWAPMASK vmovdqu XMMWORD PTR[7*16 + CT], CTR7 vpshufb TMP5, CTR7, BSWAPMASK vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 vpxor T, T, TMP4 lea CT, [8*16 + CT] lea PT, [8*16 + PT] jmp LEncDataOctets LEndEncOctets: vpshufd TMP4, TMP5, 78 vpxor TMP4, TMP4, TMP5 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] vpclmulqdq TMP1, TMP5, TMP4, 011h vpclmulqdq TMP2, TMP5, TMP4, 000h vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] KARATSUBA 1 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] KARATSUBA 2 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] KARATSUBA 3 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] KARATSUBA 4 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] KARATSUBA 5 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] KARATSUBA 6 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] KARATSUBA 7 vpxor TMP0, TMP0, TMP1 vpxor TMP0, TMP0, TMP2 vpsrldq TMP3, TMP0, 8 vpxor TMP4, TMP1, TMP3 vpslldq TMP3, TMP0, 8 vpxor T, TMP2, TMP3 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h vpalignr T,T,T,8 vpxor T, T, TMP1 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h vpalignr T,T,T,8 vpxor T, T, TMP1 vpxor T, T, TMP4 sub aluCTR, 7 LEncDataSingles: cmp len, 16 jb LEncDataTail sub len, 16 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] NEXTCTR 0 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] vmovdqu TMP2, XMMWORD PTR[10*16 + KS] cmp NR, 10 je @f vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] vmovdqu TMP2, XMMWORD PTR[12*16 + KS] cmp NR, 12 je @f vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] vmovdqu TMP2, XMMWORD PTR[14*16 + KS] @@: vaesenclast TMP1, TMP1, TMP2 vpxor TMP1, TMP1, XMMWORD PTR[PT] vmovdqu XMMWORD PTR[CT], TMP1 lea PT, [16+PT] lea CT, [16+CT] vpshufb TMP1, TMP1, BSWAPMASK vpxor T, T, TMP1 vmovdqu TMP0, XMMWORD PTR[Htbl] GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 jmp LEncDataSingles LEncDataTail: test len, len jz LEncDataEnd vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] vmovdqu TMP2, XMMWORD PTR[10*16 + KS] cmp NR, 10 je @f vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] vmovdqu TMP2, XMMWORD PTR[12*16 + KS] cmp NR, 12 je @f vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] vmovdqu TMP2, XMMWORD PTR[14*16 + KS] @@: vaesenclast TMP1, TMP1, TMP2 ; zero a temp location vpxor TMP2, TMP2, TMP2 vmovdqa XMMWORD PTR[rsp], TMP2 ; copy as many bytes as needed xor KS, KS @@: cmp len, KS je @f mov al, [PT + KS] mov [rsp + KS], al inc KS jmp @b @@: vpxor TMP1, TMP1, XMMWORD PTR[rsp] vmovdqa XMMWORD PTR[rsp], TMP1 xor KS, KS @@: cmp len, KS je @f mov al, [rsp + KS] mov [CT + KS], al inc KS jmp @b @@: cmp KS, 16 je @f mov BYTE PTR[rsp + KS], 0 inc KS jmp @b @@: BAIL: vmovdqa TMP1, XMMWORD PTR[rsp] vpshufb TMP1, TMP1, BSWAPMASK vpxor T, T, TMP1 vmovdqu TMP0, XMMWORD PTR[Htbl] GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 LEncDataEnd: vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T bswap aluCTR mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR mov rsp, rbp vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] add rsp, 10*16 pop rbp pop r13 pop r12 pop r11 vzeroupper ret intel_aes_gcmENC ENDP ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Decrypt and Authenticate ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ALIGN 16 intel_aes_gcmDEC PROC NEXTCTR MACRO i add aluCTR, 1 mov aluTMP, aluCTR xor aluTMP, aluKSl bswap aluTMP mov [3*4 + i*16 + rsp], aluTMP ENDM PT textequ <rdx> CT textequ <rcx> test len, len jnz LbeginDEC ret LbeginDEC: vzeroupper push r11 push r12 push r13 push rbp sub rsp, 10*16 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 mov rbp, rsp sub rsp, 8*16 and rsp, -16 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] mov KS, [16*16 + 3*16 + Gctx] mov NR, [4 + KS] lea KS, [48 + KS] vpshufb CTR0, CTR0, BSWAPMASK mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] mov aluKSl, [3*4 + KS] bswap aluCTR bswap aluKSl vmovdqu TMP0, XMMWORD PTR[0*16 + KS] vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] vmovdqu XMMWORD PTR[0*16 + rsp], TMP0 cmp len, 128 jb LDecDataSingles ; Prepare the "top" counters vmovdqu XMMWORD PTR[1*16 + rsp], TMP0 vmovdqu XMMWORD PTR[2*16 + rsp], TMP0 vmovdqu XMMWORD PTR[3*16 + rsp], TMP0 vmovdqu XMMWORD PTR[4*16 + rsp], TMP0 vmovdqu XMMWORD PTR[5*16 + rsp], TMP0 vmovdqu XMMWORD PTR[6*16 + rsp], TMP0 vmovdqu XMMWORD PTR[7*16 + rsp], TMP0 NEXTCTR 1 NEXTCTR 2 NEXTCTR 3 NEXTCTR 4 NEXTCTR 5 NEXTCTR 6 NEXTCTR 7 LDecDataOctets: cmp len, 128 jb LEndDecOctets sub len, 128 vmovdqa CTR0, XMMWORD PTR[0*16 + rsp] vmovdqa CTR1, XMMWORD PTR[1*16 + rsp] vmovdqa CTR2, XMMWORD PTR[2*16 + rsp] vmovdqa CTR3, XMMWORD PTR[3*16 + rsp] vmovdqa CTR4, XMMWORD PTR[4*16 + rsp] vmovdqa CTR5, XMMWORD PTR[5*16 + rsp] vmovdqa CTR6, XMMWORD PTR[6*16 + rsp] vmovdqa CTR7, XMMWORD PTR[7*16 + rsp] vmovdqu TMP5, XMMWORD PTR[7*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK vpshufd TMP4, TMP5, 78 vpxor TMP4, TMP4, TMP5 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] vpclmulqdq TMP1, TMP5, TMP4, 011h vpclmulqdq TMP2, TMP5, TMP4, 000h vmovdqu TMP5, XMMWORD PTR[6*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK ROUNDMUL 1 NEXTCTR 0 vmovdqu TMP5, XMMWORD PTR[5*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK ROUNDMUL 2 NEXTCTR 1 vmovdqu TMP5, XMMWORD PTR[4*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK ROUNDMUL 3 NEXTCTR 2 vmovdqu TMP5, XMMWORD PTR[3*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK ROUNDMUL 4 NEXTCTR 3 vmovdqu TMP5, XMMWORD PTR[2*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK ROUNDMUL 5 NEXTCTR 4 vmovdqu TMP5, XMMWORD PTR[1*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK ROUNDMUL 6 NEXTCTR 5 vmovdqu TMP5, XMMWORD PTR[0*16 + CT] vpshufb TMP5, TMP5, BSWAPMASK vpxor TMP5, TMP5, T ROUNDMUL 7 NEXTCTR 6 ROUND 8 NEXTCTR 7 vpxor TMP0, TMP0, TMP1 vpxor TMP0, TMP0, TMP2 vpsrldq TMP3, TMP0, 8 vpxor TMP4, TMP1, TMP3 vpslldq TMP3, TMP0, 8 vpxor T, TMP2, TMP3 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h vpalignr T,T,T,8 vpxor T, T, TMP1 ROUND 9 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h vpalignr T,T,T,8 vpxor T, T, TMP1 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] cmp NR, 10 je @f ROUND 10 ROUND 11 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] cmp NR, 12 je @f ROUND 12 ROUND 13 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] @@: vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT] vaesenclast CTR0, CTR0, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT] vaesenclast CTR1, CTR1, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT] vaesenclast CTR2, CTR2, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT] vaesenclast CTR3, CTR3, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT] vaesenclast CTR4, CTR4, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT] vaesenclast CTR5, CTR5, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT] vaesenclast CTR6, CTR6, TMP3 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT] vaesenclast CTR7, CTR7, TMP3 vmovdqu XMMWORD PTR[0*16 + PT], CTR0 vmovdqu XMMWORD PTR[1*16 + PT], CTR1 vmovdqu XMMWORD PTR[2*16 + PT], CTR2 vmovdqu XMMWORD PTR[3*16 + PT], CTR3 vmovdqu XMMWORD PTR[4*16 + PT], CTR4 vmovdqu XMMWORD PTR[5*16 + PT], CTR5 vmovdqu XMMWORD PTR[6*16 + PT], CTR6 vmovdqu XMMWORD PTR[7*16 + PT], CTR7 vpxor T, T, TMP4 lea CT, [8*16 + CT] lea PT, [8*16 + PT] jmp LDecDataOctets LEndDecOctets: sub aluCTR, 7 LDecDataSingles: cmp len, 16 jb LDecDataTail sub len, 16 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] NEXTCTR 0 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] vmovdqu TMP2, XMMWORD PTR[10*16 + KS] cmp NR, 10 je @f vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] vmovdqu TMP2, XMMWORD PTR[12*16 + KS] cmp NR, 12 je @f vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] vmovdqu TMP2, XMMWORD PTR[14*16 + KS] @@: vaesenclast TMP1, TMP1, TMP2 vmovdqu TMP2, XMMWORD PTR[CT] vpxor TMP1, TMP1, TMP2 vmovdqu XMMWORD PTR[PT], TMP1 lea PT, [16+PT] lea CT, [16+CT] vpshufb TMP2, TMP2, BSWAPMASK vpxor T, T, TMP2 vmovdqu TMP0, XMMWORD PTR[Htbl] GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 jmp LDecDataSingles LDecDataTail: test len, len jz LDecDataEnd vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] inc aluCTR vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] vmovdqu TMP2, XMMWORD PTR[10*16 + KS] cmp NR, 10 je @f vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] vmovdqu TMP2, XMMWORD PTR[12*16 + KS] cmp NR, 12 je @f vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] vmovdqu TMP2, XMMWORD PTR[14*16 + KS] @@: vaesenclast TMP1, TMP1, TMP2 ; copy as many bytes as needed xor KS, KS @@: cmp len, KS je @f mov al, [CT + KS] mov [rsp + KS], al inc KS jmp @b @@: cmp KS, 16 je @f mov BYTE PTR[rsp + KS], 0 inc KS jmp @b @@: vmovdqa TMP2, XMMWORD PTR[rsp] vpshufb TMP2, TMP2, BSWAPMASK vpxor T, T, TMP2 vmovdqu TMP0, XMMWORD PTR[Htbl] GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4 vpxor TMP1, TMP1, XMMWORD PTR[rsp] vmovdqa XMMWORD PTR[rsp], TMP1 xor KS, KS @@: cmp len, KS je @f mov al, [rsp + KS] mov [PT + KS], al inc KS jmp @b @@: LDecDataEnd: vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T bswap aluCTR mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR mov rsp, rbp vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] add rsp, 10*16 pop rbp pop r13 pop r12 pop r11 vzeroupper ret ret intel_aes_gcmDEC ENDP END