andre@0: ; LICENSE:
andre@0: ; This submission to NSS is to be made available under the terms of the
andre@0: ; Mozilla Public License, v. 2.0. You can obtain one at http:
andre@0: ; //mozilla.org/MPL/2.0/.
andre@0: ;###############################################################################
andre@0: ; Copyright(c) 2014, Intel Corp.
andre@0: ; Developers and authors:
andre@0: ; Shay Gueron and Vlad Krasnov
andre@0: ; Intel Corporation, Israel Development Centre, Haifa, Israel
andre@0: ; Please send feedback directly to crypto.feedback.alias@intel.com
andre@0: 
andre@0: 
andre@0: .DATA
andre@0: ALIGN 16
andre@0: Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
andre@0: Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
andre@0: Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
andre@0: Lcon1 dd 1,1,1,1
andre@0: Lcon2 dd 1bh,1bh,1bh,1bh
andre@0: 
andre@0: .CODE
andre@0: 
andre@0: ctx     textequ <rcx>
andre@0: output  textequ <rdx>
andre@0: input   textequ <r8>
andre@0: inputLen textequ <r9d>
andre@0: 
andre@0: 
andre@0: aes_rnd MACRO i
andre@0:     movdqu  xmm8, [i*16 + ctx]
andre@0:     aesenc  xmm0, xmm8
andre@0:     aesenc  xmm1, xmm8
andre@0:     aesenc  xmm2, xmm8
andre@0:     aesenc  xmm3, xmm8
andre@0:     aesenc  xmm4, xmm8
andre@0:     aesenc  xmm5, xmm8
andre@0:     aesenc  xmm6, xmm8
andre@0:     aesenc  xmm7, xmm8
andre@0:     ENDM
andre@0: 
andre@0: aes_last_rnd MACRO i
andre@0:     movdqu  xmm8, [i*16 + ctx]
andre@0:     aesenclast  xmm0, xmm8
andre@0:     aesenclast  xmm1, xmm8
andre@0:     aesenclast  xmm2, xmm8
andre@0:     aesenclast  xmm3, xmm8
andre@0:     aesenclast  xmm4, xmm8
andre@0:     aesenclast  xmm5, xmm8
andre@0:     aesenclast  xmm6, xmm8
andre@0:     aesenclast  xmm7, xmm8
andre@0:     ENDM
andre@0: 
andre@0: aes_dec_rnd MACRO i
andre@0:     movdqu  xmm8, [i*16 + ctx]
andre@0:     aesdec  xmm0, xmm8
andre@0:     aesdec  xmm1, xmm8
andre@0:     aesdec  xmm2, xmm8
andre@0:     aesdec  xmm3, xmm8
andre@0:     aesdec  xmm4, xmm8
andre@0:     aesdec  xmm5, xmm8
andre@0:     aesdec  xmm6, xmm8
andre@0:     aesdec  xmm7, xmm8
andre@0:     ENDM
andre@0: 
andre@0: aes_dec_last_rnd MACRO i
andre@0:     movdqu  xmm8, [i*16 + ctx]
andre@0:     aesdeclast  xmm0, xmm8
andre@0:     aesdeclast  xmm1, xmm8
andre@0:     aesdeclast  xmm2, xmm8
andre@0:     aesdeclast  xmm3, xmm8
andre@0:     aesdeclast  xmm4, xmm8
andre@0:     aesdeclast  xmm5, xmm8
andre@0:     aesdeclast  xmm6, xmm8
andre@0:     aesdeclast  xmm7, xmm8
andre@0:     ENDM
andre@0: 
andre@0: 
andre@0: gen_aes_ecb_func MACRO enc, rnds
andre@0: 
andre@0: LOCAL   loop8
andre@0: LOCAL   loop1
andre@0: LOCAL   bail
andre@0: 
andre@0:         xor     inputLen, inputLen
andre@0:         mov     input,      [rsp + 1*8 + 8*4]
andre@0:         mov     inputLen,   [rsp + 1*8 + 8*5]
andre@0: 
andre@0:         sub     rsp, 3*16
andre@0: 
andre@0:         movdqu  [rsp + 0*16], xmm6
andre@0:         movdqu  [rsp + 1*16], xmm7
andre@0:         movdqu  [rsp + 2*16], xmm8
andre@0: 
andre@0:         lea     ctx, [48+ctx]
andre@0: 
andre@0: loop8:
andre@0:         cmp     inputLen, 8*16
andre@0:         jb      loop1
andre@0: 
andre@0:         movdqu  xmm0, [0*16 + input]
andre@0:         movdqu  xmm1, [1*16 + input]
andre@0:         movdqu  xmm2, [2*16 + input]
andre@0:         movdqu  xmm3, [3*16 + input]
andre@0:         movdqu  xmm4, [4*16 + input]
andre@0:         movdqu  xmm5, [5*16 + input]
andre@0:         movdqu  xmm6, [6*16 + input]
andre@0:         movdqu  xmm7, [7*16 + input]
andre@0: 
andre@0:         movdqu  xmm8, [0*16 + ctx]
andre@0:         pxor    xmm0, xmm8
andre@0:         pxor    xmm1, xmm8
andre@0:         pxor    xmm2, xmm8
andre@0:         pxor    xmm3, xmm8
andre@0:         pxor    xmm4, xmm8
andre@0:         pxor    xmm5, xmm8
andre@0:         pxor    xmm6, xmm8
andre@0:         pxor    xmm7, xmm8
andre@0: 
andre@0: IF enc eq 1
andre@0:         rnd textequ <aes_rnd>
andre@0:         lastrnd textequ <aes_last_rnd>
andre@0:         aesinst textequ <aesenc>
andre@0:         aeslastinst textequ <aesenclast>
andre@0: ELSE
andre@0:         rnd textequ <aes_dec_rnd>
andre@0:         lastrnd textequ <aes_dec_last_rnd>
andre@0:         aesinst textequ <aesdec>
andre@0:         aeslastinst textequ <aesdeclast>
andre@0: ENDIF
andre@0: 
andre@0:         i = 1
andre@0:         WHILE i LT rnds
andre@0:             rnd i
andre@0:             i = i+1
andre@0:             ENDM
andre@0:         lastrnd rnds
andre@0: 
andre@0:         movdqu  [0*16 + output], xmm0
andre@0:         movdqu  [1*16 + output], xmm1
andre@0:         movdqu  [2*16 + output], xmm2
andre@0:         movdqu  [3*16 + output], xmm3
andre@0:         movdqu  [4*16 + output], xmm4
andre@0:         movdqu  [5*16 + output], xmm5
andre@0:         movdqu  [6*16 + output], xmm6
andre@0:         movdqu  [7*16 + output], xmm7
andre@0: 
andre@0:         lea input, [8*16 + input]
andre@0:         lea output, [8*16 + output]
andre@0:         sub inputLen, 8*16
andre@0:         jmp loop8
andre@0: 
andre@0: loop1:
andre@0:         cmp     inputLen, 1*16
andre@0:         jb      bail
andre@0: 
andre@0:         movdqu  xmm0, [input]
andre@0:         movdqu  xmm7, [0*16 + ctx]
andre@0:         pxor    xmm0, xmm7
andre@0: 
andre@0:         i = 1
andre@0:     WHILE i LT rnds
andre@0:             movdqu  xmm7, [i*16 + ctx]
andre@0:             aesinst  xmm0, xmm7
andre@0:             i = i+1
andre@0:         ENDM
andre@0:         movdqu  xmm7, [rnds*16 + ctx]
andre@0:         aeslastinst xmm0, xmm7
andre@0: 
andre@0:         movdqu  [output], xmm0
andre@0: 
andre@0:         lea input, [1*16 + input]
andre@0:         lea output, [1*16 + output]
andre@0:         sub inputLen, 1*16
andre@0:         jmp loop1
andre@0: 
andre@0: bail:
andre@0:         xor rax, rax
andre@0: 
andre@0:         movdqu  xmm6, [rsp + 0*16]
andre@0:         movdqu  xmm7, [rsp + 1*16]
andre@0:         movdqu  xmm8, [rsp + 2*16]
andre@0:         add     rsp, 3*16
andre@0:         ret
andre@0: ENDM
andre@0: 
andre@0: intel_aes_encrypt_ecb_128 PROC
andre@0: gen_aes_ecb_func 1, 10
andre@0: intel_aes_encrypt_ecb_128 ENDP
andre@0: 
andre@0: intel_aes_encrypt_ecb_192 PROC
andre@0: gen_aes_ecb_func 1, 12
andre@0: intel_aes_encrypt_ecb_192 ENDP
andre@0: 
andre@0: intel_aes_encrypt_ecb_256 PROC
andre@0: gen_aes_ecb_func 1, 14
andre@0: intel_aes_encrypt_ecb_256 ENDP
andre@0: 
andre@0: intel_aes_decrypt_ecb_128 PROC
andre@0: gen_aes_ecb_func 0, 10
andre@0: intel_aes_decrypt_ecb_128 ENDP
andre@0: 
andre@0: intel_aes_decrypt_ecb_192 PROC
andre@0: gen_aes_ecb_func 0, 12
andre@0: intel_aes_decrypt_ecb_192 ENDP
andre@0: 
andre@0: intel_aes_decrypt_ecb_256 PROC
andre@0: gen_aes_ecb_func 0, 14
andre@0: intel_aes_decrypt_ecb_256 ENDP
andre@0: 
andre@0: 
andre@0: KEY textequ <rcx>
andre@0: KS  textequ <rdx>
andre@0: ITR textequ <r8>
andre@0: 
andre@0: intel_aes_encrypt_init_128  PROC
andre@0: 
andre@0:     movdqu  xmm1, [KEY]
andre@0:     movdqu  [KS], xmm1
andre@0:     movdqa  xmm2, xmm1
andre@0: 
andre@0:     lea ITR, Lcon1
andre@0:     movdqa  xmm0, [ITR]
andre@0:     lea ITR, Lmask
andre@0:     movdqa  xmm4, [ITR]
andre@0: 
andre@0:     mov ITR, 8
andre@0: 
andre@0: Lenc_128_ks_loop:
andre@0:         lea KS, [16 + KS]
andre@0:         dec ITR
andre@0: 
andre@0:         pshufb  xmm2, xmm4
andre@0:         aesenclast  xmm2, xmm0
andre@0:         pslld   xmm0, 1
andre@0:         movdqa  xmm3, xmm1
andre@0:         pslldq  xmm3, 4
andre@0:         pxor    xmm1, xmm3
andre@0:         pslldq  xmm3, 4
andre@0:         pxor    xmm1, xmm3
andre@0:         pslldq  xmm3, 4
andre@0:         pxor    xmm1, xmm3
andre@0:         pxor    xmm1, xmm2
andre@0:         movdqu  [KS], xmm1
andre@0:         movdqa  xmm2, xmm1
andre@0: 
andre@0:         jne Lenc_128_ks_loop
andre@0: 
andre@0:     lea ITR, Lcon2
andre@0:     movdqa  xmm0, [ITR]
andre@0: 
andre@0:     pshufb  xmm2, xmm4
andre@0:     aesenclast  xmm2, xmm0
andre@0:     pslld   xmm0, 1
andre@0:     movdqa  xmm3, xmm1
andre@0:     pslldq  xmm3, 4
andre@0:     pxor    xmm1, xmm3
andre@0:     pslldq  xmm3, 4
andre@0:     pxor    xmm1, xmm3
andre@0:     pslldq  xmm3, 4
andre@0:     pxor    xmm1, xmm3
andre@0:     pxor    xmm1, xmm2
andre@0:     movdqu  [16 + KS], xmm1
andre@0:     movdqa  xmm2, xmm1
andre@0: 
andre@0:     pshufb  xmm2, xmm4
andre@0:     aesenclast  xmm2, xmm0
andre@0:     movdqa  xmm3, xmm1
andre@0:     pslldq  xmm3, 4
andre@0:     pxor    xmm1, xmm3
andre@0:     pslldq  xmm3, 4
andre@0:     pxor    xmm1, xmm3
andre@0:     pslldq  xmm3, 4
andre@0:     pxor    xmm1, xmm3
andre@0:     pxor    xmm1, xmm2
andre@0:     movdqu  [32 + KS], xmm1
andre@0:     movdqa  xmm2, xmm1
andre@0: 
andre@0:     ret
andre@0: intel_aes_encrypt_init_128  ENDP
andre@0: 
andre@0: 
andre@0: intel_aes_decrypt_init_128  PROC
andre@0: 
andre@0:     push    KS
andre@0:     push    KEY
andre@0: 
andre@0:     call    intel_aes_encrypt_init_128
andre@0: 
andre@0:     pop     KEY
andre@0:     pop     KS
andre@0: 
andre@0:     movdqu  xmm0, [0*16 + KS]
andre@0:     movdqu  xmm1, [10*16 + KS]
andre@0:     movdqu  [10*16 + KS], xmm0
andre@0:     movdqu  [0*16 + KS], xmm1
andre@0: 
andre@0:     i = 1
andre@0:     WHILE i LT 5
andre@0:         movdqu  xmm0, [i*16 + KS]
andre@0:         movdqu  xmm1, [(10-i)*16 + KS]
andre@0: 
andre@0:         aesimc  xmm0, xmm0
andre@0:         aesimc  xmm1, xmm1
andre@0: 
andre@0:         movdqu  [(10-i)*16 + KS], xmm0
andre@0:         movdqu  [i*16 + KS], xmm1
andre@0: 
andre@0:         i = i+1
andre@0:     ENDM
andre@0: 
andre@0:     movdqu  xmm0, [5*16 + KS]
andre@0:     aesimc  xmm0, xmm0
andre@0:     movdqu  [5*16 + KS], xmm0
andre@0:     ret
andre@0: intel_aes_decrypt_init_128  ENDP
andre@0: 
andre@0: 
andre@0: intel_aes_encrypt_init_192  PROC
andre@0: 
andre@0:     sub     rsp, 16*2
andre@0:     movdqu  [16*0 + rsp], xmm6
andre@0:     movdqu  [16*1 + rsp], xmm7
andre@0: 
andre@0:     movdqu  xmm1, [KEY]
andre@0:     mov     ITR, [16 + KEY]
andre@0:     movd    xmm3, ITR
andre@0: 
andre@0:     movdqu  [KS], xmm1
andre@0:     movdqa  xmm5, xmm3
andre@0: 
andre@0:     lea ITR, Lcon1
andre@0:     movdqu  xmm0, [ITR]
andre@0:     lea ITR, Lmask192
andre@0:     movdqu  xmm4, [ITR]
andre@0: 
andre@0:     mov ITR, 4
andre@0: 
andre@0: Lenc_192_ks_loop:
andre@0:         movdqa  xmm2, xmm3
andre@0:         pshufb  xmm2, xmm4
andre@0:         aesenclast xmm2, xmm0
andre@0:         pslld   xmm0, 1
andre@0: 
andre@0:         movdqa  xmm6, xmm1
andre@0:         movdqa  xmm7, xmm3
andre@0:         pslldq  xmm6, 4
andre@0:         pslldq  xmm7, 4
andre@0:         pxor    xmm1, xmm6
andre@0:         pxor    xmm3, xmm7
andre@0:         pslldq  xmm6, 4
andre@0:         pxor    xmm1, xmm6
andre@0:         pslldq  xmm6, 4
andre@0:         pxor    xmm1, xmm6
andre@0:         pxor    xmm1, xmm2
andre@0:         pshufd  xmm2, xmm1, 0ffh
andre@0:         pxor    xmm3, xmm2
andre@0: 
andre@0:         movdqa  xmm6, xmm1
andre@0:         shufpd  xmm5, xmm1, 00h
andre@0:         shufpd  xmm6, xmm3, 01h
andre@0: 
andre@0:         movdqu  [16 + KS], xmm5
andre@0:         movdqu  [32 + KS], xmm6
andre@0: 
andre@0:         movdqa  xmm2, xmm3
andre@0:         pshufb  xmm2, xmm4
andre@0:         aesenclast  xmm2, xmm0
andre@0:         pslld   xmm0, 1
andre@0: 
andre@0:         movdqa  xmm6, xmm1
andre@0:         movdqa  xmm7, xmm3
andre@0:         pslldq  xmm6, 4
andre@0:         pslldq  xmm7, 4
andre@0:         pxor    xmm1, xmm6
andre@0:         pxor    xmm3, xmm7
andre@0:         pslldq  xmm6, 4
andre@0:         pxor    xmm1, xmm6
andre@0:         pslldq  xmm6, 4
andre@0:         pxor    xmm1, xmm6
andre@0:         pxor    xmm1, xmm2
andre@0:         pshufd  xmm2, xmm1, 0ffh
andre@0:         pxor    xmm3, xmm2
andre@0: 
andre@0:         movdqu  [48 + KS], xmm1
andre@0:         movdqa  xmm5, xmm3
andre@0: 
andre@0:         lea KS, [48 + KS]
andre@0: 
andre@0:         dec ITR
andre@0:         jnz Lenc_192_ks_loop
andre@0: 
andre@0:     movdqu  [16 + KS], xmm5
andre@0: 
andre@0:     movdqu  xmm7, [16*1 + rsp]
andre@0:     movdqu  xmm6, [16*0 + rsp]
andre@0:     add rsp, 16*2
andre@0:     ret
andre@0: intel_aes_encrypt_init_192  ENDP
andre@0: 
andre@0: intel_aes_decrypt_init_192  PROC
andre@0:     push    KS
andre@0:     push    KEY
andre@0: 
andre@0:     call    intel_aes_encrypt_init_192
andre@0: 
andre@0:     pop     KEY
andre@0:     pop     KS
andre@0: 
andre@0:     movdqu  xmm0, [0*16 + KS]
andre@0:     movdqu  xmm1, [12*16 + KS]
andre@0:     movdqu  [12*16 + KS], xmm0
andre@0:     movdqu  [0*16 + KS], xmm1
andre@0: 
andre@0:     i = 1
andre@0:     WHILE i LT 6
andre@0:         movdqu  xmm0, [i*16 + KS]
andre@0:         movdqu  xmm1, [(12-i)*16 + KS]
andre@0: 
andre@0:         aesimc  xmm0, xmm0
andre@0:         aesimc  xmm1, xmm1
andre@0: 
andre@0:         movdqu  [(12-i)*16 + KS], xmm0
andre@0:         movdqu  [i*16 + KS], xmm1
andre@0: 
andre@0:         i = i+1
andre@0:     ENDM
andre@0: 
andre@0:     movdqu  xmm0, [6*16 + KS]
andre@0:     aesimc  xmm0, xmm0
andre@0:     movdqu  [6*16 + KS], xmm0
andre@0:     ret
andre@0: intel_aes_decrypt_init_192  ENDP
andre@0: 
andre@0: 
andre@0: intel_aes_encrypt_init_256  PROC
andre@0:     sub     rsp, 16*2
andre@0:     movdqu  [16*0 + rsp], xmm6
andre@0:     movdqu  [16*1 + rsp], xmm7
andre@0: 
andre@0:     movdqu  xmm1, [16*0 + KEY]
andre@0:     movdqu  xmm3, [16*1 + KEY]
andre@0: 
andre@0:     movdqu  [16*0 + KS], xmm1
andre@0:     movdqu  [16*1 + KS], xmm3
andre@0: 
andre@0:     lea ITR, Lcon1
andre@0:     movdqu  xmm0, [ITR]
andre@0:     lea ITR, Lmask256
andre@0:     movdqu  xmm5, [ITR]
andre@0: 
andre@0:     pxor    xmm6, xmm6
andre@0: 
andre@0:     mov ITR, 6
andre@0: 
andre@0: Lenc_256_ks_loop:
andre@0: 
andre@0:         movdqa  xmm2, xmm3
andre@0:         pshufb  xmm2, xmm5
andre@0:         aesenclast  xmm2, xmm0
andre@0:         pslld   xmm0, 1
andre@0:         movdqa  xmm4, xmm1
andre@0:         pslldq  xmm4, 4
andre@0:         pxor    xmm1, xmm4
andre@0:         pslldq  xmm4, 4
andre@0:         pxor    xmm1, xmm4
andre@0:         pslldq  xmm4, 4
andre@0:         pxor    xmm1, xmm4
andre@0:         pxor    xmm1, xmm2
andre@0:         movdqu  [16*2 + KS], xmm1
andre@0: 
andre@0:         pshufd  xmm2, xmm1, 0ffh
andre@0:         aesenclast  xmm2, xmm6
andre@0:         movdqa  xmm4, xmm3
andre@0:         pslldq  xmm4, 4
andre@0:         pxor    xmm3, xmm4
andre@0:         pslldq  xmm4, 4
andre@0:         pxor    xmm3, xmm4
andre@0:         pslldq  xmm4, 4
andre@0:         pxor    xmm3, xmm4
andre@0:         pxor    xmm3, xmm2
andre@0:         movdqu  [16*3 + KS], xmm3
andre@0: 
andre@0:         lea KS, [32 + KS]
andre@0:         dec ITR
andre@0:         jnz Lenc_256_ks_loop
andre@0: 
andre@0:     movdqa  xmm2, xmm3
andre@0:     pshufb  xmm2, xmm5
andre@0:     aesenclast  xmm2, xmm0
andre@0:     movdqa  xmm4, xmm1
andre@0:     pslldq  xmm4, 4
andre@0:     pxor    xmm1, xmm4
andre@0:     pslldq  xmm4, 4
andre@0:     pxor    xmm1, xmm4
andre@0:     pslldq  xmm4, 4
andre@0:     pxor    xmm1, xmm4
andre@0:     pxor    xmm1, xmm2
andre@0:     movdqu  [16*2 + KS], xmm1
andre@0: 
andre@0:     movdqu  xmm7, [16*1 + rsp]
andre@0:     movdqu  xmm6, [16*0 + rsp]
andre@0:     add rsp, 16*2
andre@0:     ret
andre@0: 
andre@0: intel_aes_encrypt_init_256  ENDP
andre@0: 
andre@0: 
andre@0: intel_aes_decrypt_init_256  PROC
andre@0:     push    KS
andre@0:     push    KEY
andre@0: 
andre@0:     call    intel_aes_encrypt_init_256
andre@0: 
andre@0:     pop     KEY
andre@0:     pop     KS
andre@0: 
andre@0:     movdqu  xmm0, [0*16 + KS]
andre@0:     movdqu  xmm1, [14*16 + KS]
andre@0:     movdqu  [14*16 + KS], xmm0
andre@0:     movdqu  [0*16 + KS], xmm1
andre@0: 
andre@0:     i = 1
andre@0:     WHILE i LT 7
andre@0:         movdqu  xmm0, [i*16 + KS]
andre@0:         movdqu  xmm1, [(14-i)*16 + KS]
andre@0: 
andre@0:         aesimc  xmm0, xmm0
andre@0:         aesimc  xmm1, xmm1
andre@0: 
andre@0:         movdqu  [(14-i)*16 + KS], xmm0
andre@0:         movdqu  [i*16 + KS], xmm1
andre@0: 
andre@0:         i = i+1
andre@0:     ENDM
andre@0: 
andre@0:     movdqu  xmm0, [7*16 + KS]
andre@0:     aesimc  xmm0, xmm0
andre@0:     movdqu  [7*16 + KS], xmm0
andre@0:     ret
andre@0: intel_aes_decrypt_init_256  ENDP
andre@0: 
andre@0: 
andre@0: 
andre@0: gen_aes_cbc_enc_func MACRO rnds
andre@0: 
andre@0: LOCAL   loop1
andre@0: LOCAL   bail
andre@0: 
andre@0:         mov     input,      [rsp + 1*8 + 8*4]
andre@0:         mov     inputLen,   [rsp + 1*8 + 8*5]
andre@0: 
andre@0:         sub     rsp, 3*16
andre@0: 
andre@0:         movdqu  [rsp + 0*16], xmm6
andre@0:         movdqu  [rsp + 1*16], xmm7
andre@0:         movdqu  [rsp + 2*16], xmm8
andre@0: 
andre@0:         lea     ctx, [48+ctx]
andre@0: 
andre@0:         movdqu  xmm0, [-32+ctx]
andre@0: 
andre@0:         movdqu  xmm2, [0*16 + ctx]
andre@0:         movdqu  xmm3, [1*16 + ctx]
andre@0:         movdqu  xmm4, [2*16 + ctx]
andre@0:         movdqu  xmm5, [3*16 + ctx]
andre@0:         movdqu  xmm6, [4*16 + ctx]
andre@0:         movdqu  xmm7, [5*16 + ctx]
andre@0: 
andre@0: loop1:
andre@0:         cmp     inputLen, 1*16
andre@0:         jb      bail
andre@0: 
andre@0:         movdqu  xmm1, [input]
andre@0:         pxor    xmm1, xmm2
andre@0:         pxor    xmm0, xmm1
andre@0: 
andre@0:         aesenc  xmm0, xmm3
andre@0:         aesenc  xmm0, xmm4
andre@0:         aesenc  xmm0, xmm5
andre@0:         aesenc  xmm0, xmm6
andre@0:         aesenc  xmm0, xmm7
andre@0: 
andre@0:         i = 6
andre@0:     WHILE i LT rnds
andre@0:             movdqu  xmm8, [i*16 + ctx]
andre@0:             aesenc  xmm0, xmm8
andre@0:             i = i+1
andre@0:         ENDM
andre@0:         movdqu  xmm8, [rnds*16 + ctx]
andre@0:         aesenclast xmm0, xmm8
andre@0: 
andre@0:         movdqu  [output], xmm0
andre@0: 
andre@0:         lea input, [1*16 + input]
andre@0:         lea output, [1*16 + output]
andre@0:         sub inputLen, 1*16
andre@0:         jmp loop1
andre@0: 
andre@0: bail:
andre@0:         movdqu  [-32+ctx], xmm0
andre@0: 
andre@0:         xor rax, rax
andre@0: 
andre@0:         movdqu  xmm6, [rsp + 0*16]
andre@0:         movdqu  xmm7, [rsp + 1*16]
andre@0:         movdqu  xmm8, [rsp + 2*16]
andre@0:         add     rsp, 3*16
andre@0:         ret
andre@0: 
andre@0: ENDM
andre@0: 
andre@0: gen_aes_cbc_dec_func MACRO rnds
andre@0: 
andre@0: LOCAL   loop8
andre@0: LOCAL   loop1
andre@0: LOCAL   dec1
andre@0: LOCAL   bail
andre@0: 
andre@0:         mov     input,      [rsp + 1*8 + 8*4]
andre@0:         mov     inputLen,   [rsp + 1*8 + 8*5]
andre@0: 
andre@0:         sub     rsp, 3*16
andre@0: 
andre@0:         movdqu  [rsp + 0*16], xmm6
andre@0:         movdqu  [rsp + 1*16], xmm7
andre@0:         movdqu  [rsp + 2*16], xmm8
andre@0: 
andre@0:         lea     ctx, [48+ctx]
andre@0: 
andre@0: loop8:
andre@0:         cmp     inputLen, 8*16
andre@0:         jb      dec1
andre@0: 
andre@0:         movdqu  xmm0, [0*16 + input]
andre@0:         movdqu  xmm1, [1*16 + input]
andre@0:         movdqu  xmm2, [2*16 + input]
andre@0:         movdqu  xmm3, [3*16 + input]
andre@0:         movdqu  xmm4, [4*16 + input]
andre@0:         movdqu  xmm5, [5*16 + input]
andre@0:         movdqu  xmm6, [6*16 + input]
andre@0:         movdqu  xmm7, [7*16 + input]
andre@0: 
andre@0:         movdqu  xmm8, [0*16 + ctx]
andre@0:         pxor    xmm0, xmm8
andre@0:         pxor    xmm1, xmm8
andre@0:         pxor    xmm2, xmm8
andre@0:         pxor    xmm3, xmm8
andre@0:         pxor    xmm4, xmm8
andre@0:         pxor    xmm5, xmm8
andre@0:         pxor    xmm6, xmm8
andre@0:         pxor    xmm7, xmm8
andre@0: 
andre@0:         i = 1
andre@0:         WHILE i LT rnds
andre@0:             aes_dec_rnd i
andre@0:             i = i+1
andre@0:             ENDM
andre@0:         aes_dec_last_rnd rnds
andre@0: 
andre@0:         movdqu  xmm8, [-32 + ctx]
andre@0:         pxor    xmm0, xmm8
andre@0:         movdqu  xmm8, [0*16 + input]
andre@0:         pxor    xmm1, xmm8
andre@0:         movdqu  xmm8, [1*16 + input]
andre@0:         pxor    xmm2, xmm8
andre@0:         movdqu  xmm8, [2*16 + input]
andre@0:         pxor    xmm3, xmm8
andre@0:         movdqu  xmm8, [3*16 + input]
andre@0:         pxor    xmm4, xmm8
andre@0:         movdqu  xmm8, [4*16 + input]
andre@0:         pxor    xmm5, xmm8
andre@0:         movdqu  xmm8, [5*16 + input]
andre@0:         pxor    xmm6, xmm8
andre@0:         movdqu  xmm8, [6*16 + input]
andre@0:         pxor    xmm7, xmm8
andre@0:         movdqu  xmm8, [7*16 + input]
andre@0: 
andre@0:         movdqu  [0*16 + output], xmm0
andre@0:         movdqu  [1*16 + output], xmm1
andre@0:         movdqu  [2*16 + output], xmm2
andre@0:         movdqu  [3*16 + output], xmm3
andre@0:         movdqu  [4*16 + output], xmm4
andre@0:         movdqu  [5*16 + output], xmm5
andre@0:         movdqu  [6*16 + output], xmm6
andre@0:         movdqu  [7*16 + output], xmm7
andre@0:         movdqu  [-32 + ctx], xmm8
andre@0: 
andre@0:         lea input, [8*16 + input]
andre@0:         lea output, [8*16 + output]
andre@0:         sub inputLen, 8*16
andre@0:         jmp loop8
andre@0: dec1:
andre@0: 
andre@0:         movdqu  xmm3, [-32 + ctx]
andre@0: 
andre@0: loop1:
andre@0:         cmp     inputLen, 1*16
andre@0:         jb      bail
andre@0: 
andre@0:         movdqu  xmm0, [input]
andre@0:         movdqa  xmm4, xmm0
andre@0:         movdqu  xmm7, [0*16 + ctx]
andre@0:         pxor    xmm0, xmm7
andre@0: 
andre@0:         i = 1
andre@0:     WHILE i LT rnds
andre@0:             movdqu  xmm7, [i*16 + ctx]
andre@0:             aesdec  xmm0, xmm7
andre@0:             i = i+1
andre@0:         ENDM
andre@0:         movdqu  xmm7, [rnds*16 + ctx]
andre@0:         aesdeclast xmm0, xmm7
andre@0:         pxor    xmm3, xmm0
andre@0: 
andre@0:         movdqu  [output], xmm3
andre@0:         movdqa  xmm3, xmm4
andre@0: 
andre@0:         lea input, [1*16 + input]
andre@0:         lea output, [1*16 + output]
andre@0:         sub inputLen, 1*16
andre@0:         jmp loop1
andre@0: 
andre@0: bail:
andre@0:         movdqu  [-32 + ctx], xmm3
andre@0:         xor rax, rax
andre@0: 
andre@0:         movdqu  xmm6, [rsp + 0*16]
andre@0:         movdqu  xmm7, [rsp + 1*16]
andre@0:         movdqu  xmm8, [rsp + 2*16]
andre@0:         add     rsp, 3*16
andre@0:         ret
andre@0: ENDM
andre@0: 
andre@0: intel_aes_encrypt_cbc_128 PROC
andre@0: gen_aes_cbc_enc_func  10
andre@0: intel_aes_encrypt_cbc_128 ENDP
andre@0: 
andre@0: intel_aes_encrypt_cbc_192 PROC
andre@0: gen_aes_cbc_enc_func  12
andre@0: intel_aes_encrypt_cbc_192 ENDP
andre@0: 
andre@0: intel_aes_encrypt_cbc_256 PROC
andre@0: gen_aes_cbc_enc_func  14
andre@0: intel_aes_encrypt_cbc_256 ENDP
andre@0: 
andre@0: intel_aes_decrypt_cbc_128 PROC
andre@0: gen_aes_cbc_dec_func  10
andre@0: intel_aes_decrypt_cbc_128 ENDP
andre@0: 
andre@0: intel_aes_decrypt_cbc_192 PROC
andre@0: gen_aes_cbc_dec_func  12
andre@0: intel_aes_decrypt_cbc_192 ENDP
andre@0: 
andre@0: intel_aes_decrypt_cbc_256 PROC
andre@0: gen_aes_cbc_dec_func  14
andre@0: intel_aes_decrypt_cbc_256 ENDP
andre@0: 
andre@0: 
andre@0: 
andre@0: ctrCtx textequ <r10>
andre@0: CTR textequ <r11d>
andre@0: CTRSave textequ <eax>
andre@0: 
andre@0: gen_aes_ctr_func MACRO rnds
andre@0: 
andre@0: LOCAL   loop8
andre@0: LOCAL   loop1
andre@0: LOCAL   enc1
andre@0: LOCAL   bail
andre@0: 
andre@0:         mov     input,      [rsp + 8*1 + 4*8]
andre@0:         mov     inputLen,   [rsp + 8*1 + 5*8]
andre@0: 
andre@0:         mov     ctrCtx, ctx
andre@0:         mov     ctx, [8+ctrCtx]
andre@0:         lea     ctx, [48+ctx]
andre@0: 
andre@0:         sub     rsp, 3*16
andre@0:         movdqu  [rsp + 0*16], xmm6
andre@0:         movdqu  [rsp + 1*16], xmm7
andre@0:         movdqu  [rsp + 2*16], xmm8
andre@0: 
andre@0: 
andre@0:         push    rbp
andre@0:         mov     rbp, rsp
andre@0:         sub     rsp, 8*16
andre@0:         and     rsp, -16
andre@0: 
andre@0: 
andre@0:         movdqu  xmm0, [16+ctrCtx]
andre@0:         mov     CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
andre@0:         bswap   CTRSave
andre@0:         movdqu  xmm1, [ctx + 0*16]
andre@0: 
andre@0:         pxor    xmm0, xmm1
andre@0: 
andre@0:         movdqa  [rsp + 0*16], xmm0
andre@0:         movdqa  [rsp + 1*16], xmm0
andre@0:         movdqa  [rsp + 2*16], xmm0
andre@0:         movdqa  [rsp + 3*16], xmm0
andre@0:         movdqa  [rsp + 4*16], xmm0
andre@0:         movdqa  [rsp + 5*16], xmm0
andre@0:         movdqa  [rsp + 6*16], xmm0
andre@0:         movdqa  [rsp + 7*16], xmm0
andre@0: 
andre@0:         inc     CTRSave
andre@0:         mov     CTR, CTRSave
andre@0:         bswap   CTR
andre@0:         xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:         mov     DWORD PTR [rsp + 1*16 + 3*4], CTR
andre@0: 
andre@0:         inc     CTRSave
andre@0:         mov     CTR, CTRSave
andre@0:         bswap   CTR
andre@0:         xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:         mov     DWORD PTR [rsp + 2*16 + 3*4], CTR
andre@0: 
andre@0:         inc     CTRSave
andre@0:         mov     CTR, CTRSave
andre@0:         bswap   CTR
andre@0:         xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:         mov     DWORD PTR [rsp + 3*16 + 3*4], CTR
andre@0: 
andre@0:         inc     CTRSave
andre@0:         mov     CTR, CTRSave
andre@0:         bswap   CTR
andre@0:         xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:         mov     DWORD PTR [rsp + 4*16 + 3*4], CTR
andre@0: 
andre@0:         inc     CTRSave
andre@0:         mov     CTR, CTRSave
andre@0:         bswap   CTR
andre@0:         xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:         mov     DWORD PTR [rsp + 5*16 + 3*4], CTR
andre@0: 
andre@0:         inc     CTRSave
andre@0:         mov     CTR, CTRSave
andre@0:         bswap   CTR
andre@0:         xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:         mov     DWORD PTR [rsp + 6*16 + 3*4], CTR
andre@0: 
andre@0:         inc     CTRSave
andre@0:         mov     CTR, CTRSave
andre@0:         bswap   CTR
andre@0:         xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:         mov     DWORD PTR [rsp + 7*16 + 3*4], CTR
andre@0: 
andre@0: 
andre@0: loop8:
andre@0:         cmp     inputLen, 8*16
andre@0:         jb      loop1
andre@0: 
andre@0:         movdqu  xmm0, [0*16 + rsp]
andre@0:         movdqu  xmm1, [1*16 + rsp]
andre@0:         movdqu  xmm2, [2*16 + rsp]
andre@0:         movdqu  xmm3, [3*16 + rsp]
andre@0:         movdqu  xmm4, [4*16 + rsp]
andre@0:         movdqu  xmm5, [5*16 + rsp]
andre@0:         movdqu  xmm6, [6*16 + rsp]
andre@0:         movdqu  xmm7, [7*16 + rsp]
andre@0: 
andre@0:         i = 1
andre@0:         WHILE i LE 8
andre@0:             aes_rnd i
andre@0: 
andre@0:             inc     CTRSave
andre@0:             mov     CTR, CTRSave
andre@0:             bswap   CTR
andre@0:             xor     CTR, DWORD PTR [ctx + 3*4]
andre@0:             mov     DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
andre@0: 
andre@0:             i = i+1
andre@0:         ENDM
andre@0:         WHILE i LT rnds
andre@0:             aes_rnd i
andre@0:             i = i+1
andre@0:             ENDM
andre@0:         aes_last_rnd rnds
andre@0: 
andre@0:         movdqu  xmm8, [0*16 + input]
andre@0:         pxor    xmm0, xmm8
andre@0:         movdqu  xmm8, [1*16 + input]
andre@0:         pxor    xmm1, xmm8
andre@0:         movdqu  xmm8, [2*16 + input]
andre@0:         pxor    xmm2, xmm8
andre@0:         movdqu  xmm8, [3*16 + input]
andre@0:         pxor    xmm3, xmm8
andre@0:         movdqu  xmm8, [4*16 + input]
andre@0:         pxor    xmm4, xmm8
andre@0:         movdqu  xmm8, [5*16 + input]
andre@0:         pxor    xmm5, xmm8
andre@0:         movdqu  xmm8, [6*16 + input]
andre@0:         pxor    xmm6, xmm8
andre@0:         movdqu  xmm8, [7*16 + input]
andre@0:         pxor    xmm7, xmm8
andre@0: 
andre@0:         movdqu  [0*16 + output], xmm0
andre@0:         movdqu  [1*16 + output], xmm1
andre@0:         movdqu  [2*16 + output], xmm2
andre@0:         movdqu  [3*16 + output], xmm3
andre@0:         movdqu  [4*16 + output], xmm4
andre@0:         movdqu  [5*16 + output], xmm5
andre@0:         movdqu  [6*16 + output], xmm6
andre@0:         movdqu  [7*16 + output], xmm7
andre@0: 
andre@0:         lea input, [8*16 + input]
andre@0:         lea output, [8*16 + output]
andre@0:         sub inputLen, 8*16
andre@0:         jmp loop8
andre@0: 
andre@0: 
andre@0: loop1:
andre@0:         cmp     inputLen, 1*16
andre@0:         jb      bail
andre@0: 
andre@0:         movdqu  xmm0, [rsp]
andre@0:         add     rsp, 16
andre@0: 
andre@0:         i = 1
andre@0:     WHILE i LT rnds
andre@0:             movdqu  xmm7, [i*16 + ctx]
andre@0:             aesenc  xmm0, xmm7
andre@0:             i = i+1
andre@0:         ENDM
andre@0:         movdqu  xmm7, [rnds*16 + ctx]
andre@0:         aesenclast xmm0, xmm7
andre@0: 
andre@0:         movdqu  xmm7, [input]
andre@0:         pxor    xmm0, xmm7
andre@0:         movdqu  [output], xmm0
andre@0: 
andre@0:         lea input, [1*16 + input]
andre@0:         lea output, [1*16 + output]
andre@0:         sub inputLen, 1*16
andre@0:         jmp loop1
andre@0: 
andre@0: bail:
andre@0: 
andre@0:         movdqu  xmm0, [rsp]
andre@0:         movdqu  xmm1, [ctx + 0*16]
andre@0:         pxor    xmm0, xmm1
andre@0:         movdqu  [16+ctrCtx], xmm0
andre@0: 
andre@0: 
andre@0:         xor     rax, rax
andre@0:         mov     rsp, rbp
andre@0:         pop     rbp
andre@0: 
andre@0:         movdqu  xmm6, [rsp + 0*16]
andre@0:         movdqu  xmm7, [rsp + 1*16]
andre@0:         movdqu  xmm8, [rsp + 2*16]
andre@0:         add     rsp, 3*16
andre@0: 
andre@0:         ret
andre@0: ENDM
andre@0: 
andre@0: 
andre@0: intel_aes_encrypt_ctr_128 PROC
andre@0: gen_aes_ctr_func  10
andre@0: intel_aes_encrypt_ctr_128 ENDP
andre@0: 
andre@0: intel_aes_encrypt_ctr_192 PROC
andre@0: gen_aes_ctr_func  12
andre@0: intel_aes_encrypt_ctr_192 ENDP
andre@0: 
andre@0: intel_aes_encrypt_ctr_256 PROC
andre@0: gen_aes_ctr_func  14
andre@0: intel_aes_encrypt_ctr_256 ENDP
andre@0: 
andre@0: 
andre@0: END