Mercurial > trustbridge > nss-cmake-static
diff nss/lib/freebl/intel-aes-x64-masm.asm @ 0:1e5118fa0cb1
This is NSS with a Cmake Buildsyste
To compile a static NSS library for Windows we've used the
Chromium-NSS fork and added a Cmake buildsystem to compile
it statically for Windows. See README.chromium for chromium
changes and README.trustbridge for our modifications.
author | Andre Heinecke <andre.heinecke@intevation.de> |
---|---|
date | Mon, 28 Jul 2014 10:47:06 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nss/lib/freebl/intel-aes-x64-masm.asm Mon Jul 28 10:47:06 2014 +0200 @@ -0,0 +1,971 @@ +; LICENSE: +; This submission to NSS is to be made available under the terms of the +; Mozilla Public License, v. 2.0. You can obtain one at http: +; //mozilla.org/MPL/2.0/. +;############################################################################### +; Copyright(c) 2014, Intel Corp. +; Developers and authors: +; Shay Gueron and Vlad Krasnov +; Intel Corporation, Israel Development Centre, Haifa, Israel +; Please send feedback directly to crypto.feedback.alias@intel.com + + +.DATA +ALIGN 16 +Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh +Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h +Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh +Lcon1 dd 1,1,1,1 +Lcon2 dd 1bh,1bh,1bh,1bh + +.CODE + +ctx textequ <rcx> +output textequ <rdx> +input textequ <r8> +inputLen textequ <r9d> + + +aes_rnd MACRO i + movdqu xmm8, [i*16 + ctx] + aesenc xmm0, xmm8 + aesenc xmm1, xmm8 + aesenc xmm2, xmm8 + aesenc xmm3, xmm8 + aesenc xmm4, xmm8 + aesenc xmm5, xmm8 + aesenc xmm6, xmm8 + aesenc xmm7, xmm8 + ENDM + +aes_last_rnd MACRO i + movdqu xmm8, [i*16 + ctx] + aesenclast xmm0, xmm8 + aesenclast xmm1, xmm8 + aesenclast xmm2, xmm8 + aesenclast xmm3, xmm8 + aesenclast xmm4, xmm8 + aesenclast xmm5, xmm8 + aesenclast xmm6, xmm8 + aesenclast xmm7, xmm8 + ENDM + +aes_dec_rnd MACRO i + movdqu xmm8, [i*16 + ctx] + aesdec xmm0, xmm8 + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm7, xmm8 + ENDM + +aes_dec_last_rnd MACRO i + movdqu xmm8, [i*16 + ctx] + aesdeclast xmm0, xmm8 + aesdeclast xmm1, xmm8 + aesdeclast xmm2, xmm8 + aesdeclast xmm3, xmm8 + aesdeclast xmm4, xmm8 + aesdeclast xmm5, xmm8 + aesdeclast xmm6, xmm8 + aesdeclast xmm7, xmm8 + ENDM + + +gen_aes_ecb_func MACRO enc, rnds + +LOCAL loop8 +LOCAL loop1 +LOCAL bail + + xor inputLen, inputLen + mov input, [rsp + 1*8 + 8*4] + mov inputLen, [rsp + 1*8 + 8*5] + + sub rsp, 3*16 + + movdqu [rsp + 0*16], xmm6 + movdqu [rsp + 1*16], xmm7 + movdqu [rsp + 2*16], xmm8 + + lea ctx, [48+ctx] + +loop8: + cmp inputLen, 8*16 + jb loop1 + + movdqu xmm0, [0*16 + input] + movdqu xmm1, [1*16 + input] + movdqu xmm2, [2*16 + input] + movdqu xmm3, [3*16 + input] + movdqu xmm4, [4*16 + input] + movdqu xmm5, [5*16 + input] + movdqu xmm6, [6*16 + input] + movdqu xmm7, [7*16 + input] + + movdqu xmm8, [0*16 + ctx] + pxor xmm0, xmm8 + pxor xmm1, xmm8 + pxor xmm2, xmm8 + pxor xmm3, xmm8 + pxor xmm4, xmm8 + pxor xmm5, xmm8 + pxor xmm6, xmm8 + pxor xmm7, xmm8 + +IF enc eq 1 + rnd textequ <aes_rnd> + lastrnd textequ <aes_last_rnd> + aesinst textequ <aesenc> + aeslastinst textequ <aesenclast> +ELSE + rnd textequ <aes_dec_rnd> + lastrnd textequ <aes_dec_last_rnd> + aesinst textequ <aesdec> + aeslastinst textequ <aesdeclast> +ENDIF + + i = 1 + WHILE i LT rnds + rnd i + i = i+1 + ENDM + lastrnd rnds + + movdqu [0*16 + output], xmm0 + movdqu [1*16 + output], xmm1 + movdqu [2*16 + output], xmm2 + movdqu [3*16 + output], xmm3 + movdqu [4*16 + output], xmm4 + movdqu [5*16 + output], xmm5 + movdqu [6*16 + output], xmm6 + movdqu [7*16 + output], xmm7 + + lea input, [8*16 + input] + lea output, [8*16 + output] + sub inputLen, 8*16 + jmp loop8 + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm0, [input] + movdqu xmm7, [0*16 + ctx] + pxor xmm0, xmm7 + + i = 1 + WHILE i LT rnds + movdqu xmm7, [i*16 + ctx] + aesinst xmm0, xmm7 + i = i+1 + ENDM + movdqu xmm7, [rnds*16 + ctx] + aeslastinst xmm0, xmm7 + + movdqu [output], xmm0 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + xor rax, rax + + movdqu xmm6, [rsp + 0*16] + movdqu xmm7, [rsp + 1*16] + movdqu xmm8, [rsp + 2*16] + add rsp, 3*16 + ret +ENDM + +intel_aes_encrypt_ecb_128 PROC +gen_aes_ecb_func 1, 10 +intel_aes_encrypt_ecb_128 ENDP + +intel_aes_encrypt_ecb_192 PROC +gen_aes_ecb_func 1, 12 +intel_aes_encrypt_ecb_192 ENDP + +intel_aes_encrypt_ecb_256 PROC +gen_aes_ecb_func 1, 14 +intel_aes_encrypt_ecb_256 ENDP + +intel_aes_decrypt_ecb_128 PROC +gen_aes_ecb_func 0, 10 +intel_aes_decrypt_ecb_128 ENDP + +intel_aes_decrypt_ecb_192 PROC +gen_aes_ecb_func 0, 12 +intel_aes_decrypt_ecb_192 ENDP + +intel_aes_decrypt_ecb_256 PROC +gen_aes_ecb_func 0, 14 +intel_aes_decrypt_ecb_256 ENDP + + +KEY textequ <rcx> +KS textequ <rdx> +ITR textequ <r8> + +intel_aes_encrypt_init_128 PROC + + movdqu xmm1, [KEY] + movdqu [KS], xmm1 + movdqa xmm2, xmm1 + + lea ITR, Lcon1 + movdqa xmm0, [ITR] + lea ITR, Lmask + movdqa xmm4, [ITR] + + mov ITR, 8 + +Lenc_128_ks_loop: + lea KS, [16 + KS] + dec ITR + + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [KS], xmm1 + movdqa xmm2, xmm1 + + jne Lenc_128_ks_loop + + lea ITR, Lcon2 + movdqa xmm0, [ITR] + + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [16 + KS], xmm1 + movdqa xmm2, xmm1 + + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [32 + KS], xmm1 + movdqa xmm2, xmm1 + + ret +intel_aes_encrypt_init_128 ENDP + + +intel_aes_decrypt_init_128 PROC + + push KS + push KEY + + call intel_aes_encrypt_init_128 + + pop KEY + pop KS + + movdqu xmm0, [0*16 + KS] + movdqu xmm1, [10*16 + KS] + movdqu [10*16 + KS], xmm0 + movdqu [0*16 + KS], xmm1 + + i = 1 + WHILE i LT 5 + movdqu xmm0, [i*16 + KS] + movdqu xmm1, [(10-i)*16 + KS] + + aesimc xmm0, xmm0 + aesimc xmm1, xmm1 + + movdqu [(10-i)*16 + KS], xmm0 + movdqu [i*16 + KS], xmm1 + + i = i+1 + ENDM + + movdqu xmm0, [5*16 + KS] + aesimc xmm0, xmm0 + movdqu [5*16 + KS], xmm0 + ret +intel_aes_decrypt_init_128 ENDP + + +intel_aes_encrypt_init_192 PROC + + sub rsp, 16*2 + movdqu [16*0 + rsp], xmm6 + movdqu [16*1 + rsp], xmm7 + + movdqu xmm1, [KEY] + mov ITR, [16 + KEY] + movd xmm3, ITR + + movdqu [KS], xmm1 + movdqa xmm5, xmm3 + + lea ITR, Lcon1 + movdqu xmm0, [ITR] + lea ITR, Lmask192 + movdqu xmm4, [ITR] + + mov ITR, 4 + +Lenc_192_ks_loop: + movdqa xmm2, xmm3 + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm3 + pslldq xmm6, 4 + pslldq xmm7, 4 + pxor xmm1, xmm6 + pxor xmm3, xmm7 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pxor xmm1, xmm2 + pshufd xmm2, xmm1, 0ffh + pxor xmm3, xmm2 + + movdqa xmm6, xmm1 + shufpd xmm5, xmm1, 00h + shufpd xmm6, xmm3, 01h + + movdqu [16 + KS], xmm5 + movdqu [32 + KS], xmm6 + + movdqa xmm2, xmm3 + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm3 + pslldq xmm6, 4 + pslldq xmm7, 4 + pxor xmm1, xmm6 + pxor xmm3, xmm7 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pxor xmm1, xmm2 + pshufd xmm2, xmm1, 0ffh + pxor xmm3, xmm2 + + movdqu [48 + KS], xmm1 + movdqa xmm5, xmm3 + + lea KS, [48 + KS] + + dec ITR + jnz Lenc_192_ks_loop + + movdqu [16 + KS], xmm5 + + movdqu xmm7, [16*1 + rsp] + movdqu xmm6, [16*0 + rsp] + add rsp, 16*2 + ret +intel_aes_encrypt_init_192 ENDP + +intel_aes_decrypt_init_192 PROC + push KS + push KEY + + call intel_aes_encrypt_init_192 + + pop KEY + pop KS + + movdqu xmm0, [0*16 + KS] + movdqu xmm1, [12*16 + KS] + movdqu [12*16 + KS], xmm0 + movdqu [0*16 + KS], xmm1 + + i = 1 + WHILE i LT 6 + movdqu xmm0, [i*16 + KS] + movdqu xmm1, [(12-i)*16 + KS] + + aesimc xmm0, xmm0 + aesimc xmm1, xmm1 + + movdqu [(12-i)*16 + KS], xmm0 + movdqu [i*16 + KS], xmm1 + + i = i+1 + ENDM + + movdqu xmm0, [6*16 + KS] + aesimc xmm0, xmm0 + movdqu [6*16 + KS], xmm0 + ret +intel_aes_decrypt_init_192 ENDP + + +intel_aes_encrypt_init_256 PROC + sub rsp, 16*2 + movdqu [16*0 + rsp], xmm6 + movdqu [16*1 + rsp], xmm7 + + movdqu xmm1, [16*0 + KEY] + movdqu xmm3, [16*1 + KEY] + + movdqu [16*0 + KS], xmm1 + movdqu [16*1 + KS], xmm3 + + lea ITR, Lcon1 + movdqu xmm0, [ITR] + lea ITR, Lmask256 + movdqu xmm5, [ITR] + + pxor xmm6, xmm6 + + mov ITR, 6 + +Lenc_256_ks_loop: + + movdqa xmm2, xmm3 + pshufb xmm2, xmm5 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + movdqa xmm4, xmm1 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pxor xmm1, xmm2 + movdqu [16*2 + KS], xmm1 + + pshufd xmm2, xmm1, 0ffh + aesenclast xmm2, xmm6 + movdqa xmm4, xmm3 + pslldq xmm4, 4 + pxor xmm3, xmm4 + pslldq xmm4, 4 + pxor xmm3, xmm4 + pslldq xmm4, 4 + pxor xmm3, xmm4 + pxor xmm3, xmm2 + movdqu [16*3 + KS], xmm3 + + lea KS, [32 + KS] + dec ITR + jnz Lenc_256_ks_loop + + movdqa xmm2, xmm3 + pshufb xmm2, xmm5 + aesenclast xmm2, xmm0 + movdqa xmm4, xmm1 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pxor xmm1, xmm2 + movdqu [16*2 + KS], xmm1 + + movdqu xmm7, [16*1 + rsp] + movdqu xmm6, [16*0 + rsp] + add rsp, 16*2 + ret + +intel_aes_encrypt_init_256 ENDP + + +intel_aes_decrypt_init_256 PROC + push KS + push KEY + + call intel_aes_encrypt_init_256 + + pop KEY + pop KS + + movdqu xmm0, [0*16 + KS] + movdqu xmm1, [14*16 + KS] + movdqu [14*16 + KS], xmm0 + movdqu [0*16 + KS], xmm1 + + i = 1 + WHILE i LT 7 + movdqu xmm0, [i*16 + KS] + movdqu xmm1, [(14-i)*16 + KS] + + aesimc xmm0, xmm0 + aesimc xmm1, xmm1 + + movdqu [(14-i)*16 + KS], xmm0 + movdqu [i*16 + KS], xmm1 + + i = i+1 + ENDM + + movdqu xmm0, [7*16 + KS] + aesimc xmm0, xmm0 + movdqu [7*16 + KS], xmm0 + ret +intel_aes_decrypt_init_256 ENDP + + + +gen_aes_cbc_enc_func MACRO rnds + +LOCAL loop1 +LOCAL bail + + mov input, [rsp + 1*8 + 8*4] + mov inputLen, [rsp + 1*8 + 8*5] + + sub rsp, 3*16 + + movdqu [rsp + 0*16], xmm6 + movdqu [rsp + 1*16], xmm7 + movdqu [rsp + 2*16], xmm8 + + lea ctx, [48+ctx] + + movdqu xmm0, [-32+ctx] + + movdqu xmm2, [0*16 + ctx] + movdqu xmm3, [1*16 + ctx] + movdqu xmm4, [2*16 + ctx] + movdqu xmm5, [3*16 + ctx] + movdqu xmm6, [4*16 + ctx] + movdqu xmm7, [5*16 + ctx] + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm1, [input] + pxor xmm1, xmm2 + pxor xmm0, xmm1 + + aesenc xmm0, xmm3 + aesenc xmm0, xmm4 + aesenc xmm0, xmm5 + aesenc xmm0, xmm6 + aesenc xmm0, xmm7 + + i = 6 + WHILE i LT rnds + movdqu xmm8, [i*16 + ctx] + aesenc xmm0, xmm8 + i = i+1 + ENDM + movdqu xmm8, [rnds*16 + ctx] + aesenclast xmm0, xmm8 + + movdqu [output], xmm0 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + movdqu [-32+ctx], xmm0 + + xor rax, rax + + movdqu xmm6, [rsp + 0*16] + movdqu xmm7, [rsp + 1*16] + movdqu xmm8, [rsp + 2*16] + add rsp, 3*16 + ret + +ENDM + +gen_aes_cbc_dec_func MACRO rnds + +LOCAL loop8 +LOCAL loop1 +LOCAL dec1 +LOCAL bail + + mov input, [rsp + 1*8 + 8*4] + mov inputLen, [rsp + 1*8 + 8*5] + + sub rsp, 3*16 + + movdqu [rsp + 0*16], xmm6 + movdqu [rsp + 1*16], xmm7 + movdqu [rsp + 2*16], xmm8 + + lea ctx, [48+ctx] + +loop8: + cmp inputLen, 8*16 + jb dec1 + + movdqu xmm0, [0*16 + input] + movdqu xmm1, [1*16 + input] + movdqu xmm2, [2*16 + input] + movdqu xmm3, [3*16 + input] + movdqu xmm4, [4*16 + input] + movdqu xmm5, [5*16 + input] + movdqu xmm6, [6*16 + input] + movdqu xmm7, [7*16 + input] + + movdqu xmm8, [0*16 + ctx] + pxor xmm0, xmm8 + pxor xmm1, xmm8 + pxor xmm2, xmm8 + pxor xmm3, xmm8 + pxor xmm4, xmm8 + pxor xmm5, xmm8 + pxor xmm6, xmm8 + pxor xmm7, xmm8 + + i = 1 + WHILE i LT rnds + aes_dec_rnd i + i = i+1 + ENDM + aes_dec_last_rnd rnds + + movdqu xmm8, [-32 + ctx] + pxor xmm0, xmm8 + movdqu xmm8, [0*16 + input] + pxor xmm1, xmm8 + movdqu xmm8, [1*16 + input] + pxor xmm2, xmm8 + movdqu xmm8, [2*16 + input] + pxor xmm3, xmm8 + movdqu xmm8, [3*16 + input] + pxor xmm4, xmm8 + movdqu xmm8, [4*16 + input] + pxor xmm5, xmm8 + movdqu xmm8, [5*16 + input] + pxor xmm6, xmm8 + movdqu xmm8, [6*16 + input] + pxor xmm7, xmm8 + movdqu xmm8, [7*16 + input] + + movdqu [0*16 + output], xmm0 + movdqu [1*16 + output], xmm1 + movdqu [2*16 + output], xmm2 + movdqu [3*16 + output], xmm3 + movdqu [4*16 + output], xmm4 + movdqu [5*16 + output], xmm5 + movdqu [6*16 + output], xmm6 + movdqu [7*16 + output], xmm7 + movdqu [-32 + ctx], xmm8 + + lea input, [8*16 + input] + lea output, [8*16 + output] + sub inputLen, 8*16 + jmp loop8 +dec1: + + movdqu xmm3, [-32 + ctx] + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm0, [input] + movdqa xmm4, xmm0 + movdqu xmm7, [0*16 + ctx] + pxor xmm0, xmm7 + + i = 1 + WHILE i LT rnds + movdqu xmm7, [i*16 + ctx] + aesdec xmm0, xmm7 + i = i+1 + ENDM + movdqu xmm7, [rnds*16 + ctx] + aesdeclast xmm0, xmm7 + pxor xmm3, xmm0 + + movdqu [output], xmm3 + movdqa xmm3, xmm4 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + movdqu [-32 + ctx], xmm3 + xor rax, rax + + movdqu xmm6, [rsp + 0*16] + movdqu xmm7, [rsp + 1*16] + movdqu xmm8, [rsp + 2*16] + add rsp, 3*16 + ret +ENDM + +intel_aes_encrypt_cbc_128 PROC +gen_aes_cbc_enc_func 10 +intel_aes_encrypt_cbc_128 ENDP + +intel_aes_encrypt_cbc_192 PROC +gen_aes_cbc_enc_func 12 +intel_aes_encrypt_cbc_192 ENDP + +intel_aes_encrypt_cbc_256 PROC +gen_aes_cbc_enc_func 14 +intel_aes_encrypt_cbc_256 ENDP + +intel_aes_decrypt_cbc_128 PROC +gen_aes_cbc_dec_func 10 +intel_aes_decrypt_cbc_128 ENDP + +intel_aes_decrypt_cbc_192 PROC +gen_aes_cbc_dec_func 12 +intel_aes_decrypt_cbc_192 ENDP + +intel_aes_decrypt_cbc_256 PROC +gen_aes_cbc_dec_func 14 +intel_aes_decrypt_cbc_256 ENDP + + + +ctrCtx textequ <r10> +CTR textequ <r11d> +CTRSave textequ <eax> + +gen_aes_ctr_func MACRO rnds + +LOCAL loop8 +LOCAL loop1 +LOCAL enc1 +LOCAL bail + + mov input, [rsp + 8*1 + 4*8] + mov inputLen, [rsp + 8*1 + 5*8] + + mov ctrCtx, ctx + mov ctx, [8+ctrCtx] + lea ctx, [48+ctx] + + sub rsp, 3*16 + movdqu [rsp + 0*16], xmm6 + movdqu [rsp + 1*16], xmm7 + movdqu [rsp + 2*16], xmm8 + + + push rbp + mov rbp, rsp + sub rsp, 8*16 + and rsp, -16 + + + movdqu xmm0, [16+ctrCtx] + mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] + bswap CTRSave + movdqu xmm1, [ctx + 0*16] + + pxor xmm0, xmm1 + + movdqa [rsp + 0*16], xmm0 + movdqa [rsp + 1*16], xmm0 + movdqa [rsp + 2*16], xmm0 + movdqa [rsp + 3*16], xmm0 + movdqa [rsp + 4*16], xmm0 + movdqa [rsp + 5*16], xmm0 + movdqa [rsp + 6*16], xmm0 + movdqa [rsp + 7*16], xmm0 + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + 1*16 + 3*4], CTR + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + 2*16 + 3*4], CTR + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + 3*16 + 3*4], CTR + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + 4*16 + 3*4], CTR + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + 5*16 + 3*4], CTR + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + 6*16 + 3*4], CTR + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + 7*16 + 3*4], CTR + + +loop8: + cmp inputLen, 8*16 + jb loop1 + + movdqu xmm0, [0*16 + rsp] + movdqu xmm1, [1*16 + rsp] + movdqu xmm2, [2*16 + rsp] + movdqu xmm3, [3*16 + rsp] + movdqu xmm4, [4*16 + rsp] + movdqu xmm5, [5*16 + rsp] + movdqu xmm6, [6*16 + rsp] + movdqu xmm7, [7*16 + rsp] + + i = 1 + WHILE i LE 8 + aes_rnd i + + inc CTRSave + mov CTR, CTRSave + bswap CTR + xor CTR, DWORD PTR [ctx + 3*4] + mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR + + i = i+1 + ENDM + WHILE i LT rnds + aes_rnd i + i = i+1 + ENDM + aes_last_rnd rnds + + movdqu xmm8, [0*16 + input] + pxor xmm0, xmm8 + movdqu xmm8, [1*16 + input] + pxor xmm1, xmm8 + movdqu xmm8, [2*16 + input] + pxor xmm2, xmm8 + movdqu xmm8, [3*16 + input] + pxor xmm3, xmm8 + movdqu xmm8, [4*16 + input] + pxor xmm4, xmm8 + movdqu xmm8, [5*16 + input] + pxor xmm5, xmm8 + movdqu xmm8, [6*16 + input] + pxor xmm6, xmm8 + movdqu xmm8, [7*16 + input] + pxor xmm7, xmm8 + + movdqu [0*16 + output], xmm0 + movdqu [1*16 + output], xmm1 + movdqu [2*16 + output], xmm2 + movdqu [3*16 + output], xmm3 + movdqu [4*16 + output], xmm4 + movdqu [5*16 + output], xmm5 + movdqu [6*16 + output], xmm6 + movdqu [7*16 + output], xmm7 + + lea input, [8*16 + input] + lea output, [8*16 + output] + sub inputLen, 8*16 + jmp loop8 + + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm0, [rsp] + add rsp, 16 + + i = 1 + WHILE i LT rnds + movdqu xmm7, [i*16 + ctx] + aesenc xmm0, xmm7 + i = i+1 + ENDM + movdqu xmm7, [rnds*16 + ctx] + aesenclast xmm0, xmm7 + + movdqu xmm7, [input] + pxor xmm0, xmm7 + movdqu [output], xmm0 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + + movdqu xmm0, [rsp] + movdqu xmm1, [ctx + 0*16] + pxor xmm0, xmm1 + movdqu [16+ctrCtx], xmm0 + + + xor rax, rax + mov rsp, rbp + pop rbp + + movdqu xmm6, [rsp + 0*16] + movdqu xmm7, [rsp + 1*16] + movdqu xmm8, [rsp + 2*16] + add rsp, 3*16 + + ret +ENDM + + +intel_aes_encrypt_ctr_128 PROC +gen_aes_ctr_func 10 +intel_aes_encrypt_ctr_128 ENDP + +intel_aes_encrypt_ctr_192 PROC +gen_aes_ctr_func 12 +intel_aes_encrypt_ctr_192 ENDP + +intel_aes_encrypt_ctr_256 PROC +gen_aes_ctr_func 14 +intel_aes_encrypt_ctr_256 ENDP + + +END