Mercurial > trustbridge > nss-cmake-static
diff nss/lib/freebl/mpi/mpi_x86_asm.c @ 0:1e5118fa0cb1
This is NSS with a Cmake Buildsyste
To compile a static NSS library for Windows we've used the
Chromium-NSS fork and added a Cmake buildsystem to compile
it statically for Windows. See README.chromium for chromium
changes and README.trustbridge for our modifications.
author | Andre Heinecke <andre.heinecke@intevation.de> |
---|---|
date | Mon, 28 Jul 2014 10:47:06 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nss/lib/freebl/mpi/mpi_x86_asm.c Mon Jul 28 10:47:06 2014 +0200 @@ -0,0 +1,535 @@ +/* + * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mpi-priv.h" + +static int is_sse = -1; +extern unsigned long s_mpi_is_sse2(); + +/* + * ebp - 36: caller's esi + * ebp - 32: caller's edi + * ebp - 28: + * ebp - 24: + * ebp - 20: + * ebp - 16: + * ebp - 12: + * ebp - 8: + * ebp - 4: + * ebp + 0: caller's ebp + * ebp + 4: return address + * ebp + 8: a argument + * ebp + 12: a_len argument + * ebp + 16: b argument + * ebp + 20: c argument + * registers: + * eax: + * ebx: carry + * ecx: a_len + * edx: + * esi: a ptr + * edi: c ptr + */ +__declspec(naked) void +s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) +{ + __asm { + mov eax, is_sse + cmp eax, 0 + je s_mpv_mul_d_x86 + jg s_mpv_mul_d_sse2 + call s_mpi_is_sse2 + mov is_sse, eax + cmp eax, 0 + jg s_mpv_mul_d_sse2 +s_mpv_mul_d_x86: + push ebp + mov ebp,esp + sub esp,28 + push edi + push esi + push ebx + mov ebx,0 ; carry = 0 + mov ecx,[ebp+12] ; ecx = a_len + mov edi,[ebp+20] + cmp ecx,0 + je L_2 ; jmp if a_len == 0 + mov esi,[ebp+8] ; esi = a + cld +L_1: + lodsd ; eax = [ds:esi]; esi += 4 + mov edx,[ebp+16] ; edx = b + mul edx ; edx:eax = Phi:Plo = a_i * b + + add eax,ebx ; add carry (ebx) to edx:eax + adc edx,0 + mov ebx,edx ; high half of product becomes next carry + + stosd ; [es:edi] = ax; edi += 4; + dec ecx ; --a_len + jnz L_1 ; jmp if a_len != 0 +L_2: + mov [edi],ebx ; *c = carry + pop ebx + pop esi + pop edi + leave + ret + nop +s_mpv_mul_d_sse2: + push ebp + mov ebp, esp + push edi + push esi + psubq mm2, mm2 ; carry = 0 + mov ecx, [ebp+12] ; ecx = a_len + movd mm1, [ebp+16] ; mm1 = b + mov edi, [ebp+20] + cmp ecx, 0 + je L_6 ; jmp if a_len == 0 + mov esi, [ebp+8] ; esi = a + cld +L_5: + movd mm0, [esi] ; mm0 = *a++ + add esi, 4 + pmuludq mm0, mm1 ; mm0 = b * *a++ + paddq mm2, mm0 ; add the carry + movd [edi], mm2 ; store the 32bit result + add edi, 4 + psrlq mm2, 32 ; save the carry + dec ecx ; --a_len + jnz L_5 ; jmp if a_len != 0 +L_6: + movd [edi], mm2 ; *c = carry + emms + pop esi + pop edi + leave + ret + nop + } +} + +/* + * ebp - 36: caller's esi + * ebp - 32: caller's edi + * ebp - 28: + * ebp - 24: + * ebp - 20: + * ebp - 16: + * ebp - 12: + * ebp - 8: + * ebp - 4: + * ebp + 0: caller's ebp + * ebp + 4: return address + * ebp + 8: a argument + * ebp + 12: a_len argument + * ebp + 16: b argument + * ebp + 20: c argument + * registers: + * eax: + * ebx: carry + * ecx: a_len + * edx: + * esi: a ptr + * edi: c ptr + */ +__declspec(naked) void +s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) +{ + __asm { + mov eax, is_sse + cmp eax, 0 + je s_mpv_mul_d_add_x86 + jg s_mpv_mul_d_add_sse2 + call s_mpi_is_sse2 + mov is_sse, eax + cmp eax, 0 + jg s_mpv_mul_d_add_sse2 +s_mpv_mul_d_add_x86: + push ebp + mov ebp,esp + sub esp,28 + push edi + push esi + push ebx + mov ebx,0 ; carry = 0 + mov ecx,[ebp+12] ; ecx = a_len + mov edi,[ebp+20] + cmp ecx,0 + je L_11 ; jmp if a_len == 0 + mov esi,[ebp+8] ; esi = a + cld +L_10: + lodsd ; eax = [ds:esi]; esi += 4 + mov edx,[ebp+16] ; edx = b + mul edx ; edx:eax = Phi:Plo = a_i * b + + add eax,ebx ; add carry (ebx) to edx:eax + adc edx,0 + mov ebx,[edi] ; add in current word from *c + add eax,ebx + adc edx,0 + mov ebx,edx ; high half of product becomes next carry + + stosd ; [es:edi] = ax; edi += 4; + dec ecx ; --a_len + jnz L_10 ; jmp if a_len != 0 +L_11: + mov [edi],ebx ; *c = carry + pop ebx + pop esi + pop edi + leave + ret + nop +s_mpv_mul_d_add_sse2: + push ebp + mov ebp, esp + push edi + push esi + psubq mm2, mm2 ; carry = 0 + mov ecx, [ebp+12] ; ecx = a_len + movd mm1, [ebp+16] ; mm1 = b + mov edi, [ebp+20] + cmp ecx, 0 + je L_16 ; jmp if a_len == 0 + mov esi, [ebp+8] ; esi = a + cld +L_15: + movd mm0, [esi] ; mm0 = *a++ + add esi, 4 + pmuludq mm0, mm1 ; mm0 = b * *a++ + paddq mm2, mm0 ; add the carry + movd mm0, [edi] + paddq mm2, mm0 ; add the carry + movd [edi], mm2 ; store the 32bit result + add edi, 4 + psrlq mm2, 32 ; save the carry + dec ecx ; --a_len + jnz L_15 ; jmp if a_len != 0 +L_16: + movd [edi], mm2 ; *c = carry + emms + pop esi + pop edi + leave + ret + nop + } +} + +/* + * ebp - 36: caller's esi + * ebp - 32: caller's edi + * ebp - 28: + * ebp - 24: + * ebp - 20: + * ebp - 16: + * ebp - 12: + * ebp - 8: + * ebp - 4: + * ebp + 0: caller's ebp + * ebp + 4: return address + * ebp + 8: a argument + * ebp + 12: a_len argument + * ebp + 16: b argument + * ebp + 20: c argument + * registers: + * eax: + * ebx: carry + * ecx: a_len + * edx: + * esi: a ptr + * edi: c ptr + */ +__declspec(naked) void +s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) +{ + __asm { + mov eax, is_sse + cmp eax, 0 + je s_mpv_mul_d_add_prop_x86 + jg s_mpv_mul_d_add_prop_sse2 + call s_mpi_is_sse2 + mov is_sse, eax + cmp eax, 0 + jg s_mpv_mul_d_add_prop_sse2 +s_mpv_mul_d_add_prop_x86: + push ebp + mov ebp,esp + sub esp,28 + push edi + push esi + push ebx + mov ebx,0 ; carry = 0 + mov ecx,[ebp+12] ; ecx = a_len + mov edi,[ebp+20] + cmp ecx,0 + je L_21 ; jmp if a_len == 0 + cld + mov esi,[ebp+8] ; esi = a +L_20: + lodsd ; eax = [ds:esi]; esi += 4 + mov edx,[ebp+16] ; edx = b + mul edx ; edx:eax = Phi:Plo = a_i * b + + add eax,ebx ; add carry (ebx) to edx:eax + adc edx,0 + mov ebx,[edi] ; add in current word from *c + add eax,ebx + adc edx,0 + mov ebx,edx ; high half of product becomes next carry + + stosd ; [es:edi] = ax; edi += 4; + dec ecx ; --a_len + jnz L_20 ; jmp if a_len != 0 +L_21: + cmp ebx,0 ; is carry zero? + jz L_23 + mov eax,[edi] ; add in current word from *c + add eax,ebx + stosd ; [es:edi] = ax; edi += 4; + jnc L_23 +L_22: + mov eax,[edi] ; add in current word from *c + adc eax,0 + stosd ; [es:edi] = ax; edi += 4; + jc L_22 +L_23: + pop ebx + pop esi + pop edi + leave + ret + nop +s_mpv_mul_d_add_prop_sse2: + push ebp + mov ebp, esp + push edi + push esi + push ebx + psubq mm2, mm2 ; carry = 0 + mov ecx, [ebp+12] ; ecx = a_len + movd mm1, [ebp+16] ; mm1 = b + mov edi, [ebp+20] + cmp ecx, 0 + je L_26 ; jmp if a_len == 0 + mov esi, [ebp+8] ; esi = a + cld +L_25: + movd mm0, [esi] ; mm0 = *a++ + movd mm3, [edi] ; fetch the sum + add esi, 4 + pmuludq mm0, mm1 ; mm0 = b * *a++ + paddq mm2, mm0 ; add the carry + paddq mm2, mm3 ; add *c++ + movd [edi], mm2 ; store the 32bit result + add edi, 4 + psrlq mm2, 32 ; save the carry + dec ecx ; --a_len + jnz L_25 ; jmp if a_len != 0 +L_26: + movd ebx, mm2 + cmp ebx, 0 ; is carry zero? + jz L_28 + mov eax, [edi] + add eax, ebx + stosd + jnc L_28 +L_27: + mov eax, [edi] ; add in current word from *c + adc eax, 0 + stosd ; [es:edi] = ax; edi += 4; + jc L_27 +L_28: + emms + pop ebx + pop esi + pop edi + leave + ret + nop + } +} + +/* + * ebp - 20: caller's esi + * ebp - 16: caller's edi + * ebp - 12: + * ebp - 8: carry + * ebp - 4: a_len local + * ebp + 0: caller's ebp + * ebp + 4: return address + * ebp + 8: pa argument + * ebp + 12: a_len argument + * ebp + 16: ps argument + * ebp + 20: + * registers: + * eax: + * ebx: carry + * ecx: a_len + * edx: + * esi: a ptr + * edi: c ptr + */ +__declspec(naked) void +s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs) +{ + __asm { + mov eax, is_sse + cmp eax, 0 + je s_mpv_sqr_add_prop_x86 + jg s_mpv_sqr_add_prop_sse2 + call s_mpi_is_sse2 + mov is_sse, eax + cmp eax, 0 + jg s_mpv_sqr_add_prop_sse2 +s_mpv_sqr_add_prop_x86: + push ebp + mov ebp,esp + sub esp,12 + push edi + push esi + push ebx + mov ebx,0 ; carry = 0 + mov ecx,[ebp+12] ; a_len + mov edi,[ebp+16] ; edi = ps + cmp ecx,0 + je L_31 ; jump if a_len == 0 + cld + mov esi,[ebp+8] ; esi = pa +L_30: + lodsd ; eax = [ds:si]; si += 4; + mul eax + + add eax,ebx ; add "carry" + adc edx,0 + mov ebx,[edi] + add eax,ebx ; add low word from result + mov ebx,[edi+4] + stosd ; [es:di] = eax; di += 4; + adc edx,ebx ; add high word from result + mov ebx,0 + mov eax,edx + adc ebx,0 + stosd ; [es:di] = eax; di += 4; + dec ecx ; --a_len + jnz L_30 ; jmp if a_len != 0 +L_31: + cmp ebx,0 ; is carry zero? + jz L_34 + mov eax,[edi] ; add in current word from *c + add eax,ebx + stosd ; [es:edi] = ax; edi += 4; + jnc L_34 +L_32: + mov eax,[edi] ; add in current word from *c + adc eax,0 + stosd ; [es:edi] = ax; edi += 4; + jc L_32 +L_34: + pop ebx + pop esi + pop edi + leave + ret + nop +s_mpv_sqr_add_prop_sse2: + push ebp + mov ebp, esp + push edi + push esi + push ebx + psubq mm2, mm2 ; carry = 0 + mov ecx, [ebp+12] ; ecx = a_len + mov edi, [ebp+16] + cmp ecx, 0 + je L_36 ; jmp if a_len == 0 + mov esi, [ebp+8] ; esi = a + cld +L_35: + movd mm0, [esi] ; mm0 = *a + movd mm3, [edi] ; fetch the sum + add esi, 4 + pmuludq mm0, mm0 ; mm0 = sqr(a) + paddq mm2, mm0 ; add the carry + paddq mm2, mm3 ; add the low word + movd mm3, [edi+4] + movd [edi], mm2 ; store the 32bit result + psrlq mm2, 32 + paddq mm2, mm3 ; add the high word + movd [edi+4], mm2 ; store the 32bit result + psrlq mm2, 32 ; save the carry. + add edi, 8 + dec ecx ; --a_len + jnz L_35 ; jmp if a_len != 0 +L_36: + movd ebx, mm2 + cmp ebx, 0 ; is carry zero? + jz L_38 + mov eax, [edi] + add eax, ebx + stosd + jnc L_38 +L_37: + mov eax, [edi] ; add in current word from *c + adc eax, 0 + stosd ; [es:edi] = ax; edi += 4; + jc L_37 +L_38: + emms + pop ebx + pop esi + pop edi + leave + ret + nop + } +} + +/* + * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized + * so its high bit is 1. This code is from NSPR. + * + * Dump of assembler code for function s_mpv_div_2dx1d: + * + * esp + 0: Caller's ebx + * esp + 4: return address + * esp + 8: Nhi argument + * esp + 12: Nlo argument + * esp + 16: divisor argument + * esp + 20: qp argument + * esp + 24: rp argument + * registers: + * eax: + * ebx: carry + * ecx: a_len + * edx: + * esi: a ptr + * edi: c ptr + */ +__declspec(naked) mp_err +s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, + mp_digit *qp, mp_digit *rp) +{ + __asm { + push ebx + mov edx,[esp+8] + mov eax,[esp+12] + mov ebx,[esp+16] + div ebx + mov ebx,[esp+20] + mov [ebx],eax + mov ebx,[esp+24] + mov [ebx],edx + xor eax,eax ; return zero + pop ebx + ret + nop + } +}