andre@0: /* andre@0: * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions. andre@0: * andre@0: * This Source Code Form is subject to the terms of the Mozilla Public andre@0: * License, v. 2.0. If a copy of the MPL was not distributed with this andre@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ andre@0: andre@0: #include "mpi-priv.h" andre@0: andre@0: static int is_sse = -1; andre@0: extern unsigned long s_mpi_is_sse2(); andre@0: andre@0: /* andre@0: * ebp - 36: caller's esi andre@0: * ebp - 32: caller's edi andre@0: * ebp - 28: andre@0: * ebp - 24: andre@0: * ebp - 20: andre@0: * ebp - 16: andre@0: * ebp - 12: andre@0: * ebp - 8: andre@0: * ebp - 4: andre@0: * ebp + 0: caller's ebp andre@0: * ebp + 4: return address andre@0: * ebp + 8: a argument andre@0: * ebp + 12: a_len argument andre@0: * ebp + 16: b argument andre@0: * ebp + 20: c argument andre@0: * registers: andre@0: * eax: andre@0: * ebx: carry andre@0: * ecx: a_len andre@0: * edx: andre@0: * esi: a ptr andre@0: * edi: c ptr andre@0: */ andre@0: __declspec(naked) void andre@0: s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) andre@0: { andre@0: __asm { andre@0: mov eax, is_sse andre@0: cmp eax, 0 andre@0: je s_mpv_mul_d_x86 andre@0: jg s_mpv_mul_d_sse2 andre@0: call s_mpi_is_sse2 andre@0: mov is_sse, eax andre@0: cmp eax, 0 andre@0: jg s_mpv_mul_d_sse2 andre@0: s_mpv_mul_d_x86: andre@0: push ebp andre@0: mov ebp,esp andre@0: sub esp,28 andre@0: push edi andre@0: push esi andre@0: push ebx andre@0: mov ebx,0 ; carry = 0 andre@0: mov ecx,[ebp+12] ; ecx = a_len andre@0: mov edi,[ebp+20] andre@0: cmp ecx,0 andre@0: je L_2 ; jmp if a_len == 0 andre@0: mov esi,[ebp+8] ; esi = a andre@0: cld andre@0: L_1: andre@0: lodsd ; eax = [ds:esi]; esi += 4 andre@0: mov edx,[ebp+16] ; edx = b andre@0: mul edx ; edx:eax = Phi:Plo = a_i * b andre@0: andre@0: add eax,ebx ; add carry (ebx) to edx:eax andre@0: adc edx,0 andre@0: mov ebx,edx ; high half of product becomes next carry andre@0: andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: dec ecx ; --a_len andre@0: jnz L_1 ; jmp if a_len != 0 andre@0: L_2: andre@0: mov [edi],ebx ; *c = carry andre@0: pop ebx andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: s_mpv_mul_d_sse2: andre@0: push ebp andre@0: mov ebp, esp andre@0: push edi andre@0: push esi andre@0: psubq mm2, mm2 ; carry = 0 andre@0: mov ecx, [ebp+12] ; ecx = a_len andre@0: movd mm1, [ebp+16] ; mm1 = b andre@0: mov edi, [ebp+20] andre@0: cmp ecx, 0 andre@0: je L_6 ; jmp if a_len == 0 andre@0: mov esi, [ebp+8] ; esi = a andre@0: cld andre@0: L_5: andre@0: movd mm0, [esi] ; mm0 = *a++ andre@0: add esi, 4 andre@0: pmuludq mm0, mm1 ; mm0 = b * *a++ andre@0: paddq mm2, mm0 ; add the carry andre@0: movd [edi], mm2 ; store the 32bit result andre@0: add edi, 4 andre@0: psrlq mm2, 32 ; save the carry andre@0: dec ecx ; --a_len andre@0: jnz L_5 ; jmp if a_len != 0 andre@0: L_6: andre@0: movd [edi], mm2 ; *c = carry andre@0: emms andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: } andre@0: } andre@0: andre@0: /* andre@0: * ebp - 36: caller's esi andre@0: * ebp - 32: caller's edi andre@0: * ebp - 28: andre@0: * ebp - 24: andre@0: * ebp - 20: andre@0: * ebp - 16: andre@0: * ebp - 12: andre@0: * ebp - 8: andre@0: * ebp - 4: andre@0: * ebp + 0: caller's ebp andre@0: * ebp + 4: return address andre@0: * ebp + 8: a argument andre@0: * ebp + 12: a_len argument andre@0: * ebp + 16: b argument andre@0: * ebp + 20: c argument andre@0: * registers: andre@0: * eax: andre@0: * ebx: carry andre@0: * ecx: a_len andre@0: * edx: andre@0: * esi: a ptr andre@0: * edi: c ptr andre@0: */ andre@0: __declspec(naked) void andre@0: s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) andre@0: { andre@0: __asm { andre@0: mov eax, is_sse andre@0: cmp eax, 0 andre@0: je s_mpv_mul_d_add_x86 andre@0: jg s_mpv_mul_d_add_sse2 andre@0: call s_mpi_is_sse2 andre@0: mov is_sse, eax andre@0: cmp eax, 0 andre@0: jg s_mpv_mul_d_add_sse2 andre@0: s_mpv_mul_d_add_x86: andre@0: push ebp andre@0: mov ebp,esp andre@0: sub esp,28 andre@0: push edi andre@0: push esi andre@0: push ebx andre@0: mov ebx,0 ; carry = 0 andre@0: mov ecx,[ebp+12] ; ecx = a_len andre@0: mov edi,[ebp+20] andre@0: cmp ecx,0 andre@0: je L_11 ; jmp if a_len == 0 andre@0: mov esi,[ebp+8] ; esi = a andre@0: cld andre@0: L_10: andre@0: lodsd ; eax = [ds:esi]; esi += 4 andre@0: mov edx,[ebp+16] ; edx = b andre@0: mul edx ; edx:eax = Phi:Plo = a_i * b andre@0: andre@0: add eax,ebx ; add carry (ebx) to edx:eax andre@0: adc edx,0 andre@0: mov ebx,[edi] ; add in current word from *c andre@0: add eax,ebx andre@0: adc edx,0 andre@0: mov ebx,edx ; high half of product becomes next carry andre@0: andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: dec ecx ; --a_len andre@0: jnz L_10 ; jmp if a_len != 0 andre@0: L_11: andre@0: mov [edi],ebx ; *c = carry andre@0: pop ebx andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: s_mpv_mul_d_add_sse2: andre@0: push ebp andre@0: mov ebp, esp andre@0: push edi andre@0: push esi andre@0: psubq mm2, mm2 ; carry = 0 andre@0: mov ecx, [ebp+12] ; ecx = a_len andre@0: movd mm1, [ebp+16] ; mm1 = b andre@0: mov edi, [ebp+20] andre@0: cmp ecx, 0 andre@0: je L_16 ; jmp if a_len == 0 andre@0: mov esi, [ebp+8] ; esi = a andre@0: cld andre@0: L_15: andre@0: movd mm0, [esi] ; mm0 = *a++ andre@0: add esi, 4 andre@0: pmuludq mm0, mm1 ; mm0 = b * *a++ andre@0: paddq mm2, mm0 ; add the carry andre@0: movd mm0, [edi] andre@0: paddq mm2, mm0 ; add the carry andre@0: movd [edi], mm2 ; store the 32bit result andre@0: add edi, 4 andre@0: psrlq mm2, 32 ; save the carry andre@0: dec ecx ; --a_len andre@0: jnz L_15 ; jmp if a_len != 0 andre@0: L_16: andre@0: movd [edi], mm2 ; *c = carry andre@0: emms andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: } andre@0: } andre@0: andre@0: /* andre@0: * ebp - 36: caller's esi andre@0: * ebp - 32: caller's edi andre@0: * ebp - 28: andre@0: * ebp - 24: andre@0: * ebp - 20: andre@0: * ebp - 16: andre@0: * ebp - 12: andre@0: * ebp - 8: andre@0: * ebp - 4: andre@0: * ebp + 0: caller's ebp andre@0: * ebp + 4: return address andre@0: * ebp + 8: a argument andre@0: * ebp + 12: a_len argument andre@0: * ebp + 16: b argument andre@0: * ebp + 20: c argument andre@0: * registers: andre@0: * eax: andre@0: * ebx: carry andre@0: * ecx: a_len andre@0: * edx: andre@0: * esi: a ptr andre@0: * edi: c ptr andre@0: */ andre@0: __declspec(naked) void andre@0: s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) andre@0: { andre@0: __asm { andre@0: mov eax, is_sse andre@0: cmp eax, 0 andre@0: je s_mpv_mul_d_add_prop_x86 andre@0: jg s_mpv_mul_d_add_prop_sse2 andre@0: call s_mpi_is_sse2 andre@0: mov is_sse, eax andre@0: cmp eax, 0 andre@0: jg s_mpv_mul_d_add_prop_sse2 andre@0: s_mpv_mul_d_add_prop_x86: andre@0: push ebp andre@0: mov ebp,esp andre@0: sub esp,28 andre@0: push edi andre@0: push esi andre@0: push ebx andre@0: mov ebx,0 ; carry = 0 andre@0: mov ecx,[ebp+12] ; ecx = a_len andre@0: mov edi,[ebp+20] andre@0: cmp ecx,0 andre@0: je L_21 ; jmp if a_len == 0 andre@0: cld andre@0: mov esi,[ebp+8] ; esi = a andre@0: L_20: andre@0: lodsd ; eax = [ds:esi]; esi += 4 andre@0: mov edx,[ebp+16] ; edx = b andre@0: mul edx ; edx:eax = Phi:Plo = a_i * b andre@0: andre@0: add eax,ebx ; add carry (ebx) to edx:eax andre@0: adc edx,0 andre@0: mov ebx,[edi] ; add in current word from *c andre@0: add eax,ebx andre@0: adc edx,0 andre@0: mov ebx,edx ; high half of product becomes next carry andre@0: andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: dec ecx ; --a_len andre@0: jnz L_20 ; jmp if a_len != 0 andre@0: L_21: andre@0: cmp ebx,0 ; is carry zero? andre@0: jz L_23 andre@0: mov eax,[edi] ; add in current word from *c andre@0: add eax,ebx andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: jnc L_23 andre@0: L_22: andre@0: mov eax,[edi] ; add in current word from *c andre@0: adc eax,0 andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: jc L_22 andre@0: L_23: andre@0: pop ebx andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: s_mpv_mul_d_add_prop_sse2: andre@0: push ebp andre@0: mov ebp, esp andre@0: push edi andre@0: push esi andre@0: push ebx andre@0: psubq mm2, mm2 ; carry = 0 andre@0: mov ecx, [ebp+12] ; ecx = a_len andre@0: movd mm1, [ebp+16] ; mm1 = b andre@0: mov edi, [ebp+20] andre@0: cmp ecx, 0 andre@0: je L_26 ; jmp if a_len == 0 andre@0: mov esi, [ebp+8] ; esi = a andre@0: cld andre@0: L_25: andre@0: movd mm0, [esi] ; mm0 = *a++ andre@0: movd mm3, [edi] ; fetch the sum andre@0: add esi, 4 andre@0: pmuludq mm0, mm1 ; mm0 = b * *a++ andre@0: paddq mm2, mm0 ; add the carry andre@0: paddq mm2, mm3 ; add *c++ andre@0: movd [edi], mm2 ; store the 32bit result andre@0: add edi, 4 andre@0: psrlq mm2, 32 ; save the carry andre@0: dec ecx ; --a_len andre@0: jnz L_25 ; jmp if a_len != 0 andre@0: L_26: andre@0: movd ebx, mm2 andre@0: cmp ebx, 0 ; is carry zero? andre@0: jz L_28 andre@0: mov eax, [edi] andre@0: add eax, ebx andre@0: stosd andre@0: jnc L_28 andre@0: L_27: andre@0: mov eax, [edi] ; add in current word from *c andre@0: adc eax, 0 andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: jc L_27 andre@0: L_28: andre@0: emms andre@0: pop ebx andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: } andre@0: } andre@0: andre@0: /* andre@0: * ebp - 20: caller's esi andre@0: * ebp - 16: caller's edi andre@0: * ebp - 12: andre@0: * ebp - 8: carry andre@0: * ebp - 4: a_len local andre@0: * ebp + 0: caller's ebp andre@0: * ebp + 4: return address andre@0: * ebp + 8: pa argument andre@0: * ebp + 12: a_len argument andre@0: * ebp + 16: ps argument andre@0: * ebp + 20: andre@0: * registers: andre@0: * eax: andre@0: * ebx: carry andre@0: * ecx: a_len andre@0: * edx: andre@0: * esi: a ptr andre@0: * edi: c ptr andre@0: */ andre@0: __declspec(naked) void andre@0: s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs) andre@0: { andre@0: __asm { andre@0: mov eax, is_sse andre@0: cmp eax, 0 andre@0: je s_mpv_sqr_add_prop_x86 andre@0: jg s_mpv_sqr_add_prop_sse2 andre@0: call s_mpi_is_sse2 andre@0: mov is_sse, eax andre@0: cmp eax, 0 andre@0: jg s_mpv_sqr_add_prop_sse2 andre@0: s_mpv_sqr_add_prop_x86: andre@0: push ebp andre@0: mov ebp,esp andre@0: sub esp,12 andre@0: push edi andre@0: push esi andre@0: push ebx andre@0: mov ebx,0 ; carry = 0 andre@0: mov ecx,[ebp+12] ; a_len andre@0: mov edi,[ebp+16] ; edi = ps andre@0: cmp ecx,0 andre@0: je L_31 ; jump if a_len == 0 andre@0: cld andre@0: mov esi,[ebp+8] ; esi = pa andre@0: L_30: andre@0: lodsd ; eax = [ds:si]; si += 4; andre@0: mul eax andre@0: andre@0: add eax,ebx ; add "carry" andre@0: adc edx,0 andre@0: mov ebx,[edi] andre@0: add eax,ebx ; add low word from result andre@0: mov ebx,[edi+4] andre@0: stosd ; [es:di] = eax; di += 4; andre@0: adc edx,ebx ; add high word from result andre@0: mov ebx,0 andre@0: mov eax,edx andre@0: adc ebx,0 andre@0: stosd ; [es:di] = eax; di += 4; andre@0: dec ecx ; --a_len andre@0: jnz L_30 ; jmp if a_len != 0 andre@0: L_31: andre@0: cmp ebx,0 ; is carry zero? andre@0: jz L_34 andre@0: mov eax,[edi] ; add in current word from *c andre@0: add eax,ebx andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: jnc L_34 andre@0: L_32: andre@0: mov eax,[edi] ; add in current word from *c andre@0: adc eax,0 andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: jc L_32 andre@0: L_34: andre@0: pop ebx andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: s_mpv_sqr_add_prop_sse2: andre@0: push ebp andre@0: mov ebp, esp andre@0: push edi andre@0: push esi andre@0: push ebx andre@0: psubq mm2, mm2 ; carry = 0 andre@0: mov ecx, [ebp+12] ; ecx = a_len andre@0: mov edi, [ebp+16] andre@0: cmp ecx, 0 andre@0: je L_36 ; jmp if a_len == 0 andre@0: mov esi, [ebp+8] ; esi = a andre@0: cld andre@0: L_35: andre@0: movd mm0, [esi] ; mm0 = *a andre@0: movd mm3, [edi] ; fetch the sum andre@0: add esi, 4 andre@0: pmuludq mm0, mm0 ; mm0 = sqr(a) andre@0: paddq mm2, mm0 ; add the carry andre@0: paddq mm2, mm3 ; add the low word andre@0: movd mm3, [edi+4] andre@0: movd [edi], mm2 ; store the 32bit result andre@0: psrlq mm2, 32 andre@0: paddq mm2, mm3 ; add the high word andre@0: movd [edi+4], mm2 ; store the 32bit result andre@0: psrlq mm2, 32 ; save the carry. andre@0: add edi, 8 andre@0: dec ecx ; --a_len andre@0: jnz L_35 ; jmp if a_len != 0 andre@0: L_36: andre@0: movd ebx, mm2 andre@0: cmp ebx, 0 ; is carry zero? andre@0: jz L_38 andre@0: mov eax, [edi] andre@0: add eax, ebx andre@0: stosd andre@0: jnc L_38 andre@0: L_37: andre@0: mov eax, [edi] ; add in current word from *c andre@0: adc eax, 0 andre@0: stosd ; [es:edi] = ax; edi += 4; andre@0: jc L_37 andre@0: L_38: andre@0: emms andre@0: pop ebx andre@0: pop esi andre@0: pop edi andre@0: leave andre@0: ret andre@0: nop andre@0: } andre@0: } andre@0: andre@0: /* andre@0: * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized andre@0: * so its high bit is 1. This code is from NSPR. andre@0: * andre@0: * Dump of assembler code for function s_mpv_div_2dx1d: andre@0: * andre@0: * esp + 0: Caller's ebx andre@0: * esp + 4: return address andre@0: * esp + 8: Nhi argument andre@0: * esp + 12: Nlo argument andre@0: * esp + 16: divisor argument andre@0: * esp + 20: qp argument andre@0: * esp + 24: rp argument andre@0: * registers: andre@0: * eax: andre@0: * ebx: carry andre@0: * ecx: a_len andre@0: * edx: andre@0: * esi: a ptr andre@0: * edi: c ptr andre@0: */ andre@0: __declspec(naked) mp_err andre@0: s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, andre@0: mp_digit *qp, mp_digit *rp) andre@0: { andre@0: __asm { andre@0: push ebx andre@0: mov edx,[esp+8] andre@0: mov eax,[esp+12] andre@0: mov ebx,[esp+16] andre@0: div ebx andre@0: mov ebx,[esp+20] andre@0: mov [ebx],eax andre@0: mov ebx,[esp+24] andre@0: mov [ebx],edx andre@0: xor eax,eax ; return zero andre@0: pop ebx andre@0: ret andre@0: nop andre@0: } andre@0: }