mirror of https://github.com/mirror/busybox.git
tls: P256: improve x86_64 multiplication asm code
gcc is being rather silly. Usues suboptimal registers, and does not realize that i and j are never negative, thus usese even _more_ registers for temporaries to sign-extend i/j to 64-bit offsets. function old new delta sp_256_mont_mul_8 155 132 -23 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>master
parent
d745852f13
commit
999e290ef6
|
@ -411,10 +411,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
|
||||||
"\n subl $0xffffffff, (%0)"
|
"\n subl $0xffffffff, (%0)"
|
||||||
"\n sbbl $0xffffffff, 1*4(%0)"
|
"\n sbbl $0xffffffff, 1*4(%0)"
|
||||||
"\n sbbl $0xffffffff, 2*4(%0)"
|
"\n sbbl $0xffffffff, 2*4(%0)"
|
||||||
"\n sbbl $0, 3*4(%0)"
|
"\n sbbl $0x00000000, 3*4(%0)"
|
||||||
"\n sbbl $0, 4*4(%0)"
|
"\n sbbl $0x00000000, 4*4(%0)"
|
||||||
"\n sbbl $0, 5*4(%0)"
|
"\n sbbl $0x00000000, 5*4(%0)"
|
||||||
"\n sbbl $1, 6*4(%0)"
|
"\n sbbl $0x00000001, 6*4(%0)"
|
||||||
"\n sbbl $0xffffffff, 7*4(%0)"
|
"\n sbbl $0xffffffff, 7*4(%0)"
|
||||||
"\n"
|
"\n"
|
||||||
: "=r" (r)
|
: "=r" (r)
|
||||||
|
@ -433,10 +433,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
|
||||||
uint64_t ooff;
|
uint64_t ooff;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"\n subq $0xffffffffffffffff, (%0)"
|
"\n subq $0xffffffffffffffff, (%0)"
|
||||||
"\n sbbq %1, 1*8(%0)"
|
"\n sbbq %1, 1*8(%0)" // %1 = 00000000ffffffff
|
||||||
"\n sbbq $0, 2*8(%0)"
|
"\n sbbq $0x0000000000000000, 2*8(%0)"
|
||||||
"\n movq 3*8(%0), %2"
|
"\n movq 3*8(%0), %2"
|
||||||
"\n sbbq $0, %2" // subtract carry
|
"\n sbbq $0x0, %2" // subtract carry
|
||||||
"\n addq %1, %2" // adding 00000000ffffffff (in %1)
|
"\n addq %1, %2" // adding 00000000ffffffff (in %1)
|
||||||
"\n" // is the same as subtracting ffffffff00000001
|
"\n" // is the same as subtracting ffffffff00000001
|
||||||
"\n movq %2, 3*8(%0)"
|
"\n movq %2, 3*8(%0)"
|
||||||
|
@ -452,9 +452,9 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
|
||||||
"\n orl $0xffffffff, %%eax" // %1 (rax) = 00000000ffffffff
|
"\n orl $0xffffffff, %%eax" // %1 (rax) = 00000000ffffffff
|
||||||
"\n subq $0xffffffffffffffff, (%0)"
|
"\n subq $0xffffffffffffffff, (%0)"
|
||||||
"\n sbbq %1, 1*8(%0)"
|
"\n sbbq %1, 1*8(%0)"
|
||||||
"\n sbbq $0, 2*8(%0)"
|
"\n sbbq $0x0000000000000000, 2*8(%0)"
|
||||||
"\n movq 3*8(%0), %2"
|
"\n movq 3*8(%0), %2"
|
||||||
"\n sbbq $0, %2" // subtract carry
|
"\n sbbq $0x0, %2" // subtract carry
|
||||||
"\n addq %1, %2" // adding 00000000ffffffff (in %1)
|
"\n addq %1, %2" // adding 00000000ffffffff (in %1)
|
||||||
"\n" // is the same as subtracting ffffffff00000001
|
"\n" // is the same as subtracting ffffffff00000001
|
||||||
"\n movq %2, 3*8(%0)"
|
"\n movq %2, 3*8(%0)"
|
||||||
|
@ -495,15 +495,23 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
|
||||||
////////////////////////
|
////////////////////////
|
||||||
// uint64_t m = ((uint64_t)a[i]) * b[j];
|
// uint64_t m = ((uint64_t)a[i]) * b[j];
|
||||||
// acc_hi:acch:accl += m;
|
// acc_hi:acch:accl += m;
|
||||||
|
long eax_clobbered;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
// a[i] is already loaded in %%eax
|
// a[i] is already loaded in %%eax
|
||||||
"\n mull %7"
|
"\n mull %8"
|
||||||
"\n addl %%eax, %0"
|
"\n addl %%eax, %0"
|
||||||
"\n adcl %%edx, %1"
|
"\n adcl %%edx, %1"
|
||||||
"\n adcl $0, %2"
|
"\n adcl $0x0, %2"
|
||||||
: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
|
: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi), "=a" (eax_clobbered)
|
||||||
: "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j])
|
: "0" (accl), "1" (acch), "2" (acc_hi), "3" (a[i]), "m" (b[j])
|
||||||
: "cc", "dx"
|
: "cc", "dx"
|
||||||
|
// What is "eax_clobbered"? gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html:
|
||||||
|
// "Do not modify the contents of input-only operands (except for inputs tied
|
||||||
|
// to outputs). The compiler assumes that on exit from the asm statement these
|
||||||
|
// operands contain the same values as they had before executing the statement.
|
||||||
|
// It is not possible to use clobbers to inform the compiler that the values
|
||||||
|
// in these inputs are changing. One common work-around is to tie the changing
|
||||||
|
// input variable to an output variable that never gets used."
|
||||||
);
|
);
|
||||||
////////////////////////
|
////////////////////////
|
||||||
j--;
|
j--;
|
||||||
|
@ -519,15 +527,20 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
|
||||||
const uint64_t* bb = (const void*)b;
|
const uint64_t* bb = (const void*)b;
|
||||||
uint64_t* rr = (void*)r;
|
uint64_t* rr = (void*)r;
|
||||||
int k;
|
int k;
|
||||||
uint64_t accl;
|
register uint64_t accl asm("r8");
|
||||||
uint64_t acch;
|
register uint64_t acch asm("r9");
|
||||||
|
/* ^^^ ask gcc to not use rax/rdx/input arg regs for accumulator variables */
|
||||||
|
/* (or else it may generate lots of silly mov's and even xchg's!) */
|
||||||
|
|
||||||
acch = accl = 0;
|
acch = accl = 0;
|
||||||
for (k = 0; k < 7; k++) {
|
for (k = 0; k < 7; k++) {
|
||||||
int i, j;
|
unsigned i, j;
|
||||||
uint64_t acc_hi;
|
/* ^^^^^ not signed "int",
|
||||||
|
* or gcc can use a temp register to sign-extend i,j for aa[i],bb[j] */
|
||||||
|
register uint64_t acc_hi asm("r10");
|
||||||
|
/* ^^^ ask gcc to not use rax/rdx/input arg regs for accumulators */
|
||||||
i = k - 3;
|
i = k - 3;
|
||||||
if (i < 0)
|
if ((int)i < 0)
|
||||||
i = 0;
|
i = 0;
|
||||||
j = k - i;
|
j = k - i;
|
||||||
acc_hi = 0;
|
acc_hi = 0;
|
||||||
|
@ -535,14 +548,15 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
|
||||||
////////////////////////
|
////////////////////////
|
||||||
// uint128_t m = ((uint128_t)a[i]) * b[j];
|
// uint128_t m = ((uint128_t)a[i]) * b[j];
|
||||||
// acc_hi:acch:accl += m;
|
// acc_hi:acch:accl += m;
|
||||||
|
long rax_clobbered;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
// aa[i] is already loaded in %%rax
|
// aa[i] is already loaded in %%rax
|
||||||
"\n mulq %7"
|
"\n mulq %8"
|
||||||
"\n addq %%rax, %0"
|
"\n addq %%rax, %0"
|
||||||
"\n adcq %%rdx, %1"
|
"\n adcq %%rdx, %1"
|
||||||
"\n adcq $0, %2"
|
"\n adcq $0x0, %2"
|
||||||
: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
|
: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi), "=a" (rax_clobbered)
|
||||||
: "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
|
: "0" (accl), "1" (acch), "2" (acc_hi), "3" (aa[i]), "m" (bb[j])
|
||||||
: "cc", "dx"
|
: "cc", "dx"
|
||||||
);
|
);
|
||||||
////////////////////////
|
////////////////////////
|
||||||
|
|
Loading…
Reference in New Issue