tls: P256: improve x86_64 multiplication asm code

gcc is being rather silly. Usues suboptimal registers, and does not realize that i and j are never negative, thus usese even _more_ registers for temporaries to sign-extend i/j to 64-bit offsets. function old new delta sp_256_mont_mul_8 155 132 -23 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
2024-07-12 11:08:08 +02:00 · 2024-07-12 11:08:08 +02:00 · 999e290ef6
parent d745852f13
commit 999e290ef6
1 changed files with 36 additions and 22 deletions
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@ -411,10 +411,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
 "\n		subl	$0xffffffff, (%0)"
 "\n		sbbl	$0xffffffff, 1*4(%0)"
 "\n		sbbl	$0xffffffff, 2*4(%0)"
-"\n		sbbl	$0, 3*4(%0)"
-"\n		sbbl	$0, 4*4(%0)"
-"\n		sbbl	$0, 5*4(%0)"
-"\n		sbbl	$1, 6*4(%0)"
+"\n		sbbl	$0x00000000, 3*4(%0)"
+"\n		sbbl	$0x00000000, 4*4(%0)"
+"\n		sbbl	$0x00000000, 5*4(%0)"
+"\n		sbbl	$0x00000001, 6*4(%0)"
 "\n		sbbl	$0xffffffff, 7*4(%0)"
 "\n"
 		: "=r" (r)
@ -433,10 +433,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
 	uint64_t ooff;
 	asm volatile (
 "\n		subq	$0xffffffffffffffff, (%0)"
-"\n		sbbq	%1, 1*8(%0)"
-"\n		sbbq	$0, 2*8(%0)"
+"\n		sbbq	%1, 1*8(%0)" // %1 = 00000000ffffffff
+"\n		sbbq	$0x0000000000000000, 2*8(%0)"
 "\n		movq	3*8(%0), %2"
-"\n		sbbq	$0, %2"	// subtract carry
+"\n		sbbq	$0x0, %2" // subtract carry
 "\n		addq	%1, %2" // adding 00000000ffffffff (in %1)
 "\n"		// is the same as subtracting ffffffff00000001
 "\n		movq	%2, 3*8(%0)"
@ -452,9 +452,9 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
 "\n		orl	$0xffffffff, %%eax" // %1 (rax) = 00000000ffffffff
 "\n		subq	$0xffffffffffffffff, (%0)"
 "\n		sbbq	%1, 1*8(%0)"
-"\n		sbbq	$0, 2*8(%0)"
+"\n		sbbq	$0x0000000000000000, 2*8(%0)"
 "\n		movq	3*8(%0), %2"
-"\n		sbbq	$0, %2"	// subtract carry
+"\n		sbbq	$0x0, %2" // subtract carry
 "\n		addq	%1, %2" // adding 00000000ffffffff (in %1)
 "\n"		// is the same as subtracting ffffffff00000001
 "\n		movq	%2, 3*8(%0)"
@ -495,15 +495,23 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 ////////////////////////
 //			uint64_t m = ((uint64_t)a[i]) * b[j];
 //			acc_hi:acch:accl += m;
+			long eax_clobbered;
 			asm volatile (
 			// a[i] is already loaded in %%eax
-"\n			mull	%7"
+"\n			mull	%8"
 "\n			addl	%%eax, %0"
 "\n			adcl	%%edx, %1"
-"\n			adcl	$0, %2"
-			: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
-			: "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j])
+"\n			adcl	$0x0, %2"
+			: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi), "=a" (eax_clobbered)
+			: "0"   (accl), "1"   (acch), "2"   (acc_hi), "3"  (a[i]), "m" (b[j])
 			: "cc", "dx"
+// What is "eax_clobbered"? gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html:
+// "Do not modify the contents of input-only operands (except for inputs tied
+// to outputs). The compiler assumes that on exit from the asm statement these
+// operands contain the same values as they had before executing the statement.
+// It is not possible to use clobbers to inform the compiler that the values
+// in these inputs are changing. One common work-around is to tie the changing
+// input variable to an output variable that never gets used."
 			);
 ////////////////////////
 		        j--;
@ -519,15 +527,20 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 	const uint64_t* bb = (const void*)b;
 	uint64_t* rr = (void*)r;
 	int k;
-	uint64_t accl;
-	uint64_t acch;
+	register uint64_t accl asm("r8");
+	register uint64_t acch asm("r9");
+	/* ^^^ ask gcc to not use rax/rdx/input arg regs for accumulator variables */
+	/* (or else it may generate lots of silly mov's and even xchg's!) */

 	acch = accl = 0;
 	for (k = 0; k < 7; k++) {
-		int i, j;
-		uint64_t acc_hi;
+		unsigned i, j;
+		/* ^^^^^ not signed "int",
+		 * or gcc can use a temp register to sign-extend i,j for aa[i],bb[j] */
+		register uint64_t acc_hi asm("r10");
+		/* ^^^ ask gcc to not use rax/rdx/input arg regs for accumulators */
 		i = k - 3;
-		if (i < 0)
+		if ((int)i < 0)
 			i = 0;
 		j = k - i;
 		acc_hi = 0;
@ -535,14 +548,15 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 ////////////////////////
 //			uint128_t m = ((uint128_t)a[i]) * b[j];
 //			acc_hi:acch:accl += m;
+			long rax_clobbered;
 			asm volatile (
 			// aa[i] is already loaded in %%rax
-"\n			mulq	%7"
+"\n			mulq	%8"
 "\n			addq	%%rax, %0"
 "\n			adcq	%%rdx, %1"
-"\n			adcq	$0, %2"
-			: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
-			: "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
+"\n			adcq	$0x0, %2"
+			: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi), "=a" (rax_clobbered)
+			: "0"   (accl), "1"   (acch), "2"   (acc_hi), "3"  (aa[i]), "m" (bb[j])
 			: "cc", "dx"
 			);
 ////////////////////////