AES-GCM AArch64: Store swapped Htable values (#1403)

AArch64 assembly implementations of AES-GCM in AWS-LC use an "H-Table" to precompute and cache common computations across multiple invocations of AES-GCM using the same key, thereby improving performance. The main example of such common precomputation is the computation of powers of the H-value used in the GHASH algorithm -- giving the H-Table its name. However, despite the name, the structure of the H-Table is opaque to the code invoking AES-GCM, and implementations are free to populate it with arbitrary data. This freedom is already being leveraged: Currently, the AArch64 implementation of AES-GCM not only stores powers of H in the HTable (H1-H8 in the code), but also their 'Karatsuba preprocessing's, which are the EORs of the low and high halves. Those naturally occur when using Karatsuba's algorithm to reduce a 128-bit polynomial multiplication over GF(2) to 3x 64-bit polynomial. This commit changes the structure of the H-Table for AArch64 implementations of AES-GCM slightly to obtain a small performance gain: It is observed that every time a power of H is loaded from the H-Table (H1-H8), the first operation that happens to it in both aesv8-gcm-armv8.pl and aesv8-gcm-armv8-unroll8.pl is to swap low and high halves via `ext arg.16b, arg.16b, arg.16b, #8`. Those swaps can be precomputed, and the H{1-8} values stored in swapped form in the HTable, thereby eliminating the swaps from the critical loop of AES-GCM. This gives a small performance gain for AES-GCM on Graviton3, at the cost of slightly slower one-off initialization. For Graviton2, the AES-GCM AArch64 assembly loads the H-table only once, outside of the critical loop; hence, there is no performance benefit.
aws · Jul 11, 2024 · 90315e2 · 90315e2
1 parent a0e8da9
commit 90315e2
Show file tree

Hide file tree

Showing 14 changed files with 188 additions and 742 deletions.
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-unroll8.pl b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-unroll8.pl
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl
@@ -353,18 +353,15 @@
 	ldr     $rk5q, [$cc, #80]                                 // load rk5
 	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 1
 	ldr     $h3q, [$Htable, #48]                              // load h3l | h3h
-	ext     $h3b, $h3b, $h3b, #8
 	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
 	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 1
 	ldr     $rk4q, [$cc, #64]                                 // load rk4
 	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 2
 	ldr     $h2q, [$Htable, #32]                              // load h2l | h2h
-	ext     $h2b, $h2b, $h2b, #8
 	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 1
 	ldr     $rk12q, [$cc, #192]                               // load rk12
 	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
 	ldr     $h4q, [$Htable, #80]                              // load h4l | h4h
-	ext     $h4b, $h4b, $h4b, #8
 	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 3
 	ldr     $rk11q, [$cc, #176]                               // load rk11
 	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2
@@ -391,7 +388,6 @@
 	ldr     $rk9q, [$cc, #144]                                // load rk9
 	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 6
 	ldr     $h1q, [$Htable]                                   // load h1l | h1h
-	ext     $h1b, $h1b, $h1b, #8
 	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 6
 	ldr     $rk10q, [$cc, #160]                               // load rk10
 	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 7
@@ -962,13 +958,10 @@
 	ldr     $rk1q, [$cc, #16]                                 // load rk1
 	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 0
 	ldr     $h3q, [$Htable, #48]                              // load h3l | h3h
-	ext     $h3b, $h3b, $h3b, #8
 	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
 	ldr     $h4q, [$Htable, #80]                              // load h4l | h4h
-	ext     $h4b, $h4b, $h4b, #8
 	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 0
 	ldr     $h2q, [$Htable, #32]                              // load h2l | h2h
-	ext     $h2b, $h2b, $h2b, #8
 	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 0
 	ldr     $rk2q, [$cc, #32]                                 // load rk2
 	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 1
@@ -982,7 +975,6 @@
 	ldr     $rk12q, [$cc, #192]                               // load rk12
 	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 2
 	ldr     $h1q, [$Htable]                                   // load h1l | h1h
-	ext     $h1b, $h1b, $h1b, #8
 	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
 	ldr     $rk10q, [$cc, #160]                               // load rk10
 	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2

diff --git a/crypto/fipsmodule/modes/asm/ghashv8-armx.pl b/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
@@ -113,13 +113,14 @@
 	vand		$t0,$t0,$t1
 	vorr		$IN,$IN,$t2		@ H<<<=1
 	veor		$H,$IN,$t0		@ twisted H
+        vext.8          $H, $H, $H, #8
 	vst1.64		{$H},[x0],#16		@ store Htable[0]
 
-	@ calculate H^2
+        @ calculate H^2
 	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
-	vpmull.p64	$Xl,$H,$H
+	vpmull2.p64	$Xl,$H,$H
 	veor		$t0,$t0,$H
-	vpmull2.p64	$Xh,$H,$H
+	vpmull.p64	$Xh,$H,$H
 	vpmull.p64	$Xm,$t0,$t0
 
 	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
@@ -135,23 +136,25 @@
 	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
 	vpmull.p64	$Xl,$Xl,$xC2
 	veor		$t2,$t2,$Xh
-	veor		$H2,$Xl,$t2
+	veor		$t1,$Xl,$t2
 
-	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
+	vext.8		$H2,$t1,$t1,#8		@ Karatsuba pre-processing
 	veor		$t1,$t1,$H2
 	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
-	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
+	vst1.64		{$Hhl},[x0],#16	@ store Htable[1..2]
+	vst1.64		{$H2},[x0],#16	@ store Htable[1..2]
 ___
 if ($flavour =~ /64/) {
 my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
 my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
 
 $code.=<<___;
+
 	@ calculate H^3 and H^4
-	vpmull.p64	$Xl,$H, $H2
-	vpmull.p64	$Yl,$H2,$H2
-	vpmull2.p64	$Xh,$H, $H2
-	vpmull2.p64	$Yh,$H2,$H2
+	vpmull2.p64	$Xl,$H, $H2
+	vpmull2.p64	$Yl,$H2,$H2
+	vpmull.p64	$Xh,$H, $H2
+	vpmull.p64	$Yh,$H2,$H2
 	vpmull.p64	$Xm,$t0,$t1
 	vpmull.p64	$Ym,$t1,$t1
 
@@ -180,23 +183,23 @@
 	veor		$t2,$t2,$Xh
 	veor		$t3,$t3,$Yh
 
-	veor		$H3, $Xl,$t2		@ H^3
-	veor		$H4,$Yl,$t3			@ H^4
+	veor		$t0, $Xl,$t2		@ H^3
+	veor		$t1, $Yl,$t3		@ H^4
 
-	vext.8		$t0,$H3, $H3,#8		@ Karatsuba pre-processing
-	vext.8		$t1,$H4,$H4,#8
+	vext.8		$H3,$t0,$t0,#8		@ Karatsuba pre-processing
+	vext.8		$H4,$t1,$t1,#8
 	vext.8		$t2,$H2,$H2,#8
 	veor		$t0,$t0,$H3
 	veor		$t1,$t1,$H4
 	veor		$t2,$t2,$H2
-	vext.8		$H34k,$t0,$t1,#8	@ pack Karatsuba pre-processed
+        vext.8		$H34k,$t0,$t1,#8	@ pack Karatsuba pre-processed
 	vst1.64		{$H3-$H4},[x0],#48	@ store Htable[3..5]
 
 	@ calculate H^5 and H^6
-	vpmull.p64	$Xl,$H2, $H3
-	vpmull.p64	$Yl,$H3,$H3
-	vpmull2.p64	$Xh,$H2, $H3
-	vpmull2.p64	$Yh,$H3,$H3
+	vpmull2.p64	$Xl,$H2, $H3
+	vpmull2.p64	$Yl,$H3,$H3
+	vpmull.p64	$Xh,$H2, $H3
+	vpmull.p64	$Yh,$H3,$H3
 	vpmull.p64	$Xm,$t0,$t2
 	vpmull.p64	$Ym,$t0,$t0
 
@@ -223,12 +226,13 @@
 	vpmull.p64	$Xl,$Xl,$xC2
 	vpmull.p64	$Yl,$Yl,$xC2
 	veor		$t2,$t2,$Xh
-	veor		$t3,$t3,$Yh
-	veor		$H5,$Xl,$t2		    @ H^5
-	veor		$H6,$Yl,$t3		    @ H^6
+        veor		$t3,$t3,$Yh
+
+	veor		$t0,$Xl,$t2		    @ H^5
+	veor		$t1,$Yl,$t3		    @ H^6
 
-	vext.8		$t0,$H5, $H5,#8		@ Karatsuba pre-processing
-	vext.8		$t1,$H6,$H6,#8
+	vext.8		$H5, $t0, $t0,#8		@ Karatsuba pre-processing
+	vext.8		$H6, $t1, $t1,#8
 	vext.8		$t2,$H2,$H2,#8
 	veor		$t0,$t0,$H5
 	veor		$t1,$t1,$H6
@@ -237,10 +241,10 @@
 	vst1.64		{$H5-$H6},[x0],#48	@ store Htable[6..8]
 
 	@ calculate H^7 and H^8
-	vpmull.p64	$Xl,$H2,$H5
-	vpmull.p64	$Yl,$H2,$H6
-	vpmull2.p64	$Xh,$H2,$H5
-	vpmull2.p64	$Yh,$H2,$H6
+	vpmull2.p64	$Xl,$H2,$H5
+	vpmull2.p64	$Yl,$H2,$H6
+	vpmull.p64	$Xh,$H2,$H5
+	vpmull.p64	$Yh,$H2,$H6
 	vpmull.p64	$Xm,$t0,$t2
 	vpmull.p64	$Ym,$t1,$t2
 
@@ -268,11 +272,11 @@
 	vpmull.p64	$Yl,$Yl,$xC2
 	veor		$t2,$t2,$Xh
 	veor		$t3,$t3,$Yh
-	veor		$H7,$Xl,$t2		    @ H^7
-	veor		$H8,$Yl,$t3		    @ H^8
+	veor		$t0,$Xl,$t2		    @ H^7
+	veor		$t1,$Yl,$t3		    @ H^8
 
-	vext.8		$t0,$H7,$H7,#8		@ Karatsuba pre-processing
-	vext.8		$t1,$H8,$H8,#8
+	vext.8		$H7,$t0,$t0,#8		@ Karatsuba pre-processing
+	vext.8		$H8,$t1,$t1,#8
 	veor		$t0,$t0,$H7
 	veor		$t1,$t1,$H8
 	vext.8		$H78k,$t0,$t1,#8	@ pack Karatsuba pre-processed
@@ -299,6 +303,7 @@
 	vld1.64		{$t1},[$Xi]		@ load Xi
 	vmov.i8		$xC2,#0xe1
 	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
+	vext.8		$H,$H,$H,#8
 	vshl.u64	$xC2,$xC2,#57
 #ifndef __ARMEB__
 	vrev64.8	$t1,$t1
@@ -375,8 +380,10 @@
 						@ loaded twice, but last
 						@ copy is not processed
 	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
+	vext.8		$H,$H,$H,#8
 	vmov.i8		$xC2,#0xe1
 	vld1.64		{$H2},[$Htbl]
+	vext.8		$H2,$H2,$H2,#8
 	cclr		$inc,eq			@ is it time to zero $inc?
 	vext.8		$Xl,$Xl,$Xl,#8		@ rotate Xi
 	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
@@ -513,8 +520,12 @@
 .Lgcm_ghash_v8_4x:
 	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
 	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
+	vext.8		$H,$H,$H,#8
+	vext.8		$H2,$H2,$H2,#8
 	vmov.i8		$xC2,#0xe1
 	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
+	vext.8		$H3,$H3,$H3,#8
+	vext.8		$H4,$H4,$H4,#8
 	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
 
 	vld1.64		{$I0-$j3},[$inp],#64