Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add inverse NTTs for Kyber & Dilithium #37

Merged
merged 29 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1b84425
Dilithium invNTT
dop-amin Mar 5, 2024
d7c6180
manual_ld4 for Dilithium invNTT
dop-amin Mar 5, 2024
141a287
Kyber invNTT
dop-amin Mar 6, 2024
f3adf51
Kyber invNTT opt A72
dop-amin Mar 7, 2024
b16a319
Introduce more Kyber invNTT reductions
dop-amin Mar 7, 2024
242c68f
Kyber clean invNTT
dop-amin Mar 8, 2024
e66d9ff
Add invNTT to example.py
dop-amin Mar 11, 2024
dacc5ed
Add Kyber invNTT manual_ld4
dop-amin Mar 14, 2024
18562da
Add invNTTs to example.py
dop-amin Mar 14, 2024
e21d54f
Adjust models to optimize invNTTs
dop-amin Mar 14, 2024
e4d0834
Fix invNTT macro syntax
dop-amin Mar 14, 2024
2334d1e
Optimized Kyber invNTTs
dop-amin Mar 15, 2024
7a44d1f
Fix reduction in Dilithium 123-45678 invNTTs
dop-amin Mar 19, 2024
8313232
Optimized DIlithium invNTTs
dop-amin Mar 20, 2024
3787897
Fix modulus use invNTT Dilithium
dop-amin Mar 21, 2024
53117d8
Fix manual_ld4 invNTT Dilithium
dop-amin Mar 21, 2024
836f521
Final (?) update for invNTTs
dop-amin Mar 23, 2024
ec6e6ce
Match Kyber invNTT Barrett reduction to pqclean
dop-amin Mar 28, 2024
21d429f
New no-unfold syntax
dop-amin Apr 2, 2024
1c6ec62
Add more optimized code
dop-amin Apr 2, 2024
53ee2d3
Add simd compare class to aarch64
dop-amin Apr 2, 2024
88dc5e9
Fixed Dilithium invNTTs (ld4 ordering)
dop-amin Apr 3, 2024
7aabde4
cmge -> ASimdCompare
dop-amin Apr 3, 2024
6659a6d
Copy example config to preserve --dry-run
dop-amin Apr 6, 2024
ee53b4b
rm duplicate code
dop-amin Apr 7, 2024
8cb796e
allow src=dst in mulmod macro
dop-amin Apr 7, 2024
c7c10a9
Adjust mulmod{q} macro for fwd NTTs as well, add comments
dop-amin Apr 7, 2024
f37cffe
Fix invNTT reduction macro names
dop-amin Apr 9, 2024
d5c0d02
Add note about output range to invNTTs
dop-amin Apr 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 125 additions & 1 deletion example.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,28 @@ def core(self, slothy):
slothy.optimize_loop("layer123_start")
slothy.optimize_loop("layer4567_start")

class intt_kyber_123_4567(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None):
name = "intt_kyber_123_4567"
infile = name

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout)

def core(self, slothy):
slothy.config.sw_pipelining.enabled = True
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.minimize_overlapping = False
slothy.config.variable_size = True
slothy.config.reserved_regs = [f"x{i}" for i in range(0, 7)] + ["x30", "sp"]
slothy.config.constraints.stalls_first_attempt = 64
slothy.optimize_loop("layer4567_start")
slothy.optimize_loop("layer123_start")


class ntt_kyber_123(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
Expand Down Expand Up @@ -1030,6 +1052,39 @@ def core(self, slothy):
slothy.optimize_loop("layer45678_start")


class intt_dilithium_123_45678(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None):
name = f"intt_dilithium_123_45678"
infile = name

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout)

def core(self, slothy):
slothy.config.sw_pipelining.enabled = True
slothy.config.sw_pipelining.minimize_overlapping = False
slothy.config.inputs_are_outputs = True

slothy.config.reserved_regs = [
f"x{i}" for i in range(0, 7)] + ["v8", "x30", "sp"]
slothy.config.reserved_regs += self.target_reserved
slothy.config.constraints.stalls_first_attempt = 40
slothy.optimize_loop("layer45678_start")

slothy.config.reserved_regs = [
f"x{i}" for i in range(0, 7)] + ["v8", "x30", "sp"]
slothy.config.reserved_regs += self.target_reserved
slothy.config.inputs_are_outputs = True
slothy.config.constraints.stalls_first_attempt = 110
slothy.optimize_loop("layer123_start")




class ntt_dilithium_123(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
name = "ntt_dilithium_123"
Expand Down Expand Up @@ -1112,7 +1167,33 @@ def core(self, slothy):

if self.timeout is not None:
dop-amin marked this conversation as resolved.
Show resolved Hide resolved
slothy.config.timeout = self.timeout * 12
if self.timeout is not None:
slothy.config.timeout = self.timeout * 12

slothy.config.reserved_regs = [
f"x{i}" for i in range(0, 6)] + ["x30", "sp"]
slothy.config.inputs_are_outputs = True
slothy.config.reserved_regs += self.target_reserved
slothy.config.sw_pipelining.enabled = True
slothy.config.sw_pipelining.minimize_overlapping = False
slothy.config.sw_pipelining.halving_heuristic = False
slothy.config.split_heuristic = False
slothy.optimize_loop("layer5678_start")


class intt_dilithium_1234_5678(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72, timeout=None):
name = f"intt_dilithium_1234_5678"
infile = name

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout)

def core(self, slothy):
slothy.config.reserved_regs = [
f"x{i}" for i in range(0, 6)] + ["x30", "sp"]
slothy.config.inputs_are_outputs = True
Expand All @@ -1123,6 +1204,25 @@ def core(self, slothy):
slothy.config.split_heuristic = False
slothy.optimize_loop("layer5678_start")

slothy.config = Config(self.arch, self.target)
hanno-becker marked this conversation as resolved.
Show resolved Hide resolved

if self.timeout is not None:
slothy.config.timeout = self.timeout // 12

slothy.config.sw_pipelining.enabled = True
slothy.config.sw_pipelining.minimize_overlapping = False
slothy.config.reserved_regs = [
f"x{i}" for i in range(0, 6)] + ["x30", "sp"]
slothy.config.reserved_regs += self.target_reserved
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.halving_heuristic = True
slothy.config.split_heuristic = True
slothy.config.split_heuristic_factor = 2
slothy.config.split_heuristic_repeat = 4
slothy.config.split_heuristic_stepsize = 0.1
slothy.config.constraints.stalls_first_attempt = 14
slothy.optimize_loop("layer1234_start")


class ntt_dilithium_1234(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72):
Expand Down Expand Up @@ -1296,13 +1396,17 @@ def main():
ntt_kyber_123_4567(var="scalar_load_store"),
ntt_kyber_123_4567(var="manual_st4"),
ntt_kyber_1234_567(),
intt_kyber_123_4567(),
intt_kyber_123_4567(var="manual_ld4"),
# Cortex-A72
ntt_kyber_123_4567(target=Target_CortexA72),
ntt_kyber_123_4567(var="scalar_load", target=Target_CortexA72),
ntt_kyber_123_4567(var="scalar_store", target=Target_CortexA72),
ntt_kyber_123_4567(var="scalar_load_store", target=Target_CortexA72),
ntt_kyber_123_4567(var="manual_st4", target=Target_CortexA72),
ntt_kyber_1234_567(target=Target_CortexA72),
intt_kyber_123_4567(target=Target_CortexA72),
intt_kyber_123_4567(var="manual_ld4", target=Target_CortexA72),
# # Apple M1 Firestorm
ntt_kyber_123_4567(target=Target_AppleM1_firestorm, timeout=3600),
ntt_kyber_123_4567(var="scalar_load", target=Target_AppleM1_firestorm, timeout=3600),
Expand All @@ -1311,6 +1415,8 @@ def main():
ntt_kyber_123_4567(var="manual_st4", target=Target_AppleM1_firestorm, timeout=3600),
ntt_kyber_1234_567(target=Target_AppleM1_firestorm, timeout=300),
ntt_kyber_1234_567(var="manual_st4", target=Target_AppleM1_firestorm, timeout=300),
intt_kyber_123_4567(target=Target_AppleM1_firestorm, timeout=3600),
intt_kyber_123_4567(var="manual_ld4", target=Target_AppleM1_firestorm, timeout=3600),
# Apple M1 Icestorm
ntt_kyber_123_4567(target=Target_AppleM1_icestorm, timeout=3600),
ntt_kyber_123_4567(var="scalar_load", target=Target_AppleM1_icestorm, timeout=3600),
Expand All @@ -1319,6 +1425,8 @@ def main():
ntt_kyber_123_4567(var="manual_st4", target=Target_AppleM1_icestorm, timeout=3600),
ntt_kyber_1234_567(target=Target_AppleM1_icestorm, timeout=300),
ntt_kyber_1234_567(var="manual_st4", target=Target_AppleM1_icestorm, timeout=300),
intt_kyber_123_4567(target=Target_AppleM1_icestorm, timeout=3600),
intt_kyber_123_4567(var="manual_ld4", target=Target_AppleM1_icestorm, timeout=3600),
# Kyber InvNTT
# Cortex-M55
intt_kyber_1_23_45_67(),
Expand All @@ -1340,24 +1448,40 @@ def main():
ntt_dilithium_123_45678(var="manual_st4"),
ntt_dilithium_1234_5678(),
ntt_dilithium_1234_5678(var="manual_st4"),
intt_dilithium_123_45678(),
intt_dilithium_123_45678(var="manual_ld4"),
intt_dilithium_1234_5678(),
intt_dilithium_1234_5678(var="manual_ld4"),
# Cortex-A72
ntt_dilithium_123_45678(target=Target_CortexA72),
ntt_dilithium_123_45678(var="w_scalar", target=Target_CortexA72),
ntt_dilithium_123_45678(var="manual_st4", target=Target_CortexA72),
ntt_dilithium_1234_5678(target=Target_CortexA72),
ntt_dilithium_1234_5678(var="manual_st4", target=Target_CortexA72),
intt_dilithium_123_45678(target=Target_CortexA72),
intt_dilithium_123_45678(var="manual_ld4", target=Target_CortexA72),
intt_dilithium_1234_5678(target=Target_CortexA72),
intt_dilithium_1234_5678(var="manual_ld4", target=Target_CortexA72),
# Apple M1 Firestorm
ntt_dilithium_123_45678(target=Target_AppleM1_firestorm, timeout=3600),
ntt_dilithium_123_45678(target=Target_AppleM1_firestorm, timeout=3600),
ntt_dilithium_123_45678(var="w_scalar", target=Target_AppleM1_firestorm, timeout=3600),
ntt_dilithium_123_45678(var="manual_st4", target=Target_AppleM1_firestorm, timeout=3600),
ntt_dilithium_1234_5678(target=Target_AppleM1_firestorm, timeout=300),
ntt_dilithium_1234_5678(var="manual_st4", target=Target_AppleM1_firestorm, timeout=300),
intt_dilithium_123_45678(target=Target_AppleM1_firestorm, timeout=3600),
intt_dilithium_123_45678(var="manual_ld4", target=Target_AppleM1_firestorm, timeout=3600),
intt_dilithium_1234_5678(target=Target_AppleM1_firestorm, timeout=3600),
intt_dilithium_1234_5678(var="manual_ld4", target=Target_AppleM1_firestorm, timeout=3600),
# Apple M1 Icestorm
ntt_dilithium_123_45678(target=Target_AppleM1_icestorm, timeout=3600),
ntt_dilithium_123_45678(var="w_scalar", target=Target_AppleM1_icestorm, timeout=3600),
ntt_dilithium_123_45678(var="manual_st4", target=Target_AppleM1_icestorm, timeout=3600),
ntt_dilithium_1234_5678(target=Target_AppleM1_icestorm, timeout=300),
ntt_dilithium_1234_5678(var="manual_st4", target=Target_AppleM1_icestorm, timeout=300),
intt_dilithium_123_45678(target=Target_AppleM1_icestorm, timeout=3600),
intt_dilithium_123_45678(var="manual_ld4", target=Target_AppleM1_icestorm, timeout=3600),
intt_dilithium_1234_5678(target=Target_AppleM1_icestorm, timeout=3600),
intt_dilithium_1234_5678(var="manual_ld4", target=Target_AppleM1_icestorm, timeout=3600),
# Dilithium invNTT
# Cortex-M55
intt_dilithium_12_34_56_78(),
Expand Down
12 changes: 12 additions & 0 deletions examples/misc/gen_roots.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,13 @@ def _main():
ntt_kyber_l123.export("../naive/ntt_kyber_123_45_67_twiddles.s")
ntt_kyber_l123.export("../opt/ntt_kyber_123_45_67_twiddles.s")

# For intt_kyber_123_4567.s
intt_kyber_l123 = NttRootGen(size=256,modulus=3329,root=17,layers=7,iters=[(0,3),(3,2),(5,2)],
pad=[0,3], print_label=True, widen_single_twiddles_to_words=False,
inverse=True)
intt_kyber_l123.export("../naive/aarch64/intt_kyber_123_45_67_twiddles.s")
intt_kyber_l123.export("../opt/aarch64/intt_kyber_123_45_67_twiddles.s")

ntt_kyber = NttRootGen(size=256,modulus=3329,root=17,layers=7)
ntt_kyber.export("../naive/ntt_kyber_1_23_45_67_twiddles.s")
ntt_kyber.export("../opt/ntt_kyber_1_23_45_67_twiddles.s")
Expand All @@ -428,6 +435,11 @@ def _main():
ntt_dilithium_l123.export("../naive/ntt_dilithium_123_456_78_twiddles.s")
ntt_dilithium_l123.export("../opt/ntt_dilithium_123_456_78_twiddles.s")

intt_dilithium_l123 = NttRootGen(size=256,inverse=True,bitsize=32,modulus=8380417,root=1753,layers=8,
print_label=True, pad=[0,3], iters=[(0,3),(3,3),(6,2)])
intt_dilithium_l123.export("../naive/aarch64/intt_dilithium_123_456_78_twiddles.s")
intt_dilithium_l123.export("../opt/aarch64/intt_dilithium_123_456_78_twiddles.s")

ntt_dilithium_l123 = NttRootGen(size=256,bitsize=32,modulus=8380417,root=1753,layers=8,
print_label=True, pad=[0,3], iters=[(0,3),(3,3),(6,2)])
ntt_dilithium_l123.export("../naive/aarch64/ntt_dilithium_123_456_78_twiddles.s")
Expand Down
12 changes: 8 additions & 4 deletions examples/naive/aarch64/intt_dilithium_1234_5678.s
Original file line number Diff line number Diff line change
Expand Up @@ -334,10 +334,14 @@ _intt_dilithium_1234_5678:

.p2align 2
layer5678_start:
ldr_vo data0, inp, (16*0)
ldr_vo data1, inp, (16*1)
ldr_vo data2, inp, (16*2)
ldr_vo data3, inp, (16*3)
// manual_ld4
// ldr_vo data0, inp, (16*0)
// ldr_vo data1, inp, (16*1)
// ldr_vo data2, inp, (16*2)
// ldr_vo data3, inp, (16*3)
// transpose4 data

ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp]

load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr0

Expand Down
Loading
Loading