From 87979154350fddf07cf564d35bf984a0f596478a Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Fri, 8 Sep 2023 12:29:34 +0800 Subject: [PATCH 01/11] Insert MSM and FFT code and their benchmarks. Resolves taikoxyz/zkevm-circuits#150. --- Cargo.toml | 17 ++++- benches/fft.rs | 24 +++++++ benches/msm-alt.rs | 56 +++++++++++++++++ benches/msm.rs | 34 ++++++++++ src/fft.rs | 134 +++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 + src/msm.rs | 153 +++++++++++++++++++++++++++++++++++++++++++++ src/multicore.rs | 16 +++++ 8 files changed, 436 insertions(+), 1 deletion(-) create mode 100644 benches/fft.rs create mode 100644 benches/msm-alt.rs create mode 100644 benches/msm.rs create mode 100644 src/fft.rs create mode 100644 src/msm.rs create mode 100644 src/multicore.rs diff --git a/Cargo.toml b/Cargo.toml index f29c917e..121552ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ criterion = { version = "0.3", features = ["html_reports"] } rand_xorshift = "0.3" ark-std = { version = "0.3" } bincode = "1.3.3" +halo2_proofs = { git = "https://github.com/privacy-scaling-explorations/halo2.git", rev="7a21656" } [dependencies] subtle = "2.4" @@ -31,9 +32,11 @@ paste = "1.0.11" serde = { version = "1.0", default-features = false, optional = true } serde_arrays = { version = "0.1.0", optional = true } blake2b_simd = "1" +maybe-rayon = { version = "0.1.0", default-features = false } [features] -default = ["reexport", "bits"] +default = ["reexport", "bits", "multicore"] +multicore = ["maybe-rayon/threads"] asm = [] bits = ["ff/bits"] bn256-table = [] @@ -67,3 +70,15 @@ harness = false [[bench]] name = "hash_to_curve" harness = false + +[[bench]] +name = "fft" +harness = false + +[[bench]] +name = "msm" +harness = false + +[[bench]] +name = "msm-alt" +harness = false diff --git a/benches/fft.rs b/benches/fft.rs new file mode 100644 index 00000000..459b9494 --- /dev/null +++ b/benches/fft.rs @@ -0,0 +1,24 @@ +#[macro_use] +extern crate criterion; + +use group::ff::Field; +use halo2curves::{fft::best_fft, pasta::Fp}; + +use criterion::{BenchmarkId, Criterion}; +use rand_core::OsRng; + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("fft"); + for k in 3..19 { + group.bench_function(BenchmarkId::new("k", k), |b| { + let mut a = (0..(1 << k)).map(|_| Fp::random(OsRng)).collect::>(); + let omega = Fp::random(OsRng); // would be weird if this mattered + b.iter(|| { + best_fft(&mut a, omega, k as u32); + }); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/benches/msm-alt.rs b/benches/msm-alt.rs new file mode 100644 index 00000000..4c452530 --- /dev/null +++ b/benches/msm-alt.rs @@ -0,0 +1,56 @@ +//! This benchmark allows testing msm without depending on the `halo2_proofs` +//! crate. This code originates in an older version of `halo2_proofs` from +//! before the `hash_to_curve` method was implemented. It currently only uses +//! curve `Secp256k1Affine` + +#[macro_use] +extern crate criterion; + +use criterion::{black_box, BenchmarkId, Criterion}; +use ff::Field; +use halo2_proofs::arithmetic::small_multiexp; +use halo2curves::secp256k1::Fq as Scalar; +use halo2curves::secp256k1::Secp256k1Affine; +use halo2curves::CurveAffine; +use rand_core::OsRng; +use rand_core::SeedableRng; +use rand_xorshift::XorShiftRng; +use std::iter::zip; + +fn random_curve_points(k: u8) -> Vec { + debug_assert!(k < 64); + let n: u64 = 1 << k; + + let mut rng = XorShiftRng::from_seed([ + 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, + 0xe5, + ]); + + (0..n).map(|_n| Secp256k1Affine::random(&mut rng)).collect() +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("msm-alt"); + for k in 8..16 { + group + .bench_function(BenchmarkId::new("k", k), |b| { + let rng = OsRng; + + let mut g = random_curve_points::(k); + let half_len = g.len() / 2; + let (g_lo, g_hi) = g.split_at_mut(half_len); + let coeff_1 = Scalar::random(rng); + let coeff_2 = Scalar::random(rng); + + b.iter(|| { + for (g_lo, g_hi) in zip(g_lo.iter(), g_hi.iter()) { + small_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); + } + }) + }) + .sample_size(30); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/benches/msm.rs b/benches/msm.rs new file mode 100644 index 00000000..419816a3 --- /dev/null +++ b/benches/msm.rs @@ -0,0 +1,34 @@ +#[macro_use] +extern crate criterion; +use criterion::{black_box, Criterion}; +use ff::Field; +use halo2_proofs::poly::{commitment::ParamsProver, ipa::commitment::ParamsIPA}; +use halo2curves::msm::small_multiexp; +use pasta_curves::{EqAffine, Fp}; +use rand_core::OsRng; + +fn criterion_benchmark(c: &mut Criterion) { + let rng = OsRng; + + // small multiexp + { + let params: ParamsIPA = ParamsIPA::new(5); + let g = &mut params.get_g().to_vec(); + let len = g.len() / 2; + let (g_lo, g_hi) = g.split_at_mut(len); + + let coeff_1 = Fp::random(rng); + let coeff_2 = Fp::random(rng); + + c.bench_function("double-and-add", |b| { + b.iter(|| { + for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) { + small_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); + } + }) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/src/fft.rs b/src/fft.rs new file mode 100644 index 00000000..6eb3487e --- /dev/null +++ b/src/fft.rs @@ -0,0 +1,134 @@ +use crate::multicore; +pub use crate::{CurveAffine, CurveExt}; +use ff::Field; +use group::{GroupOpsOwned, ScalarMulOwned}; + +/// This represents an element of a group with basic operations that can be +/// performed. This allows an FFT implementation (for example) to operate +/// generically over either a field or elliptic curve group. +pub trait FftGroup: + Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned +{ +} + +impl FftGroup for T +where + Scalar: Field, + T: Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned, +{ +} + +/// Performs a radix-$2$ Fast-Fourier Transformation (FFT) on a vector of size +/// $n = 2^k$, when provided `log_n` = $k$ and an element of multiplicative +/// order $n$ called `omega` ($\omega$). The result is that the vector `a`, when +/// interpreted as the coefficients of a polynomial of degree $n - 1$, is +/// transformed into the evaluations of this polynomial at each of the $n$ +/// distinct powers of $\omega$. This transformation is invertible by providing +/// $\omega^{-1}$ in place of $\omega$ and dividing each resulting field element +/// by $n$. +/// +/// This will use multithreading if beneficial. +pub fn best_fft>(a: &mut [G], omega: Scalar, log_n: u32) { + fn bitreverse(mut n: usize, l: usize) -> usize { + let mut r = 0; + for _ in 0..l { + r = (r << 1) | (n & 1); + n >>= 1; + } + r + } + + let threads = multicore::current_num_threads(); + let log_threads = threads.ilog2(); + let n = a.len(); + assert_eq!(n, 1 << log_n); + + for k in 0..n { + let rk = bitreverse(k, log_n as usize); + if k < rk { + a.swap(rk, k); + } + } + + // precompute twiddle factors + let twiddles: Vec<_> = (0..(n / 2)) + .scan(Scalar::ONE, |w, _| { + let tw = *w; + *w *= ω + Some(tw) + }) + .collect(); + + if log_n <= log_threads { + let mut chunk = 2_usize; + let mut twiddle_chunk = n / 2; + for _ in 0..log_n { + a.chunks_mut(chunk).for_each(|coeffs| { + let (left, right) = coeffs.split_at_mut(chunk / 2); + + // case when twiddle factor is one + let (a, left) = left.split_at_mut(1); + let (b, right) = right.split_at_mut(1); + let t = b[0]; + b[0] = a[0]; + a[0] += &t; + b[0] -= &t; + + left.iter_mut() + .zip(right.iter_mut()) + .enumerate() + .for_each(|(i, (a, b))| { + let mut t = *b; + t *= &twiddles[(i + 1) * twiddle_chunk]; + *b = *a; + *a += &t; + *b -= &t; + }); + }); + chunk *= 2; + twiddle_chunk /= 2; + } + } else { + recursive_butterfly_arithmetic(a, n, 1, &twiddles) + } +} + +/// This perform recursive butterfly arithmetic +pub fn recursive_butterfly_arithmetic>( + a: &mut [G], + n: usize, + twiddle_chunk: usize, + twiddles: &[Scalar], +) { + if n == 2 { + let t = a[1]; + a[1] = a[0]; + a[0] += &t; + a[1] -= &t; + } else { + let (left, right) = a.split_at_mut(n / 2); + multicore::join( + || recursive_butterfly_arithmetic(left, n / 2, twiddle_chunk * 2, twiddles), + || recursive_butterfly_arithmetic(right, n / 2, twiddle_chunk * 2, twiddles), + ); + + // case when twiddle factor is one + let (a, left) = left.split_at_mut(1); + let (b, right) = right.split_at_mut(1); + let t = b[0]; + b[0] = a[0]; + a[0] += &t; + b[0] -= &t; + + left.iter_mut() + .zip(right.iter_mut()) + .enumerate() + .for_each(|(i, (a, b))| { + let mut t = *b; + t *= &twiddles[(i + 1) * twiddle_chunk]; + *b = *a; + *a += &t; + *b -= &t; + }); + } +} diff --git a/src/lib.rs b/src/lib.rs index 3fa8e98f..670a6448 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,8 @@ mod arithmetic; +pub mod fft; pub mod hash_to_curve; +pub mod msm; +pub mod multicore; #[macro_use] pub mod legendre; pub mod serde; diff --git a/src/msm.rs b/src/msm.rs new file mode 100644 index 00000000..de30be55 --- /dev/null +++ b/src/msm.rs @@ -0,0 +1,153 @@ +use ff::PrimeField; +use group::Group; +use pasta_curves::arithmetic::CurveAffine; + +use crate::multicore; + +pub fn multiexp_serial(coeffs: &[C::Scalar], bases: &[C], acc: &mut C::Curve) { + let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect(); + + let c = if bases.len() < 4 { + 1 + } else if bases.len() < 32 { + 3 + } else { + (f64::from(bases.len() as u32)).ln().ceil() as usize + }; + + fn get_at(segment: usize, c: usize, bytes: &F::Repr) -> usize { + let skip_bits = segment * c; + let skip_bytes = skip_bits / 8; + + if skip_bytes >= 32 { + return 0; + } + + let mut v = [0; 8]; + for (v, o) in v.iter_mut().zip(bytes.as_ref()[skip_bytes..].iter()) { + *v = *o; + } + + let mut tmp = u64::from_le_bytes(v); + tmp >>= skip_bits - (skip_bytes * 8); + tmp %= 1 << c; + + tmp as usize + } + + let segments = (256 / c) + 1; + + for current_segment in (0..segments).rev() { + for _ in 0..c { + *acc = acc.double(); + } + + #[derive(Clone, Copy)] + enum Bucket { + None, + Affine(C), + Projective(C::Curve), + } + + impl Bucket { + fn add_assign(&mut self, other: &C) { + *self = match *self { + Bucket::None => Bucket::Affine(*other), + Bucket::Affine(a) => Bucket::Projective(a + *other), + Bucket::Projective(mut a) => { + a += *other; + Bucket::Projective(a) + } + } + } + + fn add(self, mut other: C::Curve) -> C::Curve { + match self { + Bucket::None => other, + Bucket::Affine(a) => { + other += a; + other + } + Bucket::Projective(a) => other + a, + } + } + } + + let mut buckets: Vec> = vec![Bucket::None; (1 << c) - 1]; + + for (coeff, base) in coeffs.iter().zip(bases.iter()) { + let coeff = get_at::(current_segment, c, coeff); + if coeff != 0 { + buckets[coeff - 1].add_assign(base); + } + } + + // Summation by parts + // e.g. 3a + 2b + 1c = a + + // (a) + b + + // ((a) + b) + c + let mut running_sum = C::Curve::identity(); + for exp in buckets.into_iter().rev() { + running_sum = exp.add(running_sum); + *acc += &running_sum; + } + } +} + +/// Performs a small multi-exponentiation operation. +/// Uses the double-and-add algorithm with doublings shared across points. +pub fn small_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { + let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect(); + let mut acc = C::Curve::identity(); + + // for byte idx + for byte_idx in (0..32).rev() { + // for bit idx + for bit_idx in (0..8).rev() { + acc = acc.double(); + // for each coeff + for coeff_idx in 0..coeffs.len() { + let byte = coeffs[coeff_idx].as_ref()[byte_idx]; + if ((byte >> bit_idx) & 1) != 0 { + acc += bases[coeff_idx]; + } + } + } + } + + acc +} + +/// Performs a multi-exponentiation operation. +/// +/// This function will panic if coeffs and bases have a different length. +/// +/// This will use multithreading if beneficial. +pub fn best_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { + assert_eq!(coeffs.len(), bases.len()); + + let num_threads = multicore::current_num_threads(); + if coeffs.len() > num_threads { + let chunk = coeffs.len() / num_threads; + let num_chunks = coeffs.chunks(chunk).len(); + let mut results = vec![C::Curve::identity(); num_chunks]; + multicore::scope(|scope| { + let chunk = coeffs.len() / num_threads; + + for ((coeffs, bases), acc) in coeffs + .chunks(chunk) + .zip(bases.chunks(chunk)) + .zip(results.iter_mut()) + { + scope.spawn(move |_| { + multiexp_serial(coeffs, bases, acc); + }); + } + }); + results.iter().fold(C::Curve::identity(), |a, b| a + b) + } else { + let mut acc = C::Curve::identity(); + multiexp_serial(coeffs, bases, &mut acc); + acc + } +} diff --git a/src/multicore.rs b/src/multicore.rs new file mode 100644 index 00000000..d8323553 --- /dev/null +++ b/src/multicore.rs @@ -0,0 +1,16 @@ +pub use maybe_rayon::{ + iter::{IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator}, + join, scope, Scope, +}; + +#[cfg(feature = "multicore")] +pub use maybe_rayon::{ + current_num_threads, + iter::{IndexedParallelIterator, IntoParallelRefIterator}, + slice::ParallelSliceMut, +}; + +#[cfg(not(feature = "multicore"))] +pub fn current_num_threads() -> usize { + 1 +} From 77b98f25ed67af56e3d091241230477cb79a7ca7 Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Fri, 8 Sep 2023 20:15:33 +0800 Subject: [PATCH 02/11] feedback --- Cargo.toml | 5 ---- benches/msm-alt.rs | 56 ------------------------------------- benches/msm.rs | 69 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 49 insertions(+), 81 deletions(-) delete mode 100644 benches/msm-alt.rs diff --git a/Cargo.toml b/Cargo.toml index 121552ec..b722272d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,6 @@ criterion = { version = "0.3", features = ["html_reports"] } rand_xorshift = "0.3" ark-std = { version = "0.3" } bincode = "1.3.3" -halo2_proofs = { git = "https://github.com/privacy-scaling-explorations/halo2.git", rev="7a21656" } [dependencies] subtle = "2.4" @@ -78,7 +77,3 @@ harness = false [[bench]] name = "msm" harness = false - -[[bench]] -name = "msm-alt" -harness = false diff --git a/benches/msm-alt.rs b/benches/msm-alt.rs deleted file mode 100644 index 4c452530..00000000 --- a/benches/msm-alt.rs +++ /dev/null @@ -1,56 +0,0 @@ -//! This benchmark allows testing msm without depending on the `halo2_proofs` -//! crate. This code originates in an older version of `halo2_proofs` from -//! before the `hash_to_curve` method was implemented. It currently only uses -//! curve `Secp256k1Affine` - -#[macro_use] -extern crate criterion; - -use criterion::{black_box, BenchmarkId, Criterion}; -use ff::Field; -use halo2_proofs::arithmetic::small_multiexp; -use halo2curves::secp256k1::Fq as Scalar; -use halo2curves::secp256k1::Secp256k1Affine; -use halo2curves::CurveAffine; -use rand_core::OsRng; -use rand_core::SeedableRng; -use rand_xorshift::XorShiftRng; -use std::iter::zip; - -fn random_curve_points(k: u8) -> Vec { - debug_assert!(k < 64); - let n: u64 = 1 << k; - - let mut rng = XorShiftRng::from_seed([ - 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, - 0xe5, - ]); - - (0..n).map(|_n| Secp256k1Affine::random(&mut rng)).collect() -} - -fn criterion_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("msm-alt"); - for k in 8..16 { - group - .bench_function(BenchmarkId::new("k", k), |b| { - let rng = OsRng; - - let mut g = random_curve_points::(k); - let half_len = g.len() / 2; - let (g_lo, g_hi) = g.split_at_mut(half_len); - let coeff_1 = Scalar::random(rng); - let coeff_2 = Scalar::random(rng); - - b.iter(|| { - for (g_lo, g_hi) in zip(g_lo.iter(), g_hi.iter()) { - small_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); - } - }) - }) - .sample_size(30); - } -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/benches/msm.rs b/benches/msm.rs index 419816a3..6dc36245 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -1,32 +1,61 @@ +//! This benchmark allows testing msm without depending on the `halo2_proofs` +//! crate. This code originates in an older version of `halo2_proofs` from +//! before the `hash_to_curve` method was implemented. It currently only uses +//! curve `Secp256k1Affine` + #[macro_use] extern crate criterion; -use criterion::{black_box, Criterion}; + +use criterion::{black_box, BenchmarkId, Criterion}; use ff::Field; -use halo2_proofs::poly::{commitment::ParamsProver, ipa::commitment::ParamsIPA}; -use halo2curves::msm::small_multiexp; -use pasta_curves::{EqAffine, Fp}; +use halo2curves::bn256::Fr as Scalar; +use halo2curves::bn256::G1Affine; +use halo2curves::msm::best_multiexp; +use halo2curves::CurveAffine; use rand_core::OsRng; +use rand_core::SeedableRng; +use rand_xorshift::XorShiftRng; +use std::iter::zip; + +fn random_curve_points(k: u8) -> Vec { + debug_assert!(k < 64); + let n: u64 = 1 << k; + + let mut rng = XorShiftRng::from_seed([ + 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, + 0xe5, + ]); + + (0..n).map(|_n| G1Affine::random(&mut rng)).collect() +} + +#[cfg(not(feature = "multicore"))] +const RANGE: [u8; 6] = [3, 8, 10, 12 /*(Ethereum KZG / EIP 4844)*/, 14, 16]; +#[cfg(feature = "multicore")] +const RANGE: [u8; 9] = [ + 3, 8, 10, 12, /*(Ethereum KZG / EIP 4844)*/ + 14, 16, 18, 20, 21, +]; fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("msm"); let rng = OsRng; + for k in RANGE { + group + .bench_function(BenchmarkId::new("k", k), |b| { + let mut g = random_curve_points::(k); + let half_len = g.len() / 2; + let (g_lo, g_hi) = g.split_at_mut(half_len); + let coeff_1 = Scalar::random(rng); + let coeff_2 = Scalar::random(rng); - // small multiexp - { - let params: ParamsIPA = ParamsIPA::new(5); - let g = &mut params.get_g().to_vec(); - let len = g.len() / 2; - let (g_lo, g_hi) = g.split_at_mut(len); - - let coeff_1 = Fp::random(rng); - let coeff_2 = Fp::random(rng); - - c.bench_function("double-and-add", |b| { - b.iter(|| { - for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) { - small_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); - } + b.iter(|| { + for (g_lo, g_hi) in zip(g_lo.iter(), g_hi.iter()) { + best_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); + } + }) }) - }); + .sample_size(10); } } From 2b269848f2640fdd62205f9aa5dd81db03516afc Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Fri, 8 Sep 2023 20:31:46 +0800 Subject: [PATCH 03/11] Add instructions --- benches/msm.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/benches/msm.rs b/benches/msm.rs index 6dc36245..213456eb 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -1,7 +1,13 @@ -//! This benchmark allows testing msm without depending on the `halo2_proofs` -//! crate. This code originates in an older version of `halo2_proofs` from -//! before the `hash_to_curve` method was implemented. It currently only uses -//! curve `Secp256k1Affine` +//! This benchmarks Multi Scalar Multiplication (MSM). +//! It measures `G1` from the BN256 curve. +//! +//! Benchmark with default feature `multicore` enabled: +//! +//! cargo bench -- msm +//! +//! To run with as singlecore: +//! +//! cargo bench --no-default-features -- msm #[macro_use] extern crate criterion; From 1977dc029357a49eed675cefdff5233700991002 Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Fri, 8 Sep 2023 21:15:59 +0800 Subject: [PATCH 04/11] feeback --- benches/msm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/msm.rs b/benches/msm.rs index 213456eb..a3c2e759 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -40,7 +40,7 @@ const RANGE: [u8; 6] = [3, 8, 10, 12 /*(Ethereum KZG / EIP 4844)*/, 14, 16]; #[cfg(feature = "multicore")] const RANGE: [u8; 9] = [ 3, 8, 10, 12, /*(Ethereum KZG / EIP 4844)*/ - 14, 16, 18, 20, 21, + 14, 16, 18, 20, 22, ]; fn criterion_benchmark(c: &mut Criterion) { From 68f41d3321c3abbf1f8b536462e2f2003a1e496a Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Fri, 15 Sep 2023 20:16:08 +0800 Subject: [PATCH 05/11] Implement feedback: Actually supply the correct arguments to `best_multiexp`. Split into `singlecore` and `multicore` benchmarks so Criterion's result caching and comparison over multiple runs makes sense. Rewrite point and scalar generation. --- benches/msm.rs | 78 ++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/benches/msm.rs b/benches/msm.rs index a3c2e759..9c9c8b3a 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -1,69 +1,71 @@ //! This benchmarks Multi Scalar Multiplication (MSM). //! It measures `G1` from the BN256 curve. //! -//! Benchmark with default feature `multicore` enabled: +//! To run this benchmark: //! //! cargo bench -- msm //! -//! To run with as singlecore: +//! Caveat: `multicore` should be read as _allowing_ for multicore computation -- +//! not enforcing it. //! -//! cargo bench --no-default-features -- msm #[macro_use] extern crate criterion; use criterion::{black_box, BenchmarkId, Criterion}; use ff::Field; -use halo2curves::bn256::Fr as Scalar; -use halo2curves::bn256::G1Affine; -use halo2curves::msm::best_multiexp; -use halo2curves::CurveAffine; -use rand_core::OsRng; +use group::prime::PrimeCurveAffine; +use halo2curves::bn256::{Fr as Scalar, G1Affine as Point}; +use halo2curves::msm::{best_multiexp, multiexp_serial}; use rand_core::SeedableRng; use rand_xorshift::XorShiftRng; -use std::iter::zip; -fn random_curve_points(k: u8) -> Vec { - debug_assert!(k < 64); - let n: u64 = 1 << k; +const SEED: [u8; 16] = [ + 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, 0xe5, +]; - let mut rng = XorShiftRng::from_seed([ - 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, - 0xe5, - ]); +const SINGLECORE_RANGE: [u8; 6] = [3, 8, 10, 12, 14, 16]; - (0..n).map(|_n| G1Affine::random(&mut rng)).collect() -} +const MULTICORE_RANGE: [u8; 9] = [3, 8, 10, 12, 14, 16, 18, 20, 22]; -#[cfg(not(feature = "multicore"))] -const RANGE: [u8; 6] = [3, 8, 10, 12 /*(Ethereum KZG / EIP 4844)*/, 14, 16]; -#[cfg(feature = "multicore")] -const RANGE: [u8; 9] = [ - 3, 8, 10, 12, /*(Ethereum KZG / EIP 4844)*/ - 14, 16, 18, 20, 22, -]; +fn singlecore(c: &mut Criterion) { + let mut group = c.benchmark_group("msm/singlecore"); + let mut rng = XorShiftRng::from_seed(SEED); + for k in SINGLECORE_RANGE { + group + .bench_function(BenchmarkId::new("k", k), |b| { + assert!(k < 64); + let n: u64 = 1 << k; + + let bases: Vec<_> = (0..n).map(|_| Point::random(&mut rng)).collect(); + let coeffs: Vec<_> = (0..n).map(|_| Scalar::random(&mut rng)).collect(); + let mut acc = Point::identity().into(); -fn criterion_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("msm"); - let rng = OsRng; - for k in RANGE { + b.iter(|| multiexp_serial(&coeffs, &bases, &mut black_box(acc))); + }) + .sample_size(10); + } +} + +fn multicore(c: &mut Criterion) { + let mut group = c.benchmark_group("msm/multicore"); + let mut rng = XorShiftRng::from_seed(SEED); + for k in MULTICORE_RANGE { group .bench_function(BenchmarkId::new("k", k), |b| { - let mut g = random_curve_points::(k); - let half_len = g.len() / 2; - let (g_lo, g_hi) = g.split_at_mut(half_len); - let coeff_1 = Scalar::random(rng); - let coeff_2 = Scalar::random(rng); + assert!(k < 64); + let n: u64 = 1 << k; + + let bases: Vec<_> = (0..n).map(|_| Point::random(&mut rng)).collect(); + let coeffs: Vec<_> = (0..n).map(|_| Scalar::random(&mut rng)).collect(); b.iter(|| { - for (g_lo, g_hi) in zip(g_lo.iter(), g_hi.iter()) { - best_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); - } + best_multiexp(&coeffs, &bases); }) }) .sample_size(10); } } -criterion_group!(benches, criterion_benchmark); +criterion_group!(benches, singlecore, multicore); criterion_main!(benches); From 2bc3c1750ecf45c39591da082135d09d3dd32732 Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Tue, 19 Sep 2023 16:38:42 +0800 Subject: [PATCH 06/11] Use slicing and parallelism to to decrease running time. Laptop measurements: k=22: 109 sec k=16: 1 sec --- benches/msm.rs | 73 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/benches/msm.rs b/benches/msm.rs index 9c9c8b3a..d075a0b5 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -17,8 +17,11 @@ use ff::Field; use group::prime::PrimeCurveAffine; use halo2curves::bn256::{Fr as Scalar, G1Affine as Point}; use halo2curves::msm::{best_multiexp, multiexp_serial}; +use maybe_rayon::current_thread_index; +use maybe_rayon::prelude::{IntoParallelIterator, ParallelIterator}; use rand_core::SeedableRng; use rand_xorshift::XorShiftRng; +use std::time::{Duration, SystemTime}; const SEED: [u8; 16] = [ 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, 0xe5, @@ -28,20 +31,71 @@ const SINGLECORE_RANGE: [u8; 6] = [3, 8, 10, 12, 14, 16]; const MULTICORE_RANGE: [u8; 9] = [3, 8, 10, 12, 14, 16, 18, 20, 22]; +/// This do get called twice, but the total running time entirely dominated by the larger instance. +fn get_data(k: u8) -> (Vec, Vec) { + let n: u64 = { + assert!(k < 64); + 1 << k + }; + + println!( + "\n\nCoefficient and curve point generation starting. {} coefficient-points pairs needed", + n + ); + let timer = SystemTime::now(); + let coeffs = (0..n) + .into_par_iter() + .map_init( + || { + let mut thread_seed = SEED.clone(); + let uniq = current_thread_index().unwrap().to_ne_bytes(); + assert!(std::mem::size_of::() == 8); + for i in 0..uniq.len() { + thread_seed[i] += uniq[i]; + thread_seed[i + 8] += uniq[i]; + } + XorShiftRng::from_seed(thread_seed) + }, + |mut rng, _| Scalar::random(&mut rng), + ) + .collect(); + let bases = (0..n) + .into_par_iter() + .map_init( + || { + let mut thread_seed = SEED.clone(); + let uniq = current_thread_index().unwrap().to_ne_bytes(); + assert!(std::mem::size_of::() == 8); + for i in 0..uniq.len() { + thread_seed[i] += uniq[i]; + thread_seed[i + 8] += uniq[i]; + } + XorShiftRng::from_seed(thread_seed) + }, + |mut rng, _| Point::random(&mut rng), + ) + .collect(); + let end = timer.elapsed().unwrap(); + println!( + "Coefficient and curve point generation took: {} sec.\n\n", + end.as_secs() + ); + + return (coeffs, bases); +} + fn singlecore(c: &mut Criterion) { let mut group = c.benchmark_group("msm/singlecore"); - let mut rng = XorShiftRng::from_seed(SEED); + let (coeffs, bases) = get_data(*SINGLECORE_RANGE.iter().max().unwrap()); for k in SINGLECORE_RANGE { group .bench_function(BenchmarkId::new("k", k), |b| { assert!(k < 64); - let n: u64 = 1 << k; + let n: usize = 1 << k; - let bases: Vec<_> = (0..n).map(|_| Point::random(&mut rng)).collect(); - let coeffs: Vec<_> = (0..n).map(|_| Scalar::random(&mut rng)).collect(); let mut acc = Point::identity().into(); - b.iter(|| multiexp_serial(&coeffs, &bases, &mut black_box(acc))); + b.iter(|| multiexp_serial(&coeffs[..n], &bases[..n], &mut black_box(acc))); }) .sample_size(10); } @@ -49,18 +103,15 @@ fn singlecore(c: &mut Criterion) { fn multicore(c: &mut Criterion) { let mut group = c.benchmark_group("msm/multicore"); - let mut rng = XorShiftRng::from_seed(SEED); + let (coeffs, bases) = get_data(*MULTICORE_RANGE.iter().max().unwrap()); for k in MULTICORE_RANGE { group .bench_function(BenchmarkId::new("k", k), |b| { assert!(k < 64); - let n: u64 = 1 << k; - - let bases: Vec<_> = (0..n).map(|_| Point::random(&mut rng)).collect(); - let coeffs: Vec<_> = (0..n).map(|_| Scalar::random(&mut rng)).collect(); + let n: usize = 1 << k; b.iter(|| { - best_multiexp(&coeffs, &bases); + best_multiexp(&coeffs[..n], &bases[..n]); }) }) .sample_size(10); From 2621efe0b281735779711fc9de59e0c5e7cd1f0f Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Wed, 20 Sep 2023 14:06:14 +0800 Subject: [PATCH 07/11] Refactor msm --- benches/msm.rs | 63 ++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/benches/msm.rs b/benches/msm.rs index d075a0b5..d40994ab 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -12,7 +12,7 @@ #[macro_use] extern crate criterion; -use criterion::{black_box, BenchmarkId, Criterion}; +use criterion::{BenchmarkId, Criterion}; use ff::Field; use group::prime::PrimeCurveAffine; use halo2curves::bn256::{Fr as Scalar, G1Affine as Point}; @@ -21,33 +21,28 @@ use maybe_rayon::current_thread_index; use maybe_rayon::prelude::{IntoParallelIterator, ParallelIterator}; use rand_core::SeedableRng; use rand_xorshift::XorShiftRng; -use std::time::{Duration, SystemTime}; +use std::time::SystemTime; +const SAMPLE_SIZE: usize = 10; +const SINGLECORE_RANGE: [u8; 6] = [3, 8, 10, 12, 14, 16]; +const MULTICORE_RANGE: [u8; 9] = [3, 8, 10, 12, 14, 16, 18, 20, 22]; const SEED: [u8; 16] = [ 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, 0xe5, ]; -const SINGLECORE_RANGE: [u8; 6] = [3, 8, 10, 12, 14, 16]; - -const MULTICORE_RANGE: [u8; 9] = [3, 8, 10, 12, 14, 16, 18, 20, 22]; - -/// This do get called twice, but the total running time entirely dominated by the larger instance. -fn get_data(k: u8) -> (Vec, Vec) { +fn generate_coefficients_and_curvepoints(k: u8) -> (Vec, Vec) { let n: u64 = { assert!(k < 64); 1 << k }; - println!( - "\n\nCoefficient and curve point generation starting. {} coefficient-points pairs needed", - n - ); + println!("\n\nGenerating 2^{k} = {n} coefficients and curve points..",); let timer = SystemTime::now(); let coeffs = (0..n) .into_par_iter() .map_init( || { - let mut thread_seed = SEED.clone(); + let mut thread_seed = SEED; let uniq = current_thread_index().unwrap().to_ne_bytes(); assert!(std::mem::size_of::() == 8); for i in 0..uniq.len() { @@ -56,14 +51,14 @@ fn get_data(k: u8) -> (Vec, Vec) { } XorShiftRng::from_seed(thread_seed) }, - |mut rng, _| Scalar::random(&mut rng), + |rng, _| Scalar::random(rng), ) .collect(); let bases = (0..n) .into_par_iter() .map_init( || { - let mut thread_seed = SEED.clone(); + let mut thread_seed = SEED; let uniq = current_thread_index().unwrap().to_ne_bytes(); assert!(std::mem::size_of::() == 8); for i in 0..uniq.len() { @@ -72,51 +67,49 @@ fn get_data(k: u8) -> (Vec, Vec) { } XorShiftRng::from_seed(thread_seed) }, - |mut rng, _| Point::random(&mut rng), + |rng, _| Point::random(rng), ) .collect(); let end = timer.elapsed().unwrap(); - println!( - "Coefficient and curve point generation took: {} sec.\n\n", + println!("Generating 2^{k} = {n} coefficients and curve points took: {} sec.\n\n", end.as_secs() ); - return (coeffs, bases); + (coeffs, bases) } -fn singlecore(c: &mut Criterion) { - let mut group = c.benchmark_group("msm/singlecore"); - let (coeffs, bases) = get_data(*SINGLECORE_RANGE.iter().max().unwrap()); +fn msm(c: &mut Criterion) { + let mut group = c.benchmark_group("msm"); + let max_k = *SINGLECORE_RANGE + .iter() + .chain(MULTICORE_RANGE.iter()) + .max() + .unwrap_or(&16); + let (coeffs, bases) = generate_coefficients_and_curvepoints(max_k); + for k in SINGLECORE_RANGE { group - .bench_function(BenchmarkId::new("k", k), |b| { + .bench_function(BenchmarkId::new("singlecore", k), |b| { assert!(k < 64); let n: usize = 1 << k; - let mut acc = Point::identity().into(); - - b.iter(|| multiexp_serial(&coeffs[..n], &bases[..n], &mut black_box(acc))); + b.iter(|| multiexp_serial(&coeffs[..n], &bases[..n], &mut acc)); }) .sample_size(10); } -} - -fn multicore(c: &mut Criterion) { - let mut group = c.benchmark_group("msm/multicore"); - let (coeffs, bases) = get_data(*MULTICORE_RANGE.iter().max().unwrap()); for k in MULTICORE_RANGE { group - .bench_function(BenchmarkId::new("k", k), |b| { + .bench_function(BenchmarkId::new("multicore", k), |b| { assert!(k < 64); let n: usize = 1 << k; - b.iter(|| { best_multiexp(&coeffs[..n], &bases[..n]); }) }) - .sample_size(10); + .sample_size(SAMPLE_SIZE); } + group.finish(); } -criterion_group!(benches, singlecore, multicore); +criterion_group!(benches, msm); criterion_main!(benches); From 16ae1468cdcebd32a6f00375efd40c7a002b0b2a Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Wed, 20 Sep 2023 14:06:44 +0800 Subject: [PATCH 08/11] Refactor fft --- benches/fft.rs | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/benches/fft.rs b/benches/fft.rs index 459b9494..7dff3f0f 100644 --- a/benches/fft.rs +++ b/benches/fft.rs @@ -1,24 +1,45 @@ #[macro_use] extern crate criterion; -use group::ff::Field; -use halo2curves::{fft::best_fft, pasta::Fp}; - use criterion::{BenchmarkId, Criterion}; +use group::ff::Field; +use halo2curves::bn256::Fr as Scalar; +use halo2curves::fft::best_fft; use rand_core::OsRng; +use std::ops::Range; +use std::time::SystemTime; + +const RANGE: Range = 3..19; + +fn generate_data(k: u32) -> Vec { + let n = 1 << k; + let timer = SystemTime::now(); + println!("\n\nGenerating 2^{k} = {n} values..",); + let data: Vec = (0..n).map(|_| Scalar::random(OsRng)).collect(); + let end = timer.elapsed().unwrap(); + println!( + "Generating 2^{k} = {n} values took: {} sec.\n\n", + end.as_secs() + ); + data +} -fn criterion_benchmark(c: &mut Criterion) { +fn fft(c: &mut Criterion) { + let max_k = RANGE.max().unwrap_or(16); + let mut data = generate_data(max_k); + let omega = Scalar::random(OsRng); let mut group = c.benchmark_group("fft"); - for k in 3..19 { + for k in RANGE { group.bench_function(BenchmarkId::new("k", k), |b| { - let mut a = (0..(1 << k)).map(|_| Fp::random(OsRng)).collect::>(); - let omega = Fp::random(OsRng); // would be weird if this mattered + let n = 1 << k; + assert!(n <= data.len()); b.iter(|| { - best_fft(&mut a, omega, k as u32); + best_fft(&mut data[..n], omega, k); }); }); } + group.finish(); } -criterion_group!(benches, criterion_benchmark); +criterion_group!(benches, fft); criterion_main!(benches); From a5eab13a7ff76a172cafd38b034bc60b9fa61339 Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Wed, 20 Sep 2023 14:31:41 +0800 Subject: [PATCH 09/11] Update module comments --- benches/fft.rs | 12 ++++++++++++ benches/msm.rs | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/benches/fft.rs b/benches/fft.rs index 7dff3f0f..a250308d 100644 --- a/benches/fft.rs +++ b/benches/fft.rs @@ -1,3 +1,15 @@ +//! This benchmarks Fast-Fourier Transform (FFT). +//! Since it is over a finite field, it is actually the Number Theoretical +//! Transform (NNT). It uses the `Fr` scalar field from the BN256 curve. +//! +//! To run this benchmark: +//! +//! cargo bench -- fft +//! +//! Caveat: The multicore benchmark assumes: +//! 1. a multi-core system +//! 2. that the `multicore` feature is enabled. It is by default. + #[macro_use] extern crate criterion; diff --git a/benches/msm.rs b/benches/msm.rs index d40994ab..3d7f7581 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -5,9 +5,9 @@ //! //! cargo bench -- msm //! -//! Caveat: `multicore` should be read as _allowing_ for multicore computation -- -//! not enforcing it. -//! +//! Caveat: The multicore benchmark assumes: +//! 1. a multi-core system +//! 2. that the `multicore` feature is enabled. It is by default. #[macro_use] extern crate criterion; From 714e164da60303eb006e2087fad732023b2e1223 Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Wed, 20 Sep 2023 18:09:07 +0800 Subject: [PATCH 10/11] Fix formatting --- benches/msm.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benches/msm.rs b/benches/msm.rs index 3d7f7581..c78952b7 100644 --- a/benches/msm.rs +++ b/benches/msm.rs @@ -71,7 +71,8 @@ fn generate_coefficients_and_curvepoints(k: u8) -> (Vec, Vec) { ) .collect(); let end = timer.elapsed().unwrap(); - println!("Generating 2^{k} = {n} coefficients and curve points took: {} sec.\n\n", + println!( + "Generating 2^{k} = {n} coefficients and curve points took: {} sec.\n\n", end.as_secs() ); From 70924514ac84cc48dcff6e93111b7ff527ccb061 Mon Sep 17 00:00:00 2001 From: Einar Rasmussen Date: Thu, 21 Sep 2023 17:58:53 +0800 Subject: [PATCH 11/11] Implement suggestion for fixing CI --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index b722272d..06edc850 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,3 +77,4 @@ harness = false [[bench]] name = "msm" harness = false +required-features = ["multicore"]