diff --git a/Cargo.toml b/Cargo.toml index f29c917e..121552ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ criterion = { version = "0.3", features = ["html_reports"] } rand_xorshift = "0.3" ark-std = { version = "0.3" } bincode = "1.3.3" +halo2_proofs = { git = "https://github.com/privacy-scaling-explorations/halo2.git", rev="7a21656" } [dependencies] subtle = "2.4" @@ -31,9 +32,11 @@ paste = "1.0.11" serde = { version = "1.0", default-features = false, optional = true } serde_arrays = { version = "0.1.0", optional = true } blake2b_simd = "1" +maybe-rayon = { version = "0.1.0", default-features = false } [features] -default = ["reexport", "bits"] +default = ["reexport", "bits", "multicore"] +multicore = ["maybe-rayon/threads"] asm = [] bits = ["ff/bits"] bn256-table = [] @@ -67,3 +70,15 @@ harness = false [[bench]] name = "hash_to_curve" harness = false + +[[bench]] +name = "fft" +harness = false + +[[bench]] +name = "msm" +harness = false + +[[bench]] +name = "msm-alt" +harness = false diff --git a/benches/fft.rs b/benches/fft.rs new file mode 100644 index 00000000..459b9494 --- /dev/null +++ b/benches/fft.rs @@ -0,0 +1,24 @@ +#[macro_use] +extern crate criterion; + +use group::ff::Field; +use halo2curves::{fft::best_fft, pasta::Fp}; + +use criterion::{BenchmarkId, Criterion}; +use rand_core::OsRng; + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("fft"); + for k in 3..19 { + group.bench_function(BenchmarkId::new("k", k), |b| { + let mut a = (0..(1 << k)).map(|_| Fp::random(OsRng)).collect::>(); + let omega = Fp::random(OsRng); // would be weird if this mattered + b.iter(|| { + best_fft(&mut a, omega, k as u32); + }); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/benches/msm-alt.rs b/benches/msm-alt.rs new file mode 100644 index 00000000..3c12c73c --- /dev/null +++ b/benches/msm-alt.rs @@ -0,0 +1,55 @@ +//! This benchmark allows testing msm without depending on the `halo2_proofs` +//! crate. This code originates in an older version of `halo2_proofs` from +//! before the `hash_to_curve` method was implemented. + +#[macro_use] +extern crate criterion; + +use criterion::{black_box, BenchmarkId, Criterion}; +use ff::Field; +use halo2_proofs::arithmetic::small_multiexp; +use halo2curves::secp256k1::Fq as Scalar; +use halo2curves::secp256k1::Secp256k1Affine; +use halo2curves::CurveAffine; +use rand_core::OsRng; +use rand_core::SeedableRng; +use rand_xorshift::XorShiftRng; +use std::iter::zip; + +fn random_curve_points(k: u8) -> Vec { + debug_assert!(k < 64); + let n: u64 = 1 << k; + + let mut rng = XorShiftRng::from_seed([ + 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, + 0xe5, + ]); + + (0..n).map(|_n| Secp256k1Affine::random(&mut rng)).collect() +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("msm-alt"); + for k in 8..16 { + group + .bench_function(BenchmarkId::new("k", k), |b| { + let rng = OsRng; + + let mut g = random_curve_points::(k); + let half_len = g.len() / 2; + let (g_lo, g_hi) = g.split_at_mut(half_len); + let coeff_1 = Scalar::random(rng); + let coeff_2 = Scalar::random(rng); + + b.iter(|| { + for (g_lo, g_hi) in zip(g_lo.iter(), g_hi.iter()) { + small_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); + } + }) + }) + .sample_size(30); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/benches/msm.rs b/benches/msm.rs new file mode 100644 index 00000000..419816a3 --- /dev/null +++ b/benches/msm.rs @@ -0,0 +1,34 @@ +#[macro_use] +extern crate criterion; +use criterion::{black_box, Criterion}; +use ff::Field; +use halo2_proofs::poly::{commitment::ParamsProver, ipa::commitment::ParamsIPA}; +use halo2curves::msm::small_multiexp; +use pasta_curves::{EqAffine, Fp}; +use rand_core::OsRng; + +fn criterion_benchmark(c: &mut Criterion) { + let rng = OsRng; + + // small multiexp + { + let params: ParamsIPA = ParamsIPA::new(5); + let g = &mut params.get_g().to_vec(); + let len = g.len() / 2; + let (g_lo, g_hi) = g.split_at_mut(len); + + let coeff_1 = Fp::random(rng); + let coeff_2 = Fp::random(rng); + + c.bench_function("double-and-add", |b| { + b.iter(|| { + for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) { + small_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]); + } + }) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/src/fft.rs b/src/fft.rs new file mode 100644 index 00000000..e0240cff --- /dev/null +++ b/src/fft.rs @@ -0,0 +1,180 @@ +use crate::{msm::multiexp_serial, multicore}; +pub use crate::{CurveAffine, CurveExt}; +use ff::Field; +use group::{Group, GroupOpsOwned, ScalarMulOwned}; + +fn log2_floor(num: usize) -> u32 { + assert!(num > 0); + + let mut pow = 0; + + while (1 << (pow + 1)) <= num { + pow += 1; + } + + pow +} + +/// This represents an element of a group with basic operations that can be +/// performed. This allows an FFT implementation (for example) to operate +/// generically over either a field or elliptic curve group. +pub trait FftGroup: + Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned +{ +} + +impl FftGroup for T +where + Scalar: Field, + T: Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned, +{ +} + +/// Performs a multi-exponentiation operation. +/// +/// This function will panic if coeffs and bases have a different length. +/// +/// This will use multithreading if beneficial. +pub fn best_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { + assert_eq!(coeffs.len(), bases.len()); + + let num_threads = multicore::current_num_threads(); + if coeffs.len() > num_threads { + let chunk = coeffs.len() / num_threads; + let num_chunks = coeffs.chunks(chunk).len(); + let mut results = vec![C::Curve::identity(); num_chunks]; + multicore::scope(|scope| { + let chunk = coeffs.len() / num_threads; + + for ((coeffs, bases), acc) in coeffs + .chunks(chunk) + .zip(bases.chunks(chunk)) + .zip(results.iter_mut()) + { + scope.spawn(move |_| { + multiexp_serial(coeffs, bases, acc); + }); + } + }); + results.iter().fold(C::Curve::identity(), |a, b| a + b) + } else { + let mut acc = C::Curve::identity(); + multiexp_serial(coeffs, bases, &mut acc); + acc + } +} + +/// Performs a radix-$2$ Fast-Fourier Transformation (FFT) on a vector of size +/// $n = 2^k$, when provided `log_n` = $k$ and an element of multiplicative +/// order $n$ called `omega` ($\omega$). The result is that the vector `a`, when +/// interpreted as the coefficients of a polynomial of degree $n - 1$, is +/// transformed into the evaluations of this polynomial at each of the $n$ +/// distinct powers of $\omega$. This transformation is invertible by providing +/// $\omega^{-1}$ in place of $\omega$ and dividing each resulting field element +/// by $n$. +/// +/// This will use multithreading if beneficial. +pub fn best_fft>(a: &mut [G], omega: Scalar, log_n: u32) { + fn bitreverse(mut n: usize, l: usize) -> usize { + let mut r = 0; + for _ in 0..l { + r = (r << 1) | (n & 1); + n >>= 1; + } + r + } + + let threads = multicore::current_num_threads(); + let log_threads = log2_floor(threads); + let n = a.len(); + assert_eq!(n, 1 << log_n); + + for k in 0..n { + let rk = bitreverse(k, log_n as usize); + if k < rk { + a.swap(rk, k); + } + } + + // precompute twiddle factors + let twiddles: Vec<_> = (0..(n / 2)) + .scan(Scalar::ONE, |w, _| { + let tw = *w; + *w *= ω + Some(tw) + }) + .collect(); + + if log_n <= log_threads { + let mut chunk = 2_usize; + let mut twiddle_chunk = n / 2; + for _ in 0..log_n { + a.chunks_mut(chunk).for_each(|coeffs| { + let (left, right) = coeffs.split_at_mut(chunk / 2); + + // case when twiddle factor is one + let (a, left) = left.split_at_mut(1); + let (b, right) = right.split_at_mut(1); + let t = b[0]; + b[0] = a[0]; + a[0] += &t; + b[0] -= &t; + + left.iter_mut() + .zip(right.iter_mut()) + .enumerate() + .for_each(|(i, (a, b))| { + let mut t = *b; + t *= &twiddles[(i + 1) * twiddle_chunk]; + *b = *a; + *a += &t; + *b -= &t; + }); + }); + chunk *= 2; + twiddle_chunk /= 2; + } + } else { + recursive_butterfly_arithmetic(a, n, 1, &twiddles) + } +} + +/// This perform recursive butterfly arithmetic +pub fn recursive_butterfly_arithmetic>( + a: &mut [G], + n: usize, + twiddle_chunk: usize, + twiddles: &[Scalar], +) { + if n == 2 { + let t = a[1]; + a[1] = a[0]; + a[0] += &t; + a[1] -= &t; + } else { + let (left, right) = a.split_at_mut(n / 2); + multicore::join( + || recursive_butterfly_arithmetic(left, n / 2, twiddle_chunk * 2, twiddles), + || recursive_butterfly_arithmetic(right, n / 2, twiddle_chunk * 2, twiddles), + ); + + // case when twiddle factor is one + let (a, left) = left.split_at_mut(1); + let (b, right) = right.split_at_mut(1); + let t = b[0]; + b[0] = a[0]; + a[0] += &t; + b[0] -= &t; + + left.iter_mut() + .zip(right.iter_mut()) + .enumerate() + .for_each(|(i, (a, b))| { + let mut t = *b; + t *= &twiddles[(i + 1) * twiddle_chunk]; + *b = *a; + *a += &t; + *b -= &t; + }); + } +} diff --git a/src/lib.rs b/src/lib.rs index 3fa8e98f..670a6448 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,8 @@ mod arithmetic; +pub mod fft; pub mod hash_to_curve; +pub mod msm; +pub mod multicore; #[macro_use] pub mod legendre; pub mod serde; diff --git a/src/msm.rs b/src/msm.rs new file mode 100644 index 00000000..de30be55 --- /dev/null +++ b/src/msm.rs @@ -0,0 +1,153 @@ +use ff::PrimeField; +use group::Group; +use pasta_curves::arithmetic::CurveAffine; + +use crate::multicore; + +pub fn multiexp_serial(coeffs: &[C::Scalar], bases: &[C], acc: &mut C::Curve) { + let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect(); + + let c = if bases.len() < 4 { + 1 + } else if bases.len() < 32 { + 3 + } else { + (f64::from(bases.len() as u32)).ln().ceil() as usize + }; + + fn get_at(segment: usize, c: usize, bytes: &F::Repr) -> usize { + let skip_bits = segment * c; + let skip_bytes = skip_bits / 8; + + if skip_bytes >= 32 { + return 0; + } + + let mut v = [0; 8]; + for (v, o) in v.iter_mut().zip(bytes.as_ref()[skip_bytes..].iter()) { + *v = *o; + } + + let mut tmp = u64::from_le_bytes(v); + tmp >>= skip_bits - (skip_bytes * 8); + tmp %= 1 << c; + + tmp as usize + } + + let segments = (256 / c) + 1; + + for current_segment in (0..segments).rev() { + for _ in 0..c { + *acc = acc.double(); + } + + #[derive(Clone, Copy)] + enum Bucket { + None, + Affine(C), + Projective(C::Curve), + } + + impl Bucket { + fn add_assign(&mut self, other: &C) { + *self = match *self { + Bucket::None => Bucket::Affine(*other), + Bucket::Affine(a) => Bucket::Projective(a + *other), + Bucket::Projective(mut a) => { + a += *other; + Bucket::Projective(a) + } + } + } + + fn add(self, mut other: C::Curve) -> C::Curve { + match self { + Bucket::None => other, + Bucket::Affine(a) => { + other += a; + other + } + Bucket::Projective(a) => other + a, + } + } + } + + let mut buckets: Vec> = vec![Bucket::None; (1 << c) - 1]; + + for (coeff, base) in coeffs.iter().zip(bases.iter()) { + let coeff = get_at::(current_segment, c, coeff); + if coeff != 0 { + buckets[coeff - 1].add_assign(base); + } + } + + // Summation by parts + // e.g. 3a + 2b + 1c = a + + // (a) + b + + // ((a) + b) + c + let mut running_sum = C::Curve::identity(); + for exp in buckets.into_iter().rev() { + running_sum = exp.add(running_sum); + *acc += &running_sum; + } + } +} + +/// Performs a small multi-exponentiation operation. +/// Uses the double-and-add algorithm with doublings shared across points. +pub fn small_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { + let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect(); + let mut acc = C::Curve::identity(); + + // for byte idx + for byte_idx in (0..32).rev() { + // for bit idx + for bit_idx in (0..8).rev() { + acc = acc.double(); + // for each coeff + for coeff_idx in 0..coeffs.len() { + let byte = coeffs[coeff_idx].as_ref()[byte_idx]; + if ((byte >> bit_idx) & 1) != 0 { + acc += bases[coeff_idx]; + } + } + } + } + + acc +} + +/// Performs a multi-exponentiation operation. +/// +/// This function will panic if coeffs and bases have a different length. +/// +/// This will use multithreading if beneficial. +pub fn best_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { + assert_eq!(coeffs.len(), bases.len()); + + let num_threads = multicore::current_num_threads(); + if coeffs.len() > num_threads { + let chunk = coeffs.len() / num_threads; + let num_chunks = coeffs.chunks(chunk).len(); + let mut results = vec![C::Curve::identity(); num_chunks]; + multicore::scope(|scope| { + let chunk = coeffs.len() / num_threads; + + for ((coeffs, bases), acc) in coeffs + .chunks(chunk) + .zip(bases.chunks(chunk)) + .zip(results.iter_mut()) + { + scope.spawn(move |_| { + multiexp_serial(coeffs, bases, acc); + }); + } + }); + results.iter().fold(C::Curve::identity(), |a, b| a + b) + } else { + let mut acc = C::Curve::identity(); + multiexp_serial(coeffs, bases, &mut acc); + acc + } +} diff --git a/src/multicore.rs b/src/multicore.rs new file mode 100644 index 00000000..d8323553 --- /dev/null +++ b/src/multicore.rs @@ -0,0 +1,16 @@ +pub use maybe_rayon::{ + iter::{IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator}, + join, scope, Scope, +}; + +#[cfg(feature = "multicore")] +pub use maybe_rayon::{ + current_num_threads, + iter::{IndexedParallelIterator, IntoParallelRefIterator}, + slice::ParallelSliceMut, +}; + +#[cfg(not(feature = "multicore"))] +pub fn current_num_threads() -> usize { + 1 +}