Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add batched MSM operations to speed up small circuits #6

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion halo2_proofs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ lazy_static = { version = "1", optional = true }
env_logger = "0.10.0"

# GPU Icicle integration
icicle = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "rust/large-bucket-factor-msm", optional = true }
icicle = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v0.1.0", optional = true }
rustacuda = { version = "0.1", optional = true }

# Developer tooling dependencies
Expand Down Expand Up @@ -100,6 +100,7 @@ batch = ["rand_core/getrandom"]
circuit-params = []
counter = ["lazy_static"]
icicle_gpu = ["icicle", "rustacuda"]
profile=[]

[lib]
bench = false
Expand Down
26 changes: 24 additions & 2 deletions halo2_proofs/src/arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,31 @@ pub fn small_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::C
#[cfg(feature = "icicle_gpu")]
/// Performs a multi-exponentiation operation on GPU using Icicle library
pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], is_lagrange: bool) -> C::Curve {
let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> = icicle::copy_scalars_to_device::<C>(coeffs);
let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
icicle::copy_scalars_to_device::<C>(coeffs);

return icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange);
icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange)
}

#[cfg(feature = "icicle_gpu")]
/// Performs a batch multi-exponentiation operation on GPU using Icicle library
pub fn best_batch_multiexp_gpu<C: CurveAffine>(
coeffs: &[C::Scalar],
bases: &[C],
batch_size: usize,
) -> Vec<C::Curve> {
let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
icicle::copy_scalars_to_device::<C>(coeffs);
let all_bases = vec![bases; batch_size]
.iter()
.flat_map(|bases| bases.iter())
.copied()
.collect::<Vec<_>>();
assert!(scalars_ptr.len() == all_bases.len());
let bases_ptr: DeviceBuffer<::icicle::curves::bn254::PointAffineNoInfinity_BN254> =
icicle::copy_points_to_device::<C>(all_bases.as_slice());

icicle::batch_multiexp_on_device::<C>(scalars_ptr, bases_ptr, batch_size)
}

/// Performs a multi-exponentiation operation.
Expand Down
129 changes: 89 additions & 40 deletions halo2_proofs/src/icicle.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
use group::ff::PrimeField;
use icicle::{curves::bn254::{Point_BN254, ScalarField_BN254}, test_bn254::commit_bn254};
use icicle::{
curves::bn254::{Point_BN254, ScalarField_BN254},
test_bn254::{commit_batch_bn254, commit_bn254},
};
use std::sync::{Arc, Once};

pub use icicle::curves::bn254::PointAffineNoInfinity_BN254;
use rustacuda::memory::CopyDestination;
use rustacuda::prelude::*;

pub use halo2curves::CurveAffine;
use std::{mem, env};
use log::info;
use std::{env, mem};

static mut GPU_CONTEXT: Option<Context> = None;
static mut GPU_G: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static mut GPU_G_LAGRANGE: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static GPU_INIT: Once = Once::new();

pub fn should_use_cpu_msm(size: usize) -> bool {
size <= (1 << u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("8".to_string()), 10).unwrap())
pub fn is_small_circuit(size: usize) -> bool {
size <= (1
<< u8::from_str_radix(
&env::var("ICICLE_SMALL_CIRCUIT").unwrap_or("8".to_string()),
10,
)
.unwrap())
}

pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
Expand All @@ -24,20 +33,27 @@ pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
GPU_CONTEXT = Some(rustacuda::quick_init().unwrap());
GPU_G = Some(copy_points_to_device(g));
GPU_G_LAGRANGE = Some(copy_points_to_device(g_lagrange));
info!("GPU initialized");
});
}
}

fn u32_from_u8(u8_arr: &[u8;32]) -> [u32;8]{
let mut t = [0u32;8];
for i in 0..8{
t[i] = u32::from_le_bytes([u8_arr[4*i],u8_arr[4*i+1],u8_arr[4*i+2],u8_arr[4*i+3]]);
fn u32_from_u8(u8_arr: &[u8; 32]) -> [u32; 8] {
let mut t = [0u32; 8];
for i in 0..8 {
t[i] = u32::from_le_bytes([
u8_arr[4 * i],
u8_arr[4 * i + 1],
u8_arr[4 * i + 2],
u8_arr[4 * i + 3],
]);
}
return t;
return t;
}

fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32;8]) -> <C as CurveAffine>::Base {
let t : &[<<C as CurveAffine>::Base as PrimeField>::Repr] = unsafe { mem::transmute(&u32_arr[..]) };
fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base {
let t: &[<<C as CurveAffine>::Base as PrimeField>::Repr] =
unsafe { mem::transmute(&u32_arr[..]) };
return PrimeField::from_repr(t[0]).unwrap();
}

Expand All @@ -50,54 +66,74 @@ fn icicle_scalars_from_c<C: CurveAffine>(coeffs: &[C::Scalar]) -> Vec<ScalarFiel
let _coeffs = [Arc::new(
coeffs.iter().map(|x| x.to_repr()).collect::<Vec<_>>(),
)];

let _coeffs: &Arc<Vec<[u32;8]>> = unsafe { mem::transmute(&_coeffs) };
_coeffs.iter().map(|x| {
ScalarField_BN254::from_limbs(x)
}).collect::<Vec<_>>()

let _coeffs: &Arc<Vec<[u32; 8]>> = unsafe { mem::transmute(&_coeffs) };
_coeffs
.iter()
.map(|x| ScalarField_BN254::from_limbs(x))
.collect::<Vec<_>>()
}

pub fn copy_scalars_to_device<C: CurveAffine>(coeffs: &[C::Scalar]) -> DeviceBuffer<ScalarField_BN254> {
pub fn copy_scalars_to_device<C: CurveAffine>(
coeffs: &[C::Scalar],
) -> DeviceBuffer<ScalarField_BN254> {
let scalars = icicle_scalars_from_c::<C>(coeffs);

DeviceBuffer::from_slice(scalars.as_slice()).unwrap()
}

fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<PointAffineNoInfinity_BN254> {
let _bases = [Arc::new(
bases.iter().map(|p| {
let coordinates = p.coordinates().unwrap();
[coordinates.x().to_repr(),coordinates.y().to_repr()]
}).collect::<Vec<_>>(),
bases
.iter()
.map(|p| {
let coordinates = p.coordinates().unwrap();
[coordinates.x().to_repr(), coordinates.y().to_repr()]
})
.collect::<Vec<_>>(),
)];

let _bases: &Arc<Vec<[[u8;32];2]>> = unsafe { mem::transmute(&_bases) };
_bases.iter().map(|x| {
let tx = u32_from_u8(&x[0]);
let ty = u32_from_u8(&x[1]);
PointAffineNoInfinity_BN254::from_limbs(&tx,&ty)
}).collect::<Vec<_>>()

let _bases: &Arc<Vec<[[u8; 32]; 2]>> = unsafe { mem::transmute(&_bases) };
_bases
.iter()
.map(|x| {
let tx = u32_from_u8(&x[0]);
let ty = u32_from_u8(&x[1]);
PointAffineNoInfinity_BN254::from_limbs(&tx, &ty)
})
.collect::<Vec<_>>()
}

pub fn copy_points_to_device<C: CurveAffine>(bases: &[C]) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
pub fn copy_points_to_device<C: CurveAffine>(
bases: &[C],
) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
let points = icicle_points_from_c(bases);

DeviceBuffer::from_slice(points.as_slice()).unwrap()
}

fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
let (x , y) = if is_infinity_point(commit_res){
(repr_from_u32::<C>(&[0u32;8]), repr_from_u32::<C>(&[0u32;8]))
} else{
fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
let (x, y) = if is_infinity_point(commit_res) {
(
repr_from_u32::<C>(&[0u32; 8]),
repr_from_u32::<C>(&[0u32; 8]),
)
} else {
let affine_res_from_cuda = commit_res.to_affine();
(repr_from_u32::<C>(&affine_res_from_cuda.x.s), repr_from_u32::<C>(&affine_res_from_cuda.y.s))
(
repr_from_u32::<C>(&affine_res_from_cuda.x.s),
repr_from_u32::<C>(&affine_res_from_cuda.y.s),
)
};

let affine = C::from_xy(x,y).unwrap();
let affine = C::from_xy(x, y).unwrap();
return affine.to_curve();
}

pub fn multiexp_on_device<C: CurveAffine>(mut coeffs: DeviceBuffer<ScalarField_BN254>, is_lagrange: bool) -> C::Curve {
pub fn multiexp_on_device<C: CurveAffine>(
mut coeffs: DeviceBuffer<ScalarField_BN254>,
is_lagrange: bool,
) -> C::Curve {
let base_ptr: &mut DeviceBuffer<PointAffineNoInfinity_BN254>;
unsafe {
if is_lagrange {
Expand All @@ -110,10 +146,23 @@ pub fn multiexp_on_device<C: CurveAffine>(mut coeffs: DeviceBuffer<ScalarField_B
let d_commit_result = commit_bn254(base_ptr, &mut coeffs, 10);

let mut h_commit_result = Point_BN254::zero();
d_commit_result
.copy_to(&mut h_commit_result)
.unwrap();
d_commit_result.copy_to(&mut h_commit_result).unwrap();

c_from_icicle_point::<C>(h_commit_result)
}

pub fn batch_multiexp_on_device<C: CurveAffine>(
mut coeffs: DeviceBuffer<ScalarField_BN254>,
mut bases: DeviceBuffer<PointAffineNoInfinity_BN254>,
batch_size: usize,
) -> Vec<C::Curve> {
let d_commit_result = commit_batch_bn254(&mut bases, &mut coeffs, batch_size);
let mut h_commit_result: Vec<Point_BN254> =
(0..batch_size).map(|_| Point_BN254::zero()).collect();
d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();

h_commit_result
.iter()
.map(|commit_result| c_from_icicle_point::<C>(*commit_result))
.collect()
}
50 changes: 39 additions & 11 deletions halo2_proofs/src/plonk/permutation/prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ use crate::{
transcript::{EncodedChallenge, TranscriptWrite},
};

#[cfg(feature = "icicle_gpu")]
use crate::icicle;

pub(crate) struct CommittedSet<C: CurveAffine> {
pub(crate) permutation_product_poly: Polynomial<C::Scalar, Coeff>,
pub(crate) permutation_product_coset: Polynomial<C::Scalar, ExtendedLagrangeCoeff>,
Expand Down Expand Up @@ -80,6 +83,8 @@ impl Argument {
let mut last_z = C::Scalar::ONE;

let mut sets = vec![];
let mut z_set = vec![];
let mut blind_set = vec![];

for (columns, permutations) in self
.columns
Expand Down Expand Up @@ -165,21 +170,13 @@ impl Argument {
}
// Set new last_z
last_z = z[params.n() as usize - (blinding_factors + 1)];

let blind = Blind(C::Scalar::random(&mut rng));

let permutation_product_commitment_projective = params.commit_lagrange(&z, blind);
let permutation_product_blind = blind;
z_set.push(z.clone());
blind_set.push(blind);
let z = domain.lagrange_to_coeff(z);
let permutation_product_poly = z.clone();

let permutation_product_coset = domain.coeff_to_extended(z.clone());

let permutation_product_commitment =
permutation_product_commitment_projective.to_affine();

// Hash the permutation product commitment
transcript.write_point(permutation_product_commitment)?;
let permutation_product_blind = blind;

sets.push(CommittedSet {
permutation_product_poly,
Expand All @@ -188,6 +185,37 @@ impl Argument {
});
}

#[cfg(feature = "icicle_gpu")]
if std::env::var("ENABLE_ICICLE_GPU").is_ok() && icicle::is_small_circuit(z_set[0].len()) {
let permutation_product_commitment_projectives =
params.commit_lagrange_batch(&z_set, &blind_set);
permutation_product_commitment_projectives
.iter()
.for_each(|commitment_projective| {
let permutation_product_commitment = commitment_projective.to_affine();

// Hash the permutation product commitment
transcript
.write_point(permutation_product_commitment)
.unwrap();
});

return Ok(Committed { sets });
}

//NOTE: Since commit_lagrange checks for icicle_gpu feature internally, we can delegate the decision to fall back
// to CPU to it instead of duplicating code here for when icicle_gpu is not enabled
z_set.iter().zip(blind_set.iter()).for_each(|(z, blind)| {
let permutation_product_commitment_projective = params.commit_lagrange(&z, *blind);
let permutation_product_commitment =
permutation_product_commitment_projective.to_affine();

// Hash the permutation product commitment
transcript
.write_point(permutation_product_commitment)
.unwrap();
});

Ok(Committed { sets })
}
}
Expand Down
Loading
Loading