Skip to content

Commit

Permalink
icicle v2 msm and ntt integration
Browse files Browse the repository at this point in the history
  • Loading branch information
emirsoyturk committed Sep 4, 2024
1 parent 930970a commit bca2899
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 177 deletions.
6 changes: 4 additions & 2 deletions halo2_proofs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ env_logger = "0.10.0"
rustc-hash = "2.0.0"
lazy_static = "1.4.0"
# GPU Icicle integration
icicle = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "rust/large-bucket-factor-msm", optional = true }
icicle-core = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-core", optional = true }
icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-bn254", optional = true }
icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-cuda-runtime", optional = true }
rustacuda = { version = "0.1", optional = true }
serde_derive = { version = "1", optional = true}
bincode = { version = "1.3.3", default_features = false }
Expand Down Expand Up @@ -107,7 +109,7 @@ sanity-checks = []
batch = ["rand_core/getrandom"]
circuit-params = []
counter = []
icicle_gpu = ["icicle", "rustacuda"]
icicle_gpu = ["icicle-cuda-runtime", "icicle-core", "icicle-bn254"]
mv-lookup = []
cost-estimator = ["serde_derive"]
derive_serde = ["halo2curves/derive_serde"]
Expand Down
4 changes: 2 additions & 2 deletions halo2_proofs/benches/arithmetic.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#[macro_use]
extern crate criterion;

use crate::arithmetic::best_multiexp_cpu;
use crate::arithmetic::best_multiexp;
use crate::halo2curves::pasta::{EqAffine, Fp};
use group::ff::Field;
use halo2_proofs::*;
Expand All @@ -27,7 +27,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("double-and-add", |b| {
b.iter(|| {
for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) {
best_multiexp_cpu(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
best_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
}
})
});
Expand Down
67 changes: 58 additions & 9 deletions halo2_proofs/src/arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

#[cfg(feature = "icicle_gpu")]
use super::icicle;
#[cfg(feature = "icicle_gpu")]
use std::env;
use super::multicore;
pub use ff::Field;
use group::{
ff::{BatchInvert, PrimeField},
prime::PrimeCurveAffine,
Curve, GroupOpsOwned, ScalarMulOwned,
};
#[cfg(feature = "icicle_gpu")]
use rustacuda::prelude::DeviceBuffer;

use halo2curves::msm::msm_best;
pub use halo2curves::{CurveAffine, CurveExt};
Expand All @@ -31,6 +31,24 @@ where
{
}

/// Best MSM
pub fn best_multiexp<C: CurveAffine>(
coeffs: &[C::Scalar], bases: &[C]
) -> C::Curve {
#[cfg(feature = "icicle_gpu")]
if env::var("ENABLE_ICICLE_GPU").is_ok()
&& !icicle::should_use_cpu_msm(coeffs.len())
&& icicle::is_gpu_supported_field(&coeffs[0])
{
best_multiexp_gpu(coeffs, bases)
} else {
best_multiexp_cpu(coeffs, bases)
}

#[cfg(not(feature = "icicle_gpu"))]
best_multiexp_cpu(coeffs, bases)
}

// [JPW] Keep this adapter to halo2curves to minimize code changes.
/// Performs a multi-exponentiation operation.
///
Expand All @@ -43,15 +61,12 @@ pub fn best_multiexp_cpu<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C

#[cfg(feature = "icicle_gpu")]
/// Performs a multi-exponentiation operation on GPU using Icicle library
pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], is_lagrange: bool) -> C::Curve {
let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
icicle::copy_scalars_to_device::<C>(coeffs);

return icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange);
pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], g: &[C]) -> C::Curve {
icicle::multiexp_on_device::<C>(coeffs, g)
}

/// Dispatcher
pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
pub fn best_fft_cpu<Scalar: Field, G: FftGroup<Scalar>>(
a: &mut [G],
omega: Scalar,
log_n: u32,
Expand All @@ -61,6 +76,40 @@ pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
fft::fft(a, omega, log_n, data, inverse);
}

/// Best FFT
pub fn best_fft<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
scalars: &mut [G],
omega: Scalar,
log_n: u32,
data: &FFTData<Scalar>,
inverse: bool,
) {
#[cfg(feature = "icicle_gpu")]
if env::var("ENABLE_ICICLE_GPU").is_ok()
&& !icicle::should_use_cpu_fft(scalars.len())
&& icicle::is_gpu_supported_field(&omega)
{
best_fft_gpu(scalars, omega, log_n, inverse);
} else {
best_fft_cpu(scalars, omega, log_n, data, inverse);
}

#[cfg(not(feature = "icicle_gpu"))]
best_fft_cpu(scalars, omega, log_n, data, inverse);
}

/// Performs a NTT operation on GPU using Icicle library
#[cfg(feature = "icicle_gpu")]
pub fn best_fft_gpu<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
a: &mut [G],
omega: Scalar,
log_n: u32,
inverse: bool,
) {
println!("icicle_fft");
icicle::fft_on_device::<Scalar, G>(a, omega, log_n, inverse);
}

/// Convert coefficient bases group elements to lagrange basis by inverse FFT.
pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -> Vec<C> {
let n_inv = C::Scalar::TWO_INV.pow_vartime([k as u64, 0, 0, 0]);
Expand All @@ -74,7 +123,7 @@ pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -
let n = g_lagrange_projective.len();
let fft_data = FFTData::new(n, omega, omega_inv);

best_fft(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
best_fft_cpu(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
parallelize(&mut g_lagrange_projective, |g, _| {
for g in g.iter_mut() {
*g *= n_inv;
Expand Down
195 changes: 86 additions & 109 deletions halo2_proofs/src/icicle.rs
Original file line number Diff line number Diff line change
@@ -1,48 +1,34 @@
use group::ff::PrimeField;
use icicle::{
curves::bn254::{Point_BN254, ScalarField_BN254},
test_bn254::commit_bn254,
};
use std::sync::{Arc, Once};

pub use icicle::curves::bn254::PointAffineNoInfinity_BN254;
use rustacuda::memory::CopyDestination;
use rustacuda::prelude::*;

use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarField};
use halo2curves::bn256::Fr as Bn256Fr;
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
use crate::arithmetic::FftGroup;
use std::any::{TypeId, Any};
pub use halo2curves::CurveAffine;
use icicle_core::{
curve::Affine,
msm,
ntt::{initialize_domain, ntt_inplace, NTTConfig, NTTDir},
};
use maybe_rayon::iter::IntoParallelRefIterator;
use maybe_rayon::iter::ParallelIterator;
use std::{env, mem};

static mut GPU_CONTEXT: Option<Context> = None;
static mut GPU_G: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static mut GPU_G_LAGRANGE: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static GPU_INIT: Once = Once::new();

pub fn should_use_cpu_msm(size: usize) -> bool {
size <= (1
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("8".to_string()), 10).unwrap())
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("2".to_string()), 10).unwrap())
}

pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
unsafe {
GPU_INIT.call_once(|| {
GPU_CONTEXT = Some(rustacuda::quick_init().unwrap());
GPU_G = Some(copy_points_to_device(g));
GPU_G_LAGRANGE = Some(copy_points_to_device(g_lagrange));
});
}
pub fn should_use_cpu_fft(size: usize) -> bool {
size <= (1
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K_FFT").unwrap_or("2".to_string()), 10).unwrap())
}

fn u32_from_u8(u8_arr: &[u8; 32]) -> [u32; 8] {
let mut t = [0u32; 8];
for i in 0..8 {
t[i] = u32::from_le_bytes([
u8_arr[4 * i],
u8_arr[4 * i + 1],
u8_arr[4 * i + 2],
u8_arr[4 * i + 3],
]);
pub fn is_gpu_supported_field<G: Any>(_sample_element: &G) -> bool {
match TypeId::of::<G>() {
id if id == TypeId::of::<Bn256Fr>() => true,
_ => false,
}
return t;
}

fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base {
Expand All @@ -51,96 +37,87 @@ fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base
return PrimeField::from_repr(t[0]).unwrap();
}

fn is_infinity_point(point: Point_BN254) -> bool {
let inf_point = Point_BN254::infinity();
point.z.s.eq(&inf_point.z.s)
fn icicle_scalars_from_c_scalars<G: PrimeField>(coeffs: &[G]) -> Vec<ScalarField> {
coeffs.par_iter().map(|coef| {
let repr: [u32; 8] = unsafe { mem::transmute_copy(&coef.to_repr()) };
ScalarField::from(repr)
}).collect()
}

fn icicle_scalars_from_c<C: CurveAffine>(coeffs: &[C::Scalar]) -> Vec<ScalarField_BN254> {
let _coeffs = [Arc::new(
coeffs.iter().map(|x| x.to_repr()).collect::<Vec<_>>(),
)];

let _coeffs: &Arc<Vec<[u32; 8]>> = unsafe { mem::transmute(&_coeffs) };
_coeffs
.iter()
.map(|x| ScalarField_BN254::from_limbs(x))
.collect::<Vec<_>>()
fn c_scalars_from_icicle_scalars<G: PrimeField>(scalars: &[ScalarField]) -> Vec<G> {
scalars.par_iter().map(|scalar| {
let repr: G::Repr = unsafe { mem::transmute_copy(scalar) };
G::from_repr(repr).unwrap()
}).collect()
}

pub fn copy_scalars_to_device<C: CurveAffine>(
coeffs: &[C::Scalar],
) -> DeviceBuffer<ScalarField_BN254> {
let scalars = icicle_scalars_from_c::<C>(coeffs);

DeviceBuffer::from_slice(scalars.as_slice()).unwrap()
}
fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<Affine<CurveCfg>> {
bases.par_iter().map(|p| {
let coordinates = p.coordinates().unwrap();
let x_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.x().to_repr()) };
let y_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.y().to_repr()) };

fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<PointAffineNoInfinity_BN254> {
let _bases = [Arc::new(
bases
.iter()
.map(|p| {
let coordinates = p.coordinates().unwrap();
[coordinates.x().to_repr(), coordinates.y().to_repr()]
})
.collect::<Vec<_>>(),
)];

let _bases: &Arc<Vec<[[u8; 32]; 2]>> = unsafe { mem::transmute(&_bases) };
_bases
.iter()
.map(|x| {
let tx = u32_from_u8(&x[0]);
let ty = u32_from_u8(&x[1]);
PointAffineNoInfinity_BN254::from_limbs(&tx, &ty)
})
.collect::<Vec<_>>()
Affine::<CurveCfg>::from_limbs(x_repr, y_repr)
}).collect()
}

pub fn copy_points_to_device<C: CurveAffine>(
bases: &[C],
) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
let points = icicle_points_from_c(bases);
fn c_from_icicle_point<C: CurveAffine>(point: &G1Projective) -> C::Curve {
let (x, y) = {
let affine: Affine<CurveCfg> = Affine::<CurveCfg>::from(*point);

DeviceBuffer::from_slice(points.as_slice()).unwrap()
}

fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
let (x, y) = if is_infinity_point(commit_res) {
(
repr_from_u32::<C>(&[0u32; 8]),
repr_from_u32::<C>(&[0u32; 8]),
)
} else {
let affine_res_from_cuda = commit_res.to_affine();
(
repr_from_u32::<C>(&affine_res_from_cuda.x.s),
repr_from_u32::<C>(&affine_res_from_cuda.y.s),
repr_from_u32::<C>(&affine.x.into()),
repr_from_u32::<C>(&affine.y.into()),
)
};

let affine = C::from_xy(x, y).unwrap();
return affine.to_curve();
let affine = C::from_xy(x, y);

return affine.unwrap().to_curve();
}

pub fn multiexp_on_device<C: CurveAffine>(
mut coeffs: DeviceBuffer<ScalarField_BN254>,
is_lagrange: bool,
) -> C::Curve {
let base_ptr: &mut DeviceBuffer<PointAffineNoInfinity_BN254>;
unsafe {
if is_lagrange {
base_ptr = GPU_G_LAGRANGE.as_mut().unwrap();
} else {
base_ptr = GPU_G.as_mut().unwrap();
};
}
pub fn multiexp_on_device<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
let binding = icicle_scalars_from_c_scalars::<C::ScalarExt>(coeffs);
let coeffs = HostSlice::from_slice(&binding[..]);
let binding = icicle_points_from_c(bases);
let bases = HostSlice::from_slice(&binding[..]);

let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
let cfg = msm::MSMConfig::default();

let d_commit_result = commit_bn254(base_ptr, &mut coeffs, 10);
msm::msm(coeffs, bases, &cfg, &mut msm_results[..]).unwrap();

let mut h_commit_result = Point_BN254::zero();
d_commit_result.copy_to(&mut h_commit_result).unwrap();
let mut msm_host_result = vec![G1Projective::zero(); 1];
msm_results
.copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
.unwrap();

c_from_icicle_point::<C>(h_commit_result)
let msm_point = c_from_icicle_point::<C>(&msm_host_result[0]);

msm_point
}

pub fn fft_on_device<Scalar: ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
scalars: &mut [G],
omega: Scalar,
_log_n: u32,
inverse: bool
) {
let cfg = NTTConfig::<'_, ScalarField>::default();
let dir = if inverse { NTTDir::kInverse } else { NTTDir::kForward };

let omega = icicle_scalars_from_c_scalars(&[omega]);
initialize_domain(omega[0], &cfg.ctx, true).unwrap();

let mut icicle_scalars: Vec<ScalarField> = icicle_scalars_from_c_scalars(scalars);
let host_scalars = HostSlice::from_mut_slice(&mut icicle_scalars);

ntt_inplace::<ScalarField, ScalarField>(
host_scalars,
dir,
&cfg,
).unwrap();

let c_scalars = &c_scalars_from_icicle_scalars::<G>(&mut host_scalars.as_slice())[..];
scalars.copy_from_slice(&c_scalars);
}
Loading

0 comments on commit bca2899

Please sign in to comment.