Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICICLE MSM and NTT integration #18

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions halo2_proofs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ env_logger = "0.10.0"
rustc-hash = "2.0.0"
lazy_static = "1.4.0"
# GPU Icicle integration
icicle = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "rust/large-bucket-factor-msm", optional = true }
icicle-core = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-core", optional = true }
icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-bn254", optional = true }
icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-cuda-runtime", optional = true }
rustacuda = { version = "0.1", optional = true }
serde_derive = { version = "1", optional = true}
bincode = { version = "1.3.3", default_features = false }
Expand Down Expand Up @@ -107,7 +109,7 @@ sanity-checks = []
batch = ["rand_core/getrandom"]
circuit-params = []
counter = []
icicle_gpu = ["icicle", "rustacuda"]
icicle_gpu = ["icicle-cuda-runtime", "icicle-core", "icicle-bn254"]
mv-lookup = []
cost-estimator = ["serde_derive"]
derive_serde = ["halo2curves/derive_serde"]
Expand Down
4 changes: 2 additions & 2 deletions halo2_proofs/benches/arithmetic.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#[macro_use]
extern crate criterion;

use crate::arithmetic::best_multiexp_cpu;
use crate::arithmetic::best_multiexp;
use crate::halo2curves::pasta::{EqAffine, Fp};
use group::ff::Field;
use halo2_proofs::*;
Expand All @@ -27,7 +27,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("double-and-add", |b| {
b.iter(|| {
for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) {
best_multiexp_cpu(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
best_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
}
})
});
Expand Down
67 changes: 58 additions & 9 deletions halo2_proofs/src/arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

#[cfg(feature = "icicle_gpu")]
use super::icicle;
#[cfg(feature = "icicle_gpu")]
use std::env;
use super::multicore;
pub use ff::Field;
use group::{
ff::{BatchInvert, PrimeField},
prime::PrimeCurveAffine,
Curve, GroupOpsOwned, ScalarMulOwned,
};
#[cfg(feature = "icicle_gpu")]
use rustacuda::prelude::DeviceBuffer;

use halo2curves::msm::msm_best;
pub use halo2curves::{CurveAffine, CurveExt};
Expand All @@ -31,6 +31,24 @@ where
{
}

/// Best MSM
pub fn best_multiexp<C: CurveAffine>(
coeffs: &[C::Scalar], bases: &[C]
) -> C::Curve {
#[cfg(feature = "icicle_gpu")]
if env::var("ENABLE_ICICLE_GPU").is_ok()
&& !icicle::should_use_cpu_msm(coeffs.len())
&& icicle::is_gpu_supported_field(&coeffs[0])
{
best_multiexp_gpu(coeffs, bases)
} else {
best_multiexp_cpu(coeffs, bases)
}

#[cfg(not(feature = "icicle_gpu"))]
best_multiexp_cpu(coeffs, bases)
}

// [JPW] Keep this adapter to halo2curves to minimize code changes.
/// Performs a multi-exponentiation operation.
///
Expand All @@ -43,15 +61,12 @@ pub fn best_multiexp_cpu<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C

#[cfg(feature = "icicle_gpu")]
/// Performs a multi-exponentiation operation on GPU using Icicle library
pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], is_lagrange: bool) -> C::Curve {
let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
icicle::copy_scalars_to_device::<C>(coeffs);

return icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange);
pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], g: &[C]) -> C::Curve {
icicle::multiexp_on_device::<C>(coeffs, g)
}

/// Dispatcher
pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
pub fn best_fft_cpu<Scalar: Field, G: FftGroup<Scalar>>(
a: &mut [G],
omega: Scalar,
log_n: u32,
Expand All @@ -61,6 +76,40 @@ pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
fft::fft(a, omega, log_n, data, inverse);
}

/// Best FFT
pub fn best_fft<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
scalars: &mut [G],
omega: Scalar,
log_n: u32,
data: &FFTData<Scalar>,
inverse: bool,
) {
#[cfg(feature = "icicle_gpu")]
if env::var("ENABLE_ICICLE_GPU").is_ok()
&& !icicle::should_use_cpu_fft(scalars.len())
&& icicle::is_gpu_supported_field(&omega)
{
best_fft_gpu(scalars, omega, log_n, inverse);
} else {
best_fft_cpu(scalars, omega, log_n, data, inverse);
}

#[cfg(not(feature = "icicle_gpu"))]
best_fft_cpu(scalars, omega, log_n, data, inverse);
}

/// Performs a NTT operation on GPU using Icicle library
#[cfg(feature = "icicle_gpu")]
pub fn best_fft_gpu<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
a: &mut [G],
omega: Scalar,
log_n: u32,
inverse: bool,
) {
println!("icicle_fft");
icicle::fft_on_device::<Scalar, G>(a, omega, log_n, inverse);
}

/// Convert coefficient bases group elements to lagrange basis by inverse FFT.
pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -> Vec<C> {
let n_inv = C::Scalar::TWO_INV.pow_vartime([k as u64, 0, 0, 0]);
Expand All @@ -74,7 +123,7 @@ pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -
let n = g_lagrange_projective.len();
let fft_data = FFTData::new(n, omega, omega_inv);

best_fft(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
best_fft_cpu(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
parallelize(&mut g_lagrange_projective, |g, _| {
for g in g.iter_mut() {
*g *= n_inv;
Expand Down
195 changes: 86 additions & 109 deletions halo2_proofs/src/icicle.rs
Original file line number Diff line number Diff line change
@@ -1,48 +1,34 @@
use group::ff::PrimeField;
use icicle::{
curves::bn254::{Point_BN254, ScalarField_BN254},
test_bn254::commit_bn254,
};
use std::sync::{Arc, Once};

pub use icicle::curves::bn254::PointAffineNoInfinity_BN254;
use rustacuda::memory::CopyDestination;
use rustacuda::prelude::*;

use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarField};
use halo2curves::bn256::Fr as Bn256Fr;
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
use crate::arithmetic::FftGroup;
use std::any::{TypeId, Any};
pub use halo2curves::CurveAffine;
use icicle_core::{
curve::Affine,
msm,
ntt::{initialize_domain, ntt_inplace, NTTConfig, NTTDir},
};
use maybe_rayon::iter::IntoParallelRefIterator;
use maybe_rayon::iter::ParallelIterator;
use std::{env, mem};

static mut GPU_CONTEXT: Option<Context> = None;
static mut GPU_G: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static mut GPU_G_LAGRANGE: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static GPU_INIT: Once = Once::new();

pub fn should_use_cpu_msm(size: usize) -> bool {
size <= (1
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("8".to_string()), 10).unwrap())
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("2".to_string()), 10).unwrap())
}

pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
unsafe {
GPU_INIT.call_once(|| {
GPU_CONTEXT = Some(rustacuda::quick_init().unwrap());
GPU_G = Some(copy_points_to_device(g));
GPU_G_LAGRANGE = Some(copy_points_to_device(g_lagrange));
});
}
pub fn should_use_cpu_fft(size: usize) -> bool {
size <= (1
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K_FFT").unwrap_or("2".to_string()), 10).unwrap())
}

fn u32_from_u8(u8_arr: &[u8; 32]) -> [u32; 8] {
let mut t = [0u32; 8];
for i in 0..8 {
t[i] = u32::from_le_bytes([
u8_arr[4 * i],
u8_arr[4 * i + 1],
u8_arr[4 * i + 2],
u8_arr[4 * i + 3],
]);
pub fn is_gpu_supported_field<G: Any>(_sample_element: &G) -> bool {
match TypeId::of::<G>() {
id if id == TypeId::of::<Bn256Fr>() => true,
_ => false,
}
return t;
}

fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base {
Expand All @@ -51,96 +37,87 @@ fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base
return PrimeField::from_repr(t[0]).unwrap();
}

fn is_infinity_point(point: Point_BN254) -> bool {
let inf_point = Point_BN254::infinity();
point.z.s.eq(&inf_point.z.s)
fn icicle_scalars_from_c_scalars<G: PrimeField>(coeffs: &[G]) -> Vec<ScalarField> {
coeffs.par_iter().map(|coef| {
let repr: [u32; 8] = unsafe { mem::transmute_copy(&coef.to_repr()) };
ScalarField::from(repr)
}).collect()
}

fn icicle_scalars_from_c<C: CurveAffine>(coeffs: &[C::Scalar]) -> Vec<ScalarField_BN254> {
let _coeffs = [Arc::new(
coeffs.iter().map(|x| x.to_repr()).collect::<Vec<_>>(),
)];

let _coeffs: &Arc<Vec<[u32; 8]>> = unsafe { mem::transmute(&_coeffs) };
_coeffs
.iter()
.map(|x| ScalarField_BN254::from_limbs(x))
.collect::<Vec<_>>()
fn c_scalars_from_icicle_scalars<G: PrimeField>(scalars: &[ScalarField]) -> Vec<G> {
scalars.par_iter().map(|scalar| {
let repr: G::Repr = unsafe { mem::transmute_copy(scalar) };
G::from_repr(repr).unwrap()
}).collect()
}

pub fn copy_scalars_to_device<C: CurveAffine>(
coeffs: &[C::Scalar],
) -> DeviceBuffer<ScalarField_BN254> {
let scalars = icicle_scalars_from_c::<C>(coeffs);

DeviceBuffer::from_slice(scalars.as_slice()).unwrap()
}
fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<Affine<CurveCfg>> {
bases.par_iter().map(|p| {
let coordinates = p.coordinates().unwrap();
let x_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.x().to_repr()) };
let y_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.y().to_repr()) };

fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<PointAffineNoInfinity_BN254> {
let _bases = [Arc::new(
bases
.iter()
.map(|p| {
let coordinates = p.coordinates().unwrap();
[coordinates.x().to_repr(), coordinates.y().to_repr()]
})
.collect::<Vec<_>>(),
)];

let _bases: &Arc<Vec<[[u8; 32]; 2]>> = unsafe { mem::transmute(&_bases) };
_bases
.iter()
.map(|x| {
let tx = u32_from_u8(&x[0]);
let ty = u32_from_u8(&x[1]);
PointAffineNoInfinity_BN254::from_limbs(&tx, &ty)
})
.collect::<Vec<_>>()
Affine::<CurveCfg>::from_limbs(x_repr, y_repr)
}).collect()
}

pub fn copy_points_to_device<C: CurveAffine>(
bases: &[C],
) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
let points = icicle_points_from_c(bases);
fn c_from_icicle_point<C: CurveAffine>(point: &G1Projective) -> C::Curve {
let (x, y) = {
let affine: Affine<CurveCfg> = Affine::<CurveCfg>::from(*point);

DeviceBuffer::from_slice(points.as_slice()).unwrap()
}

fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
let (x, y) = if is_infinity_point(commit_res) {
(
repr_from_u32::<C>(&[0u32; 8]),
repr_from_u32::<C>(&[0u32; 8]),
)
} else {
let affine_res_from_cuda = commit_res.to_affine();
(
repr_from_u32::<C>(&affine_res_from_cuda.x.s),
repr_from_u32::<C>(&affine_res_from_cuda.y.s),
repr_from_u32::<C>(&affine.x.into()),
repr_from_u32::<C>(&affine.y.into()),
)
};

let affine = C::from_xy(x, y).unwrap();
return affine.to_curve();
let affine = C::from_xy(x, y);

return affine.unwrap().to_curve();
}

pub fn multiexp_on_device<C: CurveAffine>(
mut coeffs: DeviceBuffer<ScalarField_BN254>,
is_lagrange: bool,
) -> C::Curve {
let base_ptr: &mut DeviceBuffer<PointAffineNoInfinity_BN254>;
unsafe {
if is_lagrange {
base_ptr = GPU_G_LAGRANGE.as_mut().unwrap();
} else {
base_ptr = GPU_G.as_mut().unwrap();
};
}
pub fn multiexp_on_device<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
let binding = icicle_scalars_from_c_scalars::<C::ScalarExt>(coeffs);
let coeffs = HostSlice::from_slice(&binding[..]);
let binding = icicle_points_from_c(bases);
let bases = HostSlice::from_slice(&binding[..]);

let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
let cfg = msm::MSMConfig::default();

let d_commit_result = commit_bn254(base_ptr, &mut coeffs, 10);
msm::msm(coeffs, bases, &cfg, &mut msm_results[..]).unwrap();

let mut h_commit_result = Point_BN254::zero();
d_commit_result.copy_to(&mut h_commit_result).unwrap();
let mut msm_host_result = vec![G1Projective::zero(); 1];
msm_results
.copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
.unwrap();

c_from_icicle_point::<C>(h_commit_result)
let msm_point = c_from_icicle_point::<C>(&msm_host_result[0]);

msm_point
}

pub fn fft_on_device<Scalar: ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
scalars: &mut [G],
omega: Scalar,
_log_n: u32,
inverse: bool
) {
let cfg = NTTConfig::<'_, ScalarField>::default();
let dir = if inverse { NTTDir::kInverse } else { NTTDir::kForward };

let omega = icicle_scalars_from_c_scalars(&[omega]);
initialize_domain(omega[0], &cfg.ctx, true).unwrap();

let mut icicle_scalars: Vec<ScalarField> = icicle_scalars_from_c_scalars(scalars);
let host_scalars = HostSlice::from_mut_slice(&mut icicle_scalars);

ntt_inplace::<ScalarField, ScalarField>(
host_scalars,
dir,
&cfg,
).unwrap();

let c_scalars = &c_scalars_from_icicle_scalars::<G>(&mut host_scalars.as_slice())[..];
scalars.copy_from_slice(&c_scalars);
}
Loading
Loading