zkonduit · emirsoyturk · Sep 4, 2024
diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
@@ -63,7 +63,9 @@ env_logger = "0.10.0"
 rustc-hash = "2.0.0"
 lazy_static = "1.4.0"
 # GPU Icicle integration
-icicle = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "rust/large-bucket-factor-msm", optional = true }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-core", optional = true }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-bn254", optional = true }
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-cuda-runtime", optional = true }
 rustacuda = { version = "0.1", optional = true }
 serde_derive = { version = "1", optional = true}
 bincode = { version = "1.3.3", default_features = false }
@@ -107,7 +109,7 @@ sanity-checks = []
 batch = ["rand_core/getrandom"]
 circuit-params = []
 counter = []
-icicle_gpu = ["icicle", "rustacuda"]
+icicle_gpu = ["icicle-cuda-runtime", "icicle-core", "icicle-bn254"]
 mv-lookup = []
 cost-estimator = ["serde_derive"]
 derive_serde = ["halo2curves/derive_serde"]

diff --git a/halo2_proofs/benches/arithmetic.rs b/halo2_proofs/benches/arithmetic.rs
@@ -1,7 +1,7 @@
 #[macro_use]
 extern crate criterion;
 
-use crate::arithmetic::best_multiexp_cpu;
+use crate::arithmetic::best_multiexp;
 use crate::halo2curves::pasta::{EqAffine, Fp};
 use group::ff::Field;
 use halo2_proofs::*;
@@ -27,7 +27,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         c.bench_function("double-and-add", |b| {
             b.iter(|| {
                 for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) {
-                    best_multiexp_cpu(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
+                    best_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
                 }
             })
         });

diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
@@ -3,15 +3,15 @@
 
 #[cfg(feature = "icicle_gpu")]
 use super::icicle;
+#[cfg(feature = "icicle_gpu")]
+use std::env;
 use super::multicore;
 pub use ff::Field;
 use group::{
     ff::{BatchInvert, PrimeField},
     prime::PrimeCurveAffine,
     Curve, GroupOpsOwned, ScalarMulOwned,
 };
-#[cfg(feature = "icicle_gpu")]
-use rustacuda::prelude::DeviceBuffer;
 
 use halo2curves::msm::msm_best;
 pub use halo2curves::{CurveAffine, CurveExt};
@@ -31,6 +31,24 @@ where
 {
 }
 
+/// Best MSM
+pub fn best_multiexp<C: CurveAffine>(
+    coeffs: &[C::Scalar], bases: &[C]
+) -> C::Curve {
+    #[cfg(feature = "icicle_gpu")]
+    if env::var("ENABLE_ICICLE_GPU").is_ok()
+        && !icicle::should_use_cpu_msm(coeffs.len())
+        && icicle::is_gpu_supported_field(&coeffs[0])
+    {
+        best_multiexp_gpu(coeffs, bases)
+    } else {
+        best_multiexp_cpu(coeffs, bases)
+    }
+
+    #[cfg(not(feature = "icicle_gpu"))]
+    best_multiexp_cpu(coeffs, bases)
+}
+
 // [JPW] Keep this adapter to halo2curves to minimize code changes.
 /// Performs a multi-exponentiation operation.
 ///
@@ -43,15 +61,12 @@ pub fn best_multiexp_cpu<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C
 
 #[cfg(feature = "icicle_gpu")]
 /// Performs a multi-exponentiation operation on GPU using Icicle library
-pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], is_lagrange: bool) -> C::Curve {
-    let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
-        icicle::copy_scalars_to_device::<C>(coeffs);
-
-    return icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange);
+pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], g: &[C]) -> C::Curve {
+    icicle::multiexp_on_device::<C>(coeffs, g)
 }
 
 /// Dispatcher
-pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
+pub fn best_fft_cpu<Scalar: Field, G: FftGroup<Scalar>>(
     a: &mut [G],
     omega: Scalar,
     log_n: u32,
@@ -61,6 +76,40 @@ pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
     fft::fft(a, omega, log_n, data, inverse);
 }
 
+/// Best FFT
+pub fn best_fft<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
+    scalars: &mut [G],
+    omega: Scalar,
+    log_n: u32,
+    data: &FFTData<Scalar>,
+    inverse: bool,
+) {
+    #[cfg(feature = "icicle_gpu")]
+    if env::var("ENABLE_ICICLE_GPU").is_ok()
+        && !icicle::should_use_cpu_fft(scalars.len())
+        && icicle::is_gpu_supported_field(&omega)
+    {
+        best_fft_gpu(scalars, omega, log_n, inverse);
+    } else {
+        best_fft_cpu(scalars, omega, log_n, data, inverse);
+    }
+
+    #[cfg(not(feature = "icicle_gpu"))]
+    best_fft_cpu(scalars, omega, log_n, data, inverse);
+}
+
+/// Performs a NTT operation on GPU using Icicle library
+#[cfg(feature = "icicle_gpu")]
+pub fn best_fft_gpu<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
+    a: &mut [G],
+    omega: Scalar,
+    log_n: u32,
+    inverse: bool,
+) {
+    println!("icicle_fft");
+    icicle::fft_on_device::<Scalar, G>(a, omega, log_n, inverse);
+}
+
 /// Convert coefficient bases group elements to lagrange basis by inverse FFT.
 pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -> Vec<C> {
     let n_inv = C::Scalar::TWO_INV.pow_vartime([k as u64, 0, 0, 0]);
@@ -74,7 +123,7 @@ pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -
     let n = g_lagrange_projective.len();
     let fft_data = FFTData::new(n, omega, omega_inv);
 
-    best_fft(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
+    best_fft_cpu(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
     parallelize(&mut g_lagrange_projective, |g, _| {
         for g in g.iter_mut() {
             *g *= n_inv;

diff --git a/halo2_proofs/src/icicle.rs b/halo2_proofs/src/icicle.rs
@@ -1,48 +1,34 @@
 use group::ff::PrimeField;
-use icicle::{
-    curves::bn254::{Point_BN254, ScalarField_BN254},
-    test_bn254::commit_bn254,
-};
-use std::sync::{Arc, Once};
-
-pub use icicle::curves::bn254::PointAffineNoInfinity_BN254;
-use rustacuda::memory::CopyDestination;
-use rustacuda::prelude::*;
-
+use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarField};
+use halo2curves::bn256::Fr as Bn256Fr;
+use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
+use crate::arithmetic::FftGroup;
+use std::any::{TypeId, Any};
 pub use halo2curves::CurveAffine;
+use icicle_core::{
+    curve::Affine,
+    msm,
+    ntt::{initialize_domain, ntt_inplace, NTTConfig, NTTDir},
+};
+use maybe_rayon::iter::IntoParallelRefIterator;
+use maybe_rayon::iter::ParallelIterator;
 use std::{env, mem};
 
-static mut GPU_CONTEXT: Option<Context> = None;
-static mut GPU_G: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
-static mut GPU_G_LAGRANGE: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
-static GPU_INIT: Once = Once::new();
-
 pub fn should_use_cpu_msm(size: usize) -> bool {
     size <= (1
-        << u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("8".to_string()), 10).unwrap())
+        << u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("2".to_string()), 10).unwrap())
 }
 
-pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
-    unsafe {
-        GPU_INIT.call_once(|| {
-            GPU_CONTEXT = Some(rustacuda::quick_init().unwrap());
-            GPU_G = Some(copy_points_to_device(g));
-            GPU_G_LAGRANGE = Some(copy_points_to_device(g_lagrange));
-        });
-    }
+pub fn should_use_cpu_fft(size: usize) -> bool {
+    size <= (1
+        << u8::from_str_radix(&env::var("ICICLE_SMALL_K_FFT").unwrap_or("2".to_string()), 10).unwrap())
 }
 
-fn u32_from_u8(u8_arr: &[u8; 32]) -> [u32; 8] {
-    let mut t = [0u32; 8];
-    for i in 0..8 {
-        t[i] = u32::from_le_bytes([
-            u8_arr[4 * i],
-            u8_arr[4 * i + 1],
-            u8_arr[4 * i + 2],
-            u8_arr[4 * i + 3],
-        ]);
+pub fn is_gpu_supported_field<G: Any>(_sample_element: &G) -> bool {
+    match TypeId::of::<G>() {
+        id if id == TypeId::of::<Bn256Fr>() => true,
+        _ => false,
     }
-    return t;
 }
 
 fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base {
@@ -51,96 +37,87 @@ fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base
     return PrimeField::from_repr(t[0]).unwrap();
 }
 
-fn is_infinity_point(point: Point_BN254) -> bool {
-    let inf_point = Point_BN254::infinity();
-    point.z.s.eq(&inf_point.z.s)
+fn icicle_scalars_from_c_scalars<G: PrimeField>(coeffs: &[G]) -> Vec<ScalarField> {
+    coeffs.par_iter().map(|coef| {
+        let repr: [u32; 8] = unsafe { mem::transmute_copy(&coef.to_repr()) };
+        ScalarField::from(repr)
+    }).collect()
 }
 
-fn icicle_scalars_from_c<C: CurveAffine>(coeffs: &[C::Scalar]) -> Vec<ScalarField_BN254> {
-    let _coeffs = [Arc::new(
-        coeffs.iter().map(|x| x.to_repr()).collect::<Vec<_>>(),
-    )];
-
-    let _coeffs: &Arc<Vec<[u32; 8]>> = unsafe { mem::transmute(&_coeffs) };
-    _coeffs
-        .iter()
-        .map(|x| ScalarField_BN254::from_limbs(x))
-        .collect::<Vec<_>>()
+fn c_scalars_from_icicle_scalars<G: PrimeField>(scalars: &[ScalarField]) -> Vec<G> {
+    scalars.par_iter().map(|scalar| {
+        let repr: G::Repr = unsafe { mem::transmute_copy(scalar) };
+        G::from_repr(repr).unwrap()
+    }).collect()
 }
 
-pub fn copy_scalars_to_device<C: CurveAffine>(
-    coeffs: &[C::Scalar],
-) -> DeviceBuffer<ScalarField_BN254> {
-    let scalars = icicle_scalars_from_c::<C>(coeffs);
-
-    DeviceBuffer::from_slice(scalars.as_slice()).unwrap()
-}
+fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<Affine<CurveCfg>> {
+    bases.par_iter().map(|p| {
+        let coordinates = p.coordinates().unwrap();
+        let x_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.x().to_repr()) };
+        let y_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.y().to_repr()) };
 
-fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<PointAffineNoInfinity_BN254> {
-    let _bases = [Arc::new(
-        bases
-            .iter()
-            .map(|p| {
-                let coordinates = p.coordinates().unwrap();
-                [coordinates.x().to_repr(), coordinates.y().to_repr()]
-            })
-            .collect::<Vec<_>>(),
-    )];
-
-    let _bases: &Arc<Vec<[[u8; 32]; 2]>> = unsafe { mem::transmute(&_bases) };
-    _bases
-        .iter()
-        .map(|x| {
-            let tx = u32_from_u8(&x[0]);
-            let ty = u32_from_u8(&x[1]);
-            PointAffineNoInfinity_BN254::from_limbs(&tx, &ty)
-        })
-        .collect::<Vec<_>>()
+        Affine::<CurveCfg>::from_limbs(x_repr, y_repr)
+    }).collect()
 }
 
-pub fn copy_points_to_device<C: CurveAffine>(
-    bases: &[C],
-) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
-    let points = icicle_points_from_c(bases);
+fn c_from_icicle_point<C: CurveAffine>(point: &G1Projective) -> C::Curve {
+    let (x, y) = {
+        let affine: Affine<CurveCfg> = Affine::<CurveCfg>::from(*point);
 
-    DeviceBuffer::from_slice(points.as_slice()).unwrap()
-}
-
-fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
-    let (x, y) = if is_infinity_point(commit_res) {
         (
-            repr_from_u32::<C>(&[0u32; 8]),
-            repr_from_u32::<C>(&[0u32; 8]),
-        )
-    } else {
-        let affine_res_from_cuda = commit_res.to_affine();
-        (
-            repr_from_u32::<C>(&affine_res_from_cuda.x.s),
-            repr_from_u32::<C>(&affine_res_from_cuda.y.s),
+            repr_from_u32::<C>(&affine.x.into()),
+            repr_from_u32::<C>(&affine.y.into()),
         )
     };
 
-    let affine = C::from_xy(x, y).unwrap();
-    return affine.to_curve();
+    let affine = C::from_xy(x, y);
+
+    return affine.unwrap().to_curve();
 }
 
-pub fn multiexp_on_device<C: CurveAffine>(
-    mut coeffs: DeviceBuffer<ScalarField_BN254>,
-    is_lagrange: bool,
-) -> C::Curve {
-    let base_ptr: &mut DeviceBuffer<PointAffineNoInfinity_BN254>;
-    unsafe {
-        if is_lagrange {
-            base_ptr = GPU_G_LAGRANGE.as_mut().unwrap();
-        } else {
-            base_ptr = GPU_G.as_mut().unwrap();
-        };
-    }
+pub fn multiexp_on_device<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
+    let binding = icicle_scalars_from_c_scalars::<C::ScalarExt>(coeffs);
+    let coeffs = HostSlice::from_slice(&binding[..]);
+    let binding = icicle_points_from_c(bases);
+    let bases = HostSlice::from_slice(&binding[..]);
+
+    let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
+    let cfg = msm::MSMConfig::default();
 
-    let d_commit_result = commit_bn254(base_ptr, &mut coeffs, 10);
+    msm::msm(coeffs, bases, &cfg, &mut msm_results[..]).unwrap();
 
-    let mut h_commit_result = Point_BN254::zero();
-    d_commit_result.copy_to(&mut h_commit_result).unwrap();
+    let mut msm_host_result = vec![G1Projective::zero(); 1];
+    msm_results
+        .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
+        .unwrap();
 
-    c_from_icicle_point::<C>(h_commit_result)
+    let msm_point = c_from_icicle_point::<C>(&msm_host_result[0]);
+
+    msm_point
 }
+
+pub fn fft_on_device<Scalar: ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
+    scalars: &mut [G], 
+    omega: Scalar, 
+    _log_n: u32, 
+    inverse: bool
+) {
+    let cfg = NTTConfig::<'_, ScalarField>::default();
+    let dir = if inverse { NTTDir::kInverse } else { NTTDir::kForward };
+
+    let omega = icicle_scalars_from_c_scalars(&[omega]);
+    initialize_domain(omega[0], &cfg.ctx, true).unwrap();
+
+    let mut icicle_scalars: Vec<ScalarField> = icicle_scalars_from_c_scalars(scalars);
+    let host_scalars = HostSlice::from_mut_slice(&mut icicle_scalars);
+
+    ntt_inplace::<ScalarField, ScalarField>(
+        host_scalars,
+        dir,
+        &cfg,
+    ).unwrap();
+
+    let c_scalars = &c_scalars_from_icicle_scalars::<G>(&mut host_scalars.as_slice())[..];
+    scalars.copy_from_slice(&c_scalars);
+}