diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 6477a14ca4..02fad45510 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -63,7 +63,7 @@ lazy_static = { version = "1", optional = true }
 env_logger = "0.10.0"
 
 # GPU Icicle integration
-icicle = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "rust/large-bucket-factor-msm", optional = true }
+icicle = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v0.1.0", optional = true }
 rustacuda = { version = "0.1", optional = true }
 
 # Developer tooling dependencies
@@ -100,6 +100,7 @@ batch = ["rand_core/getrandom"]
 circuit-params = []
 counter = ["lazy_static"]
 icicle_gpu = ["icicle", "rustacuda"]
+profile=[]
 
 [lib]
 bench = false
diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
index 8e090248a8..c29874971f 100644
--- a/halo2_proofs/src/arithmetic.rs
+++ b/halo2_proofs/src/arithmetic.rs
@@ -146,9 +146,31 @@ pub fn small_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::C
 #[cfg(feature = "icicle_gpu")]
 /// Performs a multi-exponentiation operation on GPU using Icicle library
 pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], is_lagrange: bool) -> C::Curve {
-    let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> = icicle::copy_scalars_to_device::<C>(coeffs);
+    let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
+        icicle::copy_scalars_to_device::<C>(coeffs);
 
-    return icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange);
+    icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange)
+}
+
+#[cfg(feature = "icicle_gpu")]
+/// Performs a batch multi-exponentiation operation on GPU using Icicle library
+pub fn best_batch_multiexp_gpu<C: CurveAffine>(
+    coeffs: &[C::Scalar],
+    bases: &[C],
+    batch_size: usize,
+) -> Vec<C::Curve> {
+    let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
+        icicle::copy_scalars_to_device::<C>(coeffs);
+    let all_bases = vec![bases; batch_size]
+        .iter()
+        .flat_map(|bases| bases.iter())
+        .copied()
+        .collect::<Vec<_>>();
+    assert!(scalars_ptr.len() == all_bases.len());
+    let bases_ptr: DeviceBuffer<::icicle::curves::bn254::PointAffineNoInfinity_BN254> =
+        icicle::copy_points_to_device::<C>(all_bases.as_slice());
+
+    icicle::batch_multiexp_on_device::<C>(scalars_ptr, bases_ptr, batch_size)
 }
 
 /// Performs a multi-exponentiation operation.
diff --git a/halo2_proofs/src/icicle.rs b/halo2_proofs/src/icicle.rs
index 191dfad012..222d95ef57 100644
--- a/halo2_proofs/src/icicle.rs
+++ b/halo2_proofs/src/icicle.rs
@@ -1,5 +1,8 @@
 use group::ff::PrimeField;
-use icicle::{curves::bn254::{Point_BN254, ScalarField_BN254}, test_bn254::commit_bn254};
+use icicle::{
+    curves::bn254::{Point_BN254, ScalarField_BN254},
+    test_bn254::{commit_batch_bn254, commit_bn254},
+};
 use std::sync::{Arc, Once};
 
 pub use icicle::curves::bn254::PointAffineNoInfinity_BN254;
@@ -7,15 +10,21 @@ use rustacuda::memory::CopyDestination;
 use rustacuda::prelude::*;
 
 pub use halo2curves::CurveAffine;
-use std::{mem, env};
+use log::info;
+use std::{env, mem};
 
 static mut GPU_CONTEXT: Option<Context> = None;
 static mut GPU_G: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
 static mut GPU_G_LAGRANGE: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
 static GPU_INIT: Once = Once::new();
 
-pub fn should_use_cpu_msm(size: usize) -> bool {
-    size <= (1 << u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("8".to_string()), 10).unwrap())
+pub fn is_small_circuit(size: usize) -> bool {
+    size <= (1
+        << u8::from_str_radix(
+            &env::var("ICICLE_SMALL_CIRCUIT").unwrap_or("8".to_string()),
+            10,
+        )
+        .unwrap())
 }
 
 pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
@@ -24,20 +33,27 @@ pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
             GPU_CONTEXT = Some(rustacuda::quick_init().unwrap());
             GPU_G = Some(copy_points_to_device(g));
             GPU_G_LAGRANGE = Some(copy_points_to_device(g_lagrange));
+            info!("GPU initialized");
         });
     }
 }
 
-fn u32_from_u8(u8_arr: &[u8;32]) -> [u32;8]{
-    let mut t = [0u32;8];
-    for i in 0..8{
-        t[i] = u32::from_le_bytes([u8_arr[4*i],u8_arr[4*i+1],u8_arr[4*i+2],u8_arr[4*i+3]]);
+fn u32_from_u8(u8_arr: &[u8; 32]) -> [u32; 8] {
+    let mut t = [0u32; 8];
+    for i in 0..8 {
+        t[i] = u32::from_le_bytes([
+            u8_arr[4 * i],
+            u8_arr[4 * i + 1],
+            u8_arr[4 * i + 2],
+            u8_arr[4 * i + 3],
+        ]);
     }
-    return t; 
+    return t;
 }
 
-fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32;8]) -> <C as CurveAffine>::Base {
-    let t : &[<<C as CurveAffine>::Base as PrimeField>::Repr] = unsafe { mem::transmute(&u32_arr[..]) };
+fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base {
+    let t: &[<<C as CurveAffine>::Base as PrimeField>::Repr] =
+        unsafe { mem::transmute(&u32_arr[..]) };
     return PrimeField::from_repr(t[0]).unwrap();
 }
 
@@ -50,14 +66,17 @@ fn icicle_scalars_from_c<C: CurveAffine>(coeffs: &[C::Scalar]) -> Vec<ScalarFiel
     let _coeffs = [Arc::new(
         coeffs.iter().map(|x| x.to_repr()).collect::<Vec<_>>(),
     )];
-    
-    let _coeffs: &Arc<Vec<[u32;8]>> = unsafe { mem::transmute(&_coeffs) };
-    _coeffs.iter().map(|x| {
-        ScalarField_BN254::from_limbs(x)
-    }).collect::<Vec<_>>()
+
+    let _coeffs: &Arc<Vec<[u32; 8]>> = unsafe { mem::transmute(&_coeffs) };
+    _coeffs
+        .iter()
+        .map(|x| ScalarField_BN254::from_limbs(x))
+        .collect::<Vec<_>>()
 }
 
-pub fn copy_scalars_to_device<C: CurveAffine>(coeffs: &[C::Scalar]) -> DeviceBuffer<ScalarField_BN254> {
+pub fn copy_scalars_to_device<C: CurveAffine>(
+    coeffs: &[C::Scalar],
+) -> DeviceBuffer<ScalarField_BN254> {
     let scalars = icicle_scalars_from_c::<C>(coeffs);
 
     DeviceBuffer::from_slice(scalars.as_slice()).unwrap()
@@ -65,39 +84,56 @@ pub fn copy_scalars_to_device<C: CurveAffine>(coeffs: &[C::Scalar]) -> DeviceBuf
 
 fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<PointAffineNoInfinity_BN254> {
     let _bases = [Arc::new(
-        bases.iter().map(|p| {
-            let coordinates = p.coordinates().unwrap();
-            [coordinates.x().to_repr(),coordinates.y().to_repr()]
-        }).collect::<Vec<_>>(),
+        bases
+            .iter()
+            .map(|p| {
+                let coordinates = p.coordinates().unwrap();
+                [coordinates.x().to_repr(), coordinates.y().to_repr()]
+            })
+            .collect::<Vec<_>>(),
     )];
-    
-    let _bases: &Arc<Vec<[[u8;32];2]>> = unsafe { mem::transmute(&_bases) };
-    _bases.iter().map(|x| {
-        let tx = u32_from_u8(&x[0]);
-        let ty = u32_from_u8(&x[1]);
-        PointAffineNoInfinity_BN254::from_limbs(&tx,&ty)
-    }).collect::<Vec<_>>()
+
+    let _bases: &Arc<Vec<[[u8; 32]; 2]>> = unsafe { mem::transmute(&_bases) };
+    _bases
+        .iter()
+        .map(|x| {
+            let tx = u32_from_u8(&x[0]);
+            let ty = u32_from_u8(&x[1]);
+            PointAffineNoInfinity_BN254::from_limbs(&tx, &ty)
+        })
+        .collect::<Vec<_>>()
 }
 
-pub fn copy_points_to_device<C: CurveAffine>(bases: &[C]) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
+pub fn copy_points_to_device<C: CurveAffine>(
+    bases: &[C],
+) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
     let points = icicle_points_from_c(bases);
-    
+
     DeviceBuffer::from_slice(points.as_slice()).unwrap()
 }
 
-fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {    
-    let (x , y) = if is_infinity_point(commit_res){
-        (repr_from_u32::<C>(&[0u32;8]), repr_from_u32::<C>(&[0u32;8]))
-    } else{
+fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
+    let (x, y) = if is_infinity_point(commit_res) {
+        (
+            repr_from_u32::<C>(&[0u32; 8]),
+            repr_from_u32::<C>(&[0u32; 8]),
+        )
+    } else {
         let affine_res_from_cuda = commit_res.to_affine();
-        (repr_from_u32::<C>(&affine_res_from_cuda.x.s), repr_from_u32::<C>(&affine_res_from_cuda.y.s))
+        (
+            repr_from_u32::<C>(&affine_res_from_cuda.x.s),
+            repr_from_u32::<C>(&affine_res_from_cuda.y.s),
+        )
     };
 
-    let affine = C::from_xy(x,y).unwrap();
+    let affine = C::from_xy(x, y).unwrap();
     return affine.to_curve();
 }
 
-pub fn multiexp_on_device<C: CurveAffine>(mut coeffs: DeviceBuffer<ScalarField_BN254>, is_lagrange: bool) -> C::Curve {    
+pub fn multiexp_on_device<C: CurveAffine>(
+    mut coeffs: DeviceBuffer<ScalarField_BN254>,
+    is_lagrange: bool,
+) -> C::Curve {
     let base_ptr: &mut DeviceBuffer<PointAffineNoInfinity_BN254>;
     unsafe {
         if is_lagrange {
@@ -110,10 +146,23 @@ pub fn multiexp_on_device<C: CurveAffine>(mut coeffs: DeviceBuffer<ScalarField_B
     let d_commit_result = commit_bn254(base_ptr, &mut coeffs, 10);
 
     let mut h_commit_result = Point_BN254::zero();
-    d_commit_result
-        .copy_to(&mut h_commit_result)
-        .unwrap();
+    d_commit_result.copy_to(&mut h_commit_result).unwrap();
 
     c_from_icicle_point::<C>(h_commit_result)
 }
 
+pub fn batch_multiexp_on_device<C: CurveAffine>(
+    mut coeffs: DeviceBuffer<ScalarField_BN254>,
+    mut bases: DeviceBuffer<PointAffineNoInfinity_BN254>,
+    batch_size: usize,
+) -> Vec<C::Curve> {
+    let d_commit_result = commit_batch_bn254(&mut bases, &mut coeffs, batch_size);
+    let mut h_commit_result: Vec<Point_BN254> =
+        (0..batch_size).map(|_| Point_BN254::zero()).collect();
+    d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();
+
+    h_commit_result
+        .iter()
+        .map(|commit_result| c_from_icicle_point::<C>(*commit_result))
+        .collect()
+}
diff --git a/halo2_proofs/src/plonk/permutation/prover.rs b/halo2_proofs/src/plonk/permutation/prover.rs
index d6b108554d..9e3d97d628 100644
--- a/halo2_proofs/src/plonk/permutation/prover.rs
+++ b/halo2_proofs/src/plonk/permutation/prover.rs
@@ -18,6 +18,9 @@ use crate::{
     transcript::{EncodedChallenge, TranscriptWrite},
 };
 
+#[cfg(feature = "icicle_gpu")]
+use crate::icicle;
+
 pub(crate) struct CommittedSet<C: CurveAffine> {
     pub(crate) permutation_product_poly: Polynomial<C::Scalar, Coeff>,
     pub(crate) permutation_product_coset: Polynomial<C::Scalar, ExtendedLagrangeCoeff>,
@@ -80,6 +83,8 @@ impl Argument {
         let mut last_z = C::Scalar::ONE;
 
         let mut sets = vec![];
+        let mut z_set = vec![];
+        let mut blind_set = vec![];
 
         for (columns, permutations) in self
             .columns
@@ -165,21 +170,13 @@ impl Argument {
             }
             // Set new last_z
             last_z = z[params.n() as usize - (blinding_factors + 1)];
-
             let blind = Blind(C::Scalar::random(&mut rng));
-
-            let permutation_product_commitment_projective = params.commit_lagrange(&z, blind);
-            let permutation_product_blind = blind;
+            z_set.push(z.clone());
+            blind_set.push(blind);
             let z = domain.lagrange_to_coeff(z);
             let permutation_product_poly = z.clone();
-
             let permutation_product_coset = domain.coeff_to_extended(z.clone());
-
-            let permutation_product_commitment =
-                permutation_product_commitment_projective.to_affine();
-
-            // Hash the permutation product commitment
-            transcript.write_point(permutation_product_commitment)?;
+            let permutation_product_blind = blind;
 
             sets.push(CommittedSet {
                 permutation_product_poly,
@@ -188,6 +185,37 @@ impl Argument {
             });
         }
 
+        #[cfg(feature = "icicle_gpu")]
+        if std::env::var("ENABLE_ICICLE_GPU").is_ok() && icicle::is_small_circuit(z_set[0].len()) {
+            let permutation_product_commitment_projectives =
+                params.commit_lagrange_batch(&z_set, &blind_set);
+            permutation_product_commitment_projectives
+                .iter()
+                .for_each(|commitment_projective| {
+                    let permutation_product_commitment = commitment_projective.to_affine();
+
+                    // Hash the permutation product commitment
+                    transcript
+                        .write_point(permutation_product_commitment)
+                        .unwrap();
+                });
+
+            return Ok(Committed { sets });
+        }
+
+        //NOTE: Since commit_lagrange checks for icicle_gpu feature internally, we can delegate the decision to fall back
+        // to CPU to it instead of duplicating code here for when icicle_gpu is not enabled
+        z_set.iter().zip(blind_set.iter()).for_each(|(z, blind)| {
+            let permutation_product_commitment_projective = params.commit_lagrange(&z, *blind);
+            let permutation_product_commitment =
+                permutation_product_commitment_projective.to_affine();
+
+            // Hash the permutation product commitment
+            transcript
+                .write_point(permutation_product_commitment)
+                .unwrap();
+        });
+
         Ok(Committed { sets })
     }
 }
diff --git a/halo2_proofs/src/plonk/prover.rs b/halo2_proofs/src/plonk/prover.rs
index abe3b6e40e..a9a5f4ec19 100644
--- a/halo2_proofs/src/plonk/prover.rs
+++ b/halo2_proofs/src/plonk/prover.rs
@@ -30,6 +30,11 @@ use crate::{
 };
 use group::prime::PrimeCurveAffine;
 
+#[cfg(feature = "icicle_gpu")]
+use crate::icicle;
+use log::{debug, info};
+use std::time::Instant;
+
 /// This creates a proof for the provided `circuit` when given the public
 /// parameters `params` and the proving key [`ProvingKey`] that was
 /// generated previously for the same circuit. The provided `instances`
@@ -298,6 +303,8 @@ where
         }
     }
 
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let (advice, challenges) = {
         let mut advice = vec![
             AdviceSingle::<Scheme::Curve, LagrangeCoeff> {
@@ -387,11 +394,42 @@ where
                         }
                     })
                     .collect();
+
+                let now = std::time::Instant::now();
+                #[cfg(feature = "icicle_gpu")]
+                let mut advice_commitments_projective: Vec<_>;
+                #[cfg(feature = "icicle_gpu")]
+                if std::env::var("ENABLE_ICICLE_GPU").is_ok()
+                    && icicle::is_small_circuit(advice_values[0].len())
+                {
+                    advice_commitments_projective =
+                        params.commit_lagrange_batch(&advice_values, &blinds);
+                    debug!(
+                        "GPU: advice_commitments_projective of length {} took: {}",
+                        advice_commitments_projective.len(),
+                        now.elapsed().as_millis()
+                    );
+                } else {
+                    advice_commitments_projective = advice_values
+                        .iter()
+                        .zip(blinds.iter())
+                        .map(|(poly, blind)| params.commit_lagrange(poly, *blind))
+                        .collect();
+                }
+
+                #[cfg(not(feature = "icicle_gpu"))]
                 let advice_commitments_projective: Vec<_> = advice_values
                     .iter()
                     .zip(blinds.iter())
                     .map(|(poly, blind)| params.commit_lagrange(poly, *blind))
                     .collect();
+                #[cfg(not(feature = "icicle_gpu"))]
+                debug!(
+                    "CPU: advice_commitments_projective of length {} took: {}",
+                    advice_commitments_projective.len(),
+                    now.elapsed().as_millis()
+                );
+
                 let mut advice_commitments =
                     vec![Scheme::Curve::identity(); advice_commitments_projective.len()];
                 <Scheme::Curve as CurveAffine>::CurveExt::batch_normalize(
@@ -428,10 +466,21 @@ where
 
         (advice, challenges)
     };
+    #[cfg(feature = "profile")]
+    info!(
+        "Advice and Challenge generation: {} ms",
+        start.elapsed().as_millis()
+    );
 
     // Sample theta challenge for keeping lookup columns linearly independent
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let theta: ChallengeTheta<_> = transcript.squeeze_challenge_scalar();
+    #[cfg(feature = "profile")]
+    info!("theta generation: {} ms", start.elapsed().as_millis());
 
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let lookups: Vec<Vec<mv_lookup::prover::Prepared<Scheme::Curve>>> = instance
         .iter()
         .zip(advice.iter())
@@ -458,14 +507,25 @@ where
                 .collect()
         })
         .collect::<Result<Vec<_>, _>>()?;
-
+    #[cfg(feature = "profile")]
+    info!("Lookups prepare: {} ms", start.elapsed().as_millis());
     // Sample beta challenge
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let beta: ChallengeBeta<_> = transcript.squeeze_challenge_scalar();
+    #[cfg(feature = "profile")]
+    info!("beta generation: {} ms", start.elapsed().as_millis());
 
     // Sample gamma challenge
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let gamma: ChallengeGamma<_> = transcript.squeeze_challenge_scalar();
+    #[cfg(feature = "profile")]
+    info!("gamma generation: {} ms", start.elapsed().as_millis());
 
     // Commit to permutations.
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let permutations: Vec<permutation::prover::Committed<Scheme::Curve>> = instance
         .iter()
         .zip(advice.iter())
@@ -484,7 +544,11 @@ where
             )
         })
         .collect::<Result<Vec<_>, _>>()?;
+    #[cfg(feature = "profile")]
+    info!("permutation commit: {} ms", start.elapsed().as_millis());
 
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let lookups: Vec<Vec<mv_lookup::prover::Committed<Scheme::Curve>>> = lookups
         .into_iter()
         .map(|lookups| -> Result<Vec<_>, _> {
@@ -495,7 +559,14 @@ where
                 .collect::<Result<Vec<_>, _>>()
         })
         .collect::<Result<Vec<_>, _>>()?;
+    #[cfg(feature = "profile")]
+    info!(
+        "lookups commit_grand_sum: {} ms",
+        start.elapsed().as_millis()
+    );
 
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let shuffles: Vec<Vec<shuffle::prover::Committed<Scheme::Curve>>> = instance
         .iter()
         .zip(advice.iter())
@@ -524,13 +595,26 @@ where
         })
         .collect::<Result<Vec<_>, _>>()?;
 
+    #[cfg(feature = "profile")]
+    info!("shuffle commit_product: {} ms", start.elapsed().as_millis());
+
     // Commit to the vanishing argument's random polynomial for blinding h(x_3)
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let vanishing = vanishing::Argument::commit(params, domain, &mut rng, transcript)?;
+    #[cfg(feature = "profile")]
+    info!("vanishing commit: {} ms", start.elapsed().as_millis());
 
     // Obtain challenge for keeping all separate gates linearly independent
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let y: ChallengeY<_> = transcript.squeeze_challenge_scalar();
+    #[cfg(feature = "profile")]
+    info!("y generation: {} ms", start.elapsed().as_millis());
 
     // Calculate the advice polys
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let advice: Vec<AdviceSingle<Scheme::Curve, Coeff>> = advice
         .into_iter()
         .map(
@@ -548,8 +632,14 @@ where
             },
         )
         .collect();
-
+    #[cfg(feature = "profile")]
+    info!(
+        "advice langrange_to_coeff: {} ms",
+        start.elapsed().as_millis()
+    );
     // Evaluate the h(X) polynomial
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let h_poly = pk.ev.evaluate_h(
         pk,
         &advice
@@ -569,9 +659,15 @@ where
         &shuffles,
         &permutations,
     );
+    #[cfg(feature = "profile")]
+    info!("h_poly: {} ms", start.elapsed().as_millis());
 
     // Construct the vanishing argument's h(X) commitments
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let vanishing = vanishing.construct(params, domain, h_poly, &mut rng, transcript)?;
+    #[cfg(feature = "profile")]
+    info!("vanishing construction: {} ms", start.elapsed().as_millis());
 
     let x: ChallengeX<_> = transcript.squeeze_challenge_scalar();
     let xn = x.pow([params.n()]);
@@ -598,6 +694,8 @@ where
         }
     }
 
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     // Compute and hash advice evals for each circuit instance
     for advice in advice.iter() {
         // Evaluate polynomials at omega^i x
@@ -715,6 +813,9 @@ where
         // We query the h(X) polynomial at x
         .chain(vanishing.open(x));
 
+    #[cfg(feature = "profile")]
+    info!("evaluations: {} ms", start.elapsed().as_millis());
+
     #[cfg(feature = "counter")]
     {
         use crate::{FFT_COUNTER, MSM_COUNTER};
@@ -727,10 +828,15 @@ where
         *FFT_COUNTER.lock().unwrap() = BTreeMap::new();
     }
 
+    #[cfg(feature = "profile")]
+    let start = std::time::Instant::now();
     let prover = P::new(params);
-    prover
+    let proof = prover
         .create_proof(rng, transcript, instances)
-        .map_err(|_| Error::ConstraintSystemFailure)
+        .map_err(|_| Error::ConstraintSystemFailure);
+    #[cfg(feature = "profile")]
+    info!("prover.create_proof : {} ms", start.elapsed().as_millis());
+    proof
 }
 
 #[test]
diff --git a/halo2_proofs/src/plonk/vanishing/prover.rs b/halo2_proofs/src/plonk/vanishing/prover.rs
index 7943086826..fa6a2d9e8e 100644
--- a/halo2_proofs/src/plonk/vanishing/prover.rs
+++ b/halo2_proofs/src/plonk/vanishing/prover.rs
@@ -17,6 +17,9 @@ use crate::{
     transcript::{EncodedChallenge, TranscriptWrite},
 };
 
+#[cfg(feature = "icicle_gpu")]
+use crate::icicle;
+
 pub(in crate::plonk) struct Committed<C: CurveAffine> {
     random_poly: Polynomial<C::Scalar, Coeff>,
     random_blind: Blind<C::Scalar>,
@@ -126,11 +129,28 @@ impl<C: CurveAffine> Committed<C> {
             .collect();
 
         // Compute commitments to each h(X) piece
+        #[cfg(feature = "icicle_gpu")]
+        let mut h_commitments_projective: Vec<_>;
+        #[cfg(feature = "icicle_gpu")]
+        if std::env::var("ENABLE_ICICLE_GPU").is_ok()
+            && icicle::is_small_circuit(params.n() as usize)
+        {
+            h_commitments_projective = params.commit_batch(&h_pieces, &h_blinds);
+        } else {
+            h_commitments_projective = h_pieces
+                .iter()
+                .zip(h_blinds.iter())
+                .map(|(h_piece, blind)| params.commit(h_piece, *blind))
+                .collect();
+        }
+
+        #[cfg(not(feature = "icicle_gpu"))]
         let h_commitments_projective: Vec<_> = h_pieces
             .iter()
             .zip(h_blinds.iter())
             .map(|(h_piece, blind)| params.commit(h_piece, *blind))
             .collect();
+
         let mut h_commitments = vec![C::identity(); h_commitments_projective.len()];
         C::Curve::batch_normalize(&h_commitments_projective, &mut h_commitments);
         let h_commitments = h_commitments;
diff --git a/halo2_proofs/src/poly/commitment.rs b/halo2_proofs/src/poly/commitment.rs
index 590767e68e..9e5d1891e7 100644
--- a/halo2_proofs/src/poly/commitment.rs
+++ b/halo2_proofs/src/poly/commitment.rs
@@ -66,6 +66,16 @@ pub trait Params<'params, C: CurveAffine>: Sized + Clone {
         r: Blind<C::ScalarExt>,
     ) -> C::CurveExt;
 
+    #[cfg(feature = "icicle_gpu")]
+    /// This commits to a batch of polynomials using their evaluations over the $2^k$ size
+    /// evaluation domain. The commitments will be blinded by the blinding factors
+    /// `rs`.
+    fn commit_lagrange_batch(
+        &self,
+        polys: &Vec<Polynomial<C::ScalarExt, LagrangeCoeff>>,
+        rs: &Vec<Blind<C::ScalarExt>>,
+    ) -> Vec<C::CurveExt>;
+
     /// Writes params to a buffer.
     fn write<W: io::Write>(&self, writer: &mut W) -> io::Result<()>;
 
@@ -87,6 +97,16 @@ pub trait ParamsProver<'params, C: CurveAffine>: Params<'params, C> {
     fn commit(&self, poly: &Polynomial<C::ScalarExt, Coeff>, r: Blind<C::ScalarExt>)
         -> C::CurveExt;
 
+    #[cfg(feature = "icicle_gpu")]
+    /// This computes a commitment to a polynomial described by the provided
+    /// slice of coefficients. The commitment may be blinded by the blinding
+    /// factor `r`.
+    fn commit_batch(
+        &self,
+        polys: &Vec<Polynomial<C::ScalarExt, Coeff>>,
+        rs: &Vec<Blind<C::ScalarExt>>,
+    ) -> Vec<C::CurveExt>;
+
     /// Getter for g generators
     fn get_g(&self) -> &[C];
 
diff --git a/halo2_proofs/src/poly/ipa/commitment.rs b/halo2_proofs/src/poly/ipa/commitment.rs
index 96c98d5fbc..095d780313 100644
--- a/halo2_proofs/src/poly/ipa/commitment.rs
+++ b/halo2_proofs/src/poly/ipa/commitment.rs
@@ -102,6 +102,31 @@ impl<'params, C: CurveAffine> Params<'params, C> for ParamsIPA<C> {
         best_multiexp_cpu::<C>(&tmp_scalars, &tmp_bases)
     }
 
+    #[cfg(feature = "icicle_gpu")]
+    /// Falls back to single CPU MSM
+    fn commit_lagrange_batch(
+        &self,
+        polys: &Vec<Polynomial<C::Scalar, LagrangeCoeff>>,
+        rs: &Vec<Blind<C::Scalar>>,
+    ) -> Vec<C::Curve> {
+        polys
+            .iter()
+            .zip(rs.iter())
+            .map(|(poly, r)| {
+                let mut tmp_scalars = Vec::with_capacity(poly.len() + 1);
+                let mut tmp_bases = Vec::with_capacity(poly.len() + 1);
+
+                tmp_scalars.extend(poly.iter());
+                tmp_scalars.push(r.0);
+
+                tmp_bases.extend(self.g_lagrange.iter());
+                tmp_bases.push(self.w);
+
+                best_multiexp_cpu::<C>(&tmp_scalars, &tmp_bases)
+            })
+            .collect::<Vec<C::Curve>>()
+    }
+
     /// Writes params to a buffer.
     fn write<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
         writer.write_all(&self.k.to_le_bytes())?;
@@ -222,6 +247,31 @@ impl<'params, C: CurveAffine> ParamsProver<'params, C> for ParamsIPA<C> {
         best_multiexp_cpu::<C>(&tmp_scalars, &tmp_bases)
     }
 
+    #[cfg(feature = "icicle_gpu")]
+    /// Falls back to single CPU MSM
+    fn commit_batch(
+        &self,
+        polys: &Vec<Polynomial<C::Scalar, Coeff>>,
+        rs: &Vec<Blind<C::Scalar>>,
+    ) -> Vec<C::Curve> {
+        polys
+            .iter()
+            .zip(rs.iter())
+            .map(|(poly, r)| {
+                let mut tmp_scalars = Vec::with_capacity(poly.len() + 1);
+                let mut tmp_bases = Vec::with_capacity(poly.len() + 1);
+
+                tmp_scalars.extend(poly.iter());
+                tmp_scalars.push(r.0);
+
+                tmp_bases.extend(self.g.iter());
+                tmp_bases.push(self.w);
+
+                best_multiexp_cpu::<C>(&tmp_scalars, &tmp_bases)
+            })
+            .collect::<Vec<C::Curve>>()
+    }
+
     fn get_g(&self) -> &[C] {
         &self.g
     }
diff --git a/halo2_proofs/src/poly/ipa/commitment/prover.rs b/halo2_proofs/src/poly/ipa/commitment/prover.rs
index 24394f1e56..ab3b895fb8 100644
--- a/halo2_proofs/src/poly/ipa/commitment/prover.rs
+++ b/halo2_proofs/src/poly/ipa/commitment/prover.rs
@@ -112,8 +112,10 @@ pub fn create_proof<
         let value_r_j = compute_inner_product(&p_prime[0..half], &b[half..]);
         let l_j_randomness = C::Scalar::random(&mut rng);
         let r_j_randomness = C::Scalar::random(&mut rng);
-        let l_j = l_j + &best_multiexp_cpu(&[value_l_j * &z, l_j_randomness], &[params.u, params.w]);
-        let r_j = r_j + &best_multiexp_cpu(&[value_r_j * &z, r_j_randomness], &[params.u, params.w]);
+        let l_j =
+            l_j + &best_multiexp_cpu(&[value_l_j * &z, l_j_randomness], &[params.u, params.w]);
+        let r_j =
+            r_j + &best_multiexp_cpu(&[value_r_j * &z, r_j_randomness], &[params.u, params.w]);
         let l_j = l_j.to_affine();
         let r_j = r_j.to_affine();
 
diff --git a/halo2_proofs/src/poly/kzg/commitment.rs b/halo2_proofs/src/poly/kzg/commitment.rs
index 1ce330dff1..00ab582159 100644
--- a/halo2_proofs/src/poly/kzg/commitment.rs
+++ b/halo2_proofs/src/poly/kzg/commitment.rs
@@ -1,7 +1,7 @@
 use crate::arithmetic::{best_multiexp_cpu, g_to_lagrange, parallelize};
 
 #[cfg(feature = "icicle_gpu")]
-use crate::arithmetic::best_multiexp_gpu;
+use crate::arithmetic::{best_batch_multiexp_gpu, best_multiexp_gpu};
 #[cfg(feature = "icicle_gpu")]
 use crate::icicle;
 
@@ -17,7 +17,7 @@ use rand_core::{OsRng, RngCore};
 use std::fmt::Debug;
 use std::marker::PhantomData;
 
-use std::{io, env};
+use std::{env, io};
 
 use super::msm::MSMKZG;
 
@@ -149,7 +149,6 @@ where
         g2: E::G2Affine,
         s_g2: E::G2Affine,
     ) -> Self {
-
         // let g_lagrange = if let Some(g_l) = g_lagrange {
         //     g_l
         // } else {
@@ -344,7 +343,7 @@ where
         assert!(bases.len() >= size);
 
         #[cfg(feature = "icicle_gpu")]
-        if env::var("ENABLE_ICICLE_GPU").is_ok() && !icicle::should_use_cpu_msm(size) {
+        if env::var("ENABLE_ICICLE_GPU").is_ok() && !icicle::is_small_circuit(size) {
             best_multiexp_gpu::<E::G1Affine>(&scalars, true)
         } else {
             best_multiexp_cpu(&scalars, &bases[0..size])
@@ -354,6 +353,29 @@ where
         best_multiexp_cpu(&scalars, &bases[0..size])
     }
 
+    #[cfg(feature = "icicle_gpu")]
+    fn commit_lagrange_batch(
+        &self,
+        polys: &Vec<Polynomial<E::Scalar, LagrangeCoeff>>,
+        _: &Vec<Blind<E::Scalar>>,
+    ) -> Vec<E::G1> {
+        use log::info;
+
+        let batch_size = polys.len();
+        let size = polys[0].len();
+        let mut scalars = Vec::with_capacity(size * batch_size);
+        for poly in polys {
+            scalars.extend(poly.iter());
+        }
+
+        info!(
+            "Running batch icicle with size {} and batch_size {}",
+            size, batch_size
+        );
+
+        best_batch_multiexp_gpu::<E::G1Affine>(&scalars, &self.g_lagrange, batch_size)
+    }
+
     /// Writes params to a buffer.
     fn write<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
         self.write_custom(writer, SerdeFormat::RawBytes)
@@ -397,7 +419,7 @@ where
         assert!(bases.len() >= size);
 
         #[cfg(feature = "icicle_gpu")]
-        if env::var("ENABLE_ICICLE_GPU").is_ok() && !icicle::should_use_cpu_msm(size) {
+        if env::var("ENABLE_ICICLE_GPU").is_ok() && !icicle::is_small_circuit(size) {
             best_multiexp_gpu::<E::G1Affine>(&scalars, false)
         } else {
             best_multiexp_cpu(&scalars, &bases[0..size])
@@ -407,6 +429,29 @@ where
         best_multiexp_cpu(&scalars, &bases[0..size])
     }
 
+    #[cfg(feature = "icicle_gpu")]
+    fn commit_batch(
+        &self,
+        polys: &Vec<Polynomial<E::Scalar, Coeff>>,
+        rs: &Vec<Blind<E::Scalar>>,
+    ) -> Vec<E::G1> {
+        use log::info;
+
+        let batch_size = polys.len();
+        let size = polys[0].len();
+        let mut scalars = Vec::with_capacity(size * batch_size);
+        for poly in polys {
+            scalars.extend(poly.iter());
+        }
+
+        info!(
+            "Running batch icicle with size {} and batch_size {}",
+            size, batch_size
+        );
+
+        best_batch_multiexp_gpu::<E::G1Affine>(&scalars, &self.g, batch_size)
+    }
+
     fn get_g(&self) -> &[E::G1Affine] {
         &self.g
     }