diff --git a/crates/accelerate/src/sparse_pauli_op.rs b/crates/accelerate/src/sparse_pauli_op.rs index 73c4ab7a73d5..1a85daf036d9 100644 --- a/crates/accelerate/src/sparse_pauli_op.rs +++ b/crates/accelerate/src/sparse_pauli_op.rs @@ -20,11 +20,13 @@ use numpy::prelude::*; use numpy::{PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, PyUntypedArrayMethods}; use hashbrown::HashMap; -use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis}; +use ndarray::{s, ArrayView1, ArrayView2, Axis}; use num_complex::Complex64; use num_traits::Zero; -use qiskit_circuit::util::{c64, C_ONE, C_ZERO}; use rayon::prelude::*; +use thiserror::Error; + +use qiskit_circuit::util::{c64, C_ZERO}; use crate::rayon_ext::*; @@ -70,14 +72,6 @@ pub fn unordered_unique(py: Python, array: PyReadonlyArray2) -> (PyObject, ) } -#[derive(Clone, Copy)] -enum Pauli { - I, - X, - Y, - Z, -} - /// Pack a 2D array of Booleans into a given width. Returns an error if the input array is /// too large to be packed into u64. fn pack_bits(bool_arr: ArrayView2) -> Result, ()> { @@ -188,10 +182,9 @@ impl ZXPaulis { } /// Intermediate structure that represents readonly views onto the Python-space sparse Pauli data. -/// This is used in the chained methods so that the syntactical temporary lifetime extension can -/// occur; we can't have the readonly array temporaries only live within a method that returns -/// [ZXPaulisView], because otherwise the lifetimes of the [PyReadonlyArray] elements will be too -/// short. +/// This is used in the chained methods so that the lifetime extension can occur; we can't have the +/// readonly array temporaries only live within a method that returns [ZXPaulisView], because +/// otherwise the lifetimes of the [PyReadonlyArray] elements will be too short. pub struct ZXPaulisReadonly<'a> { x: PyReadonlyArray2<'a, bool>, z: PyReadonlyArray2<'a, bool>, @@ -325,175 +318,609 @@ impl MatrixCompressedPaulis { } } +#[derive(Clone, Debug)] +struct DecomposeOut { + z: Vec, + x: Vec, + phases: Vec, + coeffs: Vec, + scale: f64, + tol: f64, + num_qubits: usize, +} + +#[derive(Error, Debug)] +enum DecomposeError { + #[error("operators must have two dimensions, not {0}")] + BadDimension(usize), + #[error("operators must be square with a power-of-two side length, not {0:?}")] + BadShape([usize; 2]), +} +impl From for PyErr { + fn from(value: DecomposeError) -> PyErr { + PyValueError::new_err(value.to_string()) + } +} + /// Decompose a dense complex operator into the symplectic Pauli representation in the /// ZX-convention. /// /// This is an implementation of the "tensorized Pauli decomposition" presented in /// `Hantzko, Binkowski and Gupta (2023) `__. +/// +/// Implementation +/// -------------- +/// +/// The original algorithm was described recurisvely, allocating new matrices for each of the +/// block-wise sums (e.g. `op[top_left] + op[bottom_right]`). This implementation differs in two +/// major ways: +/// +/// - We do not allocate new matrices recursively, but instead produce a single copy of the input +/// and repeatedly overwrite subblocks of it at each point of the decomposition. +/// - The implementation is rewritten as an iteration rather than a recursion. The current "state" +/// of the iteration is encoded in a single machine word (the `PauliLocation` struct below). +/// +/// We do the decomposition in three "stages", with the stage changing whenever we need to change +/// the input/output types. The first level is mathematically the same as the middle levels, it +/// just gets handled separately because it does the double duty of moving the data out of the +/// Python-space strided array into a Rust-space contiguous array that we can modify in-place. +/// The middle levels all act in-place on this newly created scratch space. Finally, at the last +/// level, we've completed the decomposition and need to be writing the result into the output +/// data structures rather than into the scratch space. +/// +/// Each "level" is handling one qubit in the operator, equivalently to the recursive procedure +/// described in the paper referenced in the docstring. This implementation is iterative +/// stack-based and in place, rather than recursive. +/// +/// We can get away with overwriting our scratch-space matrix at each point, because each +/// element of a given subblock is used exactly twice during each decomposition - once for the `a + +/// b` case, and once for the `a - b` case. The second operand is the same in both cases. +/// Illustratively, at each step we're decomposing a submatrix blockwise, where we label the blocks +/// like this: +/// +/// +---------+---------+ +---------+---------+ +/// | | | | | | +/// | I | X | | I + Z | X + Y | +/// | | | | | | +/// +---------+---------+ =====> +---------+---------+ +/// | | | | | | +/// | Y | Z | | X - Y | I - Z | +/// | | | | | | +/// +---------+---------+ +---------+---------+ +/// +/// Each addition or subtraction is done elementwise, so as long as we iterate through the two pairs +/// of coupled blocks in order in lockstep, we can write out the answers together without +/// overwriting anything we need again. We ignore all factors of 1/2 until the very last step, and +/// apply them all at once. This minimises the number of floating-point operations we have to do. +/// +/// We store the iteration order as a stack of `PauliLocation`s, whose own docstring explains how it +/// tracks the top-left corner and the size of the submatrix it represents. #[pyfunction] pub fn decompose_dense( py: Python, operator: PyReadonlyArray2, tolerance: f64, ) -> PyResult { - let num_qubits = operator.shape()[0].ilog2() as usize; - let size = 1 << num_qubits; - if operator.shape() != [size, size] { - return Err(PyValueError::new_err(format!( - "input with shape {:?} cannot be interpreted as a multiqubit operator", - operator.shape() - ))); - } - let mut paulis = vec![]; - let mut coeffs = vec![]; - if num_qubits > 0 { - decompose_dense_inner( - C_ONE, - num_qubits, - &[], - operator.as_array(), - &mut paulis, - &mut coeffs, - tolerance * tolerance, - ); - } - if coeffs.is_empty() { - Ok(ZXPaulis { - z: PyArray2::zeros_bound(py, [0, num_qubits], false).into(), - x: PyArray2::zeros_bound(py, [0, num_qubits], false).into(), - phases: PyArray1::zeros_bound(py, [0], false).into(), - coeffs: PyArray1::zeros_bound(py, [0], false).into(), - }) - } else { - // Constructing several arrays of different shapes at once is rather awkward in iterator - // logic, so we just loop manually. - let mut z = Array2::::uninit([paulis.len(), num_qubits]); - let mut x = Array2::::uninit([paulis.len(), num_qubits]); - let mut phases = Array1::::uninit(paulis.len()); - for (i, paulis) in paulis.drain(..).enumerate() { - let mut phase = 0u8; - for (j, pauli) in paulis.into_iter().rev().enumerate() { - match pauli { - Pauli::I => { - z[[i, j]].write(false); - x[[i, j]].write(false); - } - Pauli::X => { - z[[i, j]].write(false); - x[[i, j]].write(true); - } - Pauli::Y => { - z[[i, j]].write(true); - x[[i, j]].write(true); - phase = phase.wrapping_add(1); - } - Pauli::Z => { - z[[i, j]].write(true); - x[[i, j]].write(false); - } + let array_view = operator.as_array(); + let out = py.allow_threads(|| decompose_dense_inner(array_view, tolerance))?; + Ok(ZXPaulis { + z: PyArray1::from_vec_bound(py, out.z) + .reshape([out.phases.len(), out.num_qubits])? + .into(), + x: PyArray1::from_vec_bound(py, out.x) + .reshape([out.phases.len(), out.num_qubits])? + .into(), + phases: PyArray1::from_vec_bound(py, out.phases).into(), + coeffs: PyArray1::from_vec_bound(py, out.coeffs).into(), + }) +} + +/// Rust-only inner component of the `SparsePauliOp` decomposition. +/// +/// See the top-level documentation of [decompose_dense] for more information on the internal +/// algorithm at play. +fn decompose_dense_inner( + operator: ArrayView2, + tolerance: f64, +) -> Result { + let op_shape = match operator.shape() { + [a, b] => [*a, *b], + shape => return Err(DecomposeError::BadDimension(shape.len())), + }; + if op_shape[0].is_zero() { + return Err(DecomposeError::BadShape(op_shape)); + } + let num_qubits = op_shape[0].ilog2() as usize; + let side = 1 << num_qubits; + if op_shape != [side, side] { + return Err(DecomposeError::BadShape(op_shape)); + } + if num_qubits.is_zero() { + // We have to special-case the zero-qubit operator because our `decompose_last_level` still + // needs to "consume" a qubit. + return Ok(DecomposeOut { + z: vec![], + x: vec![], + phases: vec![], + coeffs: vec![operator[[0, 0]]], + scale: 1.0, + tol: tolerance, + num_qubits: 0, + }); + } + let (stack, mut out_list, mut scratch) = decompose_first_level(operator, num_qubits); + decompose_middle_levels(stack, &mut out_list, &mut scratch, num_qubits); + Ok(decompose_last_level( + &mut out_list, + &scratch, + num_qubits, + tolerance, + )) +} + +/// Apply the matrix-addition decomposition at the first level. +/// +/// This is split out from the middle levels because it acts on an `ArrayView2`, and is responsible +/// for copying the operator over into the contiguous scratch space. We can't write over the +/// operator the user gave us (it's not ours to do that to), and anyway, we want to drop to a chunk +/// of memory that we can 100% guarantee is contiguous, so we can elide all the stride checking. +/// We split this out so we can do the first decomposition at the same time as scanning over the +/// operator to copy it. +/// +/// # Panics +/// +/// If the number of qubits in the operator is zero. +fn decompose_first_level( + in_op: ArrayView2, + num_qubits: usize, +) -> (Vec, Vec, Vec) { + let side = 1 << num_qubits; + let mut stack = Vec::::with_capacity(4); + let mut out_list = Vec::::new(); + let mut scratch = Vec::::with_capacity(side * side); + match num_qubits { + 0 => panic!("number of qubits must be greater than zero"), + 1 => { + // If we've only got one qubit, we just want to copy the data over in the correct + // continuity and let the base case of the iteration take care of outputting it. + scratch.extend(in_op.iter()); + out_list.push(PauliLocation::begin(num_qubits)); + } + _ => { + // We don't write out the operator in contiguous-index order, but we can easily + // guarantee that we'll write to each index exactly once without reading it - we still + // visit every index, just in 2x2 blockwise order, not row-by-row. + unsafe { scratch.set_len(scratch.capacity()) }; + let mut ptr = 0usize; + + let cur_qubit = num_qubits - 1; + let mid = 1 << cur_qubit; + let loc = PauliLocation::begin(num_qubits); + let mut i_nonzero = false; + let mut x_nonzero = false; + let mut y_nonzero = false; + let mut z_nonzero = false; + + let i_row_0 = loc.row(); + let i_col_0 = loc.col(); + + let x_row_0 = loc.row(); + let x_col_0 = loc.col() + mid; + + let y_row_0 = loc.row() + mid; + let y_col_0 = loc.col(); + + let z_row_0 = loc.row() + mid; + let z_col_0 = loc.col() + mid; + + for off_row in 0..mid { + let i_row = i_row_0 + off_row; + let z_row = z_row_0 + off_row; + for off_col in 0..mid { + let i_col = i_col_0 + off_col; + let z_col = z_col_0 + off_col; + let value = in_op[[i_row, i_col]] + in_op[[z_row, z_col]]; + scratch[ptr] = value; + ptr += 1; + i_nonzero = i_nonzero || (value != C_ZERO); + } + + let x_row = x_row_0 + off_row; + let y_row = y_row_0 + off_row; + for off_col in 0..mid { + let x_col = x_col_0 + off_col; + let y_col = y_col_0 + off_col; + let value = in_op[[x_row, x_col]] + in_op[[y_row, y_col]]; + scratch[ptr] = value; + ptr += 1; + x_nonzero = x_nonzero || (value != C_ZERO); } } - phases[i].write(phase % 4); + for off_row in 0..mid { + let x_row = x_row_0 + off_row; + let y_row = y_row_0 + off_row; + for off_col in 0..mid { + let x_col = x_col_0 + off_col; + let y_col = y_col_0 + off_col; + let value = in_op[[x_row, x_col]] - in_op[[y_row, y_col]]; + scratch[ptr] = value; + ptr += 1; + y_nonzero = y_nonzero || (value != C_ZERO); + } + let i_row = i_row_0 + off_row; + let z_row = z_row_0 + off_row; + for off_col in 0..mid { + let i_col = i_col_0 + off_col; + let z_col = z_col_0 + off_col; + let value = in_op[[i_row, i_col]] - in_op[[z_row, z_col]]; + scratch[ptr] = value; + ptr += 1; + z_nonzero = z_nonzero || (value != C_ZERO); + } + } + // The middle-levels `stack` is a LIFO, so if we push in this order, we'll consider the + // Pauli terms in lexicographical order, which is the canonical order from + // `SparsePauliOp.sort`. Populating the `out_list` (an initially empty `Vec`) + // effectively reverses the stack, so we want to push its elements in the IXYZ order. + if loc.qubit() == 1 { + i_nonzero.then(|| out_list.push(loc.push_i())); + x_nonzero.then(|| out_list.push(loc.push_x())); + y_nonzero.then(|| out_list.push(loc.push_y())); + z_nonzero.then(|| out_list.push(loc.push_z())); + } else { + z_nonzero.then(|| stack.push(loc.push_z())); + y_nonzero.then(|| stack.push(loc.push_y())); + x_nonzero.then(|| stack.push(loc.push_x())); + i_nonzero.then(|| stack.push(loc.push_i())); + } } - // These are safe because the above loops write into every element. It's guaranteed that - // each of the elements of the `paulis` vec will have `num_qubits` because they're all - // reading from the same base array. - let z = unsafe { z.assume_init() }; - let x = unsafe { x.assume_init() }; - let phases = unsafe { phases.assume_init() }; - Ok(ZXPaulis { - z: z.into_pyarray_bound(py).into(), - x: x.into_pyarray_bound(py).into(), - phases: phases.into_pyarray_bound(py).into(), - coeffs: PyArray1::from_vec_bound(py, coeffs).into(), - }) } + (stack, out_list, scratch) } -/// Recurse worker routine of `decompose_dense`. Should be called with at least one qubit. -fn decompose_dense_inner( - factor: Complex64, +/// Iteratively decompose the matrix at all levels other than the first and last. +/// +/// This populates the `out_list` with locations. This is mathematically the same as the first +/// level of the decomposition, except now we're acting in-place on our Rust-space contiguous +/// scratch space, rather than the strided Python-space array we were originally given. +fn decompose_middle_levels( + mut stack: Vec, + out_list: &mut Vec, + scratch: &mut [Complex64], num_qubits: usize, - paulis: &[Pauli], - block: ArrayView2, - out_paulis: &mut Vec>, - out_coeffs: &mut Vec, - square_tolerance: f64, ) { - if num_qubits == 0 { - // It would be safe to `return` here, but if it's unreachable then LLVM is allowed to - // optimize out this branch entirely in release mode, which is good for a ~2% speedup. - unreachable!("should not call this with an empty operator") - } - // Base recursion case. - if num_qubits == 1 { - let mut push_if_nonzero = |extra: Pauli, value: Complex64| { - if value.norm_sqr() <= square_tolerance { - return; + let side = 1 << num_qubits; + // The stack is a LIFO, which is how we implement the depth-first iteration. Depth-first + // means `stack` never grows very large; it reaches at most `3*num_qubits - 2` elements (if all + // terms are zero all the way through the first subblock decomposition). `out_list`, on the + // other hand, can be `4 ** (num_qubits - 1)` entries in the worst-case scenario of a + // completely dense (in Pauli terms) operator. + while let Some(loc) = stack.pop() { + // Here we work pairwise, writing out the new values into both I and Z simultaneously (etc + // for X and Y) so we can re-use their scratch space and avoid re-allocating. We're doing + // the multiple assignment `(I, Z) = (I + Z, I - Z)`. + // + // See the documentation of `decompose_dense` for more information on how this works. + let mid = 1 << loc.qubit(); + let mut i_nonzero = false; + let mut z_nonzero = false; + let i_row_0 = loc.row(); + let i_col_0 = loc.col(); + let z_row_0 = loc.row() + mid; + let z_col_0 = loc.col() + mid; + for off_row in 0..mid { + let i_loc_0 = (i_row_0 + off_row) * side + i_col_0; + let z_loc_0 = (z_row_0 + off_row) * side + z_col_0; + for off_col in 0..mid { + let i_loc = i_loc_0 + off_col; + let z_loc = z_loc_0 + off_col; + let add = scratch[i_loc] + scratch[z_loc]; + let sub = scratch[i_loc] - scratch[z_loc]; + scratch[i_loc] = add; + scratch[z_loc] = sub; + i_nonzero = i_nonzero || (add != C_ZERO); + z_nonzero = z_nonzero || (sub != C_ZERO); } - let paulis = { - let mut vec = Vec::with_capacity(paulis.len() + 1); - vec.extend_from_slice(paulis); - vec.push(extra); - vec - }; - out_paulis.push(paulis); - out_coeffs.push(value); - }; - push_if_nonzero(Pauli::I, 0.5 * factor * (block[[0, 0]] + block[[1, 1]])); - push_if_nonzero(Pauli::X, 0.5 * factor * (block[[0, 1]] + block[[1, 0]])); - push_if_nonzero( - Pauli::Y, - 0.5 * Complex64::i() * factor * (block[[0, 1]] - block[[1, 0]]), - ); - push_if_nonzero(Pauli::Z, 0.5 * factor * (block[[0, 0]] - block[[1, 1]])); - return; - } - let mut recurse_if_nonzero = |extra: Pauli, factor: Complex64, values: Array2| { - let mut is_zero = true; - for value in values.iter() { - if !value.is_zero() { - is_zero = false; - break; + } + + let mut x_nonzero = false; + let mut y_nonzero = false; + let x_row_0 = loc.row(); + let x_col_0 = loc.col() + mid; + let y_row_0 = loc.row() + mid; + let y_col_0 = loc.col(); + for off_row in 0..mid { + let x_loc_0 = (x_row_0 + off_row) * side + x_col_0; + let y_loc_0 = (y_row_0 + off_row) * side + y_col_0; + for off_col in 0..mid { + let x_loc = x_loc_0 + off_col; + let y_loc = y_loc_0 + off_col; + let add = scratch[x_loc] + scratch[y_loc]; + let sub = scratch[x_loc] - scratch[y_loc]; + scratch[x_loc] = add; + scratch[y_loc] = sub; + x_nonzero = x_nonzero || (add != C_ZERO); + y_nonzero = y_nonzero || (sub != C_ZERO); } } - if is_zero { - return; + // The middle-levels `stack` is a LIFO, so if we push in this order, we'll consider the + // Pauli terms in lexicographical order, which is the canonical order from + // `SparsePauliOp.sort`. Populating the `out_list` (an initially empty `Vec`) effectively + // reverses the stack, so we want to push its elements in the IXYZ order. + if loc.qubit() == 1 { + i_nonzero.then(|| out_list.push(loc.push_i())); + x_nonzero.then(|| out_list.push(loc.push_x())); + y_nonzero.then(|| out_list.push(loc.push_y())); + z_nonzero.then(|| out_list.push(loc.push_z())); + } else { + z_nonzero.then(|| stack.push(loc.push_z())); + y_nonzero.then(|| stack.push(loc.push_y())); + x_nonzero.then(|| stack.push(loc.push_x())); + i_nonzero.then(|| stack.push(loc.push_i())); } - let mut new_paulis = Vec::with_capacity(paulis.len() + 1); - new_paulis.extend_from_slice(paulis); - new_paulis.push(extra); - decompose_dense_inner( - factor, - num_qubits - 1, - &new_paulis, - values.view(), - out_paulis, - out_coeffs, - square_tolerance, - ); + } +} + +/// Write out the results of the final decomposition into the Pauli ZX form. +/// +/// The calculation here is the same as the previous two sets of decomposers, but we don't want to +/// write the result out into the scratch space to iterate needlessly once more; we want to +/// associate each non-zero coefficient with the final Pauli in the ZX format. +/// +/// This function applies all the factors of 1/2 that we've been skipping during the intermediate +/// decompositions. This means that the factors are applied to the output with `2 * output_len` +/// floating-point operations (real and imaginary), which is a huge reduction compared to repeatedly +/// doing it during the decomposition. +fn decompose_last_level( + out_list: &mut Vec, + scratch: &[Complex64], + num_qubits: usize, + tolerance: f64, +) -> DecomposeOut { + let side = 1 << num_qubits; + let scale = 0.5f64.powi(num_qubits as i32); + // Pessimistically allocate assuming that there will be no zero terms in the out list. We + // don't really pay much cost if we overallocate, but underallocating means that all four + // outputs have to copy their data across to a new allocation. + let mut out = DecomposeOut { + z: Vec::with_capacity(4 * num_qubits * out_list.len()), + x: Vec::with_capacity(4 * num_qubits * out_list.len()), + phases: Vec::with_capacity(4 * out_list.len()), + coeffs: Vec::with_capacity(4 * out_list.len()), + scale, + tol: (tolerance * tolerance) / (scale * scale), + num_qubits, }; - let mid = 1usize << (num_qubits - 1); - recurse_if_nonzero( - Pauli::I, - 0.5 * factor, - &block.slice(s![..mid, ..mid]) + &block.slice(s![mid.., mid..]), - ); - recurse_if_nonzero( - Pauli::X, - 0.5 * factor, - &block.slice(s![..mid, mid..]) + &block.slice(s![mid.., ..mid]), - ); - recurse_if_nonzero( - Pauli::Y, - 0.5 * Complex64::i() * factor, - &block.slice(s![..mid, mid..]) - &block.slice(s![mid.., ..mid]), - ); - recurse_if_nonzero( - Pauli::Z, - 0.5 * factor, - &block.slice(s![..mid, ..mid]) - &block.slice(s![mid.., mid..]), - ); + + for loc in out_list.drain(..) { + let row = loc.row(); + let col = loc.col(); + let base = row * side + col; + let i_value = scratch[base] + scratch[base + side + 1]; + let z_value = scratch[base] - scratch[base + side + 1]; + let x_value = scratch[base + 1] + scratch[base + side]; + let y_value = scratch[base + 1] - scratch[base + side]; + + let x = row ^ col; + let z = row; + let phase = (x & z).count_ones() as u8; + // Pushing the last Pauli onto the `loc` happens "forwards" to maintain lexicographical + // ordering in `out`, since this is the construction of the final object. + push_pauli_if_nonzero(x, z, phase, i_value, &mut out); + push_pauli_if_nonzero(x | 1, z, phase, x_value, &mut out); + push_pauli_if_nonzero(x | 1, z | 1, phase + 1, y_value, &mut out); + push_pauli_if_nonzero(x, z | 1, phase, z_value, &mut out); + } + // If we _wildly_ overallocated, then shrink back to a sensible size to avoid tying up too much + // memory as we return to Python space. + if out.z.capacity() / 4 > out.z.len() { + out.z.shrink_to_fit(); + out.x.shrink_to_fit(); + out.phases.shrink_to_fit(); + out.coeffs.shrink_to_fit(); + } + out +} + +// This generates lookup tables of the form +// const LOOKUP: [[bool; 2] 4] = [[false, false], [true, false], [false, true], [true, true]]; +// when called `pauli_lookup!(LOOKUP, 2, [_, _])`. The last argument is like a dummy version of +// an individual lookup rule, which is consumed to make an inner "loop" with a declarative macro. +macro_rules! pauli_lookup { + ($name:ident, $n:literal, [$head:expr$ (, $($tail:expr),*)?]) => { + static $name: [[bool; $n]; 1<<$n] = pauli_lookup!(@acc, [$($($tail),*)?], [[false], [true]]); + }; + (@acc, [$head:expr $(, $($tail:expr),*)?], [$([$($bools:tt),*]),+]) => { + pauli_lookup!(@acc, [$($($tail),*)?], [$([$($bools),*, false]),+, $([$($bools),*, true]),+]) + }; + (@acc, [], $init:expr) => { $init }; +} +pauli_lookup!(PAULI_LOOKUP_2, 2, [(), ()]); +pauli_lookup!(PAULI_LOOKUP_4, 4, [(), (), (), ()]); +pauli_lookup!(PAULI_LOOKUP_8, 8, [(), (), (), (), (), (), (), ()]); + +/// Push a complete Pauli chain into the output (`out`), if the corresponding entry is non-zero. +/// +/// `x` and `z` represent the symplectic X and Z bitvectors, packed into `usize`, where LSb n +/// corresponds to qubit `n`. +fn push_pauli_if_nonzero( + mut x: usize, + mut z: usize, + phase: u8, + value: Complex64, + out: &mut DecomposeOut, +) { + if value.norm_sqr() <= out.tol { + return; + } + + // This set of `extend` calls is effectively an 8-fold unrolling of the "natural" loop through + // each bit, where the initial `if` statements are handling the remainder (the up-to 7 + // least-significant bits). In practice, it's probably unlikely that people are decomposing + // 16q+ operators, since that's a pretty huge matrix already. + // + // The 8-fold loop unrolling is because going bit-by-bit all the way would be dominated by loop + // and bitwise-operation overhead. + + if out.num_qubits & 1 == 1 { + out.x.push(x & 1 == 1); + out.z.push(z & 1 == 1); + x >>= 1; + z >>= 1; + } + if out.num_qubits & 2 == 2 { + out.x.extend(&PAULI_LOOKUP_2[x & 0b11]); + out.z.extend(&PAULI_LOOKUP_2[z & 0b11]); + x >>= 2; + z >>= 2; + } + if out.num_qubits & 4 == 4 { + out.x.extend(&PAULI_LOOKUP_4[x & 0b1111]); + out.z.extend(&PAULI_LOOKUP_4[z & 0b1111]); + x >>= 4; + z >>= 4; + } + for _ in 0..(out.num_qubits / 8) { + out.x.extend(&PAULI_LOOKUP_8[x & 0b1111_1111]); + out.z.extend(&PAULI_LOOKUP_8[z & 0b1111_1111]); + x >>= 8; + z >>= 8; + } + + let phase = phase % 4; + let value = match phase { + 0 => Complex64::new(out.scale, 0.0) * value, + 1 => Complex64::new(0.0, out.scale) * value, + 2 => Complex64::new(-out.scale, 0.0) * value, + 3 => Complex64::new(0.0, -out.scale) * value, + _ => unreachable!("'x % 4' has only four possible values"), + }; + out.phases.push(phase); + out.coeffs.push(value); +} + +/// The "state" of an iteration step of the dense-operator decomposition routine. +/// +/// Pack the information about which row, column and qubit we're considering into a single `usize`. +/// Complex64 data is 16 bytes long and the operators are square and must be addressable in memory, +/// so the row and column are hardware limited to be of width `usize::BITS / 2 - 2` each. However, +/// we don't need to store at a granularity of 1, because the last 2x2 block we handle manually, so +/// we can remove an extra least significant bit from the row and column. Regardless of the width +/// of `usize`, we can therefore track the state for up to 30 qubits losslessly, which is greater +/// than the maximum addressable memory on a 64-bit system. +/// +/// For a 64-bit usize, the bit pattern is stored like this: +/// +/// 0b__000101__11111111111111111111111110000__11111111111111111111111110000 +/// <-6--> <------------29-------------> <------------29-------------> +/// | | | +/// | uint of the input row uint of the input column +/// | (once a 0 is appended) (once a 0 is appended) +/// | +/// current qubit under consideration +/// +/// The `qubit` field encodes the depth in the call stack that the user of the `PauliLocation` +/// should consider. When the stack is initialised (before any calculation is done), it starts at +/// the highest qubit index (`num_qubits - 1`) and decreases from there until 0. +/// +/// The `row` and `col` methods form the top-left corner of a `(2**(qubit + 1), 2**(qubit + 1))` +/// submatrix (where the top row and leftmost column are 0). The least significant `qubit + 1` +/// bits of the of row and column are therefore always zero; the 0-indexed qubit still corresponds +/// to a 2x2 block. This is why we needn't store it. +#[derive(Debug, Clone, Copy)] +struct PauliLocation(usize); + +impl PauliLocation { + // These shifts and masks are used to access the three components of the bit-packed state. + const QUBIT_SHIFT: u32 = usize::BITS - 6; + const QUBIT_MASK: usize = (usize::MAX >> Self::QUBIT_SHIFT) << Self::QUBIT_SHIFT; + const ROW_SHIFT: u32 = usize::BITS / 2 - 3; + const ROW_MASK: usize = + ((usize::MAX >> Self::ROW_SHIFT) << Self::ROW_SHIFT) & !Self::QUBIT_MASK; + const COL_SHIFT: u32 = 0; // Just for consistency. + const COL_MASK: usize = usize::MAX & !Self::ROW_MASK & !Self::QUBIT_MASK; + + /// Create the base `PauliLocation` for an entire matrix with `num_qubits` qubits. The initial + /// Pauli chain is empty. + #[inline(always)] + fn begin(num_qubits: usize) -> Self { + Self::new(0, 0, num_qubits - 1) + } + + /// Manually create a new `PauliLocation` with the given information. The logic in the rest of + /// the class assumes that `row` and `col` will end with at least `qubit + 1` zeros, since + /// these are the only valid locations. + #[inline(always)] + fn new(row: usize, col: usize, qubit: usize) -> Self { + debug_assert!(row & 1 == 0); + debug_assert!(col & 1 == 0); + debug_assert!(row < 2 * Self::ROW_SHIFT as usize); + debug_assert!(col < 2 * Self::ROW_SHIFT as usize); + debug_assert!(qubit < 64); + Self( + (qubit << Self::QUBIT_SHIFT) + | (row << Self::ROW_SHIFT >> 1) + | (col << Self::COL_SHIFT >> 1), + ) + } + + /// The row in the dense matrix that this location corresponds to. + #[inline(always)] + fn row(&self) -> usize { + ((self.0 & Self::ROW_MASK) >> Self::ROW_SHIFT) << 1 + } + + /// The column in the dense matrix that this location corresponds to. + #[inline(always)] + fn col(&self) -> usize { + ((self.0 & Self::COL_MASK) >> Self::COL_SHIFT) << 1 + } + + /// Which qubit in the Pauli chain we're currently considering. + #[inline(always)] + fn qubit(&self) -> usize { + (self.0 & Self::QUBIT_MASK) >> Self::QUBIT_SHIFT + } + + /// Create a new location corresponding to the Pauli chain so far, plus an identity on the + /// currently considered qubit. + #[inline(always)] + fn push_i(&self) -> Self { + Self::new(self.row(), self.col(), self.qubit() - 1) + } + + /// Create a new location corresponding to the Pauli chain so far, plus an X on the currently + /// considered qubit. + #[inline(always)] + fn push_x(&self) -> Self { + Self::new( + self.row(), + self.col() | (1 << self.qubit()), + self.qubit() - 1, + ) + } + + /// Create a new location corresponding to the Pauli chain so far, plus a Y on the currently + /// considered qubit. + #[inline(always)] + fn push_y(&self) -> Self { + Self::new( + self.row() | (1 << self.qubit()), + self.col(), + self.qubit() - 1, + ) + } + + /// Create a new location corresponding to the Pauli chain so far, plus a Z on the currently + /// considered qubit. + #[inline(always)] + fn push_z(&self) -> Self { + Self::new( + self.row() | (1 << self.qubit()), + self.col() | (1 << self.qubit()), + self.qubit() - 1, + ) + } } /// Convert the given [ZXPaulis] object to a dense 2D Numpy matrix. @@ -830,11 +1257,13 @@ pub fn sparse_pauli_op(m: &Bound) -> PyResult<()> { #[cfg(test)] mod tests { + use ndarray::{aview2, Array1}; + use super::*; use crate::test::*; - // The purpose of these tests is more about exercising the `unsafe` code; we test for full - // correctness from Python space. + // The purpose of these tests is more about exercising the `unsafe` code under Miri; we test for + // full numerical correctness from Python space. fn example_paulis() -> MatrixCompressedPaulis { MatrixCompressedPaulis { @@ -853,6 +1282,166 @@ mod tests { } } + /// Helper struct for the decomposition testing. This is a subset of the `DecomposeOut` + /// struct, skipping the unnecessary algorithm-state components of it. + /// + /// If we add a more Rust-friendly interface to `SparsePauliOp` in the future, hopefully this + /// can be removed. + #[derive(Clone, PartialEq, Debug)] + struct DecomposeMinimal { + z: Vec, + x: Vec, + phases: Vec, + coeffs: Vec, + num_qubits: usize, + } + impl From for DecomposeMinimal { + fn from(value: DecomposeOut) -> Self { + Self { + z: value.z, + x: value.x, + phases: value.phases, + coeffs: value.coeffs, + num_qubits: value.num_qubits, + } + } + } + impl From for DecomposeMinimal { + fn from(value: MatrixCompressedPaulis) -> Self { + let phases = value + .z_like + .iter() + .zip(value.x_like.iter()) + .map(|(z, x)| ((z & x).count_ones() % 4) as u8) + .collect::>(); + let coeffs = value + .coeffs + .iter() + .zip(phases.iter()) + .map(|(c, phase)| match phase { + 0 => *c, + 1 => Complex64::new(-c.im, c.re), + 2 => Complex64::new(-c.re, -c.im), + 3 => Complex64::new(c.im, -c.re), + _ => panic!("phase should only in [0, 4)"), + }) + .collect(); + let z = value + .z_like + .iter() + .flat_map(|digit| (0..value.num_qubits).map(move |i| (digit & (1 << i)) != 0)) + .collect(); + let x = value + .x_like + .iter() + .flat_map(|digit| (0..value.num_qubits).map(move |i| (digit & (1 << i)) != 0)) + .collect(); + Self { + z, + x, + phases, + coeffs, + num_qubits: value.num_qubits as usize, + } + } + } + + #[test] + fn decompose_empty_operator_fails() { + assert!(matches!( + decompose_dense_inner(aview2::(&[]), 0.0), + Err(DecomposeError::BadShape(_)), + )); + } + + #[test] + fn decompose_0q_operator() { + let coeff = Complex64::new(1.5, -0.5); + let arr = [[coeff]]; + let out = decompose_dense_inner(aview2(&arr), 0.0).unwrap(); + let expected = DecomposeMinimal { + z: vec![], + x: vec![], + phases: vec![], + coeffs: vec![coeff], + num_qubits: 0, + }; + assert_eq!(DecomposeMinimal::from(out), expected); + } + + #[test] + fn decompose_1q_operator() { + // Be sure that any sums are given in canonical order of the output, or there will be + // spurious test failures. + let paulis = [ + (vec![0], vec![0]), // I + (vec![1], vec![0]), // X + (vec![1], vec![1]), // Y + (vec![0], vec![1]), // Z + (vec![0, 1], vec![0, 0]), // I, X + (vec![0, 1], vec![0, 1]), // I, Y + (vec![0, 0], vec![0, 1]), // I, Z + (vec![1, 1], vec![0, 1]), // X, Y + (vec![1, 0], vec![1, 1]), // X, Z + (vec![1, 0], vec![1, 1]), // Y, Z + (vec![1, 1, 0], vec![0, 1, 1]), // X, Y, Z + ]; + let coeffs = [ + Complex64::new(1.5, -0.5), + Complex64::new(-0.25, 2.0), + Complex64::new(0.75, 0.75), + ]; + for (x_like, z_like) in paulis { + let paulis = MatrixCompressedPaulis { + num_qubits: 1, + coeffs: coeffs[0..x_like.len()].to_owned(), + x_like, + z_like, + }; + let arr = Array1::from_vec(to_matrix_dense_inner(&paulis, false)) + .into_shape((2, 2)) + .unwrap(); + let expected: DecomposeMinimal = paulis.into(); + let actual: DecomposeMinimal = decompose_dense_inner(arr.view(), 0.0).unwrap().into(); + assert_eq!(actual, expected); + } + } + + #[test] + fn decompose_3q_operator() { + // Be sure that any sums are given in canonical order of the output, or there will be + // spurious test failures. + let paulis = [ + (vec![0], vec![0]), // III + (vec![1], vec![0]), // IIX + (vec![2], vec![2]), // IYI + (vec![0], vec![4]), // ZII + (vec![6], vec![6]), // YYI + (vec![7], vec![7]), // YYY + (vec![1, 6, 7], vec![1, 6, 7]), // IIY, YYI, YYY + (vec![1, 2, 0], vec![0, 2, 4]), // IIX, IYI, ZII + ]; + let coeffs = [ + Complex64::new(1.5, -0.5), + Complex64::new(-0.25, 2.0), + Complex64::new(0.75, 0.75), + ]; + for (x_like, z_like) in paulis { + let paulis = MatrixCompressedPaulis { + num_qubits: 3, + coeffs: coeffs[0..x_like.len()].to_owned(), + x_like, + z_like, + }; + let arr = Array1::from_vec(to_matrix_dense_inner(&paulis, false)) + .into_shape((8, 8)) + .unwrap(); + let expected: DecomposeMinimal = paulis.into(); + let actual: DecomposeMinimal = decompose_dense_inner(arr.view(), 0.0).unwrap().into(); + assert_eq!(actual, expected); + } + } + #[test] fn dense_threaded_and_serial_equal() { let paulis = example_paulis(); diff --git a/releasenotes/notes/faster-pauli-decomposition-faf2be01a6e75fff.yaml b/releasenotes/notes/faster-pauli-decomposition-faf2be01a6e75fff.yaml new file mode 100644 index 000000000000..56ad1a725f9a --- /dev/null +++ b/releasenotes/notes/faster-pauli-decomposition-faf2be01a6e75fff.yaml @@ -0,0 +1,7 @@ +--- +features_quantum_info: + - | + The performance of :meth:`.SparsePauliOp.from_operator` has been optimized on top of the + algorithm improvements methods introduced in Qiskit 1.0. It is now approximately five times + faster than before for fully dense matrices, taking approximately 40ms to decompose a 10q + operator involving all Pauli terms. diff --git a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py index 65f19eb8e44c..3f96cd32e15f 100644 --- a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py +++ b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py @@ -19,7 +19,7 @@ import numpy as np import rustworkx as rx import scipy.sparse -from ddt import ddt +import ddt from qiskit import QiskitError from qiskit.circuit import Parameter, ParameterExpression, ParameterVector @@ -141,19 +141,49 @@ def test_sparse_pauli_op_init(self): self.assertEqual(spp_op, ref_op) -@ddt +@ddt.ddt class TestSparsePauliOpConversions(QiskitTestCase): """Tests SparsePauliOp representation conversions.""" - def test_from_operator(self): + @ddt.data(1, 2, 4) + def test_from_operator_single(self, num_qubits): """Test from_operator methods.""" - for tup in it.product(["I", "X", "Y", "Z"], repeat=2): + for tup in it.product(["I", "X", "Y", "Z"], repeat=num_qubits): label = "".join(tup) with self.subTest(msg=label): spp_op = SparsePauliOp.from_operator(Operator(pauli_mat(label))) np.testing.assert_array_equal(spp_op.coeffs, [1]) self.assertEqual(spp_op.paulis, PauliList(label)) + @ddt.data( + SparsePauliOp.from_sparse_list([("", (), 1.0), ("X", (0,), -2.0j)], num_qubits=1), + SparsePauliOp.from_sparse_list([("", (), 1.0), ("Y", (0,), -2.0j)], num_qubits=1), + SparsePauliOp.from_sparse_list([("Y", (0,), 1.0), ("Z", (0,), -2.0j)], num_qubits=1), + SparsePauliOp.from_sparse_list( + [("Y", (0,), 1.0), ("YY", (1, 0), -0.5), ("YYY", (2, 1, 0), 1j)], num_qubits=3 + ), + SparsePauliOp.from_sparse_list( + [("XZ", (2, 0), 1.0), ("YZ", (1, 0), -0.5), ("ZZ", (2, 1), 1j)], num_qubits=3 + ), + ) + def test_from_operator_roundtrip(self, op): + """Test `SparsePauliOp.from_operator` roundtrips things correctly.""" + # Ensure canonical order of the input. Part of this test is ensuring that the output is + # given in canonical order too. The coefficients in the inputs are chosen to be simple + # multiples of powers of two, so there are no floating-point rounding or associativity + # concerns. + op = op.simplify().sort() + roundtrip = SparsePauliOp.from_operator(op.to_matrix()) + self.assertEqual(roundtrip, op) + + def test_from_operator_tolerance(self): + """Test that terms whose coefficient falls below the tolerance are removed.""" + operator = SparsePauliOp.from_list( + [("IIXI", 0.25), ("IIZI", -0.25j), ("IXYI", 0.5j)] + ).to_matrix() + expected = SparsePauliOp.from_list([("IXYI", 0.5j)]) + self.assertEqual(SparsePauliOp.from_operator(operator, 0.26), expected) + def test_from_list(self): """Test from_list method.""" labels = ["XXZ", "IXI", "YZZ", "III"] @@ -416,7 +446,7 @@ def bind_one(a): return np.vectorize(bind_one, otypes=[complex])(array) -@ddt +@ddt.ddt class TestSparsePauliOpMethods(QiskitTestCase): """Tests for SparsePauliOp operator methods."""