Skip to content

Commit 11fe9d8

Browse files
authored
fix(python): Fix Series.to_numpy for Array types with nulls and nested Arrays (pola-rs#16230)
1 parent 092e3da commit 11fe9d8

File tree

6 files changed

+160
-100
lines changed

6 files changed

+160
-100
lines changed

crates/polars-core/src/chunked_array/array/mod.rs

+4-12
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,10 @@ impl ArrayChunked {
3131

3232
/// Get the inner values as `Series`
3333
pub fn get_inner(&self) -> Series {
34-
let ca = self.rechunk();
35-
let field = self.inner_dtype().to_arrow_field("item", true);
36-
let arr = ca.downcast_iter().next().unwrap();
37-
unsafe {
38-
Series::_try_from_arrow_unchecked_with_md(
39-
self.name(),
40-
vec![(arr.values()).clone()],
41-
&field.data_type,
42-
Some(&field.metadata),
43-
)
44-
.unwrap()
45-
}
34+
let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();
35+
36+
// SAFETY: Data type of arrays matches because they are chunks from the same array.
37+
unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
4638
}
4739

4840
/// Ignore the list indices and apply `func` to the inner type as [`Series`].

crates/polars-core/src/chunked_array/list/mod.rs

+4-11
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,10 @@ impl ListChunked {
4141

4242
/// Get the inner values as [`Series`], ignoring the list offsets.
4343
pub fn get_inner(&self) -> Series {
44-
let ca = self.rechunk();
45-
let arr = ca.downcast_iter().next().unwrap();
46-
// SAFETY:
47-
// Inner dtype is passed correctly
48-
unsafe {
49-
Series::from_chunks_and_dtype_unchecked(
50-
self.name(),
51-
vec![arr.values().clone()],
52-
&ca.inner_dtype(),
53-
)
54-
}
44+
let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();
45+
46+
// SAFETY: Data type of arrays matches because they are chunks from the same array.
47+
unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
5548
}
5649

5750
/// Ignore the list indices and apply `func` to the inner type as [`Series`].

py-polars/polars/series/series.py

+1-10
Original file line numberDiff line numberDiff line change
@@ -4460,7 +4460,7 @@ def to_numpy(
44604460
if (
44614461
use_pyarrow
44624462
and _PYARROW_AVAILABLE
4463-
and self.dtype not in (Object, Datetime, Duration, Date, Array)
4463+
and self.dtype not in (Date, Datetime, Duration, Array, Object)
44644464
):
44654465
if not allow_copy and self.n_chunks() > 1 and not self.is_empty():
44664466
msg = "cannot return a zero-copy array"
@@ -4470,15 +4470,6 @@ def to_numpy(
44704470
zero_copy_only=not allow_copy, writable=writable
44714471
)
44724472

4473-
if self.dtype == Array:
4474-
np_array = self.explode().to_numpy(
4475-
allow_copy=allow_copy,
4476-
writable=writable,
4477-
use_pyarrow=use_pyarrow,
4478-
)
4479-
np_array.shape = (self.len(), self.dtype.width) # type: ignore[attr-defined]
4480-
return np_array
4481-
44824473
return self._s.to_numpy(allow_copy=allow_copy, writable=writable)
44834474

44844475
def to_torch(self) -> torch.Tensor:

py-polars/src/series/export.rs

+25-11
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use pyo3::types::PyList;
99
use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter};
1010
use crate::error::PyPolarsErr;
1111
use crate::prelude::*;
12+
use crate::to_numpy::{reshape_numpy_array, series_to_numpy_view};
1213
use crate::{arrow_interop, raise_err, PySeries};
1314

1415
#[pymethods]
@@ -169,20 +170,20 @@ impl PySeries {
169170
fn to_numpy(&self, py: Python, allow_copy: bool, writable: bool) -> PyResult<PyObject> {
170171
if self.series.is_empty() {
171172
// Take this path to ensure a writable array.
172-
// This does not actually copy for empty Series.
173+
// This does not actually copy data for empty Series.
173174
return series_to_numpy_with_copy(py, &self.series);
174-
} else if self.series.null_count() == 0 {
175-
if let Some(mut arr) = self.to_numpy_view(py) {
176-
if writable {
177-
if !allow_copy {
178-
return Err(PyValueError::new_err(
179-
"cannot return a zero-copy writable array",
180-
));
181-
}
182-
arr = arr.call_method0(py, intern!(py, "copy"))?;
175+
}
176+
177+
if let Some(mut arr) = series_to_numpy_view(py, &self.series, false) {
178+
if writable {
179+
if !allow_copy {
180+
return Err(PyValueError::new_err(
181+
"cannot return a zero-copy writable array",
182+
));
183183
}
184-
return Ok(arr);
184+
arr = arr.call_method0(py, intern!(py, "copy"))?;
185185
}
186+
return Ok(arr);
186187
}
187188

188189
if !allow_copy {
@@ -264,6 +265,7 @@ fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult<PyObject> {
264265
let values = decimal_to_pyobject_iter(py, ca).map(|v| v.into_py(py));
265266
PyArray1::from_iter_bound(py, values).into_py(py)
266267
},
268+
Array(_, _) => array_series_to_numpy(py, s),
267269
#[cfg(feature = "object")]
268270
Object(_, _) => {
269271
let ca = s
@@ -352,3 +354,15 @@ where
352354
let values = ca.iter().map(|v| v.unwrap_or(i64::MIN).into());
353355
PyArray1::<T>::from_iter_bound(py, values).into_py(py)
354356
}
357+
/// Convert arrays by flattening first, converting the flat Series, and then reshaping.
358+
fn array_series_to_numpy(py: Python, s: &Series) -> PyObject {
359+
let ca = s.array().unwrap();
360+
let s_inner = ca.get_inner();
361+
let np_array_flat = series_to_numpy_with_copy(py, &s_inner).unwrap();
362+
363+
// Reshape to the original shape.
364+
let DataType::Array(_, width) = s.dtype() else {
365+
unreachable!()
366+
};
367+
reshape_numpy_array(py, np_array_flat, ca.len(), *width)
368+
}

py-polars/src/to_numpy.rs

+98-49
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ use numpy::{
88
use polars_core::prelude::*;
99
use polars_core::utils::try_get_supertype;
1010
use polars_core::with_match_physical_numeric_polars_type;
11+
use pyo3::intern;
1112
use pyo3::prelude::*;
13+
use pyo3::types::PyTuple;
1214

1315
use crate::conversion::Wrap;
1416
use crate::dataframe::PyDataFrame;
@@ -58,59 +60,66 @@ impl PySeries {
5860
/// appropriately.
5961
#[allow(clippy::wrong_self_convention)]
6062
pub fn to_numpy_view(&self, py: Python) -> Option<PyObject> {
61-
// NumPy arrays are always contiguous
62-
if self.series.n_chunks() > 1 {
63-
return None;
64-
}
63+
series_to_numpy_view(py, &self.series, true)
64+
}
65+
}
6566

66-
match self.series.dtype() {
67-
dt if dt.is_numeric() => {
68-
let dims = [self.series.len()].into_dimension();
69-
let owner = self.clone().into_py(py); // Keep the Series memory alive.
70-
with_match_physical_numeric_polars_type!(dt, |$T| {
71-
let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py);
72-
let ca: &ChunkedArray<$T> = self.series.unpack::<$T>().unwrap();
73-
let slice = ca.data_views().next().unwrap();
74-
75-
let view = unsafe {
76-
create_borrowed_np_array::<_>(
77-
py,
78-
np_dtype,
79-
dims,
80-
flags::NPY_ARRAY_FARRAY_RO,
81-
slice.as_ptr() as _,
82-
owner,
83-
)
84-
};
85-
Some(view)
86-
})
87-
},
88-
dt @ (DataType::Datetime(_, _) | DataType::Duration(_)) => {
89-
let np_dtype = polars_dtype_to_np_temporal_dtype(py, dt);
90-
91-
let phys = self.series.to_physical_repr();
92-
let ca = phys.i64().unwrap();
93-
let slice = ca.data_views().next().unwrap();
94-
let dims = [self.series.len()].into_dimension();
95-
let owner = self.clone().into_py(py);
96-
97-
let view = unsafe {
98-
create_borrowed_np_array::<_>(
99-
py,
100-
np_dtype,
101-
dims,
102-
flags::NPY_ARRAY_FARRAY_RO,
103-
slice.as_ptr() as _,
104-
owner,
105-
)
106-
};
107-
Some(view)
108-
},
109-
_ => None,
110-
}
67+
pub(crate) fn series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
68+
// NumPy arrays are always contiguous
69+
if s.n_chunks() > 1 {
70+
return None;
71+
}
72+
if !allow_nulls && s.null_count() > 0 {
73+
return None;
11174
}
75+
let view = match s.dtype() {
76+
dt if dt.is_numeric() => numeric_series_to_numpy_view(py, s),
77+
DataType::Datetime(_, _) | DataType::Duration(_) => temporal_series_to_numpy_view(py, s),
78+
DataType::Array(_, _) => array_series_to_numpy_view(py, s, allow_nulls)?,
79+
_ => return None,
80+
};
81+
Some(view)
82+
}
83+
fn numeric_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
84+
let dims = [s.len()].into_dimension();
85+
let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.
86+
with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
87+
let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py);
88+
let ca: &ChunkedArray<$T> = s.unpack::<$T>().unwrap();
89+
let slice = ca.data_views().next().unwrap();
90+
91+
unsafe {
92+
create_borrowed_np_array::<_>(
93+
py,
94+
np_dtype,
95+
dims,
96+
flags::NPY_ARRAY_FARRAY_RO,
97+
slice.as_ptr() as _,
98+
owner,
99+
)
100+
}
101+
})
112102
}
103+
fn temporal_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
104+
let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());
105+
106+
let phys = s.to_physical_repr();
107+
let ca = phys.i64().unwrap();
108+
let slice = ca.data_views().next().unwrap();
109+
let dims = [s.len()].into_dimension();
110+
let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.
113111

112+
unsafe {
113+
create_borrowed_np_array::<_>(
114+
py,
115+
np_dtype,
116+
dims,
117+
flags::NPY_ARRAY_FARRAY_RO,
118+
slice.as_ptr() as _,
119+
owner,
120+
)
121+
}
122+
}
114123
/// Get the NumPy temporal data type associated with the given Polars [`DataType`].
115124
fn polars_dtype_to_np_temporal_dtype<'a>(
116125
py: Python<'a>,
@@ -139,6 +148,46 @@ fn polars_dtype_to_np_temporal_dtype<'a>(
139148
_ => panic!("only Datetime/Duration inputs supported, got {}", dtype),
140149
}
141150
}
151+
fn array_series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
152+
let ca = s.array().unwrap();
153+
let s_inner = ca.get_inner();
154+
let np_array_flat = series_to_numpy_view(py, &s_inner, allow_nulls)?;
155+
156+
// Reshape to the original shape.
157+
let DataType::Array(_, width) = s.dtype() else {
158+
unreachable!()
159+
};
160+
let view = reshape_numpy_array(py, np_array_flat, ca.len(), *width);
161+
Some(view)
162+
}
163+
/// Reshape the first dimension of a NumPy array to the given height and width.
164+
pub(crate) fn reshape_numpy_array(
165+
py: Python,
166+
arr: PyObject,
167+
height: usize,
168+
width: usize,
169+
) -> PyObject {
170+
let shape = arr
171+
.getattr(py, intern!(py, "shape"))
172+
.unwrap()
173+
.extract::<Vec<usize>>(py)
174+
.unwrap();
175+
176+
if shape.len() == 1 {
177+
// In this case we can avoid allocating a Vec.
178+
let new_shape = (height, width);
179+
arr.call_method1(py, intern!(py, "reshape"), new_shape)
180+
.unwrap()
181+
} else {
182+
let mut new_shape_vec = vec![height, width];
183+
for v in &shape[1..] {
184+
new_shape_vec.push(*v)
185+
}
186+
let new_shape = PyTuple::new_bound(py, new_shape_vec);
187+
arr.call_method1(py, intern!(py, "reshape"), new_shape)
188+
.unwrap()
189+
}
190+
}
142191

143192
#[pymethods]
144193
#[allow(clippy::wrong_self_convention)]

py-polars/tests/unit/interop/numpy/test_to_numpy_series.py

+28-7
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,14 @@ def test_series_to_numpy_bool_with_nulls() -> None:
223223

224224

225225
def test_series_to_numpy_array_of_int() -> None:
226-
values = [[1, 2], [3, 4], [5, 6]]
227-
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
228-
result = s.to_numpy(use_pyarrow=False)
226+
values = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
227+
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int8, 3), 2))
228+
result = s.to_numpy(use_pyarrow=False, allow_copy=False)
229229

230230
expected = np.array(values)
231231
assert_array_equal(result, expected)
232-
assert result.dtype == np.int64
232+
assert result.dtype == np.int8
233+
assert result.shape == (2, 2, 3)
233234

234235

235236
def test_series_to_numpy_array_of_str() -> None:
@@ -240,9 +241,6 @@ def test_series_to_numpy_array_of_str() -> None:
240241
assert result.dtype == np.object_
241242

242243

243-
@pytest.mark.skip(
244-
reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268"
245-
)
246244
def test_series_to_numpy_array_with_nulls() -> None:
247245
values = [[1, 2], [3, 4], None]
248246
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
@@ -254,6 +252,29 @@ def test_series_to_numpy_array_with_nulls() -> None:
254252
assert_allow_copy_false_raises(s)
255253

256254

255+
def test_series_to_numpy_array_with_nested_nulls() -> None:
256+
values = [[None, 2], [3, 4], [5, None]]
257+
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
258+
result = s.to_numpy(use_pyarrow=False)
259+
260+
expected = np.array([[np.nan, 2.0], [3.0, 4.0], [5.0, np.nan]])
261+
assert_array_equal(result, expected)
262+
assert result.dtype == np.float64
263+
assert_allow_copy_false_raises(s)
264+
265+
266+
def test_series_to_numpy_array_of_arrays() -> None:
267+
values = [[[None, 2], [3, 4]], [None, [7, 8]]]
268+
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int64, 2), 2))
269+
result = s.to_numpy(use_pyarrow=False)
270+
271+
expected = np.array([[[np.nan, 2], [3, 4]], [[np.nan, np.nan], [7, 8]]])
272+
assert_array_equal(result, expected)
273+
assert result.dtype == np.float64
274+
assert result.shape == (2, 2, 2)
275+
assert_allow_copy_false_raises(s)
276+
277+
257278
def test_to_numpy_null() -> None:
258279
s = pl.Series([None, None], dtype=pl.Null)
259280
result = s.to_numpy(use_pyarrow=False)

0 commit comments

Comments
 (0)