Skip to content

Commit b57f8ea

Browse files
authored
fix(python): Fix Series constructor failure for Array types for large integers (pola-rs#16050)
1 parent e17ef46 commit b57f8ea

File tree

2 files changed

+146
-141
lines changed

2 files changed

+146
-141
lines changed

py-polars/polars/_utils/construction/series.py

+137-141
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from polars.datatypes import (
3333
INTEGER_DTYPES,
3434
TEMPORAL_DTYPES,
35+
Array,
3536
Boolean,
3637
Categorical,
3738
Date,
@@ -96,7 +97,7 @@ def sequence_to_pyseries(
9697
dtype = Null
9798

9899
# lists defer to subsequent handling; identify nested type
99-
elif dtype == List:
100+
elif dtype in (List, Array):
100101
python_dtype = list
101102

102103
# infer temporal type handling
@@ -130,8 +131,9 @@ def sequence_to_pyseries(
130131
# flat data
131132
if (
132133
dtype is not None
133-
and dtype not in (List, Struct, Unknown)
134134
and is_polars_dtype(dtype)
135+
and not dtype.is_nested()
136+
and dtype != Unknown
135137
and (python_dtype is None)
136138
):
137139
constructor = polars_type_to_constructor(dtype)
@@ -160,159 +162,153 @@ def sequence_to_pyseries(
160162
schema=struct_schema,
161163
orient="row",
162164
).to_struct(name)
163-
else:
164-
if python_dtype is None:
165-
if value is None:
166-
constructor = polars_type_to_constructor(Null)
167-
return constructor(name, values, strict)
168-
169-
# generic default dtype
170-
python_dtype = type(value)
171-
172-
# temporal branch
173-
if python_dtype in py_temporal_types:
174-
if dtype is None:
175-
dtype = py_type_to_dtype(python_dtype) # construct from integer
176-
elif dtype in py_temporal_types:
177-
dtype = py_type_to_dtype(dtype)
178-
179-
values_dtype = (
180-
None
181-
if value is None
182-
else py_type_to_dtype(type(value), raise_unmatched=False)
165+
166+
if python_dtype is None:
167+
if value is None:
168+
constructor = polars_type_to_constructor(Null)
169+
return constructor(name, values, strict)
170+
171+
# generic default dtype
172+
python_dtype = type(value)
173+
174+
# temporal branch
175+
if python_dtype in py_temporal_types:
176+
if dtype is None:
177+
dtype = py_type_to_dtype(python_dtype) # construct from integer
178+
elif dtype in py_temporal_types:
179+
dtype = py_type_to_dtype(dtype)
180+
181+
values_dtype = (
182+
None
183+
if value is None
184+
else py_type_to_dtype(type(value), raise_unmatched=False)
185+
)
186+
if values_dtype is not None and values_dtype.is_float():
187+
msg = f"'float' object cannot be interpreted as a {python_dtype.__name__!r}"
188+
raise TypeError(
189+
# we do not accept float values as temporal; if this is
190+
# required, the caller should explicitly cast to int first.
191+
msg
183192
)
184-
if values_dtype is not None and values_dtype.is_float():
185-
msg = f"'float' object cannot be interpreted as a {python_dtype.__name__!r}"
186-
raise TypeError(
187-
# we do not accept float values as temporal; if this is
188-
# required, the caller should explicitly cast to int first.
189-
msg
190-
)
191193

192-
# We use the AnyValue builder to create the datetime array
193-
# We store the values internally as UTC and set the timezone
194-
py_series = PySeries.new_from_any_values(name, values, strict)
194+
# We use the AnyValue builder to create the datetime array
195+
# We store the values internally as UTC and set the timezone
196+
py_series = PySeries.new_from_any_values(name, values, strict)
195197

196-
time_unit = getattr(dtype, "time_unit", None)
197-
time_zone = getattr(dtype, "time_zone", None)
198+
time_unit = getattr(dtype, "time_unit", None)
199+
time_zone = getattr(dtype, "time_zone", None)
198200

199-
if time_unit is None or values_dtype == Date:
200-
s = wrap_s(py_series)
201-
else:
202-
s = wrap_s(py_series).dt.cast_time_unit(time_unit)
201+
if time_unit is None or values_dtype == Date:
202+
s = wrap_s(py_series)
203+
else:
204+
s = wrap_s(py_series).dt.cast_time_unit(time_unit)
203205

204-
if (values_dtype == Date) & (dtype == Datetime):
205-
return (
206-
s.cast(Datetime(time_unit or "us"))
207-
.dt.replace_time_zone(time_zone)
208-
._s
206+
if (values_dtype == Date) & (dtype == Datetime):
207+
return (
208+
s.cast(Datetime(time_unit or "us")).dt.replace_time_zone(time_zone)._s
209+
)
210+
211+
if (dtype == Datetime) and (value.tzinfo is not None or time_zone is not None):
212+
values_tz = str(value.tzinfo) if value.tzinfo is not None else None
213+
dtype_tz = dtype.time_zone # type: ignore[union-attr]
214+
if values_tz is not None and (dtype_tz is not None and dtype_tz != "UTC"):
215+
msg = (
216+
"time-zone-aware datetimes are converted to UTC"
217+
"\n\nPlease either drop the time zone from the dtype, or set it to 'UTC'."
218+
" To convert to a different time zone, please use `.dt.convert_time_zone`."
219+
)
220+
raise ValueError(msg)
221+
if values_tz != "UTC" and dtype_tz is None:
222+
warnings.warn(
223+
"Constructing a Series with time-zone-aware "
224+
"datetimes results in a Series with UTC time zone. "
225+
"To silence this warning, you can filter "
226+
"warnings of class TimeZoneAwareConstructorWarning, or "
227+
"set 'UTC' as the time zone of your datatype.",
228+
TimeZoneAwareConstructorWarning,
229+
stacklevel=find_stacklevel(),
209230
)
231+
return s.dt.replace_time_zone(dtype_tz or "UTC")._s
232+
return s._s
210233

211-
if (dtype == Datetime) and (
212-
value.tzinfo is not None or time_zone is not None
213-
):
214-
values_tz = str(value.tzinfo) if value.tzinfo is not None else None
215-
dtype_tz = dtype.time_zone # type: ignore[union-attr]
216-
if values_tz is not None and (
217-
dtype_tz is not None and dtype_tz != "UTC"
218-
):
219-
msg = (
220-
"time-zone-aware datetimes are converted to UTC"
221-
"\n\nPlease either drop the time zone from the dtype, or set it to 'UTC'."
222-
" To convert to a different time zone, please use `.dt.convert_time_zone`."
223-
)
224-
raise ValueError(msg)
225-
if values_tz != "UTC" and dtype_tz is None:
226-
warnings.warn(
227-
"Constructing a Series with time-zone-aware "
228-
"datetimes results in a Series with UTC time zone. "
229-
"To silence this warning, you can filter "
230-
"warnings of class TimeZoneAwareConstructorWarning, or "
231-
"set 'UTC' as the time zone of your datatype.",
232-
TimeZoneAwareConstructorWarning,
233-
stacklevel=find_stacklevel(),
234-
)
235-
return s.dt.replace_time_zone(dtype_tz or "UTC")._s
236-
return s._s
234+
elif (
235+
_check_for_numpy(value)
236+
and isinstance(value, np.ndarray)
237+
and len(value.shape) == 1
238+
):
239+
n_elems = len(value)
240+
if all(len(v) == n_elems for v in values):
241+
# can take (much) faster path if all lists are the same length
242+
return numpy_to_pyseries(
243+
name,
244+
np.vstack(values),
245+
strict=strict,
246+
nan_to_null=nan_to_null,
247+
)
248+
else:
249+
return PySeries.new_series_list(
250+
name,
251+
[
252+
numpy_to_pyseries("", v, strict=strict, nan_to_null=nan_to_null)
253+
for v in values
254+
],
255+
strict,
256+
)
237257

238-
elif (
239-
_check_for_numpy(value)
240-
and isinstance(value, np.ndarray)
241-
and len(value.shape) == 1
242-
):
243-
n_elems = len(value)
244-
if all(len(v) == n_elems for v in values):
245-
# can take (much) faster path if all lists are the same length
246-
return numpy_to_pyseries(
247-
name,
248-
np.vstack(values),
249-
strict=strict,
250-
nan_to_null=nan_to_null,
251-
)
258+
elif python_dtype in (list, tuple):
259+
if dtype is None:
260+
return PySeries.new_from_any_values(name, values, strict=strict)
261+
elif dtype == Object:
262+
return PySeries.new_object(name, values, strict)
263+
else:
264+
if (inner_dtype := getattr(dtype, "inner", None)) is not None:
265+
pyseries_list = [
266+
None
267+
if value is None
268+
else sequence_to_pyseries(
269+
"",
270+
value,
271+
inner_dtype,
272+
strict=strict,
273+
nan_to_null=nan_to_null,
274+
)
275+
for value in values
276+
]
277+
pyseries = PySeries.new_series_list(name, pyseries_list, strict)
252278
else:
253-
return PySeries.new_series_list(
254-
name,
255-
[
256-
numpy_to_pyseries("", v, strict=strict, nan_to_null=nan_to_null)
257-
for v in values
258-
],
259-
strict,
279+
pyseries = PySeries.new_from_any_values_and_dtype(
280+
name, values, dtype, strict=strict
260281
)
282+
if dtype != pyseries.dtype():
283+
pyseries = pyseries.cast(dtype, strict=False)
284+
return pyseries
261285

262-
elif python_dtype in (list, tuple):
263-
if dtype is None:
264-
return PySeries.new_from_any_values(name, values, strict=strict)
265-
elif dtype == Object:
266-
return PySeries.new_object(name, values, strict)
267-
else:
268-
if (inner_dtype := getattr(dtype, "inner", None)) is not None:
269-
pyseries_list = [
270-
None
271-
if value is None
272-
else sequence_to_pyseries(
273-
"",
274-
value,
275-
inner_dtype,
276-
strict=strict,
277-
nan_to_null=nan_to_null,
278-
)
279-
for value in values
280-
]
281-
pyseries = PySeries.new_series_list(name, pyseries_list, strict)
286+
elif python_dtype == pl.Series:
287+
return PySeries.new_series_list(
288+
name, [v._s if v is not None else None for v in values], strict
289+
)
290+
291+
elif python_dtype == PySeries:
292+
return PySeries.new_series_list(name, values, strict)
293+
else:
294+
constructor = py_type_to_constructor(python_dtype)
295+
if constructor == PySeries.new_object:
296+
try:
297+
srs = PySeries.new_from_any_values(name, values, strict)
298+
if _check_for_numpy(python_dtype, check_type=False) and isinstance(
299+
np.bool_(True), np.generic
300+
):
301+
dtype = numpy_char_code_to_dtype(np.dtype(python_dtype).char)
302+
return srs.cast(dtype, strict=strict)
282303
else:
283-
pyseries = PySeries.new_from_any_values_and_dtype(
284-
name, values, dtype, strict=strict
285-
)
286-
if dtype != pyseries.dtype():
287-
pyseries = pyseries.cast(dtype, strict=False)
288-
return pyseries
304+
return srs
289305

290-
elif python_dtype == pl.Series:
291-
return PySeries.new_series_list(
292-
name, [v._s if v is not None else None for v in values], strict
293-
)
306+
except RuntimeError:
307+
return PySeries.new_from_any_values(name, values, strict=strict)
294308

295-
elif python_dtype == PySeries:
296-
return PySeries.new_series_list(name, values, strict)
297-
else:
298-
constructor = py_type_to_constructor(python_dtype)
299-
if constructor == PySeries.new_object:
300-
try:
301-
srs = PySeries.new_from_any_values(name, values, strict)
302-
if _check_for_numpy(python_dtype, check_type=False) and isinstance(
303-
np.bool_(True), np.generic
304-
):
305-
dtype = numpy_char_code_to_dtype(np.dtype(python_dtype).char)
306-
return srs.cast(dtype, strict=strict)
307-
else:
308-
return srs
309-
310-
except RuntimeError:
311-
return PySeries.new_from_any_values(name, values, strict=strict)
312-
313-
return _construct_series_with_fallbacks(
314-
constructor, name, values, dtype, strict=strict
315-
)
309+
return _construct_series_with_fallbacks(
310+
constructor, name, values, dtype, strict=strict
311+
)
316312

317313

318314
def _construct_series_with_fallbacks(

py-polars/tests/unit/constructors/test_series.py

+9
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,12 @@ def test_large_timedelta(dtype: pl.DataType | None) -> None:
8888
# Microsecond precision is lost
8989
expected = [timedelta.min, timedelta.max - timedelta(microseconds=999)]
9090
assert s.to_list() == expected
91+
92+
93+
def test_array_large_u64() -> None:
94+
u64_max = 2**64 - 1
95+
values = [[u64_max]]
96+
dtype = pl.Array(pl.UInt64, 1)
97+
s = pl.Series(values, dtype=dtype)
98+
assert s.dtype == dtype
99+
assert s.to_list() == values

0 commit comments

Comments
 (0)