Skip to content

Commit

Permalink
Add DataFrame.deserialize
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed May 28, 2024
1 parent 333acfa commit e27c80d
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 12 deletions.
42 changes: 41 additions & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
TooManyRowsReturnedError,
)
from polars.functions import col, lit
from polars.polars import PyDataFrame
from polars.selectors import _expand_selector_dicts, _expand_selectors
from polars.type_aliases import DbWriteMode, JaxExportType, TorchExportType

Expand All @@ -121,7 +122,6 @@
from polars import DataType, Expr, LazyFrame, Series
from polars.interchange.dataframe import PolarsDataFrame
from polars.ml.torch import PolarsDataset
from polars.polars import PyDataFrame
from polars.type_aliases import (
AsofJoinStrategy,
AvroCompression,
Expand Down Expand Up @@ -418,6 +418,46 @@ def __init__(
)
raise TypeError(msg)

@classmethod
def deserialize(cls, source: str | Path | IOBase) -> Self:
"""
Read a serialized DataFrame from a file.
Parameters
----------
source
Path to a file or a file-like object (by file-like object, we refer to
objects that have a `read()` method, such as a file handler (e.g.
via builtin `open` function) or `BytesIO`).
See Also
--------
DataFrame.serialize
Examples
--------
>>> import io
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
>>> json = df.serialize()
>>> pl.DataFrame.deserialize(io.StringIO(json))
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 1 ┆ 4.0 │
│ 2 ┆ 5.0 │
│ 3 ┆ 6.0 │
└─────┴─────┘
"""
if isinstance(source, StringIO):
source = BytesIO(source.getvalue().encode())
elif isinstance(source, (str, Path)):
source = normalize_filepath(source)

return cls._from_pydf(PyDataFrame.deserialize(source))

@classmethod
def _from_pydf(cls, py_df: PyDataFrame) -> Self:
"""Construct Polars DataFrame from FFI PyDataFrame object."""
Expand Down
9 changes: 4 additions & 5 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def function(s: Series) -> Series: # pragma: no cover
@classmethod
def deserialize(cls, source: str | Path | IOBase) -> Self:
"""
Read an expression from a JSON file.
Read a serialized expression from a file.
Parameters
----------
Expand All @@ -351,10 +351,9 @@ def deserialize(cls, source: str | Path | IOBase) -> Self:
Warnings
--------
This function uses :mod:`pickle` under some circumstances, and as
such inherits the security implications. Deserializing can execute
arbitrary code so it should only be attempted on trusted data.
pickle is only used when the logical plan contains python UDFs.
This function uses :mod:`pickle` when the logical plan contains Python UDFs,
and as such inherits the security implications. Deserializing can execute
arbitrary code, so it should only be attempted on trusted data.
See Also
--------
Expand Down
10 changes: 4 additions & 6 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def _scan_python_function(
@classmethod
def deserialize(cls, source: str | Path | IOBase) -> Self:
"""
Read a logical plan from a JSON file to construct a LazyFrame.
Read a logical plan from a file to construct a LazyFrame.
Parameters
----------
Expand All @@ -359,11 +359,9 @@ def deserialize(cls, source: str | Path | IOBase) -> Self:
Warnings
--------
This function uses :mod:`pickle` under some circumstances, and as
such inherits the security implications. Deserializing can execute
arbitrary code so it should only be attempted on trusted data.
pickle is only used when the logical plan contains python UDFs.
This function uses :mod:`pickle` when the logical plan contains Python UDFs,
and as such inherits the security implications. Deserializing can execute
arbitrary code, so it should only be attempted on trusted data.
See Also
--------
Expand Down
21 changes: 21 additions & 0 deletions py-polars/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,27 @@ impl PyDataFrame {
Ok(PyDataFrame::new(df))
}

#[staticmethod]
#[cfg(feature = "json")]
pub fn deserialize(py: Python, mut py_f: Bound<PyAny>) -> PyResult<Self> {
use crate::file::read_if_bytesio;
py_f = read_if_bytesio(py_f);
let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?;

py.allow_threads(move || {
let mmap_read: ReaderBytes = (&mmap_bytes_r).into();
let bytes = mmap_read.deref();
match serde_json::from_slice::<DataFrame>(bytes) {
Ok(df) => Ok(df.into()),
Err(e) => {
let msg = format!("{e}");
let e = PyPolarsErr::from(PolarsError::ComputeError(msg.into()));
Err(PyErr::from(e))
},
}
})
}

#[staticmethod]
#[cfg(feature = "json")]
pub fn read_json(
Expand Down

0 comments on commit e27c80d

Please sign in to comment.