From dd269913b1edcb95409ef9d5bf6d7819ebb7e91a Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 28 May 2024 11:26:47 +0200 Subject: [PATCH] Update tests --- py-polars/pyproject.toml | 3 + py-polars/tests/unit/dataframe/test_serde.py | 136 ++++++++++++ py-polars/tests/unit/io/test_json.py | 216 ++++--------------- 3 files changed, 181 insertions(+), 174 deletions(-) diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index aff0066a4c10..b925da325933 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -238,6 +238,9 @@ filterwarnings = [ # TODO: Remove when behavior is updated # https://github.com/pola-rs/polars/issues/13441 "ignore:.*default coalesce behavior of left join.*:DeprecationWarning", + # TODO: Remove when default is updated + # https://github.com/pola-rs/polars/issues/14526 + "ignore:.*will only write row-oriented JSON.*:DeprecationWarning", ] xfail_strict = true diff --git a/py-polars/tests/unit/dataframe/test_serde.py b/py-polars/tests/unit/dataframe/test_serde.py index 8f806e244801..d649c6b4c3b2 100644 --- a/py-polars/tests/unit/dataframe/test_serde.py +++ b/py-polars/tests/unit/dataframe/test_serde.py @@ -1,6 +1,16 @@ +from __future__ import annotations + +import io +from datetime import date, datetime, timedelta +from typing import TYPE_CHECKING, Any + import pytest import polars as pl +from polars.testing import assert_frame_equal + +if TYPE_CHECKING: + from pathlib import Path def test_df_serialize() -> None: @@ -10,6 +20,132 @@ def test_df_serialize() -> None: assert result == expected +@pytest.mark.parametrize("buf", [io.BytesIO(), io.StringIO()]) +def test_to_from_buffer(df: pl.DataFrame, buf: io.IOBase) -> None: + df.serialize(buf) + buf.seek(0) + read_df = pl.DataFrame.deserialize(buf) + assert_frame_equal(df, read_df, categorical_as_str=True) + + +@pytest.mark.write_disk() +def test_to_from_file(df: pl.DataFrame, tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + file_path = tmp_path / "small.json" + df.serialize(file_path) + out = pl.DataFrame.deserialize(file_path) + + assert_frame_equal(df, out, categorical_as_str=True) + + +def test_write_json_to_string() -> None: + # Tests if it runs if no arg given + df = pl.DataFrame({"a": [1, 2, 3]}) + expected_str = '{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]}' + assert df.serialize() == expected_str + + +def test_write_json(df: pl.DataFrame) -> None: + # Text-based conversion loses time info + df = df.select(pl.all().exclude(["cat", "time"])) + s = df.serialize() + f = io.BytesIO() + f.write(s.encode()) + f.seek(0) + out = pl.DataFrame.deserialize(f) + assert_frame_equal(out, df) + + file = io.BytesIO() + df.serialize(file) + file.seek(0) + out = pl.DataFrame.deserialize(file) + assert_frame_equal(out, df) + + +def test_df_serde_enum() -> None: + dtype = pl.Enum(["foo", "bar", "ham"]) + df = pl.DataFrame([pl.Series("e", ["foo", "bar", "ham"], dtype=dtype)]) + buf = io.StringIO() + df.serialize(buf) + buf.seek(0) + df_in = pl.DataFrame.deserialize(buf) + assert df_in.schema["e"] == dtype + + +@pytest.mark.parametrize( + ("data", "dtype"), + [ + ([[1, 2, 3], [None, None, None], [1, None, 3]], pl.Array(pl.Int32(), width=3)), + ([["a", "b"], [None, None]], pl.Array(pl.Utf8, width=2)), + ([[True, False, None], [None, None, None]], pl.Array(pl.Utf8, width=3)), + ( + [[[1, 2, 3], [4, None, 5]], None, [[None, None, 2]]], + pl.List(pl.Array(pl.Int32(), width=3)), + ), + ( + [ + [datetime(1991, 1, 1), datetime(1991, 1, 1), None], + [None, None, None], + ], + pl.Array(pl.Datetime, width=3), + ), + ], +) +def test_write_read_json_array(data: Any, dtype: pl.DataType) -> None: + df = pl.DataFrame({"foo": data}, schema={"foo": dtype}) + buf = io.StringIO() + df.serialize(buf) + buf.seek(0) + deserialized_df = pl.DataFrame.deserialize(buf) + assert_frame_equal(deserialized_df, df) + + +@pytest.mark.parametrize( + ("data", "dtype"), + [ + ( + [ + [ + datetime(1997, 10, 1), + datetime(2000, 1, 2, 10, 30, 1), + ], + [None, None], + ], + pl.Array(pl.Datetime, width=2), + ), + ( + [[date(1997, 10, 1), date(2000, 1, 1)], [None, None]], + pl.Array(pl.Date, width=2), + ), + ( + [ + [timedelta(seconds=1), timedelta(seconds=10)], + [None, None], + ], + pl.Array(pl.Duration, width=2), + ), + ], +) +def test_write_read_json_array_logical_inner_type( + data: Any, dtype: pl.DataType +) -> None: + df = pl.DataFrame({"foo": data}, schema={"foo": dtype}) + buf = io.StringIO() + df.serialize(buf) + buf.seek(0) + deserialized_df = pl.DataFrame.deserialize(buf) + assert deserialized_df.dtypes == df.dtypes + assert deserialized_df.to_dict(as_series=False) == df.to_dict(as_series=False) + + +def test_json_deserialize_empty_list_10458() -> None: + schema = {"LIST_OF_STRINGS": pl.List(pl.String)} + serialized_schema = pl.DataFrame(schema=schema).serialize() + df = pl.DataFrame.deserialize(io.StringIO(serialized_schema)) + assert df.schema == schema + + def test_df_write_json_deprecated() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) with pytest.deprecated_call(): diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index 9acbb061a63c..cefe2ab171d0 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -1,38 +1,62 @@ from __future__ import annotations -import datetime import io import json from collections import OrderedDict from io import BytesIO -from typing import TYPE_CHECKING, Any import pytest import polars as pl from polars.testing import assert_frame_equal -if TYPE_CHECKING: - from pathlib import Path +def test_write_json_row_oriented() -> None: + df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]}) + out = df.write_json(row_oriented=True) + assert out == '[{"a":1,"b":"a"},{"a":2,"b":"b"},{"a":3,"b":null}]' -@pytest.mark.parametrize("buf", [io.BytesIO(), io.StringIO()]) -def test_to_from_buffer(df: pl.DataFrame, buf: io.IOBase) -> None: - df.write_json(buf) - buf.seek(0) - read_df = pl.read_json(buf) - assert_frame_equal(df, read_df, categorical_as_str=True) + # Test round trip + f = io.BytesIO() + f.write(out.encode()) + f.seek(0) + result = pl.read_json(f) + assert_frame_equal(result, df) -@pytest.mark.write_disk() -def test_to_from_file(df: pl.DataFrame, tmp_path: Path) -> None: - tmp_path.mkdir(exist_ok=True) +def test_write_json_categoricals() -> None: + data = {"column": ["test1", "test2", "test3", "test4"]} + df = pl.DataFrame(data).with_columns(pl.col("column").cast(pl.Categorical)) - file_path = tmp_path / "small.json" - df.write_json(file_path) - out = pl.read_json(file_path) + assert ( + df.write_json(row_oriented=True, file=None) + == '[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]' + ) - assert_frame_equal(df, out, categorical_as_str=True) + +def test_write_json_duration() -> None: + df = pl.DataFrame( + { + "a": pl.Series( + [91762939, 91762890, 6020836], dtype=pl.Duration(time_unit="ms") + ) + } + ) + + # we don't guarantee a format, just round-circling + value = str(df.write_json(row_oriented=True)) + assert value == """[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]""" + + +def test_json_infer_schema_length_11148() -> None: + response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1 + result = pl.read_json(json.dumps(response).encode(), infer_schema_length=2) + with pytest.raises(AssertionError): + assert set(result.columns) == {"col1", "col2"} + + response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1 + result = pl.read_json(json.dumps(response).encode(), infer_schema_length=3) + assert set(result.columns) == {"col1", "col2"} def test_to_from_buffer_arraywise_schema() -> None: @@ -84,43 +108,6 @@ def test_to_from_buffer_arraywise_schema_override() -> None: ) -def test_write_json_to_string() -> None: - # Tests if it runs if no arg given - df = pl.DataFrame({"a": [1, 2, 3]}) - expected_str = '{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]}' - assert df.write_json() == expected_str - - -def test_write_json(df: pl.DataFrame) -> None: - # Text-based conversion loses time info - df = df.select(pl.all().exclude(["cat", "time"])) - s = df.write_json() - f = io.BytesIO() - f.write(s.encode()) - f.seek(0) - out = pl.read_json(f) - assert_frame_equal(out, df) - - file = io.BytesIO() - df.write_json(file) - file.seek(0) - out = pl.read_json(file) - assert_frame_equal(out, df) - - -def test_write_json_row_oriented() -> None: - df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]}) - out = df.write_json(row_oriented=True) - assert out == '[{"a":1,"b":"a"},{"a":2,"b":"b"},{"a":3,"b":null}]' - - # Test round trip - f = io.BytesIO() - f.write(out.encode()) - f.seek(0) - result = pl.read_json(f) - assert_frame_equal(result, df) - - def test_write_ndjson() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]}) out = df.write_ndjson() @@ -165,16 +152,6 @@ def test_ndjson_nested_string_int() -> None: } -def test_write_json_categoricals() -> None: - data = {"column": ["test1", "test2", "test3", "test4"]} - df = pl.DataFrame(data).with_columns(pl.col("column").cast(pl.Categorical)) - - assert ( - df.write_json(row_oriented=True, file=None) - == '[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]' - ) - - def test_json_supertype_infer() -> None: json_string = """[ {"c":[{"b": [], "a": "1"}]}, @@ -186,7 +163,7 @@ def test_json_supertype_infer() -> None: assert_frame_equal(python_infer, polars_infer) -def test_json_sliced_list_serialization() -> None: +def test_ndjson_sliced_list_serialization() -> None: data = {"col1": [0, 2], "col2": [[3, 4, 5], [6, 7, 8]]} df = pl.DataFrame(data) f = io.BytesIO() @@ -195,13 +172,6 @@ def test_json_sliced_list_serialization() -> None: assert f.getvalue() == b'{"col1":2,"col2":[6,7,8]}\n' -def test_json_deserialize_empty_list_10458() -> None: - schema = {"LIST_OF_STRINGS": pl.List(pl.String)} - serialized_schema = pl.DataFrame(schema=schema).write_json() - df = pl.read_json(io.StringIO(serialized_schema)) - assert df.schema == schema - - def test_json_deserialize_9687() -> None: response = { "volume": [0.0, 0.0, 0.0], @@ -216,17 +186,6 @@ def test_json_deserialize_9687() -> None: assert result.to_dict(as_series=False) == {k: [v] for k, v in response.items()} -def test_json_infer_schema_length_11148() -> None: - response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1 - result = pl.read_json(json.dumps(response).encode(), infer_schema_length=2) - with pytest.raises(AssertionError): - assert set(result.columns) == {"col1", "col2"} - - response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1 - result = pl.read_json(json.dumps(response).encode(), infer_schema_length=3) - assert set(result.columns) == {"col1", "col2"} - - def test_ndjson_ignore_errors() -> None: # this schema is inconsistent as "value" is string and object jsonl = r"""{"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]} @@ -276,86 +235,6 @@ def test_ndjson_ignore_errors() -> None: assert result.to_dict(as_series=False) == expected -def test_write_json_duration() -> None: - df = pl.DataFrame( - { - "a": pl.Series( - [91762939, 91762890, 6020836], dtype=pl.Duration(time_unit="ms") - ) - } - ) - - # we don't guarantee a format, just round-circling - value = str(df.write_json(row_oriented=True)) - assert value == """[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]""" - - -@pytest.mark.parametrize( - ("data", "dtype"), - [ - ([[1, 2, 3], [None, None, None], [1, None, 3]], pl.Array(pl.Int32(), width=3)), - ([["a", "b"], [None, None]], pl.Array(pl.Utf8, width=2)), - ([[True, False, None], [None, None, None]], pl.Array(pl.Utf8, width=3)), - ( - [[[1, 2, 3], [4, None, 5]], None, [[None, None, 2]]], - pl.List(pl.Array(pl.Int32(), width=3)), - ), - ( - [ - [datetime.datetime(1991, 1, 1), datetime.datetime(1991, 1, 1), None], - [None, None, None], - ], - pl.Array(pl.Datetime, width=3), - ), - ], -) -def test_write_read_json_array(data: Any, dtype: pl.DataType) -> None: - df = pl.DataFrame({"foo": data}, schema={"foo": dtype}) - buf = io.StringIO() - df.write_json(buf) - buf.seek(0) - deserialized_df = pl.read_json(buf) - assert_frame_equal(deserialized_df, df) - - -@pytest.mark.parametrize( - ("data", "dtype"), - [ - ( - [ - [ - datetime.datetime(1997, 10, 1), - datetime.datetime(2000, 1, 2, 10, 30, 1), - ], - [None, None], - ], - pl.Array(pl.Datetime, width=2), - ), - ( - [[datetime.date(1997, 10, 1), datetime.date(2000, 1, 1)], [None, None]], - pl.Array(pl.Date, width=2), - ), - ( - [ - [datetime.timedelta(seconds=1), datetime.timedelta(seconds=10)], - [None, None], - ], - pl.Array(pl.Duration, width=2), - ), - ], -) -def test_write_read_json_array_logical_inner_type( - data: Any, dtype: pl.DataType -) -> None: - df = pl.DataFrame({"foo": data}, schema={"foo": dtype}) - buf = io.StringIO() - df.write_json(buf) - buf.seek(0) - deserialized_df = pl.read_json(buf) - assert deserialized_df.dtypes == df.dtypes - assert deserialized_df.to_dict(as_series=False) == df.to_dict(as_series=False) - - def test_json_null_infer() -> None: json = BytesIO( bytes( @@ -408,14 +287,3 @@ def test_ndjson_null_inference_13183() -> None: "start_time": [0.795, 1.6239999999999999, 2.184, None], "end_time": [1.495, 2.0540000000000003, 2.645, None], } - - -@pytest.mark.parametrize("pretty", [True, False]) -def test_json_enum(pretty: bool) -> None: - dtype = pl.Enum(["foo", "bar", "ham"]) - df = pl.DataFrame([pl.Series("e", ["foo", "bar", "ham"], dtype=dtype)]) - buf = io.StringIO() - df.write_json(buf, pretty=pretty) - buf.seek(0) - df_in = pl.read_json(buf) - assert df_in.schema["e"] == dtype