From dd269913b1edcb95409ef9d5bf6d7819ebb7e91a Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 28 May 2024 11:26:47 +0200
Subject: [PATCH] Update tests

---
 py-polars/pyproject.toml                     |   3 +
 py-polars/tests/unit/dataframe/test_serde.py | 136 ++++++++++++
 py-polars/tests/unit/io/test_json.py         | 216 ++++---------------
 3 files changed, 181 insertions(+), 174 deletions(-)

diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
index aff0066a4c10..b925da325933 100644
--- a/py-polars/pyproject.toml
+++ b/py-polars/pyproject.toml
@@ -238,6 +238,9 @@ filterwarnings = [
   # TODO: Remove when behavior is updated
   # https://github.com/pola-rs/polars/issues/13441
   "ignore:.*default coalesce behavior of left join.*:DeprecationWarning",
+  # TODO: Remove when default is updated
+  # https://github.com/pola-rs/polars/issues/14526
+  "ignore:.*will only write row-oriented JSON.*:DeprecationWarning",
 ]
 xfail_strict = true
 
diff --git a/py-polars/tests/unit/dataframe/test_serde.py b/py-polars/tests/unit/dataframe/test_serde.py
index 8f806e244801..d649c6b4c3b2 100644
--- a/py-polars/tests/unit/dataframe/test_serde.py
+++ b/py-polars/tests/unit/dataframe/test_serde.py
@@ -1,6 +1,16 @@
+from __future__ import annotations
+
+import io
+from datetime import date, datetime, timedelta
+from typing import TYPE_CHECKING, Any
+
 import pytest
 
 import polars as pl
+from polars.testing import assert_frame_equal
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 
 def test_df_serialize() -> None:
@@ -10,6 +20,132 @@ def test_df_serialize() -> None:
     assert result == expected
 
 
+@pytest.mark.parametrize("buf", [io.BytesIO(), io.StringIO()])
+def test_to_from_buffer(df: pl.DataFrame, buf: io.IOBase) -> None:
+    df.serialize(buf)
+    buf.seek(0)
+    read_df = pl.DataFrame.deserialize(buf)
+    assert_frame_equal(df, read_df, categorical_as_str=True)
+
+
+@pytest.mark.write_disk()
+def test_to_from_file(df: pl.DataFrame, tmp_path: Path) -> None:
+    tmp_path.mkdir(exist_ok=True)
+
+    file_path = tmp_path / "small.json"
+    df.serialize(file_path)
+    out = pl.DataFrame.deserialize(file_path)
+
+    assert_frame_equal(df, out, categorical_as_str=True)
+
+
+def test_write_json_to_string() -> None:
+    # Tests if it runs if no arg given
+    df = pl.DataFrame({"a": [1, 2, 3]})
+    expected_str = '{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]}'
+    assert df.serialize() == expected_str
+
+
+def test_write_json(df: pl.DataFrame) -> None:
+    # Text-based conversion loses time info
+    df = df.select(pl.all().exclude(["cat", "time"]))
+    s = df.serialize()
+    f = io.BytesIO()
+    f.write(s.encode())
+    f.seek(0)
+    out = pl.DataFrame.deserialize(f)
+    assert_frame_equal(out, df)
+
+    file = io.BytesIO()
+    df.serialize(file)
+    file.seek(0)
+    out = pl.DataFrame.deserialize(file)
+    assert_frame_equal(out, df)
+
+
+def test_df_serde_enum() -> None:
+    dtype = pl.Enum(["foo", "bar", "ham"])
+    df = pl.DataFrame([pl.Series("e", ["foo", "bar", "ham"], dtype=dtype)])
+    buf = io.StringIO()
+    df.serialize(buf)
+    buf.seek(0)
+    df_in = pl.DataFrame.deserialize(buf)
+    assert df_in.schema["e"] == dtype
+
+
+@pytest.mark.parametrize(
+    ("data", "dtype"),
+    [
+        ([[1, 2, 3], [None, None, None], [1, None, 3]], pl.Array(pl.Int32(), width=3)),
+        ([["a", "b"], [None, None]], pl.Array(pl.Utf8, width=2)),
+        ([[True, False, None], [None, None, None]], pl.Array(pl.Utf8, width=3)),
+        (
+            [[[1, 2, 3], [4, None, 5]], None, [[None, None, 2]]],
+            pl.List(pl.Array(pl.Int32(), width=3)),
+        ),
+        (
+            [
+                [datetime(1991, 1, 1), datetime(1991, 1, 1), None],
+                [None, None, None],
+            ],
+            pl.Array(pl.Datetime, width=3),
+        ),
+    ],
+)
+def test_write_read_json_array(data: Any, dtype: pl.DataType) -> None:
+    df = pl.DataFrame({"foo": data}, schema={"foo": dtype})
+    buf = io.StringIO()
+    df.serialize(buf)
+    buf.seek(0)
+    deserialized_df = pl.DataFrame.deserialize(buf)
+    assert_frame_equal(deserialized_df, df)
+
+
+@pytest.mark.parametrize(
+    ("data", "dtype"),
+    [
+        (
+            [
+                [
+                    datetime(1997, 10, 1),
+                    datetime(2000, 1, 2, 10, 30, 1),
+                ],
+                [None, None],
+            ],
+            pl.Array(pl.Datetime, width=2),
+        ),
+        (
+            [[date(1997, 10, 1), date(2000, 1, 1)], [None, None]],
+            pl.Array(pl.Date, width=2),
+        ),
+        (
+            [
+                [timedelta(seconds=1), timedelta(seconds=10)],
+                [None, None],
+            ],
+            pl.Array(pl.Duration, width=2),
+        ),
+    ],
+)
+def test_write_read_json_array_logical_inner_type(
+    data: Any, dtype: pl.DataType
+) -> None:
+    df = pl.DataFrame({"foo": data}, schema={"foo": dtype})
+    buf = io.StringIO()
+    df.serialize(buf)
+    buf.seek(0)
+    deserialized_df = pl.DataFrame.deserialize(buf)
+    assert deserialized_df.dtypes == df.dtypes
+    assert deserialized_df.to_dict(as_series=False) == df.to_dict(as_series=False)
+
+
+def test_json_deserialize_empty_list_10458() -> None:
+    schema = {"LIST_OF_STRINGS": pl.List(pl.String)}
+    serialized_schema = pl.DataFrame(schema=schema).serialize()
+    df = pl.DataFrame.deserialize(io.StringIO(serialized_schema))
+    assert df.schema == schema
+
+
 def test_df_write_json_deprecated() -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
     with pytest.deprecated_call():
diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py
index 9acbb061a63c..cefe2ab171d0 100644
--- a/py-polars/tests/unit/io/test_json.py
+++ b/py-polars/tests/unit/io/test_json.py
@@ -1,38 +1,62 @@
 from __future__ import annotations
 
-import datetime
 import io
 import json
 from collections import OrderedDict
 from io import BytesIO
-from typing import TYPE_CHECKING, Any
 
 import pytest
 
 import polars as pl
 from polars.testing import assert_frame_equal
 
-if TYPE_CHECKING:
-    from pathlib import Path
 
+def test_write_json_row_oriented() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
+    out = df.write_json(row_oriented=True)
+    assert out == '[{"a":1,"b":"a"},{"a":2,"b":"b"},{"a":3,"b":null}]'
 
-@pytest.mark.parametrize("buf", [io.BytesIO(), io.StringIO()])
-def test_to_from_buffer(df: pl.DataFrame, buf: io.IOBase) -> None:
-    df.write_json(buf)
-    buf.seek(0)
-    read_df = pl.read_json(buf)
-    assert_frame_equal(df, read_df, categorical_as_str=True)
+    # Test round trip
+    f = io.BytesIO()
+    f.write(out.encode())
+    f.seek(0)
+    result = pl.read_json(f)
+    assert_frame_equal(result, df)
 
 
-@pytest.mark.write_disk()
-def test_to_from_file(df: pl.DataFrame, tmp_path: Path) -> None:
-    tmp_path.mkdir(exist_ok=True)
+def test_write_json_categoricals() -> None:
+    data = {"column": ["test1", "test2", "test3", "test4"]}
+    df = pl.DataFrame(data).with_columns(pl.col("column").cast(pl.Categorical))
 
-    file_path = tmp_path / "small.json"
-    df.write_json(file_path)
-    out = pl.read_json(file_path)
+    assert (
+        df.write_json(row_oriented=True, file=None)
+        == '[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]'
+    )
 
-    assert_frame_equal(df, out, categorical_as_str=True)
+
+def test_write_json_duration() -> None:
+    df = pl.DataFrame(
+        {
+            "a": pl.Series(
+                [91762939, 91762890, 6020836], dtype=pl.Duration(time_unit="ms")
+            )
+        }
+    )
+
+    # we don't guarantee a format, just round-circling
+    value = str(df.write_json(row_oriented=True))
+    assert value == """[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]"""
+
+
+def test_json_infer_schema_length_11148() -> None:
+    response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
+    result = pl.read_json(json.dumps(response).encode(), infer_schema_length=2)
+    with pytest.raises(AssertionError):
+        assert set(result.columns) == {"col1", "col2"}
+
+    response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
+    result = pl.read_json(json.dumps(response).encode(), infer_schema_length=3)
+    assert set(result.columns) == {"col1", "col2"}
 
 
 def test_to_from_buffer_arraywise_schema() -> None:
@@ -84,43 +108,6 @@ def test_to_from_buffer_arraywise_schema_override() -> None:
     )
 
 
-def test_write_json_to_string() -> None:
-    # Tests if it runs if no arg given
-    df = pl.DataFrame({"a": [1, 2, 3]})
-    expected_str = '{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]}'
-    assert df.write_json() == expected_str
-
-
-def test_write_json(df: pl.DataFrame) -> None:
-    # Text-based conversion loses time info
-    df = df.select(pl.all().exclude(["cat", "time"]))
-    s = df.write_json()
-    f = io.BytesIO()
-    f.write(s.encode())
-    f.seek(0)
-    out = pl.read_json(f)
-    assert_frame_equal(out, df)
-
-    file = io.BytesIO()
-    df.write_json(file)
-    file.seek(0)
-    out = pl.read_json(file)
-    assert_frame_equal(out, df)
-
-
-def test_write_json_row_oriented() -> None:
-    df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
-    out = df.write_json(row_oriented=True)
-    assert out == '[{"a":1,"b":"a"},{"a":2,"b":"b"},{"a":3,"b":null}]'
-
-    # Test round trip
-    f = io.BytesIO()
-    f.write(out.encode())
-    f.seek(0)
-    result = pl.read_json(f)
-    assert_frame_equal(result, df)
-
-
 def test_write_ndjson() -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
     out = df.write_ndjson()
@@ -165,16 +152,6 @@ def test_ndjson_nested_string_int() -> None:
     }
 
 
-def test_write_json_categoricals() -> None:
-    data = {"column": ["test1", "test2", "test3", "test4"]}
-    df = pl.DataFrame(data).with_columns(pl.col("column").cast(pl.Categorical))
-
-    assert (
-        df.write_json(row_oriented=True, file=None)
-        == '[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]'
-    )
-
-
 def test_json_supertype_infer() -> None:
     json_string = """[
 {"c":[{"b": [], "a": "1"}]},
@@ -186,7 +163,7 @@ def test_json_supertype_infer() -> None:
     assert_frame_equal(python_infer, polars_infer)
 
 
-def test_json_sliced_list_serialization() -> None:
+def test_ndjson_sliced_list_serialization() -> None:
     data = {"col1": [0, 2], "col2": [[3, 4, 5], [6, 7, 8]]}
     df = pl.DataFrame(data)
     f = io.BytesIO()
@@ -195,13 +172,6 @@ def test_json_sliced_list_serialization() -> None:
     assert f.getvalue() == b'{"col1":2,"col2":[6,7,8]}\n'
 
 
-def test_json_deserialize_empty_list_10458() -> None:
-    schema = {"LIST_OF_STRINGS": pl.List(pl.String)}
-    serialized_schema = pl.DataFrame(schema=schema).write_json()
-    df = pl.read_json(io.StringIO(serialized_schema))
-    assert df.schema == schema
-
-
 def test_json_deserialize_9687() -> None:
     response = {
         "volume": [0.0, 0.0, 0.0],
@@ -216,17 +186,6 @@ def test_json_deserialize_9687() -> None:
     assert result.to_dict(as_series=False) == {k: [v] for k, v in response.items()}
 
 
-def test_json_infer_schema_length_11148() -> None:
-    response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
-    result = pl.read_json(json.dumps(response).encode(), infer_schema_length=2)
-    with pytest.raises(AssertionError):
-        assert set(result.columns) == {"col1", "col2"}
-
-    response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
-    result = pl.read_json(json.dumps(response).encode(), infer_schema_length=3)
-    assert set(result.columns) == {"col1", "col2"}
-
-
 def test_ndjson_ignore_errors() -> None:
     # this schema is inconsistent as "value" is string and object
     jsonl = r"""{"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]}
@@ -276,86 +235,6 @@ def test_ndjson_ignore_errors() -> None:
     assert result.to_dict(as_series=False) == expected
 
 
-def test_write_json_duration() -> None:
-    df = pl.DataFrame(
-        {
-            "a": pl.Series(
-                [91762939, 91762890, 6020836], dtype=pl.Duration(time_unit="ms")
-            )
-        }
-    )
-
-    # we don't guarantee a format, just round-circling
-    value = str(df.write_json(row_oriented=True))
-    assert value == """[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]"""
-
-
-@pytest.mark.parametrize(
-    ("data", "dtype"),
-    [
-        ([[1, 2, 3], [None, None, None], [1, None, 3]], pl.Array(pl.Int32(), width=3)),
-        ([["a", "b"], [None, None]], pl.Array(pl.Utf8, width=2)),
-        ([[True, False, None], [None, None, None]], pl.Array(pl.Utf8, width=3)),
-        (
-            [[[1, 2, 3], [4, None, 5]], None, [[None, None, 2]]],
-            pl.List(pl.Array(pl.Int32(), width=3)),
-        ),
-        (
-            [
-                [datetime.datetime(1991, 1, 1), datetime.datetime(1991, 1, 1), None],
-                [None, None, None],
-            ],
-            pl.Array(pl.Datetime, width=3),
-        ),
-    ],
-)
-def test_write_read_json_array(data: Any, dtype: pl.DataType) -> None:
-    df = pl.DataFrame({"foo": data}, schema={"foo": dtype})
-    buf = io.StringIO()
-    df.write_json(buf)
-    buf.seek(0)
-    deserialized_df = pl.read_json(buf)
-    assert_frame_equal(deserialized_df, df)
-
-
-@pytest.mark.parametrize(
-    ("data", "dtype"),
-    [
-        (
-            [
-                [
-                    datetime.datetime(1997, 10, 1),
-                    datetime.datetime(2000, 1, 2, 10, 30, 1),
-                ],
-                [None, None],
-            ],
-            pl.Array(pl.Datetime, width=2),
-        ),
-        (
-            [[datetime.date(1997, 10, 1), datetime.date(2000, 1, 1)], [None, None]],
-            pl.Array(pl.Date, width=2),
-        ),
-        (
-            [
-                [datetime.timedelta(seconds=1), datetime.timedelta(seconds=10)],
-                [None, None],
-            ],
-            pl.Array(pl.Duration, width=2),
-        ),
-    ],
-)
-def test_write_read_json_array_logical_inner_type(
-    data: Any, dtype: pl.DataType
-) -> None:
-    df = pl.DataFrame({"foo": data}, schema={"foo": dtype})
-    buf = io.StringIO()
-    df.write_json(buf)
-    buf.seek(0)
-    deserialized_df = pl.read_json(buf)
-    assert deserialized_df.dtypes == df.dtypes
-    assert deserialized_df.to_dict(as_series=False) == df.to_dict(as_series=False)
-
-
 def test_json_null_infer() -> None:
     json = BytesIO(
         bytes(
@@ -408,14 +287,3 @@ def test_ndjson_null_inference_13183() -> None:
         "start_time": [0.795, 1.6239999999999999, 2.184, None],
         "end_time": [1.495, 2.0540000000000003, 2.645, None],
     }
-
-
-@pytest.mark.parametrize("pretty", [True, False])
-def test_json_enum(pretty: bool) -> None:
-    dtype = pl.Enum(["foo", "bar", "ham"])
-    df = pl.DataFrame([pl.Series("e", ["foo", "bar", "ham"], dtype=dtype)])
-    buf = io.StringIO()
-    df.write_json(buf, pretty=pretty)
-    buf.seek(0)
-    df_in = pl.read_json(buf)
-    assert df_in.schema["e"] == dtype