move long running tests that read/write from disk

iterative · Jan 20, 2025 · d90cae9 · d90cae9
1 parent aadbfbd
commit d90cae9
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 85 deletions.
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
@@ -1,4 +1,5 @@
 import functools
+import json
 import math
 import os
 import pickle
@@ -1700,6 +1701,47 @@ def test_to_from_parquet_partitioned_remote(cloud_test_catalog_upload):
     assert df_equal(df1, df)
 
 
+# These deprecation warnings occur in the datamodel-code-generator package.
+@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
+def test_to_from_json(tmp_dir, test_session):
+    df = pd.DataFrame(DF_DATA)
+    dc_to = DataChain.from_pandas(df, session=test_session)
+    path = tmp_dir / "test.json"
+    dc_to.order_by("first_name", "age").to_json(path)
+
+    with open(path) as f:
+        values = json.load(f)
+    assert values == [
+        {"first_name": n, "age": a, "city": c}
+        for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
+    ]
+
+    dc_from = DataChain.from_json(path.as_uri(), session=test_session)
+    df1 = dc_from.select("json.first_name", "json.age", "json.city").to_pandas()
+    df1 = df1["json"]
+    assert df_equal(df1, df)
+
+
+# These deprecation warnings occur in the datamodel-code-generator package.
+@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
+def test_from_json_jmespath(tmp_dir, test_session):
+    df = pd.DataFrame(DF_DATA)
+    values = [
+        {"first_name": n, "age": a, "city": c}
+        for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
+    ]
+    path = tmp_dir / "test.json"
+    with open(path, "w") as f:
+        json.dump({"author": "Test User", "version": 5, "values": values}, f)
+
+    dc_from = DataChain.from_json(
+        path.as_uri(), jmespath="values", session=test_session
+    )
+    df1 = dc_from.select("values.first_name", "values.age", "values.city").to_pandas()
+    df1 = df1["values"]
+    assert df_equal(df1, df)
+
+
 # These deprecation warnings occur in the datamodel-code-generator package.
 @pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
 def test_to_from_json_remote(cloud_test_catalog_upload):

diff --git a/tests/func/test_hf.py b/tests/func/test_hf.py
@@ -0,0 +1,50 @@
+import numpy as np
+from datasets import load_dataset
+from datasets.features.image import image_to_bytes
+from PIL import Image
+from scipy.io.wavfile import write
+
+from datachain.lib.data_model import dict_to_data_model
+from datachain.lib.hf import (
+    HFGenerator,
+    HFImage,
+    get_output_schema,
+)
+
+
+def test_hf_image(tmp_path):
+    train_dir = tmp_path / "train"
+    train_dir.mkdir()
+    img = Image.new(mode="RGB", size=(64, 64))
+    img.save(train_dir / "img1.png")
+
+    ds = load_dataset("imagefolder", data_dir=tmp_path)
+    schema = {"split": str} | get_output_schema(ds["train"].features)
+    assert schema["image"] is HFImage
+
+    gen = HFGenerator(ds, dict_to_data_model("", schema))
+    gen.setup()
+    row = next(iter(gen.process("train")))
+    assert row.image.img == image_to_bytes(img)
+
+
+def test_hf_audio(tmp_path):
+    # See https://stackoverflow.com/questions/66191480/how-to-convert-a-numpy-array-to-a-mp3-file
+    samplerate = 44100
+    fs = 100
+    t = np.linspace(0.0, 1.0, samplerate)
+    amplitude = np.iinfo(np.int16).max
+    data = amplitude * np.sin(2.0 * np.pi * fs * t)
+    train_dir = tmp_path / "train"
+    train_dir.mkdir()
+    write(train_dir / "example.wav", samplerate, data.astype(np.int16))
+
+    ds = load_dataset("audiofolder", data_dir=tmp_path)
+    schema = {"split": str} | get_output_schema(ds["train"].features)
+
+    gen = HFGenerator(ds, dict_to_data_model("", schema))
+    gen.setup()
+    row = next(iter(gen.process("train")))
+    assert row.audio.path == str(train_dir / "example.wav")
+    assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
+    assert row.audio.sampling_rate == samplerate
diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py
@@ -1423,47 +1423,6 @@ def test_explode_raises_on_wrong_column_type(test_session):
         dc.explode("f1.count")
 
 
-# These deprecation warnings occur in the datamodel-code-generator package.
-@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
-def test_to_from_json(tmp_dir, test_session):
-    df = pd.DataFrame(DF_DATA)
-    dc_to = DataChain.from_pandas(df, session=test_session)
-    path = tmp_dir / "test.json"
-    dc_to.order_by("first_name", "age").to_json(path)
-
-    with open(path) as f:
-        values = json.load(f)
-    assert values == [
-        {"first_name": n, "age": a, "city": c}
-        for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
-    ]
-
-    dc_from = DataChain.from_json(path.as_uri(), session=test_session)
-    df1 = dc_from.select("json.first_name", "json.age", "json.city").to_pandas()
-    df1 = df1["json"]
-    assert df_equal(df1, df)
-
-
-# These deprecation warnings occur in the datamodel-code-generator package.
-@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
-def test_from_json_jmespath(tmp_dir, test_session):
-    df = pd.DataFrame(DF_DATA)
-    values = [
-        {"first_name": n, "age": a, "city": c}
-        for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
-    ]
-    path = tmp_dir / "test.json"
-    with open(path, "w") as f:
-        json.dump({"author": "Test User", "version": 5, "values": values}, f)
-
-    dc_from = DataChain.from_json(
-        path.as_uri(), jmespath="values", session=test_session
-    )
-    df1 = dc_from.select("values.first_name", "values.age", "values.city").to_pandas()
-    df1 = df1["values"]
-    assert df_equal(df1, df)
-
-
 def test_to_json_features(tmp_dir, test_session):
     dc_to = DataChain.from_values(
         f1=features, num=range(len(features)), session=test_session

diff --git a/tests/unit/lib/test_hf.py b/tests/unit/lib/test_hf.py
@@ -1,14 +1,9 @@
-import numpy as np
-from datasets import Array2D, Dataset, DatasetDict, Sequence, Value, load_dataset
-from datasets.features.image import image_to_bytes
-from PIL import Image
-from scipy.io.wavfile import write
+from datasets import Array2D, Dataset, DatasetDict, Sequence, Value
 
 from datachain.lib.data_model import dict_to_data_model
 from datachain.lib.hf import (
     HFClassLabel,
     HFGenerator,
-    HFImage,
     get_output_schema,
     stream_splits,
 )
@@ -92,41 +87,3 @@ def test_hf_array():
     gen.setup()
     row = next(iter(gen.process()))
     assert row.arr == [[0, 1], [2, 3]]
-
-
-def test_hf_image(tmp_path):
-    train_dir = tmp_path / "train"
-    train_dir.mkdir()
-    img = Image.new(mode="RGB", size=(64, 64))
-    img.save(train_dir / "img1.png")
-
-    ds = load_dataset("imagefolder", data_dir=tmp_path)
-    schema = {"split": str} | get_output_schema(ds["train"].features)
-    assert schema["image"] is HFImage
-
-    gen = HFGenerator(ds, dict_to_data_model("", schema))
-    gen.setup()
-    row = next(iter(gen.process("train")))
-    assert row.image.img == image_to_bytes(img)
-
-
-def test_hf_audio(tmp_path):
-    # See https://stackoverflow.com/questions/66191480/how-to-convert-a-numpy-array-to-a-mp3-file
-    samplerate = 44100
-    fs = 100
-    t = np.linspace(0.0, 1.0, samplerate)
-    amplitude = np.iinfo(np.int16).max
-    data = amplitude * np.sin(2.0 * np.pi * fs * t)
-    train_dir = tmp_path / "train"
-    train_dir.mkdir()
-    write(train_dir / "example.wav", samplerate, data.astype(np.int16))
-
-    ds = load_dataset("audiofolder", data_dir=tmp_path)
-    schema = {"split": str} | get_output_schema(ds["train"].features)
-
-    gen = HFGenerator(ds, dict_to_data_model("", schema))
-    gen.setup()
-    row = next(iter(gen.process("train")))
-    assert row.audio.path == str(train_dir / "example.wav")
-    assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
-    assert row.audio.sampling_rate == samplerate