Skip to content

Commit

Permalink
move long running tests that read/write from disk
Browse files Browse the repository at this point in the history
  • Loading branch information
mattseddon committed Jan 20, 2025
1 parent aadbfbd commit d90cae9
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 85 deletions.
42 changes: 42 additions & 0 deletions tests/func/test_datachain.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import functools
import json
import math
import os
import pickle
Expand Down Expand Up @@ -1700,6 +1701,47 @@ def test_to_from_parquet_partitioned_remote(cloud_test_catalog_upload):
assert df_equal(df1, df)


# These deprecation warnings occur in the datamodel-code-generator package.
@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
def test_to_from_json(tmp_dir, test_session):
df = pd.DataFrame(DF_DATA)
dc_to = DataChain.from_pandas(df, session=test_session)
path = tmp_dir / "test.json"
dc_to.order_by("first_name", "age").to_json(path)

with open(path) as f:
values = json.load(f)
assert values == [
{"first_name": n, "age": a, "city": c}
for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
]

dc_from = DataChain.from_json(path.as_uri(), session=test_session)
df1 = dc_from.select("json.first_name", "json.age", "json.city").to_pandas()
df1 = df1["json"]
assert df_equal(df1, df)


# These deprecation warnings occur in the datamodel-code-generator package.
@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
def test_from_json_jmespath(tmp_dir, test_session):
df = pd.DataFrame(DF_DATA)
values = [
{"first_name": n, "age": a, "city": c}
for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
]
path = tmp_dir / "test.json"
with open(path, "w") as f:
json.dump({"author": "Test User", "version": 5, "values": values}, f)

dc_from = DataChain.from_json(
path.as_uri(), jmespath="values", session=test_session
)
df1 = dc_from.select("values.first_name", "values.age", "values.city").to_pandas()
df1 = df1["values"]
assert df_equal(df1, df)


# These deprecation warnings occur in the datamodel-code-generator package.
@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
def test_to_from_json_remote(cloud_test_catalog_upload):
Expand Down
50 changes: 50 additions & 0 deletions tests/func/test_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
from datasets import load_dataset
from datasets.features.image import image_to_bytes
from PIL import Image
from scipy.io.wavfile import write

from datachain.lib.data_model import dict_to_data_model
from datachain.lib.hf import (
HFGenerator,
HFImage,
get_output_schema,
)


def test_hf_image(tmp_path):
train_dir = tmp_path / "train"
train_dir.mkdir()
img = Image.new(mode="RGB", size=(64, 64))
img.save(train_dir / "img1.png")

ds = load_dataset("imagefolder", data_dir=tmp_path)
schema = {"split": str} | get_output_schema(ds["train"].features)
assert schema["image"] is HFImage

gen = HFGenerator(ds, dict_to_data_model("", schema))
gen.setup()
row = next(iter(gen.process("train")))
assert row.image.img == image_to_bytes(img)


def test_hf_audio(tmp_path):
# See https://stackoverflow.com/questions/66191480/how-to-convert-a-numpy-array-to-a-mp3-file
samplerate = 44100
fs = 100
t = np.linspace(0.0, 1.0, samplerate)
amplitude = np.iinfo(np.int16).max
data = amplitude * np.sin(2.0 * np.pi * fs * t)
train_dir = tmp_path / "train"
train_dir.mkdir()
write(train_dir / "example.wav", samplerate, data.astype(np.int16))

ds = load_dataset("audiofolder", data_dir=tmp_path)
schema = {"split": str} | get_output_schema(ds["train"].features)

gen = HFGenerator(ds, dict_to_data_model("", schema))
gen.setup()
row = next(iter(gen.process("train")))
assert row.audio.path == str(train_dir / "example.wav")
assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
assert row.audio.sampling_rate == samplerate
41 changes: 0 additions & 41 deletions tests/unit/lib/test_datachain.py
Original file line number Diff line number Diff line change
Expand Up @@ -1423,47 +1423,6 @@ def test_explode_raises_on_wrong_column_type(test_session):
dc.explode("f1.count")


# These deprecation warnings occur in the datamodel-code-generator package.
@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
def test_to_from_json(tmp_dir, test_session):
df = pd.DataFrame(DF_DATA)
dc_to = DataChain.from_pandas(df, session=test_session)
path = tmp_dir / "test.json"
dc_to.order_by("first_name", "age").to_json(path)

with open(path) as f:
values = json.load(f)
assert values == [
{"first_name": n, "age": a, "city": c}
for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
]

dc_from = DataChain.from_json(path.as_uri(), session=test_session)
df1 = dc_from.select("json.first_name", "json.age", "json.city").to_pandas()
df1 = df1["json"]
assert df_equal(df1, df)


# These deprecation warnings occur in the datamodel-code-generator package.
@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
def test_from_json_jmespath(tmp_dir, test_session):
df = pd.DataFrame(DF_DATA)
values = [
{"first_name": n, "age": a, "city": c}
for n, a, c in zip(DF_DATA["first_name"], DF_DATA["age"], DF_DATA["city"])
]
path = tmp_dir / "test.json"
with open(path, "w") as f:
json.dump({"author": "Test User", "version": 5, "values": values}, f)

dc_from = DataChain.from_json(
path.as_uri(), jmespath="values", session=test_session
)
df1 = dc_from.select("values.first_name", "values.age", "values.city").to_pandas()
df1 = df1["values"]
assert df_equal(df1, df)


def test_to_json_features(tmp_dir, test_session):
dc_to = DataChain.from_values(
f1=features, num=range(len(features)), session=test_session
Expand Down
45 changes: 1 addition & 44 deletions tests/unit/lib/test_hf.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import numpy as np
from datasets import Array2D, Dataset, DatasetDict, Sequence, Value, load_dataset
from datasets.features.image import image_to_bytes
from PIL import Image
from scipy.io.wavfile import write
from datasets import Array2D, Dataset, DatasetDict, Sequence, Value

from datachain.lib.data_model import dict_to_data_model
from datachain.lib.hf import (
HFClassLabel,
HFGenerator,
HFImage,
get_output_schema,
stream_splits,
)
Expand Down Expand Up @@ -92,41 +87,3 @@ def test_hf_array():
gen.setup()
row = next(iter(gen.process()))
assert row.arr == [[0, 1], [2, 3]]


def test_hf_image(tmp_path):
train_dir = tmp_path / "train"
train_dir.mkdir()
img = Image.new(mode="RGB", size=(64, 64))
img.save(train_dir / "img1.png")

ds = load_dataset("imagefolder", data_dir=tmp_path)
schema = {"split": str} | get_output_schema(ds["train"].features)
assert schema["image"] is HFImage

gen = HFGenerator(ds, dict_to_data_model("", schema))
gen.setup()
row = next(iter(gen.process("train")))
assert row.image.img == image_to_bytes(img)


def test_hf_audio(tmp_path):
# See https://stackoverflow.com/questions/66191480/how-to-convert-a-numpy-array-to-a-mp3-file
samplerate = 44100
fs = 100
t = np.linspace(0.0, 1.0, samplerate)
amplitude = np.iinfo(np.int16).max
data = amplitude * np.sin(2.0 * np.pi * fs * t)
train_dir = tmp_path / "train"
train_dir.mkdir()
write(train_dir / "example.wav", samplerate, data.astype(np.int16))

ds = load_dataset("audiofolder", data_dir=tmp_path)
schema = {"split": str} | get_output_schema(ds["train"].features)

gen = HFGenerator(ds, dict_to_data_model("", schema))
gen.setup()
row = next(iter(gen.process("train")))
assert row.audio.path == str(train_dir / "example.wav")
assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
assert row.audio.sampling_rate == samplerate

0 comments on commit d90cae9

Please sign in to comment.