From 07f2736294b738b7b024c8eb9f1eb14f89ca1164 Mon Sep 17 00:00:00 2001 From: lucas-nelson-uiuc Date: Thu, 28 Nov 2024 10:54:40 -0600 Subject: [PATCH] test: update tests with config file --- src/tidy_tools/tidy/_logger.py | 11 ++++++ src/tidy_tools/tidy/_types.py | 5 +++ src/tidy_tools/tidy/dataframe.py | 1 + src/tidy_tools/tidy/workflow.py | 47 +++++++++++++++++++++++--- tests/conftest.py | 58 ++++++++++++++++++++++++++++++++ tests/test_filters.py | 0 tests/test_selector.py | 58 -------------------------------- 7 files changed, 118 insertions(+), 62 deletions(-) create mode 100644 src/tidy_tools/tidy/_logger.py create mode 100644 src/tidy_tools/tidy/_types.py create mode 100644 tests/conftest.py create mode 100644 tests/test_filters.py diff --git a/src/tidy_tools/tidy/_logger.py b/src/tidy_tools/tidy/_logger.py new file mode 100644 index 0000000..c96d7be --- /dev/null +++ b/src/tidy_tools/tidy/_logger.py @@ -0,0 +1,11 @@ +import sys +from loguru import logger + +logger.remove() +logger.add(sys.stderr, format="{time:HH:mm} | {level} | {message}") + + +def _logger(message: str, level: str = "info") -> None: + if not hasattr(logger, level): + raise ValueError(f"Logger does not have {level=}") + getattr(logger, level)(message) diff --git a/src/tidy_tools/tidy/_types.py b/src/tidy_tools/tidy/_types.py new file mode 100644 index 0000000..f55dfc8 --- /dev/null +++ b/src/tidy_tools/tidy/_types.py @@ -0,0 +1,5 @@ +from typing import Iterable, Callable + + +Functions = Callable | Iterable[Callable] +Objects = object | Iterable[object] diff --git a/src/tidy_tools/tidy/dataframe.py b/src/tidy_tools/tidy/dataframe.py index 9649695..7743f7e 100644 --- a/src/tidy_tools/tidy/dataframe.py +++ b/src/tidy_tools/tidy/dataframe.py @@ -61,6 +61,7 @@ def select(self, *selectors: ColumnSelector, strict: bool = True): return self def pipe(self, *funcs: Callable): + """Chain multiple custom transformation functions to be applied iteratively.""" self._data = functools.reduce( lambda init, func: init.transform(func), funcs, self._data ) diff --git a/src/tidy_tools/tidy/workflow.py b/src/tidy_tools/tidy/workflow.py index 263344b..0ecd94f 100644 --- a/src/tidy_tools/tidy/workflow.py +++ b/src/tidy_tools/tidy/workflow.py @@ -1,6 +1,45 @@ -from attrs import define +# from typing import Any +# import inspect +# import functools +# from attrs import define, field -@define -class TidyWorkFlow: - pass +# from tidy_tools.tidy._logger import _logger +# from tidy_tools.tidy._types import Functions, Objects + + +# def identity(obj: Any) -> Any: +# """Return input object as is.""" +# return obj + + +# def metadata_factory() -> dict: +# return dict( +# name="No name provided", +# description="No description provided" +# ) + +# @define +# class TidyWorkFlow: +# input: Objects +# funcs: Functions +# preprocess: Functions = field(default=identity) +# postprocess: Functions = field(default=identity) +# metadata: dict = field(factory=metadata_factory) + +# def run(self): +# input = map(self.preprocess, self.input) +# result = functools.reduce( +# lambda init, func: transform(init, func), +# self.funcs, +# self.input +# ) +# output = self.preprocess(result) +# return output + + +# def metadata(self): +# return { +# func.__name__: inspect.getdoc(func) +# for func in self.funcs +# } diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d61d1f8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,58 @@ +import datetime +import pytest + +from pyspark.sql import SparkSession, types as T + +from tidy_tools.tidy import TidyDataFrame + + +@pytest.fixture +def spark_fixture(): + spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate() + yield spark + + +@pytest.fixture +def sample_data(spark_fixture): + data = spark_fixture.createDataFrame( + [ + { + "name": "Homer", + "birth_date": datetime.date(1956, 5, 12), + "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), + "seasons": 36, + "instrument": None, + }, + { + "name": "Marge", + "birth_date": datetime.date(1956, 10, 1), + "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), + "seasons": 36, + "instrument": None, + }, + { + "name": "Bart", + "birth_date": datetime.date(1979, 4, 1), + "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), + "seasons": 36, + "instrument": None, + }, + { + "name": "Lisa", + "birth_date": datetime.date(1981, 5, 9), + "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), + "seasons": 36, + "instrument": "Saxophone", + }, + ], + schema=T.StructType( + [ + T.StructField("name", T.StringType(), nullable=False), + T.StructField("birth_date", T.DateType(), nullable=False), + T.StructField("original_air_date", T.TimestampType(), nullable=False), + T.StructField("seasons", T.IntegerType(), nullable=False), + T.StructField("instrument", T.StringType(), nullable=True), + ] + ), + ) + yield TidyDataFrame(data) \ No newline at end of file diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_selector.py b/tests/test_selector.py index eacae5e..ec3c3e9 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1,62 +1,4 @@ -import pytest -import datetime - -from pyspark.sql import SparkSession, types as T - from tidy_tools.core import selector as cs -from tidy_tools.tidy import TidyDataFrame - - -@pytest.fixture -def spark_fixture(): - spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate() - yield spark - - -@pytest.fixture -def sample_data(spark_fixture): - data = spark_fixture.createDataFrame( - [ - { - "name": "Homer", - "birth_date": datetime.date(1956, 5, 12), - "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), - "seasons": 36, - "instrument": None, - }, - { - "name": "Marge", - "birth_date": datetime.date(1956, 10, 1), - "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), - "seasons": 36, - "instrument": None, - }, - { - "name": "Bart", - "birth_date": datetime.date(1979, 4, 1), - "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), - "seasons": 36, - "instrument": None, - }, - { - "name": "Lisa", - "birth_date": datetime.date(1981, 5, 9), - "original_air_date": datetime.datetime(1987, 4, 19, 20, 0, 0), - "seasons": 36, - "instrument": "Saxophone", - }, - ], - schema=T.StructType( - [ - T.StructField("name", T.StringType(), nullable=False), - T.StructField("birth_date", T.DateType(), nullable=False), - T.StructField("original_air_date", T.TimestampType(), nullable=False), - T.StructField("seasons", T.IntegerType(), nullable=False), - T.StructField("instrument", T.StringType(), nullable=True), - ] - ), - ) - yield TidyDataFrame(data) class TestColumnSelector: