-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
build: refactor tidydataframe with contextmanager
- Loading branch information
lucas-nelson-uiuc
committed
Nov 30, 2024
1 parent
a4843e8
commit 9b73b67
Showing
5 changed files
with
244 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,10 @@ | ||
from typing import TYPE_CHECKING, Optional | ||
from typing import TYPE_CHECKING | ||
|
||
if TYPE_CHECKING: | ||
TidyDataFrame = "TidyDataFrame" | ||
|
||
from pyspark.sql import Column, DataFrame | ||
from pyspark.sql import Column, DataFrame, GroupedData | ||
|
||
|
||
ColumnReference = str | Column | ||
DataFrameReference = DataFrame | Optional[DataFrame] | ||
DataFrameReference = DataFrame | GroupedData |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import datetime | ||
import difflib | ||
from contextlib import contextmanager | ||
|
||
from attrs import define | ||
from attrs import field | ||
from loguru import logger | ||
from pyspark.sql import types as T | ||
|
||
|
||
logger.level("ENTER", color="<green>") | ||
logger.level("EXIT", color="<red>") | ||
|
||
|
||
@define | ||
class TidySnapshot: | ||
operation: str | ||
message: str | ||
schema: T.StructType | ||
dimensions: tuple[int, int] | ||
timestamp: datetime.datetime = field(default=datetime.datetime.now()) | ||
|
||
|
||
@contextmanager | ||
def tidy_context(): | ||
"""Define context manager for handling tidy operations.""" | ||
context = {"operation_log": [], "snapshots": []} | ||
try: | ||
logger.log("ENTER", ">> Converting data to TidyDataFrame") | ||
yield context | ||
logger.log("EXIT", "<< Returning data as DataFrame") | ||
finally: | ||
for log in context["operation_log"]: | ||
print(log) | ||
|
||
|
||
def compute_delta(snapshot1: TidySnapshot, snapshot2: TidySnapshot): | ||
# Get schema differences using difflib | ||
schema_diff = compare_schemas(snapshot1.schema, snapshot2.schema) | ||
print("Schema Changes:") | ||
print("\n".join(schema_diff)) | ||
|
||
# Get dimension (row/column count) differences using difflib | ||
dimension_diff = compare_dimensions(snapshot1.dimensions, snapshot2.dimensions) | ||
print("Dimension Changes:") | ||
print("\n".join(dimension_diff)) | ||
|
||
|
||
def compare_schemas(schema1, schema2): | ||
# Extract column names and types for comparison | ||
cols1 = [f"{field.name}: {field.dataType}" for field in schema1.fields] | ||
cols2 = [f"{field.name}: {field.dataType}" for field in schema2.fields] | ||
|
||
# Use difflib to compare column lists | ||
return list(difflib.ndiff(cols1, cols2)) | ||
|
||
|
||
def compare_dimensions(dim1, dim2): | ||
# Compare row and column counts as text for difflib | ||
row_diff = f"Rows: {dim1[0]} -> {dim2[0]}" | ||
col_diff = f"Columns: {dim1[1]} -> {dim2[1]}" | ||
|
||
# Using difflib to show dimension changes | ||
return list(difflib.ndiff([row_diff], [col_diff])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,12 @@ | ||
from attrs import define | ||
from tidy_tools.frame._types import Functions | ||
from tidy_tools.frame._types import Objects | ||
|
||
|
||
@define | ||
class TidyWorkFlow: | ||
input: Objects | ||
funcs: Functions | ||
class tidyworkflow: | ||
def __enter__(self): | ||
print("Starting") | ||
return self | ||
|
||
def __exit__(self, *exc): | ||
print("Finishing") | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters