Skip to content

Commit

Permalink
feat: allow fslash in component names to simplify heuristics and serde
Browse files Browse the repository at this point in the history
  • Loading branch information
z3z1ma committed Jun 7, 2024
1 parent a89d8a9 commit eb440d9
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 19 deletions.
34 changes: 28 additions & 6 deletions src/cdf/core/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,17 +551,39 @@ def _hydrate_workspaces(cls, value: t.Any, info: pydantic.ValidationInfo):
If the workspaces is a path, load the configuration from the path.
"""
if isinstance(value, str):
value = list(map(lambda s: s.strip(), value.split(",")))
# ws1; ws2; ws3
value = list(map(lambda s: s.strip(), value.split(";")))
elif isinstance(value, dict):
value = [str(v["path"]) for v in value.values()]
# ws name : {ws config}
_buf = []
for ws_name, ws_config in value.items():
ws_config.setdefault("name", ws_name)
_buf.append(ws_config)
value = _buf
if isinstance(value, list):
for i, maybe_path in enumerate(value):
if isinstance(maybe_path, str):
path = Path(info.data["path"]) / maybe_path
# [{ws1 config} | ws1 path, {ws2 config}]
for i, obj in enumerate(value):
if isinstance(obj, (str, Path)):
# ws1 path
# ensure ws path is absolute, if not, resolve it
# relative to the project path
obj_path = Path(obj)
if obj_path.is_absolute():
path = obj_path
else:
path = Path(info.data["path"]) / obj
# load the configuration from the path
config = _load_config(path)
config["path"] = path
value[i] = config
return value
elif isinstance(obj, dict):
# {ws1 config}
obj_path = Path(obj["path"])
if not obj_path.is_absolute():
obj["path"] = Path(info.data["path"]) / obj["path"]
if not (hasattr(value, "__iter__") and not isinstance(value, (str, bytes))):
raise ValueError("Invalid workspaces configuration, must be an iterable")
return tuple(value)

@pydantic.model_validator(mode="after")
def _validate_workspaces(self):
Expand Down
23 changes: 10 additions & 13 deletions src/cdf/core/specification/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class BaseComponent(
pydantic.Field(
...,
default_factory=_gen_anon_name,
pattern=r"^[a-zA-Z0-9_-]+$",
pattern=r"^[a-zA-Z0-9_\-\/]+$",
max_length=64,
),
]
Expand Down Expand Up @@ -220,8 +220,8 @@ def path(self) -> Path:

@pydantic.model_validator(mode="before")
@classmethod
def _infer_leaf_path_validator(cls, values: t.Any) -> t.Any:
"""Infer the leaf path from the name if component_path is not provided.
def _path_from_name_validator(cls, values: t.Any) -> t.Any:
"""Infer the path from the name if component_path is not provided.
Given a name, we apply certain heuristics to infer the path of the component if a
path is not explicitly provided. The heuristics are as follows:
Expand All @@ -242,27 +242,24 @@ def _infer_leaf_path_validator(cls, values: t.Any) -> t.Any:
ext = getattr(cls._extension, "default")
typ = getattr(cls._folder, "default")[:-1]
if name.endswith(f"_{typ}"):
leaf_path = f"{name}.{ext}"
p = f"{name}.{ext}"
else:
leaf_path = f"{name}_{typ}.{ext}"
values.setdefault("path", leaf_path)
p = f"{name}_{typ}.{ext}"
values.setdefault("path", p)
return values

@pydantic.field_validator("name", mode="before")
@classmethod
def _physical_name_validator(cls, name: t.Any) -> t.Any:
"""Canonicalizes names which are pathlike.
So a name like `some/path/to/file.py` would become `some_path_to_file`.
"""
def _component_name_validator(cls, name: t.Any) -> t.Any:
"""Strip the extension from the name."""
if isinstance(name, str):
return name.rsplit(".", 1)[0].replace(os.sep, "_")
return name.rsplit(".", 1)[0]
return name

@pydantic.field_validator("component_path", mode="before")
@classmethod
def _component_path_validator(cls, component_path: t.Any) -> Path:
"""Ensure the component path is a Path and that is a child of the expected folder."""
"""Ensure the component path is a Path and that its a child of the expected folder."""
path = Path(component_path)
if path.is_absolute():
raise ValueError("Component path must be a relative path.")
Expand Down
93 changes: 93 additions & 0 deletions tests/core/test_project.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Tests for the core.project module."""

from pathlib import Path

import dlt
import pytest

Expand Down Expand Up @@ -83,3 +85,94 @@ def test_round_trip_serialization(project: Project):
roundtrip = Project.model_validate(obj)
assert roundtrip == project
assert roundtrip.is_newer_than(project)
assert (
project["workspaces.alex.scripts.nested/hello"]
== roundtrip["workspaces.alex.scripts.nested/hello"]
)


@pytest.fixture
def python_project():
city_spec = {
"path": Path("pipelines/us_cities_pipeline.py"),
"cron_string": "@daily",
"description": "Get US city data",
"metrics": {
"*": [
{
"name": "cdf_builtin_metrics_count",
"description": "Counts the number of items in a dataset",
"entrypoint": "cdf.builtin.metrics:count",
},
{
"name": "cdf_builtin_metrics_max_value",
"description": "Returns the maximum value of a key in a dataset",
"entrypoint": "cdf.builtin.metrics:max_value",
"options": {"key": "zip_code"},
},
]
},
"filters": {},
"dataset_name": "test_city",
"options": {
"progress": None,
"full_refresh": False,
"loader_file_format": "insert_values",
"runtime": {"dlthub_telemetry": False},
},
}
dota_spec = {
"cron_string": "@daily",
"name": "dota2",
"description": "Dota2 is a Massive Online Battle Arena game based on Warcraft.",
"path": Path("pipelines/dota2_pipeline.py"),
}
local_spec = {
"name": "local",
"description": "No description provided.",
"path": Path("sinks/local_sink.py"),
}
httpbin_spec = {
"cron_string": "@daily",
"name": "httpbin",
"description": "A publisher that pushes data to httpbin.org",
"path": Path("publishers/httpbin_publisher.py"),
"depends_on": ["mart.zips"],
}
hello_spec = {
"cron_string": "@daily",
"name": "hello",
"description": "No description provided.",
"path": Path("scripts/hello_script.py"),
}
return Project.model_validate(
{
"path": Path("examples/sandbox").resolve(),
"name": "data-platform",
"version": "0.2.0",
"workspaces": {
"datateam": {
"path": Path("examples/sandbox/alex").resolve(),
"pipelines": {"cities": city_spec, "dota": dota_spec},
"sinks": {"local": local_spec},
"publishers": {"httpbin": httpbin_spec},
"scripts": {"hello": hello_spec},
}
},
"filesystem": {"uri": "file://_storage", "options": {}},
"feature_flags": {
"provider": "filesystem",
"filename": "@jinja dev_flags_{{ 1 + 1}}.json",
},
}
)


def test_custom_project(python_project: Project):
"""Test creating a project programmatically.
This project has a custom structure and is not loaded from a file. Components
are still ultimately based on python files, however the configuration wrapping
these components is done in code which offers more flexibility.
"""
assert python_project.name == "data-platform"

0 comments on commit eb440d9

Please sign in to comment.