From 0e91c499780a888876f690f7160918573b3b5ce9 Mon Sep 17 00:00:00 2001 From: z3z1ma Date: Mon, 19 Aug 2024 18:48:41 -0700 Subject: [PATCH] chore: clean up old code --- examples/README.md | 1 - examples/sandbox/alex/cdf.yml | 41 - examples/sandbox/alex/config.py | 38 - examples/sandbox/alex/models/zips.sql | 8 - .../alex/notebooks/hello_world_notebook.ipynb | 78 -- .../sandbox/alex/pipelines/dota2_pipeline.py | 132 -- .../alex/pipelines/hackernews_pipeline.py | 141 -- .../sandbox/alex/pipelines/test1/__init__.py | 0 .../sandbox/alex/pipelines/test1/chore.py | 2 - .../alex/pipelines/us_cities_pipeline.py | 35 - .../alex/publishers/httpbin_publisher.py | 19 - examples/sandbox/alex/schema.yaml | 12 - examples/sandbox/alex/scripts/hello_script.py | 1 - .../alex/scripts/nested/hello_script.py | 5 - examples/sandbox/alex/sinks/fs_sink.py | 13 - examples/sandbox/alex/sinks/local_sink.py | 32 - examples/sandbox/cdf.yml | 23 - src/cdf/cli.py | 1211 ----------------- src/cdf/integrations/sqlmesh.py | 147 -- src/cdf/legacy/__init__.py | 0 src/cdf/legacy/config.py | 151 -- src/cdf/legacy/constants.py | 13 - src/cdf/legacy/context.py | 47 - src/cdf/legacy/filesystem.py | 106 -- src/cdf/legacy/logger.py | 169 --- src/cdf/legacy/project.py | 989 -------------- src/cdf/legacy/runtime/__init__.py | 12 - src/cdf/legacy/runtime/common.py | 43 - src/cdf/legacy/runtime/notebook.py | 109 -- src/cdf/legacy/runtime/pipeline.py | 488 ------- src/cdf/legacy/runtime/publisher.py | 83 -- src/cdf/legacy/runtime/script.py | 65 - src/cdf/legacy/specification/__init__.py | 24 - src/cdf/legacy/specification/base.py | 523 ------- src/cdf/legacy/specification/notebook.py | 45 - src/cdf/legacy/specification/pipeline.py | 223 --- src/cdf/legacy/specification/publisher.py | 16 - src/cdf/legacy/specification/script.py | 11 - src/cdf/legacy/specification/sink.py | 70 - src/cdf/legacy/state.py | 407 ------ src/cdf/legacy/utility/__init__.py | 25 - src/cdf/legacy/utility/file.py | 75 - tests/legacy/specification/test_notebook.py | 0 tests/legacy/specification/test_pipeline.py | 0 tests/legacy/specification/test_publisher.py | 0 tests/legacy/specification/test_script.py | 0 tests/legacy/specification/test_sink.py | 0 tests/legacy/test_context.py | 0 tests/legacy/test_filesystem.py | 0 tests/legacy/test_packaging.py | 0 tests/legacy/test_project.py | 268 ---- tests/legacy/utility/test_file_.py | 0 tests/test_cli.py | 12 - 53 files changed, 5913 deletions(-) delete mode 100644 examples/README.md delete mode 100644 examples/sandbox/alex/cdf.yml delete mode 100644 examples/sandbox/alex/config.py delete mode 100644 examples/sandbox/alex/models/zips.sql delete mode 100644 examples/sandbox/alex/notebooks/hello_world_notebook.ipynb delete mode 100644 examples/sandbox/alex/pipelines/dota2_pipeline.py delete mode 100644 examples/sandbox/alex/pipelines/hackernews_pipeline.py delete mode 100644 examples/sandbox/alex/pipelines/test1/__init__.py delete mode 100644 examples/sandbox/alex/pipelines/test1/chore.py delete mode 100644 examples/sandbox/alex/pipelines/us_cities_pipeline.py delete mode 100644 examples/sandbox/alex/publishers/httpbin_publisher.py delete mode 100644 examples/sandbox/alex/schema.yaml delete mode 100644 examples/sandbox/alex/scripts/hello_script.py delete mode 100644 examples/sandbox/alex/scripts/nested/hello_script.py delete mode 100644 examples/sandbox/alex/sinks/fs_sink.py delete mode 100644 examples/sandbox/alex/sinks/local_sink.py delete mode 100644 examples/sandbox/cdf.yml delete mode 100644 src/cdf/cli.py delete mode 100644 src/cdf/integrations/sqlmesh.py delete mode 100644 src/cdf/legacy/__init__.py delete mode 100644 src/cdf/legacy/config.py delete mode 100644 src/cdf/legacy/constants.py delete mode 100644 src/cdf/legacy/context.py delete mode 100644 src/cdf/legacy/filesystem.py delete mode 100644 src/cdf/legacy/logger.py delete mode 100644 src/cdf/legacy/project.py delete mode 100644 src/cdf/legacy/runtime/__init__.py delete mode 100644 src/cdf/legacy/runtime/common.py delete mode 100644 src/cdf/legacy/runtime/notebook.py delete mode 100644 src/cdf/legacy/runtime/pipeline.py delete mode 100644 src/cdf/legacy/runtime/publisher.py delete mode 100644 src/cdf/legacy/runtime/script.py delete mode 100644 src/cdf/legacy/specification/__init__.py delete mode 100644 src/cdf/legacy/specification/base.py delete mode 100644 src/cdf/legacy/specification/notebook.py delete mode 100644 src/cdf/legacy/specification/pipeline.py delete mode 100644 src/cdf/legacy/specification/publisher.py delete mode 100644 src/cdf/legacy/specification/script.py delete mode 100644 src/cdf/legacy/specification/sink.py delete mode 100644 src/cdf/legacy/state.py delete mode 100644 src/cdf/legacy/utility/__init__.py delete mode 100644 src/cdf/legacy/utility/file.py delete mode 100644 tests/legacy/specification/test_notebook.py delete mode 100644 tests/legacy/specification/test_pipeline.py delete mode 100644 tests/legacy/specification/test_publisher.py delete mode 100644 tests/legacy/specification/test_script.py delete mode 100644 tests/legacy/specification/test_sink.py delete mode 100644 tests/legacy/test_context.py delete mode 100644 tests/legacy/test_filesystem.py delete mode 100644 tests/legacy/test_packaging.py delete mode 100644 tests/legacy/test_project.py delete mode 100644 tests/legacy/utility/test_file_.py delete mode 100644 tests/test_cli.py diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 72c0fa4..0000000 --- a/examples/README.md +++ /dev/null @@ -1 +0,0 @@ -These are example projects that use cdf. diff --git a/examples/sandbox/alex/cdf.yml b/examples/sandbox/alex/cdf.yml deleted file mode 100644 index a211548..0000000 --- a/examples/sandbox/alex/cdf.yml +++ /dev/null @@ -1,41 +0,0 @@ -default: - name: alex - destination: - replace_strategy: insert-from-staging - pipelines: - # The pipeline name is based on the dict key by default, metadata follows in the body - us_cities: - description: Get US city data - dataset_name: us_cities_v0_{version} - version: 1 - metrics: - "*": - - entrypoint: cdf.builtin.metrics:count - - entrypoint: cdf.builtin.metrics:max_value - options: - key: zip_code - options: - progress: ~ - full_refresh: false - loader_file_format: insert_values - load: - delete_completed_jobs: true - runtime: - dlthub_telemetry: false - # Heuristics can populate enough information such that the below is the minimum definition - dota2: {} - hackernews: {} - sinks: - local: {} - fs_sink.py: {} - publishers: - httpbin: - depends_on: - - mart.zips - scripts: - hello: {} - nested/hello: {} - notebooks: - hello_world: - storage_path: reports/tests1/{name}/{timestamp}{ext} - gc_duration: 0 diff --git a/examples/sandbox/alex/config.py b/examples/sandbox/alex/config.py deleted file mode 100644 index feac8f1..0000000 --- a/examples/sandbox/alex/config.py +++ /dev/null @@ -1,38 +0,0 @@ -import cdf -import sqlmesh -from cdf.integrations.sqlmesh import CDFNotificationTarget - -workspace = cdf.get_workspace(__file__).unwrap() - -config = sqlmesh.Config.model_validate( - dict( - gateways=dict(workspace.get_transform_gateways()), - project=workspace.name, - default_gateway="local", - model_defaults={ - "dialect": "duckdb", - "start": "2020-01-01", - }, - plan={ - "auto_categorize_changes": { - "sql": "full", - "seed": "semi", - "external": "semi", - } - }, - # username=getpass.getuser(), - physical_schema_override={}, - format={ - "normalize": True, - "pad": 4, - "indent": 4, - "normalize_functions": "lower", - "leading_comma": False, - "max_text_width": 120, - "append_newline": True, - }, - ui={"format_on_save": True}, - ) -) - -config.notification_targets = [CDFNotificationTarget(workspace=workspace)] diff --git a/examples/sandbox/alex/models/zips.sql b/examples/sandbox/alex/models/zips.sql deleted file mode 100644 index ea4727f..0000000 --- a/examples/sandbox/alex/models/zips.sql +++ /dev/null @@ -1,8 +0,0 @@ -/* This is a simple model that selects distinct zip codes from the cities table */ -MODEL ( - name mart.zips -); - -SELECT DISTINCT - zip_code -FROM us_cities_v0_1.cities diff --git a/examples/sandbox/alex/notebooks/hello_world_notebook.ipynb b/examples/sandbox/alex/notebooks/hello_world_notebook.ipynb deleted file mode 100644 index 6638e40..0000000 --- a/examples/sandbox/alex/notebooks/hello_world_notebook.ipynb +++ /dev/null @@ -1,78 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "4a6c5cd3-4334-47e7-b512-34dc238626d1", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.executable" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3810128-bf25-4e8c-99a6-2034101cd5c9", - "metadata": {}, - "outputs": [], - "source": [ - "sys.path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8012b40d-1bf1-49b7-88b1-b3a860bebfb2", - "metadata": {}, - "outputs": [], - "source": [ - "import cdf\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5dc40c68-eaba-49df-8fe0-6370e7d5130e", - "metadata": {}, - "outputs": [], - "source": [ - "os.getcwd()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c124f374-cb76-49aa-84aa-0e142e86d7df", - "metadata": {}, - "outputs": [], - "source": [ - "p = cdf.find_nearest().unwrap()\n", - "p.name" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/sandbox/alex/pipelines/dota2_pipeline.py b/examples/sandbox/alex/pipelines/dota2_pipeline.py deleted file mode 100644 index 5922955..0000000 --- a/examples/sandbox/alex/pipelines/dota2_pipeline.py +++ /dev/null @@ -1,132 +0,0 @@ -"""Dota2 is a Massive Online Battle Arena game based on Warcraft.""" - -import dlt -import dlt.sources.helpers.requests as requests - -import cdf - - -@dlt.resource(write_disposition="merge", primary_key="account_id") -def pro_players(): - """Get list of pro players""" - yield requests.get("https://api.opendota.com/api/proPlayers").json() - - -@dlt.resource(write_disposition="merge", primary_key="match_id") -def pro_matches(): - """Get list of pro matches""" - yield requests.get("https://api.opendota.com/api/proMatches").json() - - -@dlt.resource(write_disposition="replace") -def distribution(): - """Distributions of MMR data by bracket and country""" - yield requests.get("https://api.opendota.com/api/distributions").json() - - -@dlt.resource(write_disposition="replace") -def rankings(): - """Top players by hero""" - yield requests.get("https://api.opendota.com/api/rankings").json() - - -@dlt.resource(write_disposition="replace") -def benchmarks(): - """Benchmarks of average stat values for a hero""" - yield requests.get("https://api.opendota.com/api/benchmarks").json() - - -@dlt.resource(write_disposition="replace") -def heroes(): - """Get hero data""" - yield requests.get("https://api.opendota.com/api/heroes").json() - - -@dlt.resource(write_disposition="replace") -def hero_stats(): - """Get stats about hero performance in recent matches""" - yield requests.get("https://api.opendota.com/api/heroStats").json() - - -@dlt.resource(write_disposition="replace") -def leagues(): - """Get league data""" - yield requests.get("https://api.opendota.com/api/leagues").json() - - -@dlt.resource(write_disposition="replace") -def teams(): - """Get team data""" - yield requests.get("https://api.opendota.com/api/teams").json() - - -@dlt.resource(write_disposition="replace") -def constants(): - """Download all constants from odota/dotaconstants""" - - for table in ( - "game_mode", - "item_colors", - "lobby_type", - "order_types", - "patch", - "permanent_buffs", - "player_colors", - "skillshots", - "xp_level", - ): - raw_data = requests.get( - f"https://raw.githubusercontent.com/odota/dotaconstants/master/json/{table}.json" - ).json() - - if table in ("game_mode", "lobby_type"): - data = list(raw_data.values()) - elif table in ( - "item_colors", - "order_types", - "permanent_buffs", - "player_colors", - "skillshots", - ): - data = [{"id": k, "value": v} for k, v in raw_data.items()] - elif table == "xp_level": - data = [{"level": i, "xp": v} for i, v in enumerate(raw_data)] - else: - data = raw_data - - yield dlt.mark.with_table_name(data, table) - - -@dlt.source -def dota2_stats(): - """This source contains Dota 2 data from OpenDota API and repository""" - return ( - pro_players(), - pro_matches(), - distribution(), - rankings(), - benchmarks(), - heroes(), - hero_stats(), - leagues(), - teams(), - constants(), - ) - - -if cdf.is_main(__name__): - # Define a pipeline - pipe = cdf.pipeline() - - # Instantiate the source - source = dota2_stats() - - # Run the pipeline - pipe.run( - source.with_resources( - "pro_players", - "pro_matches", - "teams", - "heroes", - ) - ) diff --git a/examples/sandbox/alex/pipelines/hackernews_pipeline.py b/examples/sandbox/alex/pipelines/hackernews_pipeline.py deleted file mode 100644 index 6e14685..0000000 --- a/examples/sandbox/alex/pipelines/hackernews_pipeline.py +++ /dev/null @@ -1,141 +0,0 @@ -import time -import typing as t -from datetime import datetime - -import dlt -from dlt.sources.helpers import requests - -import cdf - -URL = "https://hn.algolia.com/api/v1/search_by_date" - - -@dlt.source(name="hackernews") -def hn_search( - keywords: t.List[str] = dlt.config.value, - start_date: datetime = dlt.config.value, - end_date: datetime = datetime.today(), - text: str = "any", - daily_load: bool = False, -): - """Source method for the Algolia Hacker News Search API: https://hn.algolia.com/api - - Args: - keywords: list of keywords for which the data needs to be loaded - start_date: start date in datetime or "yyyy-mm-dd" format - end_date: end date in datetime or "yyyy-mm-dd" format - text: possible values: "story","comment". For any other value, everything is loaded. - daily_load: loads data in daily intervals when set to True (default: weekly) - """ - - # Read start date as string or datetime and convert it to UNIX timestamp - if isinstance(start_date, str): - start_timestamp = int( - time.mktime(datetime.strptime(start_date, "%Y-%m-%d").timetuple()) - ) - else: - start_timestamp = int(time.mktime(start_date.timetuple())) # type: ignore - - # Read end date as string or datetime and convert it to UNIX timestamp - if isinstance(end_date, str): - end_timestamp = int( - time.mktime(datetime.strptime(end_date, "%Y-%m-%d").timetuple()) - ) - else: - end_timestamp = int(time.mktime(end_date.timetuple())) - - today = int(time.mktime(datetime.today().timetuple())) - - # Don't load the data for dates after the current date - end_timestamp = min(today, end_timestamp) - - # Ensure that the input start date is smaller than the input end date - if start_timestamp > end_timestamp: - raise ValueError(f"{start_date=} is larger than {end_date=}") - - # Specify text = "comment" or text="story" when calling the function - # to load only comments or stories - if text in ["comment", "story"]: - tags = text - # Pass any other value to load everything (default behaviour) - else: - tags = "(story,comment)" - - return keyword_hits(keywords, start_timestamp, end_timestamp, tags, daily_load) - - -@dlt.resource(name="keyword_hits", write_disposition="append") -def keyword_hits( - keywords, - start_timestamp, - end_timestamp, - tags, - daily_load=False, -): - """This methods makes a call to the Algolia Hacker News and returns all the hits corresponding the the input keywords - - Since the API response is limited to 1000 hits, - a separate call is made for each keyword for each week between the start and end dates - - If daily_load=True, then a single call is made for each keyword for the previous day - - Args: - keywords: list of keywords for which the data needs to be loaded - start_timestamp: UNIX timestamp for the start date - end_timestamp: UNIX timestamp for the end date - tags: parameter for the API call to specify "story", "comment" or "(story,comment)" - daily_load: loads data in daily intervals when set to True (default: weekly) - """ - - def _generate_hits(keyword, batch_start_date, batch_end_date, tags): - """This function makes the API call and returns all the hits for the input parameters""" - params = { - "query": f'"{keyword}"', - "tags": f"{tags}", - "numericFilters": f"""created_at_i>={batch_start_date},created_at_i<{batch_end_date}""", - "hitsPerPage": 1000, - } - response = requests.get(URL, params=params) - response.raise_for_status() - - return response.json()["hits"] - - time_delta = ( - 86400 if daily_load else 604800 - ) # The length of a day/week in UNIX timestamp - - # Iterate across all keywords - for keyword in keywords: - batch_start_date = start_timestamp - batch_end_date = batch_start_date + time_delta - - # Iterate across each week between the start and end dates - while batch_end_date < end_timestamp + time_delta: - batch_end_date = min( - batch_end_date, end_timestamp - ) # Prevent loading data ahead of the end date - # The response json - data = _generate_hits(keyword, batch_start_date, batch_end_date, tags) - - for hits in data: - yield { - key: value - for (key, value) in hits.items() - if not key.startswith( - "_" - ) # Filtering down to relevant fields from the response json - } - - batch_start_date = batch_end_date - batch_end_date += time_delta - - -if cdf.is_main(__name__): - # Create a source - source = hn_search(keywords=["rust"]) - - # Create the externally managed pipeline - pipeline = cdf.pipeline() - - # Run the pipeline - pipeline.run(source) diff --git a/examples/sandbox/alex/pipelines/test1/__init__.py b/examples/sandbox/alex/pipelines/test1/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/examples/sandbox/alex/pipelines/test1/chore.py b/examples/sandbox/alex/pipelines/test1/chore.py deleted file mode 100644 index b0e9788..0000000 --- a/examples/sandbox/alex/pipelines/test1/chore.py +++ /dev/null @@ -1,2 +0,0 @@ -def foo(): - _ = 1 diff --git a/examples/sandbox/alex/pipelines/us_cities_pipeline.py b/examples/sandbox/alex/pipelines/us_cities_pipeline.py deleted file mode 100644 index c4c85d4..0000000 --- a/examples/sandbox/alex/pipelines/us_cities_pipeline.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -PIPELINE ( - name us_cities, - description 'Load US cities', - cron '0 0 * * *', -); -""" - -import dlt -import requests - -import cdf - -# A relative import from the workspace -from .test1.chore import foo - - -@dlt.resource(write_disposition="append", standalone=True) -def us_cities(): - """Load US cities""" - foo() # Call a function from a relative import - yield requests.get( - "https://raw.githubusercontent.com/millbj92/US-Zip-Codes-JSON/master/USCities.json" - ).json() - - -if cdf.is_main(__name__): - # Define a pipeline - pipeline = cdf.pipeline() - - # Run the pipeline - load_info = pipeline.run(us_cities(), table_name="cities", destination="duckdb") - - # Print the load information - print(load_info) diff --git a/examples/sandbox/alex/publishers/httpbin_publisher.py b/examples/sandbox/alex/publishers/httpbin_publisher.py deleted file mode 100644 index 8af6e12..0000000 --- a/examples/sandbox/alex/publishers/httpbin_publisher.py +++ /dev/null @@ -1,19 +0,0 @@ -"""A publisher that pushes data to httpbin.org""" - -import requests - -import cdf - -w = cdf.get_workspace(__file__).unwrap() -context = w.get_transform_context("local") - -df = context.fetchdf("SELECT * FROM mart.zips") - -zip_ = df.iloc[0, 0] - -r = requests.post( - "https://httpbin.org/post", - data={"zip": zip_}, -) -r.raise_for_status() -print(r.json()) diff --git a/examples/sandbox/alex/schema.yaml b/examples/sandbox/alex/schema.yaml deleted file mode 100644 index 6368fc5..0000000 --- a/examples/sandbox/alex/schema.yaml +++ /dev/null @@ -1,12 +0,0 @@ -- name: '"cdf"."us_cities_v0_1"."cities"' - columns: - zip_code: BIGINT - latitude: DOUBLE - longitude: DOUBLE - city: TEXT - state: TEXT - county: TEXT - _dlt_load_id: TEXT - _dlt_id: TEXT - latitude__v_text: TEXT - longitude__v_text: TEXT diff --git a/examples/sandbox/alex/scripts/hello_script.py b/examples/sandbox/alex/scripts/hello_script.py deleted file mode 100644 index f7cf60e..0000000 --- a/examples/sandbox/alex/scripts/hello_script.py +++ /dev/null @@ -1 +0,0 @@ -print("Hello, world!") diff --git a/examples/sandbox/alex/scripts/nested/hello_script.py b/examples/sandbox/alex/scripts/nested/hello_script.py deleted file mode 100644 index a36964a..0000000 --- a/examples/sandbox/alex/scripts/nested/hello_script.py +++ /dev/null @@ -1,5 +0,0 @@ -import cdf - -w = cdf.get_workspace(__file__).unwrap() - -print(f"Hello, world from {w.name}!") diff --git a/examples/sandbox/alex/sinks/fs_sink.py b/examples/sandbox/alex/sinks/fs_sink.py deleted file mode 100644 index 7dd143a..0000000 --- a/examples/sandbox/alex/sinks/fs_sink.py +++ /dev/null @@ -1,13 +0,0 @@ -import dlt -from sqlmesh.core.config import GatewayConfig, parse_connection_config - -ingest = dlt.destinations.filesystem( - "file://_storage", layout="{table_name}/{load_id}.{file_id}.{ext}.gz" -) - -transform = GatewayConfig( - connection=parse_connection_config( - {"type": "duckdb", "database": "cdf.duckdb", "extensions": ["httpfs"]} - ), - state_schema="_cdf_state", -) diff --git a/examples/sandbox/alex/sinks/local_sink.py b/examples/sandbox/alex/sinks/local_sink.py deleted file mode 100644 index 65c38d2..0000000 --- a/examples/sandbox/alex/sinks/local_sink.py +++ /dev/null @@ -1,32 +0,0 @@ -import dlt -import cdf -import duckdb - - -p = ( - cdf.find_nearest(__file__) - .bind(lambda p: p.get_workspace("alex")) - .map(lambda w: w.path / "cdf.duckdb") - .unwrap() -) - -LOCALDB = str(p) - -conn = duckdb.connect(LOCALDB) -conn.install_extension("httpfs") -conn.load_extension("httpfs") -conn.close() - - -ingest = dlt.destinations.duckdb(LOCALDB) - -stage = dlt.destinations.filesystem( - "file://_storage", - layout="{table_name}/{load_id}.{file_id}.{ext}.gz", -) - -transform = dict( - connection=cdf.transform_connection( - "duckdb", database=LOCALDB, extensions=["httpfs"] - ) -) diff --git a/examples/sandbox/cdf.yml b/examples/sandbox/cdf.yml deleted file mode 100644 index dd6807c..0000000 --- a/examples/sandbox/cdf.yml +++ /dev/null @@ -1,23 +0,0 @@ -default: - name: cdf-example - version: 0.1.0 - workspaces: - - alex - filesystem: - uri: file://_storage - feature_flags: - provider: filesystem - filename: flags.json - state: - connection: - type: duckdb - database: cdf.duckdb - something: ok -prod: - filesystem: - provider: gcs - root: harness_analytics_staging/cdf_test_1 -dev: - feature_flags: - provider: filesystem - filename: "@jinja dev_flags_{{ 1 + 1}}.json" diff --git a/src/cdf/cli.py b/src/cdf/cli.py deleted file mode 100644 index 26cf1ca..0000000 --- a/src/cdf/cli.py +++ /dev/null @@ -1,1211 +0,0 @@ -"""CLI for cdf.""" - -import asyncio -import itertools -import json -import os -import subprocess -import sys -import tempfile -import typing as t -from enum import Enum -from pathlib import Path - -import dlt -import pydantic -import rich -import typer -from dlt.common.utils import update_dict_nested -from dlt.common.versioned_state import ( - generate_state_version_hash, - json_decode_state, - json_encode_state, -) - -import cdf.legacy.constants as c -import cdf.legacy.context as context -import cdf.legacy.logger as logger -from cdf.legacy.project import ( - FeatureFlagConfig, - FilesystemConfig, - Workspace, - load_project, -) -from cdf.legacy.runtime import ( - execute_notebook_specification, - execute_pipeline_specification, - execute_publisher_specification, - execute_script_specification, -) -from cdf.legacy.specification import ( - CoreSpecification, - NotebookSpecification, - PipelineSpecification, - PublisherSpecification, - ScriptSpecification, - SinkSpecification, -) -from cdf.proxy import run_mysql_proxy, run_plan_server -from cdf.types import M - -WorkspaceMonad = M.Result[Workspace, Exception] - -app = typer.Typer( - rich_markup_mode="rich", - epilog="Made with [red]♥[/red] by [bold]z3z1ma[/bold].", - add_completion=False, - no_args_is_help=True, -) - -console = rich.console.Console() - - -@app.callback() -def main( - ctx: typer.Context, - workspace: t.Annotated[ - t.Optional[str], - typer.Option( - ..., - "--workspace", - "-w", - help="The workspace to use.", - envvar="CDF_WORKSPACE", - ), - ] = None, - path: t.Annotated[ - Path, - typer.Option( - ..., "--path", "-p", help="Path to the project.", envvar="CDF_ROOT" - ), - ] = Path("."), - debug: t.Annotated[ - bool, typer.Option(..., "--debug", "-d", help="Enable debug mode.") - ] = False, - environment: t.Annotated[ - t.Optional[str], typer.Option(..., "--env", "-e", help="Environment to use.") - ] = None, - log_level: t.Annotated[ - t.Optional[str], - typer.Option( - ..., - "--log-level", - "-l", - help="The log level to use.", - envvar="LOG_LEVEL", # A common environment variable for log level - ), - ] = None, -) -> None: - """CDF (continuous data framework) is a framework for end to end data processing.""" - if environment: - os.environ[c.CDF_ENVIRONMENT] = environment - if log_level: - os.environ[c.CDF_LOG_LEVEL] = log_level.upper() - if debug: - context.debug_mode.set(True) - logger.configure(log_level.upper() if log_level else "INFO") - logger.apply_patches() - logger.warning( - "The CDF CLI command is DEPRECATED and will be removed in a future release. A local python file which imports cdf and exposes the Workspace.cli method is the way to interact with CDF" - ) - ctx.obj = load_project(path).bind(lambda p: p.get_workspace(workspace)) - - -@app.command(rich_help_panel="Project Management") -def init(ctx: typer.Context) -> None: - """:art: Initialize a new project.""" - typer.echo(ctx.obj) - - -@app.command(rich_help_panel="Project Management") -def index(ctx: typer.Context, hydrate: bool = False) -> None: - """:page_with_curl: Print an index of [b][blue]Pipelines[/blue], [red]Models[/red], [yellow]Publishers[/yellow][/b], and other components.""" - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - if not hydrate: - console.print("Pipelines", W.pipelines) - console.print("Sinks", W.sinks) - console.print("Publishers", W.publishers) - console.print("Scripts", W.scripts) - console.print("Notebooks", W.notebooks) - else: - console.print_json(W.model_dump_json()) - - -@app.command(rich_help_panel="Project Management") -def path(ctx: typer.Context) -> None: - """:office: Print the current workspace path.""" - typer.echo(ctx.obj.unwrap().path) - - -def _describe( - *displayables: t.Tuple[str, t.Tuple[CoreSpecification, ...]], diag: str = "" -): - for color, components in displayables: - for component in components: - doc = "\n".join( - map(lambda ln: "> " + ln, component.description.splitlines()) - ) - console.print( - f"[{color}]{type(component).__name__}[/{color}]: [b]{component.name}[/b]" - ) - console.print(f"[dim]{doc}[/dim]\n") - if diag: - console.print(f"[yellow]{diag}[/yellow]\n") - - -@app.command(rich_help_panel="Core") -def pipeline( - ctx: typer.Context, - pipeline_to_sink: t.Annotated[ - t.Optional[str], - typer.Argument(help="The pipeline and sink separated by a colon."), - ] = None, - select: t.List[str] = typer.Option( - ..., - "-s", - "--select", - default_factory=lambda: [], - help="Glob pattern for resources to run. Can be specified multiple times.", - ), - exclude: t.List[str] = typer.Option( - ..., - "-x", - "--exclude", - default_factory=lambda: [], - help="Glob pattern for resources to exclude. Can be specified multiple times.", - ), - force_replace: t.Annotated[ - bool, - typer.Option( - ..., - "-F", - "--force-replace", - help="Force the write disposition to replace ignoring state. Useful to force a reload of incremental resources.", - ), - ] = False, - no_stage: t.Annotated[ - bool, - typer.Option( - ..., - "--no-stage", - help="Do not stage the data in the staging destination of the sink even if defined.", - ), - ] = False, -) -> t.Any: - """:inbox_tray: Ingest data from a [b blue]Pipeline[/b blue] into a data store where it can be [b red]Transformed[/b red]. - - \f - Args: - ctx: The CLI context. - pipeline_to_sink: The pipeline and sink separated by a colon. - select: The resources to ingest as a sequence of glob patterns. - exclude: The resources to exclude as a sequence of glob patterns. - force_replace: Whether to force replace the write disposition. - no_stage: Allows selective disabling of intermediate staging even if configured in sink. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - if pipeline_to_sink is None: - return _describe( - ("blue", W.pipelines), - ("violet", W.sinks), - diag="To ingest data, use the `pipeline` command with the pipeline:sink combination.", - ) - source, destination = pipeline_to_sink.split(":", 1) - return ( - W.get_pipeline_spec(source) - .bind( - lambda pipe: execute_pipeline_specification( - pipe, - W.get_sink_spec(destination).unwrap_or((destination, None)), - select=select, - exclude=exclude, - force_replace=force_replace, - enable_stage=(not no_stage), - ) - ) - .unwrap() - ) - - -@app.command(rich_help_panel="Develop") -def discover( - ctx: typer.Context, - pipeline: t.Annotated[ - t.Optional[str], - typer.Argument(help="The pipeline in which to discover resources."), - ] = None, - no_quiet: t.Annotated[ - bool, - typer.Option( - help="Pipeline stdout is suppressed by default, this disables that." - ), - ] = False, -) -> None: - """:mag: Dry run a [b blue]Pipeline[/b blue] and enumerates the discovered resources. - - \f - Args: - ctx: The CLI context. - pipeline: The pipeline in which to discover resources. - no_quiet: Whether to suppress the pipeline stdout. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - if pipeline is None: - return _describe( - ("blue", W.pipelines), - diag="To discover resources, use the `discover` command with the pipeline name.", - ) - for i, source in enumerate( - M.ok(W) - .bind(lambda w: w.get_pipeline_spec(pipeline)) - .bind( - lambda spec: execute_pipeline_specification( - spec, "dummy", dry_run=True, quiet=not no_quiet - ) - ) - .map(lambda rv: rv.pipeline.tracked_sources) - .unwrap() - ): - console.print(f"{i}: {source.name}") - for j, resource in enumerate(source.resources.values(), 1): - console.print(f"{i}.{j}: {resource.name} (enabled: {resource.selected})") - - -@app.command(rich_help_panel="Develop") -def head( - ctx: typer.Context, - pipeline: t.Annotated[ - t.Optional[str], typer.Argument(help="The pipeline to inspect.") - ] = None, - resource: t.Annotated[ - t.Optional[str], typer.Argument(help="The resource to inspect.") - ] = None, - n: t.Annotated[int, typer.Option("-n", "--rows")] = 5, -) -> None: - """:wrench: Prints the first N rows of a [b green]Resource[/b green] within a [b blue]pipeline[/b blue]. Defaults to [cyan]5[/cyan]. - - This is useful for quickly inspecting data :detective: and verifying that it is coming over the wire correctly. - - \f - Args: - ctx: The CLI context. - pipeline: The pipeline to inspect. - resource: The resource to inspect. - n: The number of rows to print. - - Raises: - typer.BadParameter: If the resource is not found in the pipeline. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - if pipeline is None: - return _describe( - ("blue", W.pipelines), - diag="To inspect a data pipeline, use the `head` command with the pipeline name.", - ) - resource_iter = filter( - lambda r: r.name == resource or resource is None, - ( - resource - for source in M.ok(W) - .bind(lambda w: w.get_pipeline_spec(pipeline)) - .bind( - lambda spec: execute_pipeline_specification( - spec, "dummy", dry_run=True, quiet=True - ) - ) - .map(lambda rv: rv.pipeline.tracked_sources) - .unwrap() - for resource in source.resources.values() - ), - ) - if resource is None: - console.print("[b green]Resources[/b green]:") - for r in resource_iter: - console.print(f"- {r.name}") - console.print( - f"\n[yellow]To inspect a resource, use `cdf head {pipeline} [cyan][/cyan]`[/yellow].\n" - ) - return - target = next(resource_iter, None) - if target is None: - raise typer.BadParameter( - f"Resource {resource} not found in pipeline {pipeline}.", - param_hint="resource", - ) - list( - map( - lambda row: console.print(row[1]), - itertools.takewhile(lambda row: row[0] < n, enumerate(target)), - ) - ) - - -@app.command(rich_help_panel="Core") -def publish( - ctx: typer.Context, - sink_to_publisher: t.Annotated[ - t.Optional[str], - typer.Argument(help="The sink and publisher separated by a colon."), - ] = None, - skip_verification: t.Annotated[ - bool, - typer.Option( - help="Skip the verification of the publisher dependencies.", - ), - ] = False, -) -> t.Any: - """:outbox_tray: [b yellow]Publish[/b yellow] data from a data store to an [violet]External[/violet] system. - - \f - Args: - ctx: The CLI context. - sink_to_publisher: The sink and publisher separated by a colon. - skip_verification: Whether to skip the verification of the publisher dependencies. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - if sink_to_publisher is None: - return _describe( - ("violet", W.sinks), - ("yellow", W.publishers), - diag="To publish data, use the `publish` command with the sink:publisher combination.", - ) - source, publisher = sink_to_publisher.split(":", 1) - return ( - W.get_publisher_spec(publisher) - .bind( - lambda p: execute_publisher_specification( - p, W.get_transform_context(source), skip_verification - ) - ) - .unwrap() - ) - - -@app.command(rich_help_panel="Core") -def script( - ctx: typer.Context, - script: t.Annotated[ - t.Optional[str], typer.Argument(help="The script to execute.") - ] = None, - quiet: t.Annotated[bool, typer.Option(help="Suppress the script stdout.")] = False, -) -> t.Any: - """:hammer: Execute a [b yellow]Script[/b yellow] within the context of the current workspace. - - \f - Args: - ctx: The CLI context. - script: The script to execute. - quiet: Whether to suppress the script stdout. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - if script is None: - return _describe( - ("yellow", W.scripts), - diag="To execute a script, use the `script` command with the script name.", - ) - return ( - M.ok(W) - .bind(lambda w: w.get_script_spec(script)) - .bind(lambda s: execute_script_specification(s, capture_stdout=quiet)) - .unwrap() - ) - - -@app.command(rich_help_panel="Core") -def notebook( - ctx: typer.Context, - notebook: t.Annotated[ - t.Optional[str], typer.Argument(help="The notebook to execute.") - ] = None, - params: t.Annotated[ - str, - typer.Option( - ..., - help="The parameters to pass to the notebook as a json formatted string.", - ), - ] = "{}", -) -> t.Any: - """:notebook: Execute a [b yellow]Notebook[/b yellow] within the context of the current workspace. - - \f - Args: - ctx: The CLI context. - notebook: The notebook to execute. - params: The parameters to pass to the notebook as a json formatted string. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - if notebook is None: - return _describe( - ("yellow", W.notebooks), - diag="To execute a notebook, use the `notebook` command with the notebook name.", - ) - return ( - M.ok(W) - .bind(lambda w: w.get_notebook_spec(notebook)) - .bind(lambda s: execute_notebook_specification(s, **json.loads(params))) - .unwrap() - ) - - -@app.command( - rich_help_panel="Utilities", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def jupyter_lab(ctx: typer.Context) -> None: - """:star2: Start a Jupyter Lab server in the context of a workspace.""" - t.cast(WorkspaceMonad, ctx.obj).map( - lambda w: subprocess.run( - ["jupyter", "lab", *ctx.args], - cwd=w.path, - check=False, - env={ - **os.environ, - "PYTHONPATH": ":".join( - (str(w.path.resolve()), *sys.path, str(w.path.parent.resolve())) - ), - }, - ) - ) - - -class _SpecType(str, Enum): - """An enum of specs which can be described via the `spec` command.""" - - pipeline = "pipeline" - publisher = "publisher" - script = "script" - notebook = "notebook" - sink = "sink" - feature_flags = "feature_flags" - filesystem = "filesystem" - - -@app.command(rich_help_panel="Develop") -def spec(name: _SpecType, json_schema: bool = False) -> None: - """:blue_book: Print the fields for a given spec type. - - \f - Args: - name: The name of the spec to print. - json_schema: Whether to print the JSON schema for the spec. - """ - - def _print_spec(spec: t.Type[pydantic.BaseModel]) -> None: - console.print(f"[bold]{spec.__name__}:[/bold]") - for name, info in spec.model_fields.items(): - typ = getattr(info.annotation, "__name__", info.annotation) - desc = info.description or "No description provided." - d = f"- [blue]{name}[/blue] ({typ!s}): {desc}" - if "Undefined" not in str(info.default): - d += f" Defaults to `{info.default}`)" - console.print(d) - console.print() - - def _print(s: t.Type[pydantic.BaseModel]) -> None: - console.print(s.model_json_schema()) if json_schema else _print_spec(s) - - if name == _SpecType.pipeline: - _print(PipelineSpecification) - elif name == _SpecType.publisher: - _print(PublisherSpecification) - elif name == _SpecType.script: - _print(ScriptSpecification) - elif name == _SpecType.notebook: - _print(NotebookSpecification) - elif name == _SpecType.sink: - _print(SinkSpecification) - elif name == _SpecType.feature_flags: - for spec in t.get_args(FeatureFlagConfig): - _print(spec) - elif name == _SpecType.filesystem: - _print(FilesystemConfig) - else: - raise ValueError(f"Invalid spec type {name}.") - - -class _ExportFormat(str, Enum): - """An enum of export formats which can be used with the `export` command.""" - - json = "json" - yaml = "yaml" - yml = "yml" - py = "py" - python = "python" - dict = "dict" - - -app.add_typer( - schema := typer.Typer( - rich_markup_mode="rich", - epilog="Made with [red]♥[/red] by [bold]z3z1ma[/bold].", - add_completion=False, - no_args_is_help=True, - ), - name="schema", - help=":construction: Schema management commands.", - rich_help_panel="Develop", -) - - -@schema.command("dump") -def schema_dump( - ctx: typer.Context, - pipeline_to_sink: t.Annotated[ - str, - typer.Argument( - help="The pipeline:sink combination from which to fetch the schema." - ), - ], - format: t.Annotated[ - _ExportFormat, typer.Option(help="The format to dump the schema in.") - ] = _ExportFormat.json, -) -> None: - """:computer: Dump the schema of a [b blue]pipeline[/b blue]:[violet]sink[/violet] combination. - - \f - Args: - ctx: The CLI context. - pipeline_to_sink: The pipeline:sink combination from which to fetch the schema. - format: The format to dump the schema in. - - Raises: - typer.BadParameter: If the pipeline or sink are not found. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - source, destination = pipeline_to_sink.split(":", 1) - spec = W.get_pipeline_spec(source).unwrap() - rv = execute_pipeline_specification( - spec, - W.get_sink_spec(destination).unwrap_or((destination, None)), - dry_run=True, - quiet=True, - ).unwrap() - if format == _ExportFormat.json: - console.print(rv.pipeline.default_schema.to_pretty_json()) - elif format in (_ExportFormat.py, _ExportFormat.python, _ExportFormat.dict): - console.print(rv.pipeline.default_schema.to_dict()) - elif format in (_ExportFormat.yaml, _ExportFormat.yml): - console.print(rv.pipeline.default_schema.to_pretty_yaml()) - else: - raise ValueError( - f"Invalid format {format}. Must be one of {list(_ExportFormat)}" - ) - - -@schema.command("edit") -def schema_edit( - ctx: typer.Context, - pipeline_to_sink: t.Annotated[ - str, - typer.Argument( - help="The pipeline:sink combination from which to fetch the schema." - ), - ], -) -> None: - """:pencil: Edit the schema of a [b blue]pipeline[/b blue]:[violet]sink[/violet] combination using the system editor. - - \f - Args: - ctx: The CLI context. - pipeline_to_sink: The pipeline:sink combination from which to fetch the schema. - - Raises: - typer.BadParameter: If the pipeline or sink are not found. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - source, destination = pipeline_to_sink.split(":", 1) - sink, _ = ( - W.get_sink_spec(destination) - .map(lambda s: s.get_ingest_config()) - .unwrap_or((destination, None)) - ) - spec = W.get_pipeline_spec(source).unwrap() - logger.info(f"Clearing local schema and state for {source}.") - pipe = spec.create_pipeline(dlt.Pipeline, destination=sink, staging=None) - pipe.drop() - logger.info(f"Syncing schema for {source}:{destination}.") - rv = execute_pipeline_specification(spec, sink, dry_run=True, quiet=True).unwrap() - schema = rv.pipeline.default_schema.clone() - with tempfile.TemporaryDirectory() as tmpdir: - fname = f"{schema.name}.schema.yaml" - with open(os.path.join(tmpdir, fname), "w") as f: - f.write(schema.to_pretty_yaml()) - logger.info(f"Editing schema {schema.name}.") - subprocess.run([os.environ.get("EDITOR", "vi"), f.name], check=True) - pipe_mut = spec.create_pipeline( - dlt.Pipeline, import_schema_path=tmpdir, destination=sink, staging=None - ) - schema_mut = pipe_mut.default_schema - if schema_mut.version > schema.version: - with pipe_mut.destination_client() as client: - logger.info( - f"Updating schema {schema.name} to version {schema_mut.version} in {destination}." - ) - client.update_stored_schema() - logger.info("Schema updated.") - else: - logger.info("Schema not updated.") - - -app.add_typer( - state := typer.Typer( - rich_markup_mode="rich", - epilog="Made with [red]♥[/red] by [bold]z3z1ma[/bold].", - add_completion=False, - no_args_is_help=True, - ), - name="state", - help=":construction: State management commands.", - rich_help_panel="Develop", -) - - -@state.command("dump") -def state_dump( - ctx: typer.Context, - pipeline_to_sink: t.Annotated[ - str, - typer.Argument( - help="The pipeline:sink combination from which to fetch the schema." - ), - ], -) -> None: - """:computer: Dump the state of a [b blue]pipeline[/b blue]:[violet]sink[/violet] combination. - - \f - Args: - ctx: The CLI context. - pipeline_to_sink: The pipeline:sink combination from which to fetch the state. - - Raises: - typer.BadParameter: If the pipeline or sink are not found. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - source, destination = pipeline_to_sink.split(":", 1) - W.get_pipeline_spec(source).bind( - lambda spec: execute_pipeline_specification( - spec, - W.get_sink_spec(destination).unwrap_or((destination, None)), - dry_run=True, - quiet=True, - ) - ).map(lambda rv: console.print(rv.pipeline.state)) - - -@state.command("edit") -def state_edit( - ctx: typer.Context, - pipeline_to_sink: t.Annotated[ - str, - typer.Argument( - help="The pipeline:sink combination from which to fetch the state." - ), - ], -) -> None: - """:pencil: Edit the state of a [b blue]pipeline[/b blue]:[violet]sink[/violet] combination using the system editor. - - \f - Args: - ctx: The CLI context. - pipeline_to_sink: The pipeline:sink combination from which to fetch the state. - - Raises: - typer.BadParameter: If the pipeline or sink are not found. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - source, destination = pipeline_to_sink.split(":", 1) - sink, _ = ( - W.get_sink_spec(destination) - .map(lambda s: s.get_ingest_config()) - .unwrap_or((destination, None)) - ) - spec = W.get_pipeline_spec(source).unwrap() - logger.info(f"Clearing local state and state for {source}.") - pipe = spec.create_pipeline(dlt.Pipeline, destination=sink, staging=None) - pipe.drop() - logger.info(f"Syncing state for {source}:{destination}.") - rv = execute_pipeline_specification(spec, sink, dry_run=True, quiet=True).unwrap() - with ( - tempfile.NamedTemporaryFile(suffix=".json") as tmp, - rv.pipeline.managed_state(extract_state=True) as state, - ): - pre_hash = generate_state_version_hash(state, exclude_attrs=["_local"]) - tmp.write(json.dumps(json.loads(json_encode_state(state)), indent=2).encode()) - tmp.flush() - logger.info(f"Editing state in {destination}.") - subprocess.run([os.environ.get("EDITOR", "vi"), tmp.name], check=True) - with open(tmp.name, "r") as f: - update_dict_nested(t.cast(dict, state), json_decode_state(f.read())) - post_hash = generate_state_version_hash(state, exclude_attrs=["_local"]) - if pre_hash != post_hash: - execute_pipeline_specification( - spec, sink, select=[], exclude=["*"], quiet=True - ).unwrap() - logger.info("State updated.") - else: - logger.info("State not updated.") - - -app.add_typer( - model := typer.Typer( - rich_markup_mode="rich", - epilog="Made with [red]♥[/red] by [bold]z3z1ma[/bold].", - add_completion=False, - no_args_is_help=True, - ), - name="model", - help=":construction: Model management commands.", - rich_help_panel="Core", -) - - -@model.command("evaluate") -def model_evaluate( - ctx: typer.Context, - model: t.Annotated[ - str, - typer.Argument(help="The model to evaluate. Can be prefixed with the gateway."), - ], - start: str = typer.Option( - "1 month ago", - help="The start time to evaluate the model from. Defaults to 1 month ago.", - ), - end: str = typer.Option( - "now", - help="The end time to evaluate the model to. Defaults to now.", - ), - limit: t.Optional[int] = typer.Option( - None, help="The number of rows to limit the evaluation to." - ), -) -> None: - """:bar_chart: Evaluate a [b red]Model[/b red] and print the results. A thin wrapper around `sqlmesh evaluate` - - \f - Args: - ctx: The CLI context. - model: The model to evaluate. Can be prefixed with the gateway. - limit: The number of rows to limit the evaluation to. - """ - if ":" in model: - gateway, model = model.split(":", 1) - else: - gateway = None - t.cast(WorkspaceMonad, ctx.obj).map( - lambda w: console.print( - w.get_transform_context(gateway).evaluate( - model, limit=limit, start=start, end=end, execution_time="now" - ) - ) - ) - - -@model.command("render") -def model_render( - ctx: typer.Context, - model: t.Annotated[ - str, - typer.Argument(help="The model to evaluate. Can be prefixed with the gateway."), - ], - start: str = typer.Option( - "1 month ago", - help="The start time to evaluate the model from. Defaults to 1 month ago.", - ), - end: str = typer.Option( - "now", - help="The end time to evaluate the model to. Defaults to now.", - ), - expand: t.List[str] = typer.Option([], help="The referenced models to expand."), - dialect: t.Optional[str] = typer.Option( - None, help="The SQL dialect to use for rendering." - ), -) -> None: - """:bar_chart: Render a [b red]Model[/b red] and print the query. A thin wrapper around `sqlmesh render` - - \f - Args: - ctx: The CLI context. - model: The model to evaluate. Can be prefixed with the gateway. - start: The start time to evaluate the model from. Defaults to 1 month ago. - end: The end time to evaluate the model to. Defaults to now. - expand: The referenced models to expand. - dialect: The SQL dialect to use for rendering. - """ - if ":" in model: - gateway, model = model.split(":", 1) - else: - gateway = None - t.cast(WorkspaceMonad, ctx.obj).map( - lambda w: w.get_transform_context(gateway), - ).map( - lambda sqlmesh_ctx: console.print( - sqlmesh_ctx.render( - model, start=start, end=end, execution_time="now", expand=expand - ).sql(dialect or sqlmesh_ctx.default_dialect, pretty=True) - ) - ) - - -@model.command("name") -def model_name( - ctx: typer.Context, - model: t.Annotated[ - str, - typer.Argument( - help="The model to convert the physical name. Can be prefixed with the gateway." - ), - ], -) -> None: - """:bar_chart: Get a [b red]Model[/b red]'s physical table name. A thin wrapper around `sqlmesh table_name` - - \f - Args: - ctx: The CLI context. - model: The model to evaluate. Can be prefixed with the gateway. - """ - if ":" in model: - gateway, model = model.split(":", 1) - else: - gateway = None - t.cast(WorkspaceMonad, ctx.obj).map( - lambda w: console.print( - w.get_transform_context(gateway).table_name(model, False) - ) - ) - - -@model.command("diff") -def model_diff( - ctx: typer.Context, - model: t.Annotated[ - str, - typer.Argument(help="The model to evaluate. Can be prefixed with the gateway."), - ], - source_target: t.Annotated[ - str, - typer.Argument(help="The source and target environments separated by a colon."), - ], - show_sample: bool = typer.Option( - False, help="Whether to show a sample of the diff." - ), -) -> None: - """:bar_chart: Compute the diff of a [b red]Model[/b red] across 2 environments. A thin wrapper around `sqlmesh table_diff` - - \f - Args: - ctx: The CLI context. - model: The model to evaluate. Can be prefixed with the gateway. - source_target: The source and target environments separated by a colon. - """ - if ":" in model: - gateway, model = model.split(":", 1) - else: - gateway = None - source, target = source_target.split(":", 1) - t.cast(WorkspaceMonad, ctx.obj).map( - lambda w: console.print( - w.get_transform_context(gateway).table_diff( - source, target, model_or_snapshot=model, show_sample=show_sample - ) - ) - ) - - -@model.command("prototype") -def model_prototype( - ctx: typer.Context, - dependencies: t.List[str] = typer.Option( - [], - "-d", - "--dependencies", - help="The dependencies to include in the prototype.", - ), - start: str = typer.Option( - "1 month ago", - help="The start time to evaluate the model from. Defaults to 1 month ago.", - ), - end: str = typer.Option( - "now", - help="The end time to evaluate the model to. Defaults to now.", - ), - limit: int = typer.Option( - 5_000_000, - help="The number of rows to limit the evaluation to.", - ), -): - """:bar_chart: Prototype a model and save the results to disk. - - \f - Args: - ctx: The CLI context. - dependencies: The dependencies to include in the prototype. - start: The start time to evaluate the model from. Defaults to 1 month ago. - end: The end time to evaluate the model to. Defaults to now. - limit: The number of rows to limit the evaluation to. - """ - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - sqlmesh_ctx = W.get_transform_context() - for dep in dependencies: - df = sqlmesh_ctx.evaluate( - dep, - start=start, - end=end, - execution_time="now", - limit=limit, - ) - df.to_parquet(f"{dep}.parquet", index=False) - - -app.add_typer( - proxy := typer.Typer( - rich_markup_mode="rich", - epilog="Made with [red]♥[/red] by [bold]z3z1ma[/bold].", - add_completion=False, - no_args_is_help=True, - ), - name="proxy", - help=":satellite: Proxy management commands.", - rich_help_panel="Core", -) - - -@proxy.command("mysql") -def proxy_mysql( - ctx: typer.Context, - gateway: t.Annotated[ - t.Optional[str], - typer.Argument( - help="The gateway to use for the server. Defaults to the default gateway." - ), - ] = None, -) -> None: - """:satellite: Start a SQLMesh proxy server. - - \f - Args: - ctx: The CLI context. - gateway: The gateway to use for the server. Defaults to the default gateway. - """ - t.cast(WorkspaceMonad, ctx.obj).map( - lambda w: asyncio.run(run_mysql_proxy(w.get_transform_context(gateway))) - ).unwrap() - - -@proxy.command("planner") -def proxy_planner( - ctx: typer.Context, - gateway: t.Annotated[ - t.Optional[str], - typer.Argument( - help="The gateway to use for the server. Defaults to the default gateway." - ), - ] = None, -) -> None: - """:satellite: Start a SQLMesh proxy server. - - \f - Args: - ctx: The CLI context. - gateway: The gateway to use for the server. Defaults to the default gateway. - """ - t.cast(WorkspaceMonad, ctx.obj).map( - lambda w: run_plan_server(8000, w.get_transform_context(gateway)) - ).unwrap() - - -@proxy.command("plan") -def proxy_plan( - ctx: typer.Context, - gateway: t.Annotated[ - t.Optional[str], - typer.Argument( - help="The gateway to use for the server. Defaults to the default gateway." - ), - ] = None, -): - """:satellite: Run a SQLMesh plan delegated to a running planner. - - \f - Args: - ctx: The CLI context. - gateway: The gateway to use for the server. Defaults to the default gateway. - """ - import pickle - - import requests - - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - plan = W.get_transform_context(gateway).plan("dev", no_prompts=True) - res = requests.post( - "http://localhost:8000", - headers={"Content-Type": "application/octet-stream"}, - data=pickle.dumps(plan), - ) - console.print(res.json()) - - -app.add_typer( - inspect := typer.Typer( - rich_markup_mode="rich", - epilog="Made with [red]♥[/red] by [bold]z3z1ma[/bold].", - add_completion=False, - no_args_is_help=True, - ), - name="inspect", - help=":mag: State store inspection commands.", - rich_help_panel="Utilities", -) - - -@inspect.command("events") -def inspect_events( - ctx: typer.Context, - limit: t.Annotated[ - int, - typer.Option(..., help="The number of audit logs to list. Defaults to 10."), - ] = 10, - failed_only: t.Annotated[ - bool, - typer.Option( - help="List only the audit logs with errors.", - ), - ] = False, -) -> None: - """:mag: List the audit logs for the current workspace.""" - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - import pandas as pd - - with pd.option_context("display.max_rows", limit): - console.print(W.state.fetch_audits(limit=limit, failed_only=failed_only)) - - -@inspect.command("extracted") -def inspect_extracted( - ctx: typer.Context, - load_ids: t.Annotated[ - str, - typer.Argument( - help="A comma-separated list of load ids to list. Use '*' to list all." - ), - ] = "*", - limit: t.Annotated[ - int, - typer.Option( - ..., help="The number of extracted resources to list. Defaults to 10." - ), - ] = 10, -) -> None: - """:mag: List the extracted resources for the current workspace.""" - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - import pandas as pd - - if load_ids == "*": - requested_ids = [] - else: - requested_ids = load_ids.split(",") - data = W.state.fetch_extracted(*requested_ids, limit=limit) - if data.empty: - console.print("\n[red]No data found.[/red]") - return - if requested_ids: - for load_id in requested_ids: - r = data.loc[data["load_id"] == load_id, "data"] - if r.empty: - console.print( - f"\n[red]No data found for requested load id {load_id}.[/red]" - ) - continue - console.print(f"\n[b]Data for load id {load_id}[/b]:") - console.print_json(r.iloc[0]) - else: - with pd.option_context("display.max_rows", limit): - console.print(data) - - -@inspect.command("normalized") -def inspect_normalized( - ctx: typer.Context, - load_ids: t.Annotated[ - str, - typer.Argument( - help="A comma-separated list of load ids to list. Use '*' to list all." - ), - ] = "*", - limit: t.Annotated[ - int, - typer.Option( - ..., help="The number of normalized resources to list. Defaults to 10." - ), - ] = 10, -) -> None: - """:mag: List the normalized resources for the current workspace.""" - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - import pandas as pd - - if load_ids == "*": - requested_ids = [] - else: - requested_ids = load_ids.split(",") - data = W.state.fetch_normalized(*requested_ids, limit=limit) - if data.empty: - console.print("\n[red]No data found.[/red]") - return - if requested_ids: - for load_id in requested_ids: - r = data.loc[data["load_id"] == load_id, "data"] - if r.empty: - console.print( - f"\n[red]No data found for requested load id {load_id}.[/red]" - ) - continue - console.print(f"\n[b]Data for load id {load_id}[/b]:") - console.print_json(r.iloc[0]) - else: - with pd.option_context("display.max_rows", limit): - console.print(data) - - -@inspect.command("loaded") -def inspect_loaded( - ctx: typer.Context, - load_ids: t.Annotated[ - str, - typer.Argument( - help="A comma-separated list of load ids to list. Use '*' to list all." - ), - ] = "*", - limit: t.Annotated[ - int, - typer.Option( - ..., help="The number of loaded resources to list. Defaults to 10." - ), - ] = 10, -) -> None: - """:mag: List the loaded resources for the current workspace.""" - W = t.cast(WorkspaceMonad, ctx.obj).unwrap() - import pandas as pd - - if load_ids == "*": - requested_ids = [] - else: - requested_ids = load_ids.split(",") - data = W.state.fetch_loaded(*requested_ids, limit=limit) - if data.empty: - console.print("\n[red]No data found.[/red]") - return - if requested_ids: - for load_id in requested_ids: - r = data.loc[data["load_id"] == load_id, "data"] - if r.empty: - console.print( - f"\n[red]No data found for requested load id {load_id}.[/red]" - ) - continue - console.print(f"\n[b]Data for load id {load_id}[/b]:") - console.print_json(r.iloc[0]) - else: - with pd.option_context("display.max_rows", limit): - console.print(data) - - -if __name__ == "__main__": - app() diff --git a/src/cdf/integrations/sqlmesh.py b/src/cdf/integrations/sqlmesh.py deleted file mode 100644 index b5553eb..0000000 --- a/src/cdf/integrations/sqlmesh.py +++ /dev/null @@ -1,147 +0,0 @@ -import logging -import time -import typing as t - -from sqlmesh.core.notification_target import ( - ConsoleNotificationTarget, - NotificationEvent, - NotificationStatus, -) -from sqlmesh.utils.errors import AuditError - -from cdf.legacy.project import Workspace - -logger = logging.getLogger(__name__) - - -class CDFNotificationTarget(ConsoleNotificationTarget): - """A notification target which sends notifications to the state of a CDF workspace.""" - - workspace: Workspace - notify_on: t.FrozenSet[NotificationEvent] = frozenset( - { - NotificationEvent.APPLY_START, - NotificationEvent.APPLY_END, - NotificationEvent.RUN_START, - NotificationEvent.RUN_END, - NotificationEvent.MIGRATION_START, - NotificationEvent.MIGRATION_END, - NotificationEvent.APPLY_FAILURE, - NotificationEvent.RUN_FAILURE, - NotificationEvent.AUDIT_FAILURE, - NotificationEvent.MIGRATION_FAILURE, - } - ) - - _run_start: float = 0.0 - """The time a run started""" - _apply_start: float = 0.0 - """The time an apply started""" - _migrate_start: float = 0.0 - """The time a migration started""" - - def send( - self, notification_status: NotificationStatus, msg: str, **kwargs: t.Any - ) -> None: - msg += "\n(event logged in state store)" - if notification_status.is_failure: - logger.error(msg) - elif notification_status.is_warning: - logger.warning(msg) - else: - logger.info(msg) - - def notify_run_start(self, environment: str) -> None: - """Notify the workspace of a run start""" - self._run_start = time.time() - self.workspace.state.audit( - "sqlmesh_run_start", - success=True, - environment=environment, - ) - - def notify_run_end(self, environment: str) -> None: - """Notify the workspace of a run end""" - self.workspace.state.audit( - "sqlmesh_run_end", - success=True, - environment=environment, - elapsed=time.time() - self._run_start, - ) - - def notify_run_failure(self, exc: str) -> None: - """Notify the workspace of a run failure""" - self.workspace.state.audit( - "sqlmesh_run_failure", - success=False, - error=exc, - elapsed=time.time() - self._run_start, - ) - - def notify_apply_start(self, environment: str, plan_id: str) -> None: - """Notify the workspace of an apply start""" - self._apply_start = time.time() - self.workspace.state.audit( - "sqlmesh_apply_start", - success=True, - environment=environment, - plan_id=plan_id, - ) - - def notify_apply_end(self, environment: str, plan_id: str) -> None: - """Notify the workspace of an apply end""" - self.workspace.state.audit( - "sqlmesh_apply_end", - success=True, - environment=environment, - plan_id=plan_id, - elapsed=time.time() - self._apply_start, - ) - - def notify_apply_failure(self, environment: str, plan_id: str, exc: str) -> None: - """Notify the workspace of an apply failure""" - self.workspace.state.audit( - "sqlmesh_apply_failure", - success=False, - environment=environment, - plan_id=plan_id, - error=exc, - elapsed=time.time() - self._apply_start, - ) - - def notify_migration_start(self) -> None: - """Notify the workspace of a migration start""" - self._migrate_start = time.time() - self.workspace.state.audit( - "sqlmesh_migration_start", - success=True, - ) - - def notify_migration_end(self) -> None: - """Notify the workspace of a migration end""" - self.workspace.state.audit( - "sqlmesh_migration_end", - success=True, - elapsed=time.time() - self._migrate_start, - ) - - def notify_migration_failure(self, exc: str) -> None: - """Notify the workspace of a migration failure""" - self.workspace.state.audit( - "sqlmesh_migration_failure", - success=False, - error=exc, - elapsed=time.time() - self._migrate_start, - ) - - def notify_audit_failure(self, audit_error: AuditError) -> None: - """Notify the workspace of an audit failure""" - self.workspace.state.audit( - "sqlmesh_audit_failure", - success=False, - sql=audit_error.sql(), - name=audit_error.audit_name, - model=audit_error.model_name, # type: ignore - err_msg=str(audit_error), - elapsed=1.0, - ) diff --git a/src/cdf/legacy/__init__.py b/src/cdf/legacy/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/cdf/legacy/config.py b/src/cdf/legacy/config.py deleted file mode 100644 index 50263fe..0000000 --- a/src/cdf/legacy/config.py +++ /dev/null @@ -1,151 +0,0 @@ -"""The config module provides a configuration provider for CDF scoped settings. - -This allows for the configuration to be accessed and modified in a consistent manner across -the codebase leveraging dlt's configuration provider interface. It also makes all of dlt's -semantics which depend on the configuration providers seamlessly work with CDF's configuration. -""" - -import typing as t -from collections import ChainMap -from contextlib import contextmanager - -import dynaconf -from dlt.common.configuration.container import Container -from dlt.common.configuration.providers import ConfigProvider as _ConfigProvider -from dlt.common.configuration.providers import EnvironProvider -from dlt.common.configuration.specs.config_providers_context import ( - ConfigProvidersContext, -) -from dlt.common.utils import update_dict_nested - - -class CdfConfigProvider(_ConfigProvider): - """A configuration provider for CDF scoped settings.""" - - def __init__(self, scope: t.ChainMap[str, t.Any], secret: bool = False) -> None: - """Initialize the provider. - - Args: - config: The configuration ChainMap. - """ - if not isinstance(scope, ChainMap): - scope = ChainMap(scope) - self._scope = scope - self._secret = secret - - def get_value( - self, key: str, hint: t.Type[t.Any], pipeline_name: str, *sections: str - ) -> t.Tuple[t.Optional[t.Any], str]: - """Get a value from the configuration.""" - _ = hint - if pipeline_name: - sections = ("pipelines", pipeline_name, "options", *sections) - parts = (*sections, key) - fqn = ".".join(parts) - - try: - return self._scope[fqn], fqn - except KeyError: - return None, fqn - - def set_value( - self, key: str, value: t.Any, pipeline_name: str, *sections: str - ) -> None: - """Set a value in the configuration.""" - if pipeline_name: - sections = ("pipelines", pipeline_name, "options", *sections) - parts = (*sections, key) - fqn = ".".join(parts) - if isinstance(value, dynaconf.Dynaconf): - if key is None: - self._scope.maps[-1] = t.cast(dict, value) - else: - self._scope.maps[-1][fqn].update(value) - return None - else: - if key is None: - if isinstance(value, dict): - self._scope.update(value) - return None - else: - raise ValueError("Cannot set a value without a key") - this = self._scope - for key in parts[:-1]: - if key not in this: - this[key] = {} - this = this[key] - if isinstance(value, dict) and isinstance(this[parts[-1]], dict): - update_dict_nested(this[parts[-1]], value) - else: - this[parts[-1]] = value - - @property - def name(self) -> str: - """The name of the provider""" - return "CDF Configuration Provider" - - @property - def supports_sections(self) -> bool: - """This provider supports sections""" - return True - - @property - def supports_secrets(self) -> bool: - """There is no differentiation between secrets and non-secrets for the cdf provider. - - Nothing is persisted. Data is available in memory and backed by the dynaconf settings object. - """ - return self._secret - - @property - def is_writable(self) -> bool: - """Whether the provider is writable""" - return True - - -@t.overload -def get_config_providers( - scope: t.ChainMap[str, t.Any], /, include_env: bool = False -) -> t.Tuple[CdfConfigProvider, CdfConfigProvider]: ... - - -@t.overload -def get_config_providers( - scope: t.ChainMap[str, t.Any], /, include_env: bool = True -) -> t.Tuple[EnvironProvider, CdfConfigProvider, CdfConfigProvider]: ... - - -def get_config_providers( - scope: t.ChainMap[str, t.Any], /, include_env: bool = True -) -> t.Union[ - t.Tuple[CdfConfigProvider, CdfConfigProvider], - t.Tuple[EnvironProvider, CdfConfigProvider, CdfConfigProvider], -]: - """Get the configuration providers for the given scope.""" - cdf_providers = ( - CdfConfigProvider(scope), - CdfConfigProvider(scope, secret=True), - ) - if include_env: - return (EnvironProvider(), *cdf_providers) - return cdf_providers - - -@contextmanager -def inject_configuration( - scope: t.ChainMap[str, t.Any], /, include_env: bool = True -) -> t.Iterator[t.Mapping[str, t.Any]]: - """Inject the configuration provider into the context - - This allows dlt.config and dlt.secrets to access the scope configuration. Furthermore - it makes the scope configuration available throughout dlt where things such as extract, - normalize, and load settings can be specified. - """ - ctx = Container()[ConfigProvidersContext] - prior = ctx.providers.copy() - ctx.providers = list(get_config_providers(scope, include_env=include_env)) - yield scope - ctx.providers = prior - - -__all__ = ["CdfConfigProvider", "get_config_providers", "inject_configuration"] diff --git a/src/cdf/legacy/constants.py b/src/cdf/legacy/constants.py deleted file mode 100644 index 7727056..0000000 --- a/src/cdf/legacy/constants.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Constants used by CDF.""" - -CDF_ENVIRONMENT = "CDF_ENVIRONMENT" -"""Environment variable to set the environment of the project.""" - -DEFAULT_ENVIRONMENT = "dev" -"""Default environment for the project.""" - -CDF_MAIN = "__cdf_main__" -"""A sentinel value that will match the __name__ attribute of a module being executed by CDF.""" - -CDF_LOG_LEVEL = "RUNTIME__LOG_LEVEL" -"""Environment variable to set the log level of the project.""" diff --git a/src/cdf/legacy/context.py b/src/cdf/legacy/context.py deleted file mode 100644 index a7e6daa..0000000 --- a/src/cdf/legacy/context.py +++ /dev/null @@ -1,47 +0,0 @@ -"""The context module provides thread-safe context variables and injection mechanisms. - -It facilitates communication between specifications and runtime modules. -""" - -import typing as t -import uuid -from contextvars import ContextVar - -import dlt - -if t.TYPE_CHECKING: - from cdf.legacy.project import Project - - -active_project: ContextVar["Project"] = ContextVar("active_project") -"""The active workspace context variable. - -The allows the active workspace to be passed to user-defined scripts. The workspace -has a reference to the project configuration and filesystem. -""" - -active_pipeline: ContextVar[dlt.Pipeline] = ContextVar("active_pipeline") -"""Stores the active pipeline. - -This is the primary mechanism to pass a configured pipeline to user-defined scripts. -""" - -debug_mode: ContextVar[bool] = ContextVar("debug_mode", default=False) -"""The debug mode context variable. - -Allows us to mutate certain behaviors in the runtime based on the debug mode. User can -optionally introspect this. -""" - -extract_limit: ContextVar[int] = ContextVar("extract_limit", default=0) -"""The extract limit context variable. - -Lets us set a limit on the number of items to extract from a source. This variable -can be introspected by user-defined scripts to optimize for partial extraction. -""" - -execution_id: ContextVar[str] = ContextVar("execution_id", default=str(uuid.uuid4())) -"""The execution ID context variable.""" - - -__all__ = ["active_project", "active_pipeline", "debug_mode", "extract_limit"] diff --git a/src/cdf/legacy/filesystem.py b/src/cdf/legacy/filesystem.py deleted file mode 100644 index a96e11f..0000000 --- a/src/cdf/legacy/filesystem.py +++ /dev/null @@ -1,106 +0,0 @@ -"""A central interface for filesystems thinly wrapping fsspec.""" - -import posixpath -import typing as t -from pathlib import Path - -import dlt -import fsspec -from dlt.common.configuration import with_config -from fsspec.core import strip_protocol -from fsspec.implementations.dirfs import DirFileSystem -from fsspec.utils import get_protocol - -from cdf.types import PathLike - - -# TODO: Add UPath integration... -class FilesystemAdapter: - """Wraps an fsspec filesystem. - - The filesystem is lazily loaded. Certain methods are intercepted to include cdf-specific logic. Helper - methods are provided for specific operations. - """ - - @with_config(sections=("filesystem",)) - def __init__( - self, - uri: PathLike = dlt.config.value, - root: t.Optional[PathLike] = None, - options: t.Optional[t.Dict[str, t.Any]] = None, - ) -> None: - """Load a filesystem from a provider and kwargs. - - Args: - uri: The filesystem URI. - options: The filesystem provider kwargs. - """ - uri = _resolve_local_uri(uri, root) - if isinstance(uri, Path): - uri = uri.resolve().as_uri() - options = options or {} - options.setdefault("auto_mkdir", True) - CdfFs = type("CdfFs", (DirFileSystem,), {"protocol": "cdf"}) - self.wrapped = CdfFs( - path=posixpath.join(strip_protocol(uri), "x")[:-1], - fs=fsspec.filesystem(get_protocol(uri), **options), - auto_mkdir=True, - ) - self.uri = uri - self.mapper = self.wrapped.get_mapper() - - def __repr__(self) -> str: - return f"{type(self).__name__}(uri={self.uri!r})" - - def __str__(self) -> str: - return self.uri - - def __getattr__(self, name: str) -> t.Any: - """Proxy attribute access to the wrapped filesystem.""" - return getattr(self.wrapped, name) - - def __getitem__(self, value: str) -> t.Any: - """Get a path from the filesystem.""" - return self.mapper[value] - - def __setitem__(self, key: str, value: t.Any) -> None: - """Set a path in the filesystem.""" - self.mapper[key] = value - - def open(self, path: PathLike, mode: str = "r", **kwargs: t.Any) -> t.Any: - """Open a file from the filesystem. - - Args: - path: The path to the file. - mode: The file mode. - kwargs: Additional kwargs. - - Returns: - The file handle. - """ - return self.wrapped.open(path, mode, **kwargs) - - -def _resolve_local_uri(uri: PathLike, root: t.Optional[PathLike] = None) -> PathLike: - """Resolve a local URI to an absolute path. If the URI is already absolute, it is returned as-is. - - URIs with protocols other than "file" are returned as-is. - - Args: - uri: The URI to resolve. - root: The root path to use. - - Returns: - The resolved URI. - """ - uri_str = str(uri) - proto = get_protocol(uri_str) - root_proto = "file" - if root and proto == root_proto: - uri_str = uri_str.replace(f"{root_proto}://", "") - if not Path(uri_str).is_absolute(): - uri = Path(root, uri_str).resolve().as_uri() - return uri - - -__all__ = ["FilesystemAdapter"] diff --git a/src/cdf/legacy/logger.py b/src/cdf/legacy/logger.py deleted file mode 100644 index bfdad6a..0000000 --- a/src/cdf/legacy/logger.py +++ /dev/null @@ -1,169 +0,0 @@ -"""Logger for CDF""" - -from __future__ import annotations - -import contextlib -import logging -import typing as t -import warnings - -from rich.logging import RichHandler - -if t.TYPE_CHECKING: - - class Representable(t.Protocol): - def __str__(self) -> str: ... - - class LogMethod(t.Protocol): - """Protocol for logger methods.""" - - def __call__( - self, msg: Representable, *args: t.Any, **kwargs: t.Any - ) -> None: ... - - -__all__ = [ - "configure", - "create", - "set_level", - "LOG_LEVEL", - "LOGGER", -] - - -class CDFLoggerAdapter(logging.LoggerAdapter): - extra: t.Dict[str, t.Any] - logger: logging.Logger - - -LOGGER = CDFLoggerAdapter(logging.getLogger("cdf"), {}) -"""CDF logger instance.""" - -LOG_LEVEL = logging.INFO -"""The active log level for CDF.""" - - -def configure(level: int | str = logging.INFO) -> None: - """Configure logging. - - Args: - level (int, optional): Logging level. Defaults to logging.INFO. - """ - if LOGGER.extra.get("configured"): - return - LOGGER.setLevel(LOG_LEVEL := level) - console_handler = RichHandler( - LOG_LEVEL, - markup=True, - rich_tracebacks=True, - omit_repeated_times=False, - ) - LOGGER.logger.addHandler(console_handler) - LOGGER.extra["configured"] = True - - -@t.overload -def create(name: t.Literal["cdf"] | None) -> CDFLoggerAdapter: ... - - -@t.overload -def create(name: str) -> logging.Logger: ... - - -def create(name: str | None = None) -> CDFLoggerAdapter | logging.Logger: - """Get or create a logger. - - Args: - name (str, optional): The name of the logger. If None, the package logger is - returned. Defaults to None. If a name is provided, a child logger is - created. - - Returns: - The logger. - """ - if name is None: - return LOGGER - return LOGGER.logger.getChild(name) - - -def log_level() -> str: - """Returns current log level""" - return logging.getLevelName(LOGGER.logger.level) - - -def set_level(level: int | str) -> None: - """Set the package log level. - - Args: - level (int | str): The new log level. - - Raises: - ValueError: If the log level is not valid. - """ - global LOG_LEVEL - - if not LOGGER.extra.get("configured"): - configure(LOG_LEVEL := level) - else: - LOGGER.setLevel(LOG_LEVEL := level) - - -@contextlib.contextmanager -def suppress_and_warn() -> t.Iterator[None]: - """Suppresses exception and logs it as warning""" - try: - yield - except Exception: - LOGGER.warning("Suppressed exception", exc_info=True) - - -@contextlib.contextmanager -def mute() -> t.Iterator[None]: - """Mute the logger.""" - LOGGER.logger.disabled = True - try: - yield - finally: - LOGGER.logger.disabled = False - - -def __getattr__(name: str) -> "LogMethod": - """Get a logger method from the package logger.""" - if not LOGGER.extra.get("configured"): - configure() - - def wrapper(msg: "Representable", *args: t.Any, **kwargs: t.Any) -> None: - stacklevel = 3 if name == "exception" else 2 - getattr(LOGGER, name)(msg, *args, **kwargs, stacklevel=stacklevel) - - return wrapper - - -def _monkeypatch_dlt() -> None: - """Monkeypatch the dlt logging module.""" - from dlt.common import logger - - patched = create("dlt") - setattr(logger, "_init_logging", lambda *a, **kw: patched) - setattr(logger, "LOGGER", patched) - - -def _monkeypatch_sqlglot() -> None: - """Monkeypatch the sqlglot logging module.""" - logger = logging.getLogger("sqlglot") - patched = create("sqlglot") - logger.handlers = patched.handlers - logger.setLevel(logging.ERROR) - logger.propagate = False - warnings.filterwarnings( - "ignore", - message=r"^Possible nested set .*", - category=FutureWarning, - module="sqlglot", - ) - - -def apply_patches() -> None: - """Apply logger patches.""" - _monkeypatch_dlt() - _monkeypatch_sqlglot() diff --git a/src/cdf/legacy/project.py b/src/cdf/legacy/project.py deleted file mode 100644 index 3abcd88..0000000 --- a/src/cdf/legacy/project.py +++ /dev/null @@ -1,989 +0,0 @@ -"""The project module provides a way to define a project and its workspaces. - -Everything in CDF is described via a simple configuration structure. We parse this configuration -using dynaconf which provides a simple way to load configuration from various sources such as -environment variables, YAML, TOML, JSON, and Python files. It also provides many other features -such as loading .env files, env-specific configuration, templating via @ tokens, and more. The -configuration is then validated with pydantic to ensure it is correct and to give us well defined -types to work with. The underlying dynaconf settings object is stored in the `wrapped` attribute -of the Project and Workspace settings objects. This allows us to access the raw configuration -values if needed. ChainMaps are used to provide a scoped view of the configuration. This enables -a powerful layering mechanism where we can override configuration values at different levels. -Finally, we provide a context manager to inject the project configuration into the dlt context -which allows us to access the configuration throughout the dlt codebase and in data pipelines. - -Example: - -```toml -# cdf.toml -[default] -name = "cdf-example" -version = "0.1.0" -workspaces = ["alex"] -filesystem.uri = "file://_storage" -feature_flags.provider = "filesystem" -feature_flags.filename = "feature_flags.json" - -[prod] -filesystem.uri = "gcs://bucket/path" -``` - -```toml -# alex/cdf.toml -[pipelines.us_cities] # alex/pipelines/us_cities_pipeline.py -version = 1 -dataset_name = "us_cities_v0_{version}" -description = "Get US city data" -options.full_refresh = false -options.runtime.dlthub_telemetry = false -``` -""" - -import itertools -import os -import time -import typing as t -from collections import ChainMap -from contextlib import contextmanager, suppress -from enum import Enum -from functools import cached_property, lru_cache -from pathlib import Path - -import duckdb -import dynaconf -import pydantic -from dynaconf.utils.boxing import DynaBox -from dynaconf.vendor.box import Box - -import cdf.legacy.constants as c -import cdf.legacy.specification as spec -from cdf.integrations.feature_flag import ( - AbstractFeatureFlagAdapter, - get_feature_flag_adapter_cls, -) -from cdf.legacy.config import inject_configuration -from cdf.legacy.filesystem import FilesystemAdapter -from cdf.legacy.state import StateStore -from cdf.types import M, PathLike - -if t.TYPE_CHECKING: - from sqlmesh.core.config import GatewayConfig - -T = t.TypeVar("T") - - -class _BaseSettings(pydantic.BaseModel): - """A base model for CDF settings""" - - model_config = pydantic.ConfigDict( - frozen=True, - use_attribute_docstrings=True, - from_attributes=True, - populate_by_name=True, - ) - - _generation: float = pydantic.PrivateAttr(default_factory=time.monotonic) - """A monotonic timestamp of when the model was generated""" - - def __hash__(self) -> int: - return hash(self.model_dump_json()) - - def __eq__(self, other: t.Any) -> bool: - if not isinstance(other, type(self)): - return False - return self.model_dump() == other.model_dump() - - def is_newer_than(self, other: "Project") -> bool: - """Check if the model is newer than another model""" - return self._generation > other._generation - - def is_older_than(self, other: "Project") -> bool: - """Check if the model is older than another model""" - return self._generation < other._generation - - def model_dump(self, **kwargs: t.Any) -> t.Dict[str, t.Any]: - """Dump the model to a dictionary""" - kwargs.setdefault("by_alias", True) - return super().model_dump(**kwargs) - - -class FilesystemConfig(_BaseSettings): - """Configuration for a filesystem provider""" - - uri: str = "_storage" - """The filesystem URI - - This is based on fsspec. See https://filesystem-spec.readthedocs.io/en/latest/index.html - This supports all filesystems supported by fsspec as well as filesystem chaining. - """ - options_: t.Annotated[ - t.Tuple[t.Tuple[str, t.Any], ...], pydantic.Field(alias="options") - ] = () - """The filesystem options - - Options are passed to the filesystem provider as keyword arguments. - """ - - _project: t.Optional["Project"] = None - """The project this configuration belongs to""" - - @pydantic.field_validator("options_", mode="before") - @classmethod - def _options_validator(cls, value: t.Any) -> t.Any: - """Convert the options to an immutable tuple of tuples""" - if isinstance(value, dict): - value = tuple(value.items()) - return value - - @property - def options(self) -> t.Dict[str, t.Any]: - """Get the filesystem options as a dictionary""" - return dict(self.options_) - - @property - def project(self) -> "Project": - """Get the project this configuration belongs to""" - if self._project is None: - raise ValueError("Filesystem configuration not associated with a project") - return self._project - - @property - def has_project_association(self) -> bool: - """Check if the configuration is associated with a project""" - return self._project is not None - - def get_adapter(self) -> M.Result[FilesystemAdapter, Exception]: - """Get a filesystem adapter""" - if self.has_project_association: - root = self.project.path - else: - root = None - try: - return M.ok(FilesystemAdapter(self.uri, root, self.options)) - except Exception as e: - return M.error(e) - - -class FeatureFlagProviderType(str, Enum): - """The feature flag provider""" - - FILESYSTEM = "filesystem" - HARNESS = "harness" - LAUNCHDARKLY = "launchdarkly" - SPLIT = "split" - NOOP = "noop" - - -class BaseFeatureFlagConfig(_BaseSettings): - """Base configuration for a feature flags provider""" - - provider: FeatureFlagProviderType - """The feature flags provider""" - - _project: t.Optional["Project"] = None - """The project this configuration belongs to""" - - @property - def project(self) -> "Project": - """Get the project this configuration belongs to""" - if self._project is None: - raise ValueError("Feature flag configuration not associated with a project") - return self._project - - @property - def has_project_association(self) -> bool: - """Check if the configuration is associated with a project""" - return self._project is not None - - def get_adapter( - self, **kwargs: t.Any - ) -> M.Result[AbstractFeatureFlagAdapter, Exception]: - """Get a handle to the feature flag adapter""" - options = self.model_dump() - provider = str(options.pop("provider").value) - options.update(kwargs) - return get_feature_flag_adapter_cls(provider).map( - lambda cls: cls(**options, filesystem=self.project.fs_adapter.wrapped) - ) - - -class FilesystemFeatureFlagConfig(BaseFeatureFlagConfig): - """Configuration for a feature flags provider that uses the configured filesystem""" - - provider: t.Literal[FeatureFlagProviderType.FILESYSTEM] = ( - FeatureFlagProviderType.FILESYSTEM - ) - """The feature flags provider""" - filename: str = "feature_flags.json" - """The feature flags filename. - - This is a format string that can include the following variables: - - `name`: The project name - - `workspace`: The workspace name - - `environment`: The environment name - - `source`: The source name - - `resource`: The resource name - - `version`: The version number of the component - """ - - -class HarnessFeatureFlagConfig(BaseFeatureFlagConfig): - """Configuration for a feature flags provider that uses the Harness API""" - - provider: t.Literal[FeatureFlagProviderType.HARNESS] = ( - FeatureFlagProviderType.HARNESS - ) - """The feature flags provider""" - api_key: str = pydantic.Field( - os.getenv("HARNESS_API_KEY", ...), - pattern=r"^[ps]at\.[a-zA-Z0-9_\-]+\.[a-fA-F0-9]+\.[a-zA-Z0-9_\-]+$", - ) - """The harness API key. Get it from your user settings""" - sdk_key: pydantic.UUID4 = pydantic.Field(os.getenv("HARNESS_SDK_KEY", ...)) - """The harness SDK key. Get it from the environment management page of the FF module""" - account: str = pydantic.Field( - os.getenv("HARNESS_ACCOUNT_ID", ...), - min_length=22, - max_length=22, - pattern=r"^[a-zA-Z0-9_\-]+$", - ) - """The harness account ID. We will attempt to read it from the environment if not provided.""" - organization: str = pydantic.Field(os.getenv("HARNESS_ORG_ID", "default")) - """The harness organization ID. We will attempt to read it from the environment if not provided.""" - project_: str = pydantic.Field( - os.getenv("HARNESS_PROJECT_ID", ...), alias="project" - ) - """The harness project ID. We will attempt to read it from the environment if not provided.""" - - -class LaunchDarklyFeatureFlagSettings(BaseFeatureFlagConfig): - """Configuration for a feature flags provider that uses the LaunchDarkly API""" - - provider: t.Literal[FeatureFlagProviderType.LAUNCHDARKLY] = ( - FeatureFlagProviderType.LAUNCHDARKLY - ) - """The feature flags provider""" - api_key: str = pydantic.Field( - os.getenv("LAUNCHDARKLY_API_KEY", ...), - pattern=r"^[a-zA-Z0-9_\-]+$", - ) - """The LaunchDarkly API key. Get it from your user settings""" - - -class SplitFeatureFlagSettings(BaseFeatureFlagConfig): - """Configuration for a feature flags provider that uses the Split API""" - - provider: t.Literal[FeatureFlagProviderType.SPLIT] = FeatureFlagProviderType.SPLIT - """The feature flags provider""" - api_key: str = pydantic.Field( - os.getenv("SPLIT_API_KEY", ...), - pattern=r"^[a-zA-Z0-9_\-]+$", - ) - """The Split API key. Get it from your user settings""" - - -class NoopFeatureFlagSettings(BaseFeatureFlagConfig): - """Configuration for a feature flags provider that does nothing""" - - provider: t.Literal[FeatureFlagProviderType.NOOP] = FeatureFlagProviderType.NOOP - """The feature flags provider""" - - -FeatureFlagConfig = t.Union[ - FilesystemFeatureFlagConfig, - HarnessFeatureFlagConfig, - LaunchDarklyFeatureFlagSettings, - SplitFeatureFlagSettings, - NoopFeatureFlagSettings, -] -"""A union of all feature flag provider configurations""" - - -class Workspace(_BaseSettings): - """A workspace is a collection of pipelines, sinks, publishers, scripts, and notebooks in a subdirectory of the project""" - - workspace_path: t.Annotated[Path, pydantic.Field(alias="path")] = Path(".") - """The path to the workspace within the project path""" - project_path: Path = Path(".") - """The path to the project""" - name: t.Annotated[ - str, pydantic.Field(pattern=r"^[a-zA-Z0-9_\-]+$", min_length=3, max_length=32) - ] = "default" - """The name of the workspace""" - owner: t.Optional[str] = None - """The owner of the workspace""" - pipelines: t.Tuple[spec.PipelineSpecification, ...] = () - """Pipelines move data from sources to sinks""" - sinks: t.Tuple[spec.SinkSpecification, ...] = () - """A sink is a destination for data""" - publishers: t.Tuple[spec.PublisherSpecification, ...] = () - """Publishers send data to external systems""" - scripts: t.Tuple[spec.ScriptSpecification, ...] = () - """Scripts are used to automate tasks""" - notebooks: t.Tuple[spec.NotebookSpecification, ...] = () - """Notebooks are used for data analysis and reporting""" - - _project: t.Optional["Project"] = None - """The project this workspace belongs to. Set by the project model validator.""" - - @pydantic.field_validator( - "pipelines", "sinks", "publishers", "scripts", "notebooks", mode="before" - ) - @classmethod - def _workspace_component_validator( - cls, value: t.Any, info: pydantic.ValidationInfo - ): - """Parse component dictionaries into an array of components inject the workspace path""" - if isinstance(value, dict): - # name : {config} - cmps = [] - for key, cmp in value.items(): - if isinstance(cmp, (str, Path)): - # name : path - cmp = {"path": cmp} - cmp.setdefault("name", key) - cmps.append(cmp) - value = cmps - elif hasattr(value, "__iter__") and not isinstance(value, (str, bytes)): - # [{configA}, ...] - value = list(value) - else: - raise ValueError( - "Invalid workspace component configuration, must be either a dict or a list" - ) - for cmp in value: - # TODO: gut check this, its interesting how the tree-like structure - # of project -> workspace -> component requires us to bubble down - # the accumulated path since each layer is a separate model validator - # and component validator ultimately relies on a fully resolvable path - cmp["root_path"] = Path( - info.data["project_path"], info.data["workspace_path"] - ) - return value - - @pydantic.model_validator(mode="after") - def _associate_components_with_workspace(self): - """Associate the components with the workspace""" - for cmp in ( - self.pipelines - + self.sinks - + self.publishers - + self.scripts - + self.notebooks - ): - cmp._workspace = self - return self - - @pydantic.field_serializer( - "pipelines", "sinks", "publishers", "scripts", "notebooks" - ) - @classmethod - def _workspace_component_serializer(cls, value: t.Any) -> t.Dict[str, t.Any]: - """Serialize component arrays back to dictionaries""" - return {cmp.name: cmp.model_dump() for cmp in value} - - @property - def path(self) -> Path: - """Get the path to the workspace""" - return self.project_path / self.workspace_path - - def __getitem__(self, key: str) -> t.Any: - """Get a component by name""" - try: - if "." in key: - parts = key.split(".") - if ( - parts[0] - in ("pipelines", "sinks", "publishers", "scripts", "notebooks") - and len(parts) > 1 - ): - obj = getattr(self, parts[0]) - obj = next(filter(lambda cmp: cmp.name == parts[1], obj)) - parts = parts[2:] - else: - obj = self - for part in parts: - if hasattr(obj, "__getitem__"): - obj = obj[part] - else: - obj = getattr(obj, part) - return obj - else: - return getattr(self, key) - except AttributeError: - pass - raise KeyError(f"Component not found: {key}") - - def __setitem__(self, key: str, value: t.Any) -> None: - """Set a component by name""" - raise NotImplementedError("Cannot set components") - - def __delitem__(self, key: str) -> None: - """Delete a component by name""" - raise NotImplementedError("Cannot delete components") - - def __len__(self) -> int: - """Get the number of components""" - return ( - len(self.pipelines) - + len(self.sinks) - + len(self.publishers) - + len(self.scripts) - + len(self.notebooks) - ) - - def __iter__( - self, - ) -> t.Iterator[spec.CoreSpecification]: - """Iterate over the components""" - return itertools.chain( - self.pipelines, - self.sinks, - self.publishers, - self.scripts, - self.notebooks, - ) - - def __contains__(self, key: str) -> bool: - """Check if a component exists""" - return key in self.get_component_names() - - def get_component_names(self) -> t.List[str]: - """Get the component names""" - return list( - itertools.chain( - (f"pipelines.{cmp.name}" for cmp in self.pipelines), - (f"sinks.{cmp.name}" for cmp in self.sinks), - (f"publishers.{cmp.name}" for cmp in self.publishers), - (f"scripts.{cmp.name}" for cmp in self.scripts), - (f"notebooks.{cmp.name}" for cmp in self.notebooks), - ) - ) - - keys = get_component_names - values = __iter__ - - def items(self) -> t.Iterator[t.Tuple[str, spec.CoreSpecification]]: - """Iterate over the components""" - return ((cmp, self[cmp]) for cmp in self.get_component_names()) - - def _get_spec( - self, name: str, kind: str - ) -> M.Result[spec.CoreSpecification, KeyError]: - """Get a component spec by name""" - for cmp in getattr(self, kind): - if cmp.name == name: - return M.ok(cmp) - return M.error(KeyError(f"{kind[:-1].title()} not found: {name}")) - - def get_pipeline_spec( - self, name: str - ) -> M.Result[spec.PipelineSpecification, Exception]: - """Get a pipeline by name""" - return t.cast( - M.Result[spec.PipelineSpecification, Exception], - self._get_spec(name, "pipelines"), - ) - - def get_sink_spec(self, name: str) -> M.Result[spec.SinkSpecification, Exception]: - """Get a sink by name""" - return t.cast( - M.Result[spec.SinkSpecification, Exception], - self._get_spec(name, "sinks"), - ) - - def get_publisher_spec( - self, name: str - ) -> M.Result[spec.PublisherSpecification, Exception]: - """Get a publisher by name""" - return t.cast( - M.Result[spec.PublisherSpecification, Exception], - self._get_spec(name, "publishers"), - ) - - def get_script_spec( - self, name: str - ) -> M.Result[spec.ScriptSpecification, Exception]: - """Get a script by name""" - return t.cast( - M.Result[spec.ScriptSpecification, Exception], - self._get_spec(name, "scripts"), - ) - - def get_notebook_spec( - self, name: str - ) -> M.Result[spec.NotebookSpecification, Exception]: - """Get a notebook by name""" - return t.cast( - M.Result[spec.NotebookSpecification, Exception], - self._get_spec(name, "notebooks"), - ) - - @property - def project(self) -> "Project": - """Get the project this workspace belongs to""" - if self._project is None: - raise ValueError("Workspace not associated with a project") - return self._project - - @property - def has_project_association(self) -> bool: - """Check if the workspace is associated with a project""" - return self._project is not None - - @contextmanager - def inject_configuration(self) -> t.Iterator[None]: - """Inject the workspace configuration into the context""" - with self.project.inject_configuration(self.name): - yield - - @property - def fs_adapter(self) -> FilesystemAdapter: - """Get a handle to the project filesystem adapter""" - return self.project.fs_adapter - - @property - def ff_adapter(self) -> AbstractFeatureFlagAdapter: - """Get a handle to the project feature flag adapter""" - return self.project.ff_adapter - - @property - def state(self) -> StateStore: - """Get a handle to the project state store""" - return self.project.state - - def get_transform_gateways(self) -> t.Iterator[t.Tuple[str, "GatewayConfig"]]: - """Get the SQLMesh gateway configurations""" - for sink in self.sinks: - with suppress(KeyError): - yield sink.name, sink.get_transform_config() - - def get_transform_context(self, name: t.Optional[str] = None): - """Get the SQLMesh context for the workspace - - We expect a config.py file in the workspace directory that uses the - `get_transform_gateways` method to populate the SQLMesh Config.gateways key. - - Args: - name: The name of the gateway to use. - - Returns: - The SQLMesh context. - """ - import sqlmesh - - return sqlmesh.Context(paths=self.path, gateway=name) - - -class Project(_BaseSettings): - """A project is a collection of workspaces and configuration settings""" - - path: Path = Path(".") - """The path to the project""" - name: str = pydantic.Field( - pattern=r"^[a-zA-Z0-9_\-]+$", - min_length=3, - max_length=32, - default_factory=lambda: "CDF-" + os.urandom(4).hex(sep="-", bytes_per_sep=2), - ) - """The name of the project""" - version: str - """The version of the project, this discriminates between project and workspace config""" - owner: t.Optional[str] = None - """The owner of the project""" - documentation: t.Optional[str] = None - """The project documentation""" - workspaces: t.Tuple[Workspace, ...] = (Workspace(),) - """The project workspaces""" - fs: t.Annotated[ - FilesystemConfig, - pydantic.Field(alias="filesystem"), - ] = FilesystemConfig() - """The project filesystem settings""" - ff: t.Annotated[ - FeatureFlagConfig, - pydantic.Field(discriminator="provider", alias="feature_flags"), - ] = FilesystemFeatureFlagConfig() - """The project feature flags provider settings""" - state: StateStore = StateStore() - """The project state connection settings""" - - _wrapped_config: t.Any = {} - """Store a reference to the wrapped configuration""" - - _extra: t.Dict[str, t.Any] = {} - """Stored information set via __setitem__ which is included in scoped dictionaries""" - - @pydantic.field_validator("path", mode="before") - @classmethod - def _path_validator(cls, value: t.Any): - """Resolve the project path - - The project path must be a directory. If it is a string, it will be converted to a Path object. - """ - if isinstance(value, str): - value = Path(value) - if not isinstance(value, Path): - raise ValueError("Path must be a string or a Path object") - elif not value.is_dir(): - raise FileNotFoundError(f"Project not found: {value}") - return value.resolve() - - @pydantic.field_validator("workspaces", mode="before") - @classmethod - def _workspaces_validator(cls, value: t.Any, info: pydantic.ValidationInfo): - """Hydrate the workspaces if they are paths. Convert a dict to a list of workspaces. - - If the workspace is a path, load the configuration from the path. - """ - if isinstance(value, str): - # pathA; pathB; pathC - value = list(map(lambda s: s.strip(), value.split(";"))) - elif isinstance(value, dict): - # name : {config} - workspaces = [] - for name, config in value.items(): - config.setdefault("name", name) - workspaces.append(config) - value = workspaces - if isinstance(value, (list, tuple)): - # [{configA} | pathA, {configB}, ...] - workspaces = [] - project_path = Path(info.data["path"]) - for obj in value: - if isinstance(obj, (str, Path)): - # pathA - path = Path(obj) - if path.is_absolute(): - path = path.relative_to(project_path) - config = _load_config(project_path / path) - config["path"] = path - config["project_path"] = project_path - workspaces.append(config) - elif isinstance(obj, dict): - # {configA} - # NOTE: in the component validator, we have heuristics for getting a path - # from a name but we seem to demand a path here, we should be consistent - path = Path(obj.pop("path", None) or obj.pop("workspace_path")) - if path.is_absolute(): - path = path.relative_to(project_path) - obj["path"] = path - obj["project_path"] = project_path - workspaces.append(obj) - else: - raise ValueError("Invalid workspace configuration") - value = workspaces - if not (hasattr(value, "__iter__") and not isinstance(value, (str, bytes))): - raise ValueError("Invalid workspaces configuration, must be an iterable") - return value - - @pydantic.model_validator(mode="after") - def _project_workspaces_validator(self): - """Validate the workspaces - - Workspaces must have unique names and paths. - Workspaces must be subdirectories of the project path. - Workspaces must not be subdirectories of other workspaces. - """ - workspace_names = [workspace.name for workspace in self.workspaces] - if len(workspace_names) != len(set(workspace_names)): - raise ValueError("Workspace names must be unique") - workspace_paths = [workspace.path for workspace in self.workspaces] - if len(workspace_paths) != len(set(workspace_paths)): - raise ValueError("Workspace paths must be unique") - if not all(map(lambda path: path.is_relative_to(self.path), workspace_paths)): - raise ValueError( - "Workspace paths must be subdirectories of the project path" - ) - if not all( - map( - lambda path: all( - not other_path.is_relative_to(path) - for other_path in workspace_paths - if other_path != path - ), - workspace_paths, - ) - ): - raise ValueError( - "Workspace paths must not be subdirectories of other workspaces" - ) - return self - - @pydantic.model_validator(mode="after") - def _associate_project_with_children(self): - """Bind the project to the workspaces, filesystem, and feature flags""" - for workspace in self.workspaces: - workspace._project = self - self.ff._project = self - self.fs._project = self - return self - - @pydantic.field_serializer("workspaces") - @classmethod - def _workspace_serializer(cls, value: t.Any) -> t.Dict[str, t.Any]: - """Serialize the workspaces""" - return {workspace.name: workspace.model_dump() for workspace in value} - - def __getitem__(self, key: str) -> t.Any: - """Get an item from the configuration""" - try: - if "." in key: - parts = key.split(".") - if parts[0] == "workspaces" and len(parts) > 1: - obj = self.get_workspace(parts[1]).unwrap() - parts = parts[2:] - else: - obj = self - for i, part in enumerate(parts): - if isinstance(obj, Workspace): - return obj[".".join(parts[i:])] - if hasattr(obj, "__getitem__"): - obj = obj[part] - else: - obj = getattr(obj, part) - return obj - if key == "name": - return self.name - if key in self.model_fields: - return getattr(self, key) - except AttributeError: - pass - return self._wrapped_config[key] - - def __setitem__(self, key: str, value: t.Any) -> None: - """Set an item in the configuration""" - if key in self.model_fields: - raise KeyError( - f"Cannot set `{key}` via string accessor, set the attribute directly instead" - ) - self._extra[key] = value - - def __delitem__(self, key: str) -> None: - """Delete a workspace""" - raise NotImplementedError("Cannot delete workspaces") - - def __len__(self) -> int: - """Get the number of workspaces""" - return len(self.workspaces) - - def __iter__(self) -> t.Iterator[Workspace]: - """Iterate over the workspaces""" - return iter(self.workspaces) - - def __contains__(self, key: str) -> bool: - """Check if a workspace exists""" - return key in self.get_workspace_names() - - def get_workspace_names(self) -> t.List[str]: - """Get the workspace names""" - return [workspace.name for workspace in self.workspaces] - - keys = get_workspace_names - values = __iter__ - - def items(self) -> t.Iterator[t.Tuple[str, Workspace]]: - """Iterate over the workspaces""" - return zip(self.get_workspace_names(), self.workspaces) - - def get_workspace( - self, name: t.Optional[str] = None - ) -> M.Result[Workspace, Exception]: - """Get a workspace by name, if no name is provided, return the default workspace""" - if name is None: - return M.ok(self.workspaces[0]) - for workspace in self.workspaces: - if workspace.name == name: - return M.ok(workspace) - return M.error(KeyError(f"Workspace not found: {name}")) - - def get_workspace_from_path(self, path: PathLike) -> M.Result[Workspace, Exception]: - """Get a workspace by path.""" - path = Path(path).resolve() - for workspace in self.workspaces: - if path.is_relative_to(workspace.path): - return M.ok(workspace) - return M.error(ValueError(f"No workspace found at {path}.")) - - def to_scoped_dict(self, workspace: t.Optional[str] = None) -> ChainMap[str, t.Any]: - """Convert the project settings to a scoped dictionary - - Lookups are performed in the following order: - - The extra configuration, holding data set via __setitem__. - - The workspace configuration, if passed. - - The project configuration. - - The wrapped configuration, if available. Typically a dynaconf settings object. - - Boxing allows us to access nested values using dot notation. This is doubly useful - since ChainMaps will move to the next map in the chain if the dotted key is not - fully resolved in the current map. - """ - - def to_box(obj: t.Any) -> Box: - return DynaBox(obj, box_dots=True) - - if workspace: - return ( - self.get_workspace(workspace) - .map( - lambda ws: ChainMap( - to_box(self._extra), - to_box(ws.model_dump()), - to_box(self.model_dump()), - self._wrapped_config, - ) - ) - .unwrap() - ) - return ChainMap( - to_box(self._extra), - to_box(self.model_dump()), - self._wrapped_config, - ) - - @contextmanager - def inject_configuration( - self, workspace: t.Optional[str] = None - ) -> t.Iterator[None]: - """Inject the project configuration into the context""" - with inject_configuration(self.to_scoped_dict(workspace)): - yield - - @cached_property - def fs_adapter(self) -> FilesystemAdapter: - """Get a configured filesystem adapter""" - return self.fs.get_adapter().unwrap() - - @cached_property - def ff_adapter(self) -> AbstractFeatureFlagAdapter: - """Get a handle to the project's configured feature flag adapter""" - return self.ff.get_adapter().unwrap() - - @cached_property - def duckdb(self) -> duckdb.DuckDBPyConnection: - """Get a handle to the project's DuckDB connection""" - conn = duckdb.connect(":memory:") - conn.install_extension("httpfs") - conn.install_extension("json") - conn.register_filesystem(self.fs_adapter.wrapped) - conn.execute("CREATE TABLE workspaces (name TEXT PRIMARY KEY, path TEXT)") - for workspace in self.workspaces: - conn.execute( - "INSERT INTO workspaces (name, path) VALUES (?, ?)", - (workspace.name, workspace.path.as_posix()), - ) - return conn - - def get_workspace_path(self, name: str) -> M.Result[Path, Exception]: - """Get the path to a workspace by name""" - return self.get_workspace(name).map(lambda ws: ws.path) - - @classmethod - def from_path(cls, root: PathLike): - """Load configuration data from a project root path using dynaconf. - - Args: - root: The root path to the project. - - Returns: - A Project object. - """ - root_path = Path(root).resolve() - if root_path.is_file(): - root_path = root_path.parent - config = _load_config(root_path) - config["path"] = root_path - project = cls.model_validate(config) - project._wrapped_config = config - return project - - def activate(self) -> t.Callable[[], None]: - """Activate the project and return a deactivation function""" - from cdf.legacy.context import active_project - - token = active_project.set(self) - ctx = self.inject_configuration() - ctx.__enter__() - - def _deactivate() -> None: - """Deactivate the project""" - active_project.reset(token) - ctx.__exit__(None, None, None) - - return _deactivate - - @contextmanager - def activated(self) -> t.Iterator[None]: - """Activate the project for the duration of the context""" - deactivate = self.activate() - yield - deactivate() - - -def _load_config( - path: Path, extensions: t.Optional[t.List[str]] = None -) -> dynaconf.LazySettings: - """Load raw configuration data from a file path using dynaconf. - - Args: - path: The path to the project or workspace directory - - Returns: - A dynaconf.LazySettings object. - """ - extensions = extensions or ["toml", "yaml", "yml", "json", "py"] - if not any(map(lambda ext: path.joinpath(f"cdf.{ext}").is_file(), extensions)): - raise FileNotFoundError(f"No cdf configuration file found: {path}") - - config = dynaconf.LazySettings( - root_path=path, - settings_files=[f"cdf.{ext}" for ext in extensions], - environments=True, - envvar_prefix="CDF", - env_switcher=c.CDF_ENVIRONMENT, - env=c.DEFAULT_ENVIRONMENT, - load_dotenv=True, - ) - - def _eval_lazy(value: t.Any) -> t.Any: - """Evaluate lazy values in the configuration""" - if isinstance(value, dict): - for key, val in value.items(): - value[key] = _eval_lazy(val) - return value - elif isinstance(value, list): - for i, val in enumerate(value): - value[i] = _eval_lazy(val) - return value - if getattr(value, "_dynaconf_lazy_format", None): - value = value(config) - return value - - for key, value in config.items(): - config[key] = _eval_lazy(value) - - return config - - -load_project = M.result(Project.from_path) -"""Load configuration data from a project root path using dynaconf. - -Args: - root: The root path to the project. - -Returns: - A Result monad with a Project object if successful. Otherwise, a Result monad with an error. -""" - -if not t.TYPE_CHECKING: - # type checker seems to not like the lru_cache decorator wrapping a monadic lift - # so we can safely hide this from the type checker - load_project = lru_cache(maxsize=25)(load_project) - -__all__ = [ - "load_project", - "Project", - "Workspace", - "FeatureFlagConfig", - "FilesystemConfig", -] diff --git a/src/cdf/legacy/runtime/__init__.py b/src/cdf/legacy/runtime/__init__.py deleted file mode 100644 index 39be86f..0000000 --- a/src/cdf/legacy/runtime/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from cdf.legacy.runtime.notebook import execute_notebook_specification -from cdf.legacy.runtime.pipeline import execute_pipeline_specification, pipeline -from cdf.legacy.runtime.publisher import execute_publisher_specification -from cdf.legacy.runtime.script import execute_script_specification - -__all__ = [ - "execute_notebook_specification", - "execute_pipeline_specification", - "execute_publisher_specification", - "execute_script_specification", - "pipeline", -] diff --git a/src/cdf/legacy/runtime/common.py b/src/cdf/legacy/runtime/common.py deleted file mode 100644 index 2734111..0000000 --- a/src/cdf/legacy/runtime/common.py +++ /dev/null @@ -1,43 +0,0 @@ -import functools -import typing as t - -import cdf.legacy.logger as logger -from cdf.legacy.project import Project, Workspace -from cdf.legacy.specification.base import BaseComponent -from cdf.types import P - -T = t.TypeVar("T") - - -def _get_project(obj: t.Any) -> Project: - """Get the project associated with the object.""" - if isinstance(obj, Project): - return obj - if isinstance(obj, Workspace): - return obj.project - if isinstance(obj, BaseComponent): - return obj.workspace.project - raise TypeError(f"Expected a Project, Workspace or Component, got {type(obj)}") - - -def with_activate_project(func: t.Callable[P, T]) -> t.Callable[P, T]: - """Attempt to inject the Project associated with the first argument into cdf.context. - - Args: - func: The function to decorate. - - Returns: - The decorated function. - """ - - @functools.wraps(func) - def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: - try: - project = _get_project(args[0]) - except TypeError: - logger.warning(f"Could not get project from {type(args[0])}") - return func(*args, **kwargs) - with project.activated(): - return func(*args, **kwargs) - - return wrapper diff --git a/src/cdf/legacy/runtime/notebook.py b/src/cdf/legacy/runtime/notebook.py deleted file mode 100644 index 2348096..0000000 --- a/src/cdf/legacy/runtime/notebook.py +++ /dev/null @@ -1,109 +0,0 @@ -"""The runtime notebook module is responsible for executing notebooks from notebook specifications. - -It performs the following functions: -- Executes the notebook. -- Writes the output to a designated location in a storage provider. -- Cleans up the rendered notebook if required. -""" - -import re -import sys -import time -import typing as t -from contextlib import nullcontext -from datetime import date, datetime -from pathlib import Path - -import papermill - -import cdf.legacy.logger as logger -from cdf.legacy.runtime.common import with_activate_project -from cdf.legacy.specification import NotebookSpecification -from cdf.legacy.state import with_audit -from cdf.types import M - -if t.TYPE_CHECKING: - from nbformat import NotebookNode - - -@with_activate_project -@with_audit( - "execute_notebook", - lambda spec, **params: { - "name": spec.name, - "owner": spec.owner, - "workspace": spec.workspace.name, - "project": spec.project.name, - }, -) -def execute_notebook_specification( - spec: NotebookSpecification, - **params: t.Any, -) -> M.Result["NotebookNode", Exception]: - """Execute a notebook specification. - - Args: - spec: The notebook specification to execute. - storage: The filesystem to use for persisting the output. - **params: The parameters to pass to the notebook. Overrides the notebook spec parameters. - """ - origpath = sys.path[:] - sys.path = [ - str(spec.root_path), - *sys.path, - str(spec.root_path.parent), - ] - try: - merged_params = {**spec.parameters, **params} - output = spec.path.parent.joinpath( - "_rendered", f"{spec.name}.{int(time.time())}.ipynb" - ) - output.parent.mkdir(parents=True, exist_ok=True) - if spec.has_workspace_association: - workspace_context = spec.workspace.inject_configuration() - else: - workspace_context = nullcontext() - with spec._lock, workspace_context: - rv: "NotebookNode" = papermill.execute_notebook( - spec.path, - output, - merged_params, - cwd=spec.root_path, - ) - logger.info( - f"Successfully ran notebook {spec.path} with params {merged_params} rendered into {output}" - ) - storage = spec.workspace.fs_adapter - if storage and spec.storage_path: - storage_path = spec.storage_path.format( - name=spec.name, - date=date.today(), - timestamp=datetime.now().isoformat(timespec="seconds"), - epoch=time.time(), - params=merged_params, - ext=spec.path.suffix, - ) - logger.info(f"Persisting output to {storage_path} with {storage}") - storage.put_file(output, storage_path) - if spec.gc_duration >= 0: - _gc_rendered(output.parent, spec.name, spec.gc_duration) - return M.ok(rv) - except Exception as e: - logger.error(f"Error running notebook {spec.path}: {e}") - return M.error(e) - finally: - sys.path = origpath - - -def _gc_rendered(path: Path, name: str, max_ttl: int) -> None: - """Garbage collect rendered notebooks.""" - now = time.time() - for nb in path.glob(f"{name}.*.ipynb"): - ts_str = re.search(r"\d{10}", nb.stem) - if ts_str: - ts = int(ts_str.group()) - if now - ts > max_ttl: - nb.unlink() - - -__all__ = ["execute_notebook_specification"] diff --git a/src/cdf/legacy/runtime/pipeline.py b/src/cdf/legacy/runtime/pipeline.py deleted file mode 100644 index c40b39f..0000000 --- a/src/cdf/legacy/runtime/pipeline.py +++ /dev/null @@ -1,488 +0,0 @@ -"""The runtime pipeline module is responsible for executing pipelines from pipeline specifications. - -It performs the following functions: -- Injects the runtime context into the pipeline. -- Executes the pipeline. -- Captures metrics during extract. -- Intercepts sources during extract. (if specified, this makes the pipeline a no-op) -- Applies transformations to sources during extract. -- Stages data if a staging location is provided and enabled in the runtime context. -- Forces replace disposition if specified in the runtime context. -- Filters resources based on glob patterns. -- Logs a warning if dataset_name is provided in the runtime context. (since we want to manage it) -- Creates a cdf pipeline from a dlt pipeline. -""" - -import fnmatch -import os -import shutil -import typing as t -from contextlib import nullcontext, redirect_stdout, suppress -from pathlib import Path - -import dlt -from dlt.common.destination import TDestinationReferenceArg, TLoaderFileFormat -from dlt.common.pipeline import ExtractInfo, LoadInfo, NormalizeInfo -from dlt.common.schema.typing import ( - TAnySchemaColumns, - TColumnNames, - TSchemaContract, - TWriteDisposition, -) -from dlt.extract.extract import Extract, data_to_sources -from dlt.pipeline.exceptions import SqlClientNotAvailable -from dlt.pipeline.pipeline import Pipeline - -import cdf.legacy.context as context -import cdf.legacy.logger as logger -from cdf.legacy.runtime.common import with_activate_project -from cdf.legacy.specification import PipelineSpecification, SinkSpecification -from cdf.legacy.state import with_audit -from cdf.types import M, P - -T = t.TypeVar("T") - -TPipeline = t.TypeVar("TPipeline", bound=dlt.Pipeline) - - -def _wrap_pipeline(default_factory: t.Callable[P, TPipeline]): - """Wraps dlt.pipeline such that it sources the active pipeline from the context.""" - - def wrapper(*args: P.args, **kwargs: P.kwargs) -> TPipeline: - try: - pipe = context.active_pipeline.get() - pipe.activate() - if kwargs: - logger.warning("CDF runtime detected, ignoring pipeline arguments") - return t.cast(TPipeline, pipe) - except LookupError: - return default_factory(*args, **kwargs) - - return wrapper - - -pipeline = _wrap_pipeline(dlt.pipeline) -"""Gets the active pipeline or creates a new one with the given arguments.""" - - -def _apply_filters( - source: dlt.sources.DltSource, resource_patterns: t.List[str], invert: bool -) -> dlt.sources.DltSource: - """Filters resources in a source based on a list of patterns.""" - return source.with_resources( - *[ - r - for r in source.selected_resources - if any(fnmatch.fnmatch(r, patt) for patt in resource_patterns) ^ invert - ] - ) - - -class RuntimePipeline(Pipeline): - """Overrides certain methods of the dlt pipeline to allow for cdf specific behavior.""" - - specification: PipelineSpecification - - def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: - super().__init__(*args, **kwargs) - - self._force_replace = False - self._dry_run = False - self._metric_accumulator = {} - self._tracked_sources = set() - self._source_hooks = [] - - def configure( - self, - dry_run: bool = False, - force_replace: bool = False, - select: t.Optional[t.List[str]] = None, - exclude: t.Optional[t.List[str]] = None, - ) -> "RuntimePipeline": - """Configures options which affect the behavior of the pipeline at runtime. - - Args: - dry_run: Whether to run the pipeline in dry run mode. - force_replace: Whether to force replace disposition. - select: A list of glob patterns to select resources. - exclude: A list of glob patterns to exclude resources. - - Returns: - RuntimePipeline: The pipeline with source hooks configured. - """ - S = self.specification - - self._force_replace = force_replace - self._dry_run = dry_run - - def inject_metrics_and_filters( - source: dlt.sources.DltSource, - ) -> dlt.sources.DltSource: - """Injects metrics and filters into the source.""" - return S.inject_metrics_and_filters(source, self._metric_accumulator) - - def apply_selection(source: dlt.sources.DltSource) -> dlt.sources.DltSource: - """Applies selection filters to the source.""" - if not select: - return source - return _apply_filters(source, select, invert=False) - - def apply_exclusion(source: dlt.sources.DltSource) -> dlt.sources.DltSource: - """Applies exclusion filters to the source.""" - if not exclude: - return source - return _apply_filters(source, exclude, invert=True) - - def apply_feature_flags(source: dlt.sources.DltSource) -> dlt.sources.DltSource: - """Applies feature flags to the source. User-defined selection takes precedence.""" - if select: - return source - return S.workspace.ff_adapter.apply_source( - source, - S.workspace.project.name, - S.workspace.name, - ) - - self._source_hooks = [ - inject_metrics_and_filters, - apply_selection, - apply_feature_flags, - apply_exclusion, - ] - return self - - @property - def force_replace(self) -> bool: - """Whether to force replace disposition.""" - return self._force_replace - - @property - def dry_run(self) -> bool: - """Dry run mode.""" - return self._dry_run - - @property - def metric_accumulator(self) -> t.Mapping[str, t.Any]: - """A container for accumulating metrics during extract.""" - return self._metric_accumulator - - @property - def source_hooks( - self, - ) -> t.List[t.Callable[[dlt.sources.DltSource], dlt.sources.DltSource]]: - """The source hooks for the pipeline.""" - return self._source_hooks - - @property - def tracked_sources(self) -> t.Set[dlt.sources.DltSource]: - """The sources tracked by the pipeline.""" - return self._tracked_sources - - def extract( - self, - data: t.Any, - *, - table_name: str = None, # type: ignore[arg-type] - parent_table_name: str = None, # type: ignore[arg-type] - write_disposition: TWriteDisposition = None, # type: ignore[arg-type] - columns: TAnySchemaColumns = None, # type: ignore[arg-type] - primary_key: TColumnNames = None, # type: ignore[arg-type] - schema: dlt.Schema = None, # type: ignore[arg-type] - max_parallel_items: int = None, # type: ignore[arg-type] - workers: int = None, # type: ignore[arg-type] - schema_contract: TSchemaContract = None, # type: ignore[arg-type] - **kwargs: t.Any, - ) -> ExtractInfo: - _ = kwargs - with self._maybe_destination_capabilities(): - sources = data_to_sources( - data, - self, - schema, - table_name, - parent_table_name, - write_disposition, - columns, - primary_key, - schema_contract, - ) - - for i in range(len(sources)): - for hook in self._source_hooks: - sources[i] = hook(sources[i]) - self._tracked_sources.add(sources[i]) - - if self.dry_run: - return self._get_step_info( - step=Extract( - self._schema_storage, - self._normalize_storage_config(), - self.collector, - original_data=data, - ) - ) - - if self.force_replace: - write_disposition = "replace" - - info = self.specification.state_adapter.with_audit( - "extract", - { - "pipeline": self.pipeline_name, - "destination": self.destination.destination_name, - }, - )(super().extract)( - sources, - table_name=table_name, - parent_table_name=parent_table_name, - write_disposition=write_disposition, - columns=columns, - primary_key=primary_key, - schema=schema, - max_parallel_items=max_parallel_items, - workers=workers, - schema_contract=schema_contract, - ) - - if self.metric_accumulator: - logger.info( - "Metrics captured during %s extract, sideloading to destination...", - info.pipeline.pipeline_name, - ) - self.specification.state_adapter.with_audit( - "captured_metrics", - { - "load_ids": info.loads_ids, - "pipeline": self.pipeline_name, - "destination": self.destination.destination_name, - }, # type: ignore[arg-type] - )(super().extract)( - dlt.resource( - [ - { - "load_id": load_id, - "metrics": dict(self.metric_accumulator), - } - for load_id in info.loads_ids - ], - name="cdf_runtime_metrics", - write_disposition="append", - columns=[ - {"name": "load_id", "data_type": "text"}, - {"name": "metrics", "data_type": "complex"}, - ], - table_name="_cdf_metrics", - ) - ) - - if self.specification.persist_extract_package: - logger.info( - "Persisting extract package for %s...", info.pipeline.pipeline_name - ) - for package in info.load_packages: - # TODO: move this to a top-level function - root = Path(self.pipelines_dir) - base = Path(package.package_path).relative_to(root) - path = shutil.make_archive( - base_name=package.load_id, - format="gztar", - root_dir=root, - base_dir=base, - logger=logger, - ) - logger.info("Extract package staged at %s", path) - target = f"extracted/{package.load_id}.tar.gz" - self.specification.workspace.fs_adapter.put(path, target) - logger.info("Package uploaded to %s using project fs", target) - Path(path).unlink() - logger.info("Cleaned up staged package") - # TODO: listing and manipulating these should be first-class - # this will enable us to "replay" a pipeline - # logger.info(self.specification.workspace.filesystem.ls("extracted")) - - self.specification.state_adapter.capture_extract_info(info) - return info - - def normalize( - self, - workers: int = 1, - loader_file_format: TLoaderFileFormat = None, # type: ignore[arg-type] - ) -> NormalizeInfo: - info = self.specification.state_adapter.with_audit( - "normalize", - { - "pipeline": self.pipeline_name, - "destination": self.destination.destination_name, - }, - )(super().normalize)(workers, loader_file_format) - self.specification.state_adapter.capture_normalize_info(info) - return info - - def load( - self, - destination: TDestinationReferenceArg = None, # type: ignore[arg-type] - dataset_name: str = None, # type: ignore[arg-type] - credentials: t.Any = None, # type: ignore[arg-type] - *, - workers: int = 20, - raise_on_failed_jobs: bool = False, - ) -> LoadInfo: - info = self.specification.state_adapter.with_audit( - "load", - { - "pipeline": self.pipeline_name, - "destination": self.destination.destination_name, - }, - )(super().load)( - destination, - dataset_name, - credentials, - workers=workers, - raise_on_failed_jobs=raise_on_failed_jobs, - ) - self.specification.state_adapter.capture_load_info(info) - return info - - def run( - self, - data: t.Any = None, - *, - table_name: str = None, # type: ignore[arg-type] - write_disposition: TWriteDisposition = None, # type: ignore[arg-type] - columns: TAnySchemaColumns = None, # type: ignore[arg-type] - primary_key: TColumnNames = None, # type: ignore[arg-type] - schema: dlt.Schema = None, # type: ignore[arg-type] - loader_file_format: TLoaderFileFormat = None, # type: ignore[arg-type] - schema_contract: TSchemaContract = None, # type: ignore[arg-type] - **kwargs: t.Any, - ) -> LoadInfo: - _ = kwargs - if self._force_replace: - write_disposition = "replace" - - return super().run( - data, - table_name=table_name, - write_disposition=write_disposition, - columns=columns, - primary_key=primary_key, - schema=schema, - loader_file_format=loader_file_format, - schema_contract=schema_contract, - ) - - -class PipelineResult(t.NamedTuple): - """The result of executing a pipeline specification.""" - - exports: t.Dict[str, t.Any] - pipeline: RuntimePipeline - - -def _audit_props( - pipe_spec: PipelineSpecification, - sink_spec: t.Union[ - TDestinationReferenceArg, - t.Tuple[TDestinationReferenceArg, t.Optional[TDestinationReferenceArg]], - SinkSpecification, - ], - select: t.Optional[t.List[str]] = None, - exclude: t.Optional[t.List[str]] = None, - force_replace: bool = False, - dry_run: bool = False, - enable_stage: bool = True, - quiet: bool = False, - **pipeline_options: t.Any, -) -> t.Dict[str, t.Any]: - """The audit function for executing a pipeline specification.""" - return { - "name": pipe_spec.name, - "owner": pipe_spec.owner, - "sink": getattr(sink_spec, "name", sink_spec), - "select": select, - "exclude": exclude, - "force_replace": force_replace, - "dry_run": dry_run, - "enable_stage": enable_stage, - "quiet": quiet, - "pipeline_options": pipeline_options, - "workspace": pipe_spec.workspace.name, - "project": pipe_spec.project.name, - } - - -@with_activate_project -@with_audit("execute_pipeline", _audit_props) -def execute_pipeline_specification( - pipe_spec: PipelineSpecification, - sink_spec: t.Union[ - TDestinationReferenceArg, - t.Tuple[TDestinationReferenceArg, t.Optional[TDestinationReferenceArg]], - SinkSpecification, - ], - select: t.Optional[t.List[str]] = None, - exclude: t.Optional[t.List[str]] = None, - force_replace: bool = False, - dry_run: bool = False, - enable_stage: bool = True, - quiet: bool = False, - **pipeline_options: t.Any, -) -> M.Result[PipelineResult, Exception]: - """Executes a pipeline specification. - - Args: - pipe_spec: The pipeline specification. - sink_spec: The destination where the pipeline will write data. - select: A list of glob patterns to select resources. - exclude: A list of glob patterns to exclude resources. - force_replace: Whether to force replace disposition. - dry_run: Whether to run the pipeline in dry run mode. - enable_stage: Whether to enable staging. If disabled, staging will be ignored. - quiet: Whether to suppress output. - pipeline_options: Additional dlt.pipeline constructor arguments. - - Returns: - M.Result[PipelineResult, Exception]: The result of executing the pipeline specification. - """ - if isinstance(sink_spec, SinkSpecification): - destination, staging = sink_spec.get_ingest_config() - elif isinstance(sink_spec, tuple): - destination, staging = sink_spec - else: - destination, staging = sink_spec, None - - pipeline_options.update( - {"destination": destination, "staging": staging if enable_stage else None} - ) - pipe_reference = pipe_spec.create_pipeline( - RuntimePipeline, **pipeline_options - ).configure(dry_run, force_replace, select, exclude) - token = context.active_pipeline.set(pipe_reference) - - null = open(os.devnull, "w") - maybe_redirect = redirect_stdout(null) if quiet else nullcontext() - try: - with maybe_redirect: - result = PipelineResult(exports=pipe_spec(), pipeline=pipe_reference) - if dry_run: - return M.ok(result) - with ( - suppress(KeyError, SqlClientNotAvailable), - pipe_reference.sql_client() as client, - client.with_staging_dataset(staging=True) as client_staging, - ): - strategy = dlt.config["destination.replace_strategy"] - if strategy in ("insert-from-staging",) and client_staging.has_dataset(): - logger.info( - f"Cleaning up staging dataset {client_staging.dataset_name}" - ) - client_staging.drop_dataset() - return M.ok(result) - except Exception as e: - return M.error(e) - finally: - context.active_pipeline.reset(token) - null.close() - - -__all__ = ["execute_pipeline_specification"] diff --git a/src/cdf/legacy/runtime/publisher.py b/src/cdf/legacy/runtime/publisher.py deleted file mode 100644 index 01810c3..0000000 --- a/src/cdf/legacy/runtime/publisher.py +++ /dev/null @@ -1,83 +0,0 @@ -"""The runtime publisher module is responsible for executing publishers from publisher specifications. - -It performs the following functions: -- Validates the dependencies of the publisher exist. -- Verifies the dependencies are up-to-date. -- Executes the publisher script. -""" - -import datetime -import logging -import typing as t - -import sqlmesh -from sqlmesh.core.dialect import normalize_model_name - -from cdf.legacy.runtime.common import with_activate_project -from cdf.legacy.specification import PublisherSpecification -from cdf.legacy.state import with_audit -from cdf.types import M - -logger = logging.getLogger(__name__) - - -@with_activate_project -@with_audit( - "execute_publisher", - lambda spec, transform_ctx, skip_verification=False: { - "name": spec.name, - "owner": spec.owner, - "depends_on": spec.depends_on, - "skipped_verification": skip_verification, - "gateway": transform_ctx.gateway, - "workspace": spec.workspace.name, - "project": spec.project.name, - }, -) -def execute_publisher_specification( - spec: PublisherSpecification, - transform_ctx: sqlmesh.Context, - skip_verification: bool = False, -) -> M.Result[t.Dict[str, t.Any], Exception]: - """Execute a publisher specification. - - Args: - spec: The publisher specification to execute. - transform_ctx: The SQLMesh context to use for execution. - skip_verification: Whether to skip the verification of the publisher dependencies. - """ - if not skip_verification: - models = transform_ctx.models - for dependency in spec.depends_on: - normalized_name = normalize_model_name( - dependency, transform_ctx.default_catalog, transform_ctx.default_dialect - ) - if normalized_name not in models: - return M.error( - ValueError( - f"Cannot find tracked dependency {dependency} in models." - ) - ) - model = models[normalized_name] - snapshot = transform_ctx.get_snapshot(normalized_name) - if not snapshot: - return M.error(ValueError(f"Snapshot not found for {normalized_name}")) - if snapshot.missing_intervals( - datetime.date.today() - datetime.timedelta(days=7), - datetime.date.today() - datetime.timedelta(days=1), - ): - return M.error( - ValueError(f"Model {model} has missing intervals. Cannot publish.") - ) - logger.info(f"Model {model} has no missing intervals.") - logger.info("All tracked dependencies passed interval check.") - else: - logger.warning("Skipping dependency verification.") - try: - return M.ok(spec()) - except Exception as e: - logger.error(f"Error running publisher script {spec.path}: {e}") - return M.error(e) - - -__all__ = ["execute_publisher_specification"] diff --git a/src/cdf/legacy/runtime/script.py b/src/cdf/legacy/runtime/script.py deleted file mode 100644 index 88bff6a..0000000 --- a/src/cdf/legacy/runtime/script.py +++ /dev/null @@ -1,65 +0,0 @@ -"""The runtime script module is responsible for executing scripts from script specifications. - -It performs the following functions: -- Executes the script. -- Optionally captures stdout and returns it as a string. -""" - -import io -import typing as t -from contextlib import nullcontext, redirect_stdout - -import cdf.legacy.logger as logger -from cdf.legacy.runtime.common import with_activate_project -from cdf.legacy.specification import ScriptSpecification -from cdf.legacy.state import with_audit -from cdf.types import M - - -@t.overload -def execute_script_specification( - spec: ScriptSpecification, - capture_stdout: bool = False, -) -> M.Result[t.Dict[str, t.Any], Exception]: ... - - -@t.overload -def execute_script_specification( - spec: ScriptSpecification, - capture_stdout: bool = True, -) -> M.Result[str, Exception]: ... - - -@with_activate_project -@with_audit( - "execute_script", - lambda spec, capture_stdout=False: { - "name": spec.name, - "owner": spec.owner, - "workspace": spec.workspace.name, - "project": spec.project.name, - }, -) -def execute_script_specification( - spec: ScriptSpecification, - capture_stdout: bool = False, -) -> t.Union[M.Result[t.Dict[str, t.Any], Exception], M.Result[str, Exception]]: - """Execute a script specification. - - Args: - spec: The script specification to execute. - capture_stdout: Whether to capture stdout and return it. False returns an empty string. - """ - try: - buf = io.StringIO() - maybe_redirect = redirect_stdout(buf) if capture_stdout else nullcontext() - logger.info(f"Running script {spec.path}") - with maybe_redirect: - exports = spec() - return M.ok(buf.getvalue() if capture_stdout else exports) # type: ignore - except Exception as e: - logger.error(f"Error running script {spec.path}: {e}") - return M.error(e) - - -__all__ = ["execute_script_specification"] diff --git a/src/cdf/legacy/specification/__init__.py b/src/cdf/legacy/specification/__init__.py deleted file mode 100644 index d35f5da..0000000 --- a/src/cdf/legacy/specification/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -import typing as t - -from cdf.legacy.specification.notebook import NotebookSpecification -from cdf.legacy.specification.pipeline import PipelineSpecification -from cdf.legacy.specification.publisher import PublisherSpecification -from cdf.legacy.specification.script import ScriptSpecification -from cdf.legacy.specification.sink import SinkSpecification - -CoreSpecification = t.Union[ - NotebookSpecification, - PipelineSpecification, - PublisherSpecification, - ScriptSpecification, - SinkSpecification, -] - -__all__ = [ - "NotebookSpecification", - "PipelineSpecification", - "PublisherSpecification", - "ScriptSpecification", - "SinkSpecification", - "CoreSpecification", -] diff --git a/src/cdf/legacy/specification/base.py b/src/cdf/legacy/specification/base.py deleted file mode 100644 index ca14604..0000000 --- a/src/cdf/legacy/specification/base.py +++ /dev/null @@ -1,523 +0,0 @@ -"""Base specification classes for continuous data framework components""" - -from __future__ import annotations - -import ast -import importlib -import inspect -import operator -import os -import runpy -import sys -import time -import typing as t -from contextlib import nullcontext, suppress -from pathlib import Path -from threading import Lock - -import dlt -import pydantic -from croniter import croniter - -import cdf.legacy.constants as c -import cdf.legacy.logger as logger - -if t.TYPE_CHECKING: - from cdf.legacy.project import Project, Workspace - -T = t.TypeVar("T") - -_NO_DESCRIPTION = "No description provided." -"""A default description for components if not provided or parsed.""" - - -def _gen_anon_name() -> str: - """Generate an anonymous name for a component.""" - return f"anon_{os.urandom(8).hex()}" - - -def _getmodulename(name: str) -> str: - """Wraps inspect.getmodulename to ensure a module name is returned.""" - rv = inspect.getmodulename(name) - return rv or name - - -class BaseComponent( - pydantic.BaseModel, use_attribute_docstrings=True, from_attributes=True -): - """ - A component specification. - - Components are the building blocks of a data platform. They declaratively describe - the functions within a workspace which extract, load, transform, and publish data. - """ - - name: t.Annotated[ - str, - pydantic.Field( - ..., - default_factory=_gen_anon_name, - pattern=r"^[a-zA-Z0-9_\-\/]+$", - max_length=64, - ), - ] - """The name of the component. Must be unique within the workspace.""" - version: t.Annotated[int, pydantic.Field(1, ge=1, le=999, frozen=True)] = 1 - """The version of the component. - - Used internally to version datasets and serves as an external signal to dependees that something - has changed in a breaking way. All components are versioned. - """ - owner: t.Optional[str] = None - """The owners of the component.""" - description: str = _NO_DESCRIPTION - """The description of the component. - - This should help users understand the purpose of the component. For scripts and entrypoints, we - will attempt to extract the relevant docstring. - """ - tags: t.List[str] = [] - """Tags for this component used for component queries and integrations.""" - enabled: bool = True - """Whether this component is enabled. Respected in cdf operations.""" - meta: t.Dict[str, t.Any] = {} - """Arbitrary user-defined metadata for this component. - - Used for user-specific integrations and automation. - """ - - _workspace: t.Optional["Workspace"] = None - """The workspace containing the component. Set by the workspace model validator.""" - - _generation: float = pydantic.PrivateAttr(default_factory=time.monotonic) - """The generation time of the component. Used for ordering components.""" - - def __eq__(self, other: t.Any) -> bool: - """Check if two components are equal.""" - if not isinstance(other, BaseComponent): - return False - same_name_and_version = ( - self.name == other.name and self.version == other.version - ) - if self.has_workspace_association and other.has_workspace_association: - same_workspace = self.workspace.name == other.workspace.name - if ( - self.workspace.has_project_association - and other.workspace.has_project_association - ): - same_project = ( - self.workspace.project.name == other.workspace.project.name - ) - return same_name_and_version and same_workspace and same_project - return same_name_and_version and same_workspace - return same_name_and_version - - def __hash__(self) -> int: - """Hash the component.""" - if not self.has_workspace_association: - if self.workspace.has_project_association: - return hash( - ( - self.workspace.project.name, - self.workspace.name, - self.name, - self.version, - ) - ) - return hash((self.workspace.name, self.name, self.version)) - return hash((self.name, self.version)) - - @property - def workspace(self) -> "Workspace": - """Get the workspace containing the component.""" - if self._workspace is None: - raise ValueError("Component not associated with a workspace.") - return self._workspace - - @property - def has_workspace_association(self) -> bool: - """Check if the component has a workspace association.""" - return self._workspace is not None - - @property - def project(self) -> "Project": - """Get the project containing the component.""" - return self.workspace.project - - @property - def state_adapter(self) -> t.Any: - """Get the state adapter for the component.""" - return self.workspace.state - - @property - def versioned_name(self) -> str: - """Get the versioned name of the component.""" - return f"{self.name}_v{self.version}" - - @property - def owners(self) -> t.List[str]: - """Get the owners.""" - if not self.owner: - return [] - return self.owner.split(",") - - @pydantic.field_validator("tags", mode="before") - @classmethod - def _tags_validator(cls, tags: t.Any) -> t.Sequence[str]: - """Wrap tags in a list.""" - if isinstance(tags, str): - tags = tags.split(",") - return tags - - @pydantic.field_validator("owner", mode="before") - @classmethod - def _owner_validator(cls, owner: t.Any) -> str: - """Ensure owner is a string.""" - if isinstance(owner, (list, tuple)): - owner = ",".join(owner) - return owner - - @pydantic.field_validator("description", mode="after") - @classmethod - def _description_validator(cls, description: str) -> str: - """Ensure the description has no leading whitespace.""" - return inspect.cleandoc(description) - - @pydantic.model_validator(mode="before") # type: ignore - @classmethod - def _spec_validator(cls, data: t.Any) -> t.Any: - """Perform validation on the spec ensuring forward compatibility.""" - if isinstance(data, dict): - owners = data.pop("owners", None) - if owners is not None: - data["owner"] = ",".join(owners) - return data - - def __getitem__(self, key: str) -> t.Any: - """Get a field from the component.""" - if key not in self.model_fields: - raise KeyError(f"No attribute {key} found in component {self.name}") - try: - return getattr(self, key) - except AttributeError as e: - raise KeyError(f"Attribute {key} not found in component {self.name}") from e - - -class WorkspaceComponent(BaseComponent): - """A component within a workspace.""" - - component_path: t.Annotated[Path, pydantic.Field(alias="path", frozen=True)] - """The path to the component within the workspace folder.""" - root_path: t.Annotated[Path, pydantic.Field(frozen=True, exclude=True)] = Path(".") - """The base path from which to resolve the component path. - - This is typically the union of the project path and the workspace path but - for standalone components (components created programmatically outside the - context of the cdf taxonomy), it should be set to either the current working - directory (default) or the system root. It is excluded from serialization. - """ - - _folder: str = "." - """The folder within the workspace where components are stored.""" - _extension: str = "py" - """The extension for components of this type.""" - - @property - def path(self) -> Path: - """Get the path to the component.""" - return self.root_path / self.component_path - - @pydantic.model_validator(mode="before") - @classmethod - def _path_from_name_validator(cls, values: t.Any) -> t.Any: - """Infer the path from the name if component_path is not provided. - - Given a name, we apply certain heuristics to infer the path of the component if a - path is not explicitly provided. The heuristics are as follows: - - If the name ends with the component extension (.py), we use the name as the path. - - If the name does NOT end with the component extension, we append the component type - if not present. So a pipeline name like `darksky` would become `darksky_pipeline`. - - We then append the component extension and set the path. So `darksky_pipeline.py` - - The _component_path_validator validator is uniformly responsible for prefixing the - folder name to the path. - """ - if isinstance(values, (str, Path)): - values = {"path": values} - elif isinstance(values, dict): - name = values.get("name") - if not name: - return values - if name.endswith((".py", ".ipynb")): - values.setdefault("path", name) - else: - ext = getattr(cls._extension, "default") - typ = getattr(cls._folder, "default")[:-1] - if name.endswith(f"_{typ}"): - p = f"{name}.{ext}" - else: - p = f"{name}_{typ}.{ext}" - values.setdefault("path", p) - return values - - @pydantic.field_validator("name", mode="before") - @classmethod - def _component_name_validator(cls, name: t.Any) -> t.Any: - """Strip the extension from the name.""" - if isinstance(name, str): - return name.rsplit(".", 1)[0] - return name - - @pydantic.field_validator("component_path", mode="before") - @classmethod - def _component_path_validator(cls, component_path: t.Any) -> Path: - """Ensure the component path is a Path and that its a child of the expected folder.""" - path = Path(component_path) - if path.is_absolute(): - raise ValueError("Component path must be a relative path.") - ns = getattr(cls._folder, "default") - if path.parts[0] != ns: - path = Path(ns) / path - return path - - -class Schedulable(pydantic.BaseModel): - """A mixin for schedulable components.""" - - cron_string: t.Annotated[ - str, pydantic.Field(serialization_alias="cron", frozen=True) - ] = "@daily" - """A cron expression for scheduling the primary action associated with the component. - - This is intended to be leveraged by libraries like Airflow. - """ - - @property - def cron(self) -> t.Optional[croniter]: - """Get the croniter instance.""" - if self.cron_string is None: - return None - return croniter(self.cron_string) # TODO: add start time here based on last run - - def next_run(self) -> t.Optional[int]: - """Get the next run time for the component.""" - if self.cron is None: - return None - return self.cron.get_next() - - def is_due(self) -> bool: - """Check if the component is due to run.""" - if self.cron is None: - return False - return self.cron.get_next() <= self.cron.get_current() - - @pydantic.field_validator("cron_string", mode="before") - @classmethod - def _cron_validator(cls, cron_string: t.Any) -> str: - """Ensure the cron expression is valid.""" - if isinstance(cron_string, croniter): - return " ".join(cron_string.expressions) - elif isinstance(cron_string, str): - try: - croniter(cron_string) - except Exception as e: - raise ValueError(f"Invalid cron expression: {cron_string}") from e - else: - return cron_string - raise TypeError( - f"Invalid cron type: {type(cron_string)} is not str or croniter." - ) - - -class InstallableRequirements(pydantic.BaseModel): - """A mixin for components that support installation of requirements.""" - - requirements: t.Annotated[t.List[str], pydantic.Field(frozen=True)] = [] - """The requirements for the component.""" - - @pydantic.field_validator("requirements", mode="before") - @classmethod - def _requirements_validator(cls, requirements: t.Any) -> t.Sequence[str]: - """Wrap requirements in a list.""" - if isinstance(requirements, str): - requirements = requirements.split(",") - return requirements - - def install_requirements(self) -> None: - """Install the component.""" - if not self.requirements: - return - name = getattr(self, "name", self.__class__.__name__) - logger.info(f"Installing requirements for {name}: {self.requirements}") - try: - import pip - except ImportError: - raise ImportError( - "Pip was not found. Please install pip or recreate the virtual environment." - ) - pip.main(["install", *self.requirements]) - - -class PythonScript(WorkspaceComponent, InstallableRequirements): - """A python script component.""" - - auto_install: bool = False - """Whether to automatically install the requirements for the script. - - Useful for leaner Docker images which defer certain component dep installs to runtime. - """ - - _lock: Lock = pydantic.PrivateAttr(default_factory=Lock) - """A lock for ensuring thread safety.""" - - @pydantic.model_validator(mode="after") - def _setup_script(self): - """Import the entrypoint and register the component.""" - if self.name.startswith("anon_"): - self.name = self.name.replace("anon_", self.path.stem) - if self.description == _NO_DESCRIPTION: - tree = ast.parse(self.path.read_text()) - with suppress(TypeError): - self.description = ast.get_docstring(tree) or _NO_DESCRIPTION - return self - - def package(self, outputdir: str) -> None: - """Package the component.""" - from pex.bin import pex - - name = getattr(self, "name", self.__class__.__name__) - logger.info(f"Packaging {name}...") - - output = os.path.join(outputdir, f"{name}.pex") - try: - # --inject-env in pex can add the c.CDF_MAIN variable? - # or really any other variable that should be injected - pex.main(["-o", output, ".", *self.requirements]) - except SystemExit as e: - # A failed pex build will exit with a non-zero code - # Successfully built pexes will exit with either 0 or None - if e.code is not None and e.code != 0: - # If the pex fails to build, delete the compromised pex - with suppress(FileNotFoundError): - os.remove(output) - raise - - @property - def main(self) -> t.Callable[[], t.Dict[str, t.Any]]: - """Get the entrypoint function.""" - - def _run() -> t.Any: - """Run the script""" - origpath = sys.path[:] - sys.path = [ - str(self.root_path), - *sys.path, - str(self.root_path.parent), - ] - parts = map( - _getmodulename, - self.path.relative_to(self.root_path).parts, - ) - run_name = ".".join(parts) - if self.has_workspace_association: - workspace_context = self.workspace.inject_configuration() - else: - workspace_context = nullcontext() - try: - with self._lock, workspace_context: - maybe_log_level = dlt.config.get("runtime.log_level", str) - if maybe_log_level: - logger.set_level(maybe_log_level.upper()) - if self.auto_install: - self.install_requirements() - return runpy.run_path( - str(self.path), - run_name=run_name, - init_globals={ - "__file__": str(self.path), - c.CDF_MAIN: run_name, - }, - ) - except SystemExit as e: - if e.code != 0: - raise - return {} - except Exception as e: - logger.exception(f"Error running script {self.name}: {e}") - raise - finally: - sys.path = origpath - - return _run - - def __call__(self) -> t.Dict[str, t.Any]: - """Run the script.""" - return self.main() - - -class PythonEntrypoint(BaseComponent, InstallableRequirements): - """A python entrypoint component.""" - - entrypoint: t.Annotated[ - str, - pydantic.Field( - ..., - frozen=True, - pattern=r"^[a-zA-Z][a-zA-Z0-9_\.]*:[a-zA-Z][a-zA-Z0-9_\.]*$", - ), - ] - """The entrypoint of the component in the format module:func.""" - - @pydantic.model_validator(mode="after") - def _setup_entrypoint(self): - """Import the entrypoint and register the component.""" - if self.name.startswith("anon_"): - mod, func = self.entrypoint.split(":", 1) - self.name = mod.replace(".", "_") + "_" + func.replace(".", "_") - if self.description == _NO_DESCRIPTION: - with logger.suppress_and_warn(): - self.description = self.main(__return_func=1).__doc__ or _NO_DESCRIPTION - return self - - @property - def main(self) -> t.Callable[..., t.Any]: - """Get the entrypoint function.""" - module, func = self.entrypoint.split(":") - - def _run(*args: t.Any, **kwargs: t.Any) -> t.Any: - """Execute the entrypoint.""" - if self.has_workspace_association: - workspace_context = self.workspace.inject_configuration() - else: - workspace_context = nullcontext() - with workspace_context: - mod = importlib.import_module(module) - fn = operator.attrgetter(func)(mod) - if kwargs.pop("__return_func", 0): - return fn - return fn(*args, **kwargs) - - return _run - - def __call__(self, *args: t.Any, **kwargs: t.Any) -> t.Any: - """Run the entrypoint.""" - return self.main(*args, **kwargs) - - -class CanExecute(t.Protocol): - """A protocol specifying the minimum interface executable components satisfy.""" - - @property - def main(self) -> t.Callable[..., t.Any]: ... - - def __call__(self, *args: t.Any, **kwargs: t.Any) -> t.Any: ... - - -__all__ = [ - "BaseComponent", - "Schedulable", - "PythonScript", - "PythonEntrypoint", - "WorkspaceComponent", - "CanExecute", -] diff --git a/src/cdf/legacy/specification/notebook.py b/src/cdf/legacy/specification/notebook.py deleted file mode 100644 index 2d738d9..0000000 --- a/src/cdf/legacy/specification/notebook.py +++ /dev/null @@ -1,45 +0,0 @@ -import typing as t -from threading import Lock - -import pydantic - -from cdf.legacy.specification.base import InstallableRequirements, WorkspaceComponent - - -class NotebookSpecification(WorkspaceComponent, InstallableRequirements): - """A sink specification.""" - - storage_path: t.Optional[str] = None - """The path to write the output notebook to for long term storage. - - Uses the configured Project fs provider. This may be gcs, s3, etc. - - This is a format string which will be formatted with the following variables: - - name: The name of the notebook. - - date: The current date. - - timestamp: An ISO formatted timestamp. - - epoch: The current epoch time. - - params: A dict of the resolved parameters passed to the notebook. - """ - - parameters: t.Dict[str, t.Any] = {} - """Parameters to pass to the notebook when running.""" - gc_duration: int = 86400 * 3 - """The duration in seconds to keep the locally rendered notebook in the `_rendered` folder. - - Rendered notebooks are written to the `_rendered` folder of the notebook's parent directory. - That folder is not intended to be a permanent storage location. This setting controls how long - rendered notebooks are kept before being garbage collected. The default is 3 days. Set to 0 to - clean up immediately after execution. Set to -1 to never clean up. - """ - - _folder: str = "notebooks" - """The folder where notebooks are stored.""" - _extension: str = "ipynb" - """The default extension for notebooks.""" - - _lock: Lock = pydantic.PrivateAttr(default_factory=Lock) - """A lock to ensure the notebook is thread safe.""" - - -__all__ = ["NotebookSpecification"] diff --git a/src/cdf/legacy/specification/pipeline.py b/src/cdf/legacy/specification/pipeline.py deleted file mode 100644 index 5393d99..0000000 --- a/src/cdf/legacy/specification/pipeline.py +++ /dev/null @@ -1,223 +0,0 @@ -"""The spec classes for continuous data framework pipelines.""" - -import atexit -import decimal -import fnmatch -import time -import typing as t - -import dlt -import pydantic -from dlt.common.destination.exceptions import DestinationLoadingViaStagingNotSupported -from dlt.common.typing import TDataItem - -import cdf.legacy.logger as logger -from cdf.legacy.specification.base import PythonEntrypoint, PythonScript, Schedulable - -T = t.TypeVar("T") -TPipeline = t.TypeVar("TPipeline", bound=dlt.Pipeline) - -Metric = t.Union[float, int, decimal.Decimal] -MetricStateContainer = t.MutableMapping[str, t.MutableMapping[str, Metric]] - - -class MetricInterface(t.Protocol): - def __call__( - self, item: TDataItem, metric: t.Optional[t.Any] = None, / - ) -> Metric: ... - - -class PipelineMetricSpecification(PythonEntrypoint): - """Defines metrics which can be captured during pipeline execution""" - - options: t.Dict[str, t.Any] = {} - """Kwargs to pass to the metric function. - - This assumes the metric is a callable which accepts kwargs and returns a metric - interface. If the metric is not parameterized, this should be left empty. - """ - - @property - def func(self) -> MetricInterface: - """A typed property to return the metric function""" - if self.options: - return self.main(**self.options) - return self.main - - def __call__( - self, resource: dlt.sources.DltResource, state: MetricStateContainer - ) -> None: - """Adds a metric aggregator to a resource""" - func = self.func - first = True - resource_name = resource.name - metric_name = self.name - elapsed = 0.0 - - def _aggregator(item: T) -> T: - nonlocal first, elapsed - compstart = time.perf_counter() - if first: - state[resource_name][metric_name] = func(item) - first = False - return item - state[resource_name][metric_name] = func( - item, - state[resource_name][metric_name], - ) - compend = time.perf_counter() - elapsed += compend - compstart - return item - - state.setdefault(resource_name, {}) - resource.add_map(_aggregator) - - def _timing_stats(): - logger.debug( - f"Collecting metric {metric_name} for {resource_name} took {elapsed} seconds" - ) - - atexit.register(_timing_stats) - - -InlineMetricSpecifications = t.Dict[str, t.List[PipelineMetricSpecification]] -"""Mapping of resource name glob patterns to metric specs""" - - -class FilterInterface(t.Protocol): - def __call__(self, item: TDataItem) -> bool: ... - - -class PipelineFilterSpecification(PythonEntrypoint): - """Defines filters which can be applied to pipeline execution""" - - options: t.Dict[str, t.Any] = {} - """Kwargs to pass to the filter function. - - This assumes the filter is a callable which accepts kwargs and returns a filter - interface. If the filter is already a filter interface, this should be left empty. - """ - - @property - def func(self) -> FilterInterface: - """A typed property to return the filter function""" - if self.options: - return self.main(**self.options) - return self.main - - def __call__(self, resource: dlt.sources.DltResource) -> None: - """Adds a filter to a resource""" - resource.add_filter(self.func) - - -InlineFilterSpecifications = t.Dict[str, t.List[PipelineFilterSpecification]] -"""Mapping of resource name glob patterns to filter specs""" - - -class PipelineSpecification(PythonScript, Schedulable): - """A pipeline specification.""" - - metrics: InlineMetricSpecifications = {} - """A dict of resource name glob patterns to metric definitions. - - Metrics are captured on a per resource basis during pipeline execution and are - accumulated into the metric_state dict. The metric definitions are callables that - take the current item and the current metric value and return the new metric value. - """ - filters: InlineFilterSpecifications = {} - """A dict of resource name glob patterns to filter definitions. - - Filters are applied on a per resource basis during pipeline execution. The filter - definitions are callables that take the current item and return a boolean indicating - whether the item should be filtered out. - """ - dataset_name: str = "{name}_v{version}" - """The name of the dataset associated with the pipeline. - - Defaults to the versioned name. This string is formatted with the pipeline name, version, meta, and tags. - """ - options: t.Dict[str, t.Any] = {} - """Options available in pipeline scoped dlt config resolution.""" - persist_extract_package: bool = True - """Whether to persist the extract package in the project filesystem.""" - - _folder = "pipelines" - """The folder where pipeline scripts are stored.""" - - @pydantic.model_validator(mode="after") - def _validate_dataset(self: "PipelineSpecification") -> "PipelineSpecification": - """Validate the dataset name and apply formatting.""" - name = self.dataset_name.format( - name=self.name, version=self.version, meta=self.meta, tags=self.tags - ).strip() - self.dataset_name = name or self.versioned_name - return self - - def inject_metrics_and_filters( - self, source: dlt.sources.DltSource, container: MetricStateContainer - ) -> dlt.sources.DltSource: - """Apply metrics and filters defined by the specification to a source. - - For a source to conform to the specification, it must have this method applied to it. You - can manipulate sources without this method, but the metrics and filters will not be applied. - - Args: - source: The source to apply metrics and filters to. - container: The container to store metric state in. This is mutated during execution. - - Returns: - dlt.sources.DltSource: The source with metrics and filters applied. - """ - for resource in source.selected_resources.values(): - for patt, metric in self.metrics.items(): - if fnmatch.fnmatch(resource.name, patt): - for applicator in metric: - applicator(resource, container) - for patt, filter_ in self.filters.items(): - if fnmatch.fnmatch(resource.name, patt): - for applicator in filter_: - applicator(resource) - return source - - def create_pipeline( - self, - klass: t.Type[TPipeline] = dlt.Pipeline, - /, - **kwargs: t.Any, - ) -> TPipeline: - """Convert the pipeline specification to a dlt pipeline object. - - This is a convenience method to create a dlt pipeline object from the specification. The - dlt pipeline is expected to use the name and dataset name from the specification. This - is what allows declarative definitions to be associated with runtime artifacts. - - Args: - klass (t.Type[TPipeline], optional): The pipeline class to use. Defaults to dlt.Pipeline. - **kwargs: Additional keyword arguments to pass to the dlt.pipeline constructor. - - Returns: - TPipeline: The dlt pipeline object. - """ - try: - pipe = dlt.pipeline( - pipeline_name=self.name, - dataset_name=self.dataset_name, - **kwargs, - _impl_cls=klass, - ) - except DestinationLoadingViaStagingNotSupported: - logger.warning( - "Destination does not support loading via staging. Disabling staging." - ) - kwargs.pop("staging", None) - pipe = dlt.pipeline( - pipeline_name=self.name, - dataset_name=self.dataset_name, - **kwargs, - _impl_cls=klass, - ) - setattr(pipe, "specification", self) - return pipe - - -__all__ = ["PipelineSpecification"] diff --git a/src/cdf/legacy/specification/publisher.py b/src/cdf/legacy/specification/publisher.py deleted file mode 100644 index 7ab5292..0000000 --- a/src/cdf/legacy/specification/publisher.py +++ /dev/null @@ -1,16 +0,0 @@ -import typing as t - -from cdf.legacy.specification.base import PythonScript, Schedulable - - -class PublisherSpecification(PythonScript, Schedulable): - """A publisher specification.""" - - depends_on: t.List = [] - """The dependencies of the publisher expressed as fully qualified names of SQLMesh tables.""" - - _folder = "publishers" - """The folder where publisher scripts are stored.""" - - -__all__ = ["PublisherSpecification"] diff --git a/src/cdf/legacy/specification/script.py b/src/cdf/legacy/specification/script.py deleted file mode 100644 index 7e3e78b..0000000 --- a/src/cdf/legacy/specification/script.py +++ /dev/null @@ -1,11 +0,0 @@ -from cdf.legacy.specification.base import PythonScript, Schedulable - - -class ScriptSpecification(PythonScript, Schedulable): - """A script specification.""" - - _folder = "scripts" - """The folder where generic scripts are stored.""" - - -__all__ = ["ScriptSpecification"] diff --git a/src/cdf/legacy/specification/sink.py b/src/cdf/legacy/specification/sink.py deleted file mode 100644 index 3c36501..0000000 --- a/src/cdf/legacy/specification/sink.py +++ /dev/null @@ -1,70 +0,0 @@ -import typing as t - -from dlt.common.destination.reference import Destination -from sqlmesh.core.config import GatewayConfig - -from cdf.legacy.specification.base import PythonScript -from cdf.legacy.state import with_audit - - -class SinkSpecification(PythonScript): - """A sink specification.""" - - ingest_config: str = "ingest" - """The variable which holds the ingest configuration (a dlt destination).""" - stage_config: str = "stage" - """The variable which holds the staging configuration (a dlt destination).""" - transform_config: str = "transform" - """The variable which holds the transform configuration (a sqlmesh config).""" - - _exports: t.Optional[t.Dict[str, t.Any]] = None - """Caches the exports from the sink script.""" - - _folder: str = "sinks" - """The folder where sink scripts are stored.""" - - @property - def main(self) -> t.Callable[..., t.Dict[str, t.Any]]: - """Run the sink script.""" - loader = t.cast(t.Callable[..., t.Dict[str, t.Any]], super().main) - return with_audit( - "load_sink", - lambda self=self: { - "name": self.name, - "owner": self.owner, - "workspace": self.workspace.name, - "project": self.project.name, - }, - )(loader) - - def get_ingest_config( - self, - ) -> t.Tuple[Destination, t.Optional[Destination]]: - """Get the ingest configuration.""" - if self._exports is None: - self._exports = self.main() - return self._exports[self.ingest_config], self._exports.get(self.stage_config) - - def get_transform_config(self) -> GatewayConfig: - """Get the transform configuration.""" - if self._exports is None: - self._exports = self.main() - return GatewayConfig.model_validate(self._exports[self.transform_config]) - - @property - def ingest(self) -> Destination: - """The ingest destination.""" - return self.get_ingest_config()[0] - - @property - def stage(self) -> t.Optional[Destination]: - """The stage destination.""" - return self.get_ingest_config()[1] - - @property - def transform(self) -> GatewayConfig: - """The transform configuration.""" - return self.get_transform_config() - - -__all__ = ["SinkSpecification"] diff --git a/src/cdf/legacy/state.py b/src/cdf/legacy/state.py deleted file mode 100644 index b54b818..0000000 --- a/src/cdf/legacy/state.py +++ /dev/null @@ -1,407 +0,0 @@ -"""The state module is responible for providing an adapter through which we can persist data""" - -import json -import time -import typing as t -from datetime import timedelta, timezone - -import pandas as pd -import pydantic -from dlt.common.pipeline import ExtractInfo, LoadInfo, NormalizeInfo -from sqlglot import exp -from sqlmesh.core.config.connection import ( - DuckDBConnectionConfig, - MySQLConnectionConfig, - PostgresConnectionConfig, -) -from sqlmesh.core.engine_adapter import EngineAdapter - -import cdf.legacy.logger as logger -from cdf.legacy.context import active_project, execution_id -from cdf.types import M, P - -T = t.TypeVar("T") -JSON = t.Union[bool, int, float, str, t.List["JSON"], t.Dict[str, "JSON"]] - -KV_SCHEMA = {"key": exp.DataType.build("TEXT"), "value": exp.DataType.build("TEXT")} -"""The schema for the key-value store""" - -_PIPELINE_SCHEMA = { - "load_id": exp.DataType.build("TEXT"), - "timestamp": exp.DataType.build("INT64"), - "pipeline": exp.DataType.build("TEXT"), - "dataset": exp.DataType.build("TEXT"), - "destination_name": exp.DataType.build("TEXT"), - "destination_type": exp.DataType.build("TEXT"), - "data": exp.DataType.build("TEXT"), - "success": exp.DataType.build("BOOLEAN"), - "elapsed": exp.DataType.build("FLOAT"), - "execution_id": exp.DataType.build("TEXT"), -} - -EXTRACT_SCHEMA = _PIPELINE_SCHEMA.copy() -"""The schema for the extract store""" -NORMALIZE_SCHEMA = _PIPELINE_SCHEMA.copy() -"""The schema for the normalize store""" -LOAD_SCHEMA = _PIPELINE_SCHEMA.copy() -"""The schema for the load store""" - -AUDIT_SCHEMA = { - "event": exp.DataType.build("TEXT"), - "timestamp": exp.DataType.build("INT64"), - "elapsed": exp.DataType.build("FLOAT"), - "success": exp.DataType.build("BOOLEAN"), - "properties": exp.DataType.build("TEXT"), - "execution_id": exp.DataType.build("TEXT"), -} -"""The schema for the audit store""" - - -def _no_props(*args: t.Any, **kwargs: t.Any) -> t.Dict[str, JSON]: - """Empty properties""" - return {} - - -class StateStore(pydantic.BaseModel): - """The state store is responsible for persisting data""" - - model_config = {"frozen": True, "from_attributes": True} - - schema_: t.Annotated[str, pydantic.Field(alias="schema")] = "cdf_state" - """The schema in which to store data""" - protected: bool = True - """Whether the state store is protected, i.e. should never be torn down - - A safety measure to prevent accidental data loss when users are consuming the cdf API - directly. This should be set to False when running tests or you know what you're doing. - """ - - connection: t.Union[ - DuckDBConnectionConfig, - MySQLConnectionConfig, - PostgresConnectionConfig, - ] = DuckDBConnectionConfig(database=".cdf.db") - """The connection configuration to the state store""" - - _adapter: t.Optional[EngineAdapter] = None - """Lazy loaded adapter to the state store""" - - @property - def kv_table(self) -> str: - """The key-value table name""" - return f"{self.schema_}.json_store" - - @property - def extract_table(self) -> str: - """The extract table name""" - return f"{self.schema_}.extract_store" - - @property - def normalize_table(self) -> str: - """The normalize table name""" - return f"{self.schema_}.normalize_store" - - @property - def load_table(self) -> str: - """The load table name""" - return f"{self.schema_}.load_store" - - @property - def audit_table(self) -> str: - """The audit table name""" - return f"{self.schema_}.audit_store" - - @property - def adapter(self) -> EngineAdapter: - """The adapter to the state store""" - if self._adapter is None: - adapter = self.connection.create_engine_adapter() - adapter.create_schema(self.schema_) - adapter.create_state_table(self.kv_table, KV_SCHEMA) - adapter.create_state_table(self.extract_table, EXTRACT_SCHEMA) - adapter.create_state_table(self.normalize_table, NORMALIZE_SCHEMA) - adapter.create_state_table(self.load_table, LOAD_SCHEMA) - adapter.create_state_table(self.audit_table, AUDIT_SCHEMA) - self._adapter = adapter - return self._adapter - - def _execute(self, sql: str) -> None: - """Execute a SQL statement""" - self.adapter.execute(sql) - - def store_json(self, key: str, value: JSON) -> None: - """Store a JSON value""" - with self.adapter.transaction(value is not None), logger.suppress_and_warn(): - self.adapter.delete_from(self.kv_table, f"key = '{key}'") - if value is not None: - self.adapter.insert_append( - self.kv_table, - pd.DataFrame([{"key": key, "value": json.dumps(value)}]), - ) - - def load_json(self, key: str) -> JSON: - """Load a JSON value""" - return json.loads( - self.adapter.fetchone( - exp.select("value").from_(self.kv_table).where(f"key = '{key}'") - )[0] - ) - - __getitem__ = load_json - __setitem__ = store_json - - def __enter__(self, condition: bool = True) -> "StateStore": - """Proxies to the transaction context manager""" - self.__trans = self.adapter.transaction(condition) - return self - - def __exit__(self, exc_type, exc_value, traceback) -> None: - """Proxies to the transaction context manager""" - self.__trans.__exit__(exc_type, exc_value, traceback) - - def __del__(self) -> None: - """Close the connection to the state store""" - if self._adapter is not None: - self.adapter.close() - - def with_audit( - self, - event: str, - input_props: t.Union[t.Callable[P, JSON], t.Dict[str, JSON]] = _no_props, - output_props: t.Callable[[T], t.Dict[str, JSON]] = _no_props, - ) -> t.Callable[[t.Callable[P, T]], t.Callable[P, T]]: - """Decorator to add audit logging to a function - - Args: - event (str): The event name - input_props (Union[Callable[[P], JSON], Dict[str, JSON], optional): A callable that takes the function arguments - and returns a dictionary of properties to log. Alternatively, static props are accepted as a dictionary. - output_props (Callable[[T], Dict[str, JSON], optional): A callable that takes the function return value - and returns a dictionary of properties to log. - """ - - def decorator(func: t.Callable[P, T]) -> t.Callable[P, T]: - def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: - audit_event = { - "event": event, - "timestamp": time.time(), - "elapsed": 0, - "success": False, - "properties": ( - input_props(*args, **kwargs) - if callable(input_props) - else input_props - ), - "execution_id": execution_id.get(), - } - start = time.perf_counter() - try: - rv = func(*args, **kwargs) - except Exception as e: - audit_event["elapsed"] = time.perf_counter() - start - with self.adapter.transaction(), logger.suppress_and_warn(): - self.adapter.insert_append( - self.audit_table, - pd.DataFrame([audit_event]), - ) - raise e - audit_event["elapsed"] = time.perf_counter() - start - audit_event["success"] = not isinstance(rv, M.Err) - audit_event["properties"].update(output_props(rv)) - audit_event["properties"] = json.dumps(audit_event["properties"]) - with self.adapter.transaction(), logger.suppress_and_warn(): - self.adapter.insert_append( - self.audit_table, - pd.DataFrame([audit_event]), - ) - return rv - - return wrapper - - return decorator - - def audit( - self, event: str, success: bool = True, elapsed: float = 0.0, **properties: JSON - ) -> None: - """Audit an event""" - payload = { - "event": event, - "timestamp": time.time(), - "elapsed": elapsed, - "success": success, - "properties": json.dumps(properties), - "execution_id": execution_id.get(), - } - with self.adapter.transaction(), logger.suppress_and_warn(): - self.adapter.insert_append( - self.audit_table, - pd.DataFrame([payload]), - ) - - def fetch_audits( - self, *event_names: str, limit: int = 100, failed_only: bool = False - ): - """List all audit events""" - assert limit > 0 and limit < 1000, "Limit must be between 1 and 1000" - q = ( - exp.select("*") - .from_(self.audit_table) - .order_by("timestamp DESC") - .limit(limit) - ) - if failed_only: - q = q.where("success = false") - if event_names: - q = q.where(f"event IN {tuple(event_names)}") - df = self.adapter.fetchdf(q).sort_values("timestamp", ascending=True) - df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) - localtz = timezone(timedelta(seconds=-time.timezone)) - df["timestamp"] = df["timestamp"].dt.tz_convert(localtz) - return df - - def clear_audits(self): - """Clear all audit events""" - self.adapter.delete_from(self.audit_table, "1 = 1") - - def capture_extract_info(self, info: ExtractInfo) -> None: - """Capture extract information""" - d = self._info_to_payload(info) - if not d: - return - with self.adapter.transaction(), logger.suppress_and_warn(): - self.adapter.insert_append(self.extract_table, pd.DataFrame(d)) - - def capture_normalize_info(self, info: NormalizeInfo) -> None: - """Capture normalize information""" - d = self._info_to_payload(info) - if not d: - return - with self.adapter.transaction(), logger.suppress_and_warn(): - self.adapter.insert_append(self.normalize_table, pd.DataFrame(d)) - - def capture_load_info(self, info: LoadInfo) -> None: - """Capture load information""" - d = self._info_to_payload(info) - if not d: - return - with self.adapter.transaction(), logger.suppress_and_warn(): - self.adapter.insert_append(self.load_table, pd.DataFrame(d)) - - @staticmethod - def _info_to_payload( - info: t.Union[ExtractInfo, NormalizeInfo, LoadInfo], - ) -> t.List[t.Dict[str, t.Any]]: - """Convert an info object to a payload""" - payload = [] - for pkg in info.load_packages: - payload.append( - { - "load_id": pkg.load_id, - "timestamp": int(time.time()), - "pipeline": info.pipeline.pipeline_name, - "dataset": info.pipeline.dataset_name, - "destination_name": info.pipeline.destination.destination_name, - "destination_type": info.pipeline.destination.destination_type, - "data": json.dumps(pkg.asdict(), default=str), - "success": pkg.state != "aborted", - "elapsed": sum( - [j.elapsed for k in pkg.jobs.keys() for j in pkg.jobs[k]] - ), - "execution_id": execution_id.get(), - } - ) - return payload - - def fetch_extracted( - self, *load_ids: str, limit: int = 100, failed_only: bool = False - ): - """List all extracted data""" - assert limit > 0 and limit < 1000, "Limit must be between 1 and 1000" - q = ( - exp.select("*") - .from_(self.extract_table) - .order_by("timestamp DESC") - .limit(limit) - ) - if failed_only: - q = q.where("success = false") - if load_ids: - q = q.where(f"load_id IN {tuple(load_ids)}") - df = self.adapter.fetchdf(q).sort_values("timestamp", ascending=True) - df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) - localtz = timezone(timedelta(seconds=-time.timezone)) - df["timestamp"] = df["timestamp"].dt.tz_convert(localtz) - return df - - def fetch_normalized( - self, *load_ids: str, limit: int = 100, failed_only: bool = False - ): - """List all normalized data""" - assert limit > 0 and limit < 1000, "Limit must be between 1 and 1000" - q = ( - exp.select("*") - .from_(self.normalize_table) - .order_by("timestamp DESC") - .limit(limit) - ) - if failed_only: - q = q.where("success = false") - if load_ids: - q = q.where(f"load_id IN {tuple(load_ids)}") - df = self.adapter.fetchdf(q).sort_values("timestamp", ascending=True) - df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) - localtz = timezone(timedelta(seconds=-time.timezone)) - df["timestamp"] = df["timestamp"].dt.tz_convert(localtz) - return df - - def fetch_loaded(self, *load_ids: str, limit: int = 100, failed_only: bool = False): - """List all loaded data""" - assert limit > 0 and limit < 1000, "Limit must be between 1 and 1000" - q = ( - exp.select("*") - .from_(self.load_table) - .order_by("timestamp DESC") - .limit(limit) - ) - if failed_only: - q = q.where("success = false") - if load_ids: - q = q.where(f"load_id IN {tuple(load_ids)}") - df = self.adapter.fetchdf(q) - df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) - localtz = timezone(timedelta(seconds=-time.timezone)) - df["timestamp"] = df["timestamp"].dt.tz_convert(localtz) - return df - - -def with_audit( - event: str, - input_props: t.Union[t.Callable[P, JSON], t.Dict[str, JSON]] = _no_props, - output_props: t.Callable[[T], t.Dict[str, JSON]] = _no_props, -) -> t.Callable[[t.Callable[P, T]], t.Callable[P, T]]: - """Decorator to add audit logging to a function given an active project""" - - def decorator(func: t.Callable[P, T]) -> t.Callable[P, T]: - def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: - project = active_project.get(None) - if project is None: - return func(*args, **kwargs) - return project.state.with_audit( - event, - input_props, - output_props, - )(func)(*args, **kwargs) - - return wrapper - - return decorator - - -def audit( - event: str, success: bool = True, elapsed: float = 0.0, **properties: JSON -) -> None: - """Audit an event given an active project""" - properties.setdefault("execution_id", execution_id.get()) - project = active_project.get(None) - if project is not None: - project.state.audit(event, success, elapsed, **properties) diff --git a/src/cdf/legacy/utility/__init__.py b/src/cdf/legacy/utility/__init__.py deleted file mode 100644 index c279ee7..0000000 --- a/src/cdf/legacy/utility/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -import typing as t -from operator import itemgetter - -TDict = t.TypeVar("TDict", bound=t.Dict[str, t.Any]) - - -def find_item( - lst: t.List[TDict], key: t.Union[t.Callable[[TDict], t.Any], str], value: t.Any -) -> TDict: - """Find an item in a list by a key-value pair. - - Example: - >>> find_item([{"name": "Alice"}, {"name": "Bob"}], "name", "Bob") - {"name": "Bob"} - - Args: - lst: The list to search. - key: The key function to extract the value from an item or the key name. - value: The value to find. - - Returns: - The item with the matching value. - """ - fn = itemgetter(key) if isinstance(key, str) else key - return next((item for item in lst if fn(item) == value)) diff --git a/src/cdf/legacy/utility/file.py b/src/cdf/legacy/utility/file.py deleted file mode 100644 index 6808c0d..0000000 --- a/src/cdf/legacy/utility/file.py +++ /dev/null @@ -1,75 +0,0 @@ -import json -import typing as t -from pathlib import Path - -import ruamel.yaml as yaml -import tomlkit - -from cdf.types import M - - -def load_file(path: Path) -> M.Result[t.Dict[str, t.Any], Exception]: - """Load a configuration from a file path. - - Args: - path: The file path. - - Returns: - A Result monad with the configuration dictionary if the file format is JSON, YAML or TOML. - Otherwise, a Result monad with an error. - """ - if path.suffix == ".json": - return _load_json(path) - if path.suffix in (".yaml", ".yml"): - return _load_yaml(path) - if path.suffix == ".toml": - return _load_toml(path) - return M.error(ValueError("Invalid file format, must be JSON, YAML or TOML")) - - -def _load_json(path: Path) -> M.Result[t.Dict[str, t.Any], Exception]: - """Load a configuration from a JSON file. - - Args: - path: The file path to a valid JSON document. - - Returns: - A Result monad with the configuration dictionary if the file format is JSON. Otherwise, a - Result monad with an error. - """ - try: - return M.ok(json.loads(path.read_text())) - except Exception as e: - return M.error(e) - - -def _load_yaml(path: Path) -> M.Result[t.Dict[str, t.Any], Exception]: - """Load a configuration from a YAML file. - - Args: - path: The file path to a valid YAML document. - - Returns: - A Result monad with the configuration dictionary if the file format is YAML. Otherwise, a - Result monad with an error. - """ - try: - return M.ok(yaml.round_trip_load(path, preserve_quotes=True)) - except Exception as e: - return M.error(e) - - -def _load_toml(path: Path) -> M.Result[t.Dict[str, t.Any], Exception]: - """Load a configuration from a TOML file. - - Args: - path: The file path to a valid TOML document. - - Returns: - A Result monad with the configuration dictionary if the file format is TOML. Otherwise, a - Result monad with an error. - """ - try: - return M.ok(tomlkit.loads(path.read_text()).unwrap()) - except Exception as e: - return M.error(e) diff --git a/tests/legacy/specification/test_notebook.py b/tests/legacy/specification/test_notebook.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/specification/test_pipeline.py b/tests/legacy/specification/test_pipeline.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/specification/test_publisher.py b/tests/legacy/specification/test_publisher.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/specification/test_script.py b/tests/legacy/specification/test_script.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/specification/test_sink.py b/tests/legacy/specification/test_sink.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/test_context.py b/tests/legacy/test_context.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/test_filesystem.py b/tests/legacy/test_filesystem.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/test_packaging.py b/tests/legacy/test_packaging.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/legacy/test_project.py b/tests/legacy/test_project.py deleted file mode 100644 index 86e00f1..0000000 --- a/tests/legacy/test_project.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Tests for the core.project module.""" - -from pathlib import Path - -import dlt -import pytest - -from cdf.legacy.project import Project, load_project - - -def test_load_project(): - """Test the load_project function.""" - project = load_project("examples/sandbox") - assert project.is_ok() - - err_monad = load_project("examples/idontexist") - with pytest.raises(FileNotFoundError): - err_monad.unwrap() - - project = project.unwrap() - assert isinstance(project, Project) - - -@pytest.fixture -def project(): - """Load the project for testing.""" - return load_project("examples/sandbox").unwrap() - - -def test_project_indexing(project: Project): - """Ensure the project can be indexed. - - The project is a dictionary-like object. It exposes its own configuration, - and it exposes workspaces through the `workspaces` key. Dot notation is also - supported. Dunder methods like `__contains__` and `__len__` apply to - the workspace collection. The project is read-only. Indexing into a Workspace - object will invoke the workspace's __getitem__ method which also supports dot - notation. Hence the project is a tree-like structure. - """ - assert project["name"] == "cdf-example" - assert project["version"] == "0.1.0" - assert project["feature_flags.provider"] == "filesystem" - assert project["workspaces.alex"] is project.get_workspace("alex").unwrap() - assert len(project) == 1 - assert len(project["workspaces.alex.pipelines"]) == 3 - assert "alex" in project - assert "jane" not in project - with pytest.raises(KeyError): - project["workspaces.jane"] - with pytest.raises(NotImplementedError): - del project["name"] - assert list(project)[0] is project["workspaces.alex"] - assert project["workspaces.alex.pipelines.us_cities.version"] == 1 - - -def test_project_get_spec(project: Project): - """Ensure the project can get a spec and that we get the same spec each time.""" - spec = ( - project.get_workspace("alex") - .bind(lambda workspace: workspace.get_pipeline_spec("us_cities")) - .unwrap() - ) - assert spec["name"] == "us_cities" - assert callable(spec) - assert spec is ( - project.get_workspace("alex") - .bind(lambda workspace: workspace.get_pipeline_spec("us_cities")) - .unwrap() - ) - - -def test_inject_configuration(project: Project): - """Ensure keys are persisted while injecting configuration.""" - with project.inject_configuration(): - assert dlt.config["something"] == "ok" - dlt.config["other"] = "cool" - assert dlt.config["other"] == "cool" - dlt.secrets["ok.nice.cool"] = "wow" - assert dlt.secrets["ok.nice.cool"] == "wow" - - -def test_round_trip_serialization(project: Project): - """Test that the project can be serialized and deserialized.""" - obj = project.model_dump() - roundtrip = Project.model_validate(obj) - assert roundtrip == project - assert roundtrip.is_newer_than(project) - assert ( - project["workspaces.alex.scripts.nested/hello"] - == roundtrip["workspaces.alex.scripts.nested/hello"] - ) - - -def test_init_ff(project: Project): - """Test that the feature flag adapter is initialized.""" - assert project.ff_adapter is not None - assert project.ff.provider == "filesystem" - # The example project _storage is not committed to git currently - # assert project.ff_adapter["cdf-example.alex.us_cities.us_cities"].to_bool() is True - - -def test_init_fs(project: Project): - """Test that the filesystem adapter is initialized.""" - assert project.fs_adapter is not None - assert project.fs_adapter.protocol == "cdf" - - -def test_init_state(project: Project): - """Test that the state adapter is initialized.""" - # from sqlglot import exp - - adapter = project.state.adapter - assert adapter is not None - # adapter.create_schema("test") - # adapter.create_table("test1", {"name": exp.DataType.build("text")}) - # assert adapter.table_exists("test1") - # project.state.store_json("test", {"name": "alex"}) - # adapter.close() - - -@pytest.fixture -def python_project(): - city_spec = { - "path": Path("pipelines/us_cities_pipeline.py"), - "cron_string": "@daily", - "description": "Get US city data", - "metrics": { - "*": [ - { - "name": "cdf_builtin_metrics_count", - "description": "Counts the number of items in a dataset", - "entrypoint": "cdf.builtin.metrics:count", - }, - { - "name": "cdf_builtin_metrics_max_value", - "description": "Returns the maximum value of a key in a dataset", - "entrypoint": "cdf.builtin.metrics:max_value", - "options": {"key": "zip_code"}, - }, - ] - }, - "filters": {}, - "dataset_name": "test_city", - "options": { - "progress": None, - "full_refresh": False, - "loader_file_format": "insert_values", - "runtime": {"dlthub_telemetry": False}, - }, - } - dota_spec = { - "cron_string": "@daily", - "name": "dota2", - "description": "Dota2 is a Massive Online Battle Arena game based on Warcraft.", - "path": Path("pipelines/dota2_pipeline.py"), - } - local_spec = { - "name": "local", - "description": "No description provided.", - "path": Path("sinks/local_sink.py"), - } - httpbin_spec = { - "cron_string": "@daily", - "name": "httpbin", - "description": "A publisher that pushes data to httpbin.org", - "path": Path("publishers/httpbin_publisher.py"), - "depends_on": ["mart.zips"], - } - hello_spec = { - "cron_string": "@daily", - "name": "hello", - "description": "No description provided.", - "path": Path("scripts/hello_script.py"), - } - return Project.model_validate( - { - "path": Path("examples/sandbox").resolve(), - "name": "data-platform", - "version": "0.2.0", - "workspaces": { - "datateam": { - "path": "alex", - "pipelines": {"cities": city_spec, "dota": dota_spec}, - "sinks": {"local": local_spec}, - "publishers": {"httpbin": httpbin_spec}, - "scripts": {"hello": hello_spec}, - } - }, - "filesystem": {"uri": "file://_storage", "options": {}}, - "feature_flags": { - "provider": "filesystem", - "filename": "@jinja dev_flags_{{ 1 + 1}}.json", - }, - } - ) - - -def test_custom_project(python_project: Project): - """Test creating a project programmatically. - - This project has a custom structure and is not loaded from a file. Components - are still ultimately based on python files, however the configuration wrapping - these components is done in code which offers more flexibility. - """ - assert python_project.name == "data-platform" - - -@pytest.fixture -def barebones_project(): - return Project.model_validate( - { - "path": "examples/sandbox", - "name": "data-platform", - "version": "0.2.0", - "workspaces": { - "datateam": { - "path": "alex", - "pipelines": { - "cities": "pipelines/us_cities_pipeline.py", - "dota": {"path": "pipelines/dota2_pipeline.py"}, - }, - "sinks": {"local": "sinks/local_sink.py"}, - "publishers": { - "httpbin": { - "path": "publishers/httpbin_publisher.py", - "depends_on": ["mart.zips"], - } - }, - "scripts": {"hello": "scripts/hello_script.py"}, - } - }, - } - ) - - -def test_barebones_project(barebones_project: Project): - """Test creating a project programmatically with minimal configuration. - - This asserts that certain heuristics are applied to the configuration to - make it more user-friendly. - """ - assert barebones_project.name == "data-platform" - assert barebones_project["workspaces.datateam.pipelines.cities"] is not None - assert barebones_project["workspaces.datateam.publishers.httpbin.depends_on"] == [ - "mart.zips" - ] - assert barebones_project["workspaces.datateam.sinks.local.component_path"] == Path( - "sinks/local_sink.py" - ) - assert barebones_project[ - "workspaces.datateam.scripts.hello.component_path" - ] == Path("scripts/hello_script.py") - assert barebones_project[ - "workspaces.datateam.pipelines.cities.component_path" - ] == Path("pipelines/us_cities_pipeline.py") - assert len(barebones_project["workspaces.datateam.pipelines"]) == 2 - assert len(barebones_project["workspaces.datateam.sinks"]) == 1 - assert len(barebones_project["workspaces.datateam.publishers"]) == 1 - assert len(barebones_project["workspaces.datateam.scripts"]) == 1 - assert len(barebones_project["workspaces.datateam"]) == 5 - assert len(barebones_project) == 1 - assert "datateam" in barebones_project - assert "jane" not in barebones_project - with pytest.raises(KeyError): - barebones_project["workspaces.jane"] - with pytest.raises(NotImplementedError): - del barebones_project["name"] - assert list(barebones_project)[0] is barebones_project["workspaces.datateam"] diff --git a/tests/legacy/utility/test_file_.py b/tests/legacy/utility/test_file_.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index c9a5130..0000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,12 +0,0 @@ -from typer.testing import CliRunner - -from cdf.cli import app - -runner = CliRunner() - - -def test_index(): - result = runner.invoke(app, ["-p", "examples/sandbox", "-w", "alex", "index"]) - assert result.exit_code == 0 - assert "Pipelines" in result.stdout - assert "Sinks" in result.stdout