dbt-labs · MichelleArk · Sep 24, 2024 · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -0,0 +1,6 @@
+kind: Features
+body: Write microbatch compiled/run targets to separate files, one per batch
+time: 2024-09-20T17:24:19.219556+01:00
+custom:
+  Author: michelleark
+  Issue: "10714"
@@ -521,7 +521,9 @@ def write_graph_file(self, linker: Linker, manifest: Manifest):
             linker.write_graph(graph_path, manifest)
 
     # writes the "compiled_code" into the target/compiled directory
-    def _write_node(self, node: ManifestSQLNode) -> ManifestSQLNode:
+    def _write_node(
+        self, node: ManifestSQLNode, split_suffix: Optional[str] = None
+    ) -> ManifestSQLNode:
         if not node.extra_ctes_injected or node.resource_type in (
             NodeType.Snapshot,
             NodeType.Seed,
@@ -530,7 +532,9 @@ def _write_node(self, node: ManifestSQLNode) -> ManifestSQLNode:
         fire_event(WritingInjectedSQLForNode(node_info=get_node_info()))
 
         if node.compiled_code:
-            node.compiled_path = node.get_target_write_path(self.config.target_path, "compiled")
+            node.compiled_path = node.get_target_write_path(
+                self.config.target_path, "compiled", split_suffix
+            )
             node.write_node(self.config.project_root, node.compiled_path, node.compiled_code)
         return node
 
@@ -540,6 +544,7 @@ def compile_node(
         manifest: Manifest,
         extra_context: Optional[Dict[str, Any]] = None,
         write: bool = True,
+        split_suffix: Optional[str] = None,
     ) -> ManifestSQLNode:
         """This is the main entry point into this code. It's called by
         CompileRunner.compile, GenericRPCRunner.compile, and
@@ -562,7 +567,7 @@ def compile_node(
 
         node, _ = self._recursively_prepend_ctes(node, manifest, extra_context)
         if write:
-            self._write_node(node)
+            self._write_node(node, split_suffix=split_suffix)
         return node
 
 

@@ -51,6 +51,7 @@
     Exposure,
     Macro,
     ManifestNode,
+    ModelNode,
     Resource,
     SeedNode,
     SemanticModel,
@@ -972,7 +973,19 @@
         # macros/source defs aren't 'writeable'.
         if isinstance(self.model, (Macro, SourceDefinition)):
             raise MacrosSourcesUnWriteableError(node=self.model)
-        self.model.build_path = self.model.get_target_write_path(self.config.target_path, "run")
+
+        split_suffix = None
+        if (
+            isinstance(self.model, ModelNode)
+            and self.model.config.get("incremental_strategy") == "microbatch"
+        ):
+            split_suffix = self.model.format_batch_start(
+                self.model.config.get("__dbt_internal_microbatch_event_time_start")
+            )
+
+        self.model.build_path = self.model.get_target_write_path(
+            self.config.target_path, "run", split_suffix=split_suffix
+        )
         self.model.write_node(self.config.project_root, self.model.build_path, payload)
         return ""
 

@@ -2,6 +2,7 @@
 import os
 from dataclasses import dataclass, field
 from datetime import datetime
+from pathlib import Path
 from typing import (
     Any,
     Dict,
@@ -59,6 +60,7 @@
 from dbt.artifacts.resources import SqlOperation as SqlOperationResource
 from dbt.artifacts.resources import TimeSpine
 from dbt.artifacts.resources import UnitTestDefinition as UnitTestDefinitionResource
+from dbt.artifacts.resources.types import BatchSize
 from dbt.contracts.graph.model_config import UnitTestNodeConfig
 from dbt.contracts.graph.node_args import ModelNodeArgs
 from dbt.contracts.graph.unparsed import (
@@ -243,14 +245,25 @@ def clear_event_status(self):
 
 @dataclass
 class ParsedNode(ParsedResource, NodeInfoMixin, ParsedNodeMandatory, SerializableType):
-    def get_target_write_path(self, target_path: str, subdirectory: str):
+    def get_target_write_path(
+        self, target_path: str, subdirectory: str, split_suffix: Optional[str] = None
+    ):
         # This is called for both the "compiled" subdirectory of "target" and the "run" subdirectory
         if os.path.basename(self.path) == os.path.basename(self.original_file_path):
             # One-to-one relationship of nodes to files.
             path = self.original_file_path
         else:
             #  Many-to-one relationship of nodes to files.
             path = os.path.join(self.original_file_path, self.path)
+
+        if split_suffix:
+            pathlib_path = Path(path)
+            path = str(
+                pathlib_path.parent
+                / pathlib_path.stem
+                / (pathlib_path.stem + f"_{split_suffix}" + pathlib_path.suffix)
+            )
+
         target_write_path = os.path.join(target_path, subdirectory, self.package_name, path)
         return target_write_path
 
@@ -559,6 +572,16 @@ def infer_primary_key(self, data_tests: List["GenericTestNode"]) -> List[str]:
 
         return []
 
+    def format_batch_start(self, batch_start: Optional[datetime]) -> Optional[str]:
+        if batch_start is None:
+            return batch_start
+
+        return str(
+            batch_start.date()
+            if (batch_start and self.config.batch_size != BatchSize.hour)
+            else batch_start
+        )
+
     def same_contents(self, old, adapter_type) -> bool:
         return super().same_contents(old, adapter_type) and self.same_ref_representation(old)
 

@@ -14,7 +14,6 @@
 )
 from dbt.adapters.exceptions import MissingMaterializationError
 from dbt.artifacts.resources import Hook
-from dbt.artifacts.resources.types import BatchSize
 from dbt.artifacts.schemas.results import (
     BaseResult,
     NodeStatus,
@@ -197,11 +196,8 @@
 
     def describe_batch(self, batch_start: Optional[datetime]) -> str:
         # Only visualize date if batch_start year/month/day
-        formatted_batch_start = (
-            batch_start.date()
-            if (batch_start and self.node.config.batch_size != BatchSize.hour)
-            else batch_start
-        )
+        formatted_batch_start = self.node.format_batch_start(batch_start)
+
         return f"batch {formatted_batch_start} of {self.get_node_representation()}"
 
     def print_start_line(self):
@@ -463,7 +459,9 @@
                 model.config["__dbt_internal_microbatch_event_time_end"] = batch[1]
 
                 # Recompile node to re-resolve refs with event time filters rendered, update context
-                self.compiler.compile_node(model, manifest, {})
+                self.compiler.compile_node(
+                    model, manifest, {}, split_suffix=model.format_batch_start(batch[0])
+                )
                 context["model"] = model
                 context["sql"] = model.compiled_code
                 context["compiled_code"] = model.compiled_code

@@ -5,6 +5,7 @@
 
 from dbt.tests.util import (
     patch_microbatch_end_time,
+    read_file,
     relation_from_name,
     run_dbt,
     run_dbt_and_capture,
@@ -442,3 +443,78 @@ def test_run_with_event_time(self, project):
         with patch_microbatch_end_time("2020-01-03 13:57:00"):
             run_dbt(["run", "--event-time-start", "2020-01-01"])
         self.assert_row_count(project, "microbatch_model", 2)
+
+
+class TestMicrobatchCompiledRunPaths(BaseMicrobatchTest):
+    @mock.patch.dict(os.environ, {"DBT_EXPERIMENTAL_MICROBATCH": "True"})
+    def test_run_with_event_time(self, project):
+        # run all partitions from start - 2 expected rows in output, one failed
+        with patch_microbatch_end_time("2020-01-03 13:57:00"):
+            run_dbt(["run", "--event-time-start", "2020-01-01"])
+
+        # Compiled paths - compiled model without filter only
+        assert read_file(
+            project.project_root,
+            "target",
+            "compiled",
+            "test",
+            "models",
+            "microbatch_model.sql",
+        )
+
+        # Compiled paths - batch compilations
+        assert read_file(
+            project.project_root,
+            "target",
+            "compiled",
+            "test",
+            "models",
+            "microbatch_model",
+            "microbatch_model_2020-01-01.sql",
+        )
+        assert read_file(
+            project.project_root,
+            "target",
+            "compiled",
+            "test",
+            "models",
+            "microbatch_model",
+            "microbatch_model_2020-01-02.sql",
+        )
+        assert read_file(
+            project.project_root,
+            "target",
+            "compiled",
+            "test",
+            "models",
+            "microbatch_model",
+            "microbatch_model_2020-01-03.sql",
+        )
+
+        assert read_file(
+            project.project_root,
+            "target",
+            "run",
+            "test",
+            "models",
+            "microbatch_model",
+            "microbatch_model_2020-01-01.sql",
+        )
+        assert read_file(
+            project.project_root,
+            "target",
+            "run",
+            "test",
+            "models",
+            "microbatch_model",
+            "microbatch_model_2020-01-02.sql",
+        )
+        assert read_file(
+            project.project_root,
+            "target",
+            "run",
+            "test",
+            "models",
+            "microbatch_model",
+            "microbatch_model_2020-01-03.sql",
+        )
@@ -13,9 +13,10 @@
     Measure,
     TestMetadata,
 )
+from dbt.artifacts.resources.types import BatchSize
 from dbt.artifacts.resources.v1.semantic_model import NodeRelation
 from dbt.contracts.graph.model_config import TestConfig
-from dbt.contracts.graph.nodes import ColumnInfo, ModelNode, SemanticModel
+from dbt.contracts.graph.nodes import ColumnInfo, ModelNode, ParsedNode, SemanticModel
 from dbt.node_types import NodeType
 from dbt_common.contracts.constraints import (
     ColumnLevelConstraint,
@@ -110,6 +111,22 @@ def test_all_constraints(
 
         assert default_model_node.all_constraints == expected_all_constraints
 
+    @pytest.mark.parametrize(
+        "batch_size,batch_start,expected_formatted_batch_start",
+        [
+            (None, None, None),
+            (BatchSize.year, datetime(2020, 1, 1, 1), "2020-01-01"),
+            (BatchSize.month, datetime(2020, 1, 1, 1), "2020-01-01"),
+            (BatchSize.day, datetime(2020, 1, 1, 1), "2020-01-01"),
+            (BatchSize.hour, datetime(2020, 1, 1, 1), "2020-01-01 01:00:00"),
+        ],
+    )
+    def test_format_batch_start(
+        self, default_model_node, batch_size, batch_start, expected_formatted_batch_start
+    ):
+        default_model_node.config.batch_size = batch_size
+        assert default_model_node.format_batch_start(batch_start) == expected_formatted_batch_start
+
 
 class TestSemanticModel:
     @pytest.fixture(scope="function")
@@ -391,3 +408,35 @@ def test_disabled_unique_combo_multiple():
 
 def assertSameContents(list1, list2):
     assert sorted(list1) == sorted(list2)
+
+
+class TestParsedNode:
+    @pytest.fixture(scope="class")
+    def parsed_node(self) -> ParsedNode:
+        return ParsedNode(
+            resource_type=NodeType.Model,
+            unique_id="model.test_package.test_name",
+            name="test_name",
+            package_name="test_package",
+            schema="test_schema",
+            alias="test_alias",
+            fqn=["models", "test_name"],
+            original_file_path="test_original_file_path",
+            checksum=FileHash.from_contents("checksum"),
+            path="test_path.sql",
+            database=None,
+        )
+
+    def test_get_target_write_path(self, parsed_node):
+        write_path = parsed_node.get_target_write_path("target_path", "subdirectory")
+        assert (
+            write_path
+            == "target_path/subdirectory/test_package/test_original_file_path/test_path.sql"
+        )
+
+    def test_get_target_write_path_split(self, parsed_node):
+        write_path = parsed_node.get_target_write_path("target_path", "subdirectory", "split")
+        assert (
+            write_path
+            == "target_path/subdirectory/test_package/test_original_file_path/test_path/test_path_split.sql"
+        )