DataChefHQ · kkiani · Sep 18, 2024 · Aug 6, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/examples/sparkle/Pulumi.yaml b/examples/sparkle/Pulumi.yaml
@@ -0,0 +1,11 @@
+name: object_storage
+runtime:
+  name: python
+  options:
+    toolchain: pip
+    virtualenv: venv
+description: A minimal Azure Native Python Pulumi program
+config:
+  pulumi:tags:
+    value:
+      pulumi:template: azure-python
diff --git a/examples/sparkle/__main__.py b/examples/sparkle/__main__.py
@@ -0,0 +1,31 @@
+from damavand.cloud.provider import AwsProvider
+from damavand.factories import SparkControllerFactory
+
+from applications.orders import CustomerOrders
+from applications.products import Products
+
+
+def main() -> None:
+    spark_factory = SparkControllerFactory(
+        provider=AwsProvider(
+            app_name="my-app",
+            region="us-west-2",
+        ),
+        tags={"env": "dev"},
+    )
+
+    spark_controller = spark_factory.new(
+        name="my-spark",
+    )
+
+    spark_controller.applications = [
+        Products(spark_controller.default_session()),
+        CustomerOrders(spark_controller.default_session()),
+    ]
+
+    spark_controller.run_application("products")
+    spark_controller.provision()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/resources/__init__.py → examples/sparkle/applications/__init__.py b/tests/resources/__init__.py → examples/sparkle/applications/__init__.py
diff --git a/examples/sparkle/applications/orders.py b/examples/sparkle/applications/orders.py
@@ -0,0 +1,31 @@
+from sparkle.config import Config
+from sparkle.writer.iceberg_writer import IcebergWriter
+from sparkle.application import Sparkle
+
+from pyspark.sql import DataFrame
+from pyspark.sql import SparkSession
+
+
+class CustomerOrders(Sparkle):
+    def __init__(self, spark_session: SparkSession):
+        super().__init__(
+            spark_session,
+            config=Config(
+                app_name="orders",
+                app_id="orders-app",
+                version="0.0.1",
+                database_bucket="s3://test-bucket",
+                checkpoints_bucket="s3://test-checkpoints",
+            ),
+            writers=[
+                IcebergWriter(
+                    database_name="default",
+                    database_path="s3://bucket-name/warehouse",
+                    table_name="products",
+                    spark_session=spark_session,
+                )
+            ],
+        )
+
+    def process(self) -> DataFrame:
+        return self.input["orders"].read()
diff --git a/examples/sparkle/applications/products.py b/examples/sparkle/applications/products.py
@@ -0,0 +1,31 @@
+from sparkle.application import Sparkle
+from sparkle.config import Config
+from sparkle.writer.iceberg_writer import IcebergWriter
+
+from pyspark.sql import DataFrame
+from pyspark.sql import SparkSession
+
+
+class Products(Sparkle):
+    def __init__(self, spark_session: SparkSession):
+        super().__init__(
+            spark_session,
+            config=Config(
+                app_name="products",
+                app_id="products-app",
+                version="0.0.1",
+                database_bucket="s3://test-bucket",
+                checkpoints_bucket="s3://test-checkpoints",
+            ),
+            writers=[
+                IcebergWriter(
+                    database_name="default",
+                    database_path="s3://bucket-name/warehouse",
+                    table_name="products",
+                    spark_session=spark_session,
+                )
+            ],
+        )
+
+    def process(self) -> DataFrame:
+        return self.input["products"].read()
diff --git a/examples/sparkle/requirements.txt b/examples/sparkle/requirements.txt
@@ -0,0 +1,3 @@
+-e ../../../damavand
+pyspark==3.3.2
+pulumi
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "damavand"
-description = "Default template for PDM package"
+description = "Damavand is an opinionated cloud-agnostic pythonic implementation of ARC design pattern for developing cloud-native applications."
 authors = [
     {name = "Kiarash Kiani", email = "[email protected]"},
 ]
@@ -12,6 +12,8 @@ dependencies = [
     "pulumi>=3.127.0",
     "pulumi-aws>=6.47.0",
     "pulumi-azure-native>=2.51.0",
+    "pulumi-random>=4.16.3",
+    "sparkle @ git+https://github.com/DataChefHQ/[email protected]",
 ]
 requires-python = ">=3.11.0"
 readme = "README.md"
@@ -36,6 +38,7 @@ dev = [
     "pytest-coverage>=0.0",
     "pyright>=1.1.374",
     "moto>=5.0.11",
+    "pip>=24.2",
 ]
 [tool.commitizen]
 version = "1.0.0"

diff --git a/src/damavand/base/__init__.py b/src/damavand/base/__init__.py
diff --git a/src/damavand/base/controllers/__init__.py b/src/damavand/base/controllers/__init__.py
@@ -0,0 +1,11 @@
+from .base_controller import ApplicationController, runtime, buildtime
+from .object_storage import ObjectStorageController
+from .spark import SparkController
+
+__all__ = [
+    "ApplicationController",
+    "ObjectStorageController",
+    "SparkController",
+    "runtime",
+    "buildtime",
+]
diff --git a/src/damavand/base/controllers/base_controller.py b/src/damavand/base/controllers/base_controller.py
@@ -0,0 +1,66 @@
+import logging
+from functools import cache
+from pulumi import Resource as PulumiResource
+import pulumi
+
+from damavand import utils
+
+
+logger = logging.getLogger(__name__)
+
+
+def buildtime(func):
+    def wrapper(self, *args, **kwargs):
+        if not utils.is_building():
+            logger.warning(
+                f"Calling buildtime method `{func.__name__}` during runtime."
+            )
+            return None
+
+        return func(self, *args, **kwargs)
+
+    return wrapper
+
+
+def runtime(func):
+    def wrapper(self, *args, **kwargs):
+        if utils.is_building():
+            logger.warning(
+                f"Calling runtime method `{func.__name__}` during buildtime."
+            )
+            return None
+
+        return func(self, *args, **kwargs)
+
+    return wrapper
+
+
+class ApplicationController(object):
+    def __init__(
+        self,
+        name: str,
+        tags: dict[str, str] = {},
+        **kwargs,
+    ) -> None:
+        self.name = name
+        self.tags = tags
+        self.extra_args = kwargs
+        self._pulumi_object = None
+
+    @property
+    @buildtime
+    @cache
+    def build_config(self) -> pulumi.Config:
+        return pulumi.Config()
+
+    @buildtime
+    @cache
+    def resource(self) -> PulumiResource:
+        """A lazy property that provision the resource if it is not provisioned yet and return the pulumi object."""
+
+        raise NotImplementedError()
+
+    def provision(self) -> None:
+        """Provision the resource in not provisioned yet."""
+
+        _ = self.resource()
diff --git a/src/damavand/resource/object_storage.py → ...mavand/base/controllers/object_storage.py b/src/damavand/resource/object_storage.py → ...mavand/base/controllers/object_storage.py
@@ -1,20 +1,16 @@
-from typing import Iterable, Optional
+from typing import Iterable
 
-from damavand.resource import BaseResource
+from damavand.base.controllers import ApplicationController
 
 
-class BaseObjectStorage(BaseResource):
+class ObjectStorageController(ApplicationController):
     def __init__(
         self,
         name,
-        id_: Optional[str] = None,
         tags: dict[str, str] = {},
         **kwargs,
     ) -> None:
-        super().__init__(name, id_, tags, **kwargs)
-
-    def provision(self):
-        raise NotImplementedError
+        super().__init__(name, tags, **kwargs)
 
     def read(self, path: str) -> bytes:
         """Read an object from the storage."""

diff --git a/src/damavand/base/controllers/spark.py b/src/damavand/base/controllers/spark.py
@@ -0,0 +1,158 @@
+import os
+import logging
+from pyspark.conf import SparkConf
+from pyspark.sql import SparkSession
+
+from damavand.environment import Environment
+from damavand.base.controllers import ApplicationController
+from damavand.base.controllers.base_controller import runtime
+
+from sparkle.application import Sparkle
+
+
+logger = logging.getLogger(__name__)
+
+
+class SparkController(ApplicationController):
+    """
+    The SparkController class is the base class for all Spark controllers
+    implementations for each cloud provider.
+
+    ...
+
+    Attributes
+    ----------
+    name : str
+        the name of the controller.
+    applications : list[Sparkle]
+        the list of Spark applications.
+    tags : dict[str, str]
+        the tags of the controller.
+    kwargs : dict
+        the extra arguments.
+
+    Methods
+    -------
+    default_local_session()
+        Return the default local Spark session.
+    default_cloud_session()
+        Return the default cloud Spark session.
+    default_session()
+        Return the currently active Spark session.
+    application_with_id(app_id)
+        Return the Spark application with the given ID.
+    run_application(app_id)
+        Run the Spark application with the given ID.
+    """
+
+    def __init__(
+        self,
+        name,
+        tags: dict[str, str] = {},
+        **kwargs,
+    ) -> None:
+        ApplicationController.__init__(self, name, tags, **kwargs)
+        self.applications: list[Sparkle] = []
+
+    @property
+    def _spark_extensions(self) -> list[str]:
+        return [
+            "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
+        ]
+
+    @property
+    def _spark_packages(self) -> list[str]:
+        return [
+            "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.3.1",
+            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0",
+            "org.apache.spark:spark-avro_2.12:3.3.0",
+        ]
+
+    def default_local_session(self) -> SparkSession:
+        """Return the default local Spark session.
+
+        Returns:
+            SparkSession: The Spark session.
+        """
+
+        ivy_settings_path = os.environ.get("IVY_SETTINGS_PATH", None)
+        LOCAL_CONFIG = {
+            "spark.sql.extensions": ",".join(self._spark_extensions),
+            "spark.jars.packages": ",".join(self._spark_packages),
+            "spark.sql.jsonGenerator.ignoreNullFields": False,
+            "spark.sql.session.timeZone": "UTC",
+            "spark.sql.catalog.spark_catalog": "org.apache.iceberg.spark.SparkSessionCatalog",
+            "spark.sql.catalog.spark_catalog.type": "hive",
+            "spark.sql.catalog.local": "org.apache.iceberg.spark.SparkCatalog",
+            "spark.sql.catalog.local.type": "hadoop",
+            "spark.sql.catalog.local.warehouse": "/tmp/warehouse",
+            "spark.sql.defaultCatalog": "local",
+        }
+
+        spark_conf = SparkConf()
+
+        for key, value in LOCAL_CONFIG.items():
+            spark_conf.set(key, str(value))
+
+        spark_session = (
+            # NOTE: Pyright does not work `@classproperty` decorator used in `SparkSession`. This however should be fixed in pyspark v4.
+            SparkSession.builder.master("local[*]")  # type: ignore
+            .appName("LocalDataProductApp")
+            .config(conf=spark_conf)
+        )
+
+        if ivy_settings_path:
+            spark_session.config("spark.jars.ivySettings", ivy_settings_path)
+
+        return spark_session.getOrCreate()
+
+    def default_cloud_session(self) -> SparkSession:
+        """Return the default Spark session provided by the cloud spark machine.
+
+        Returns:
+            SparkSession: The Spark session.
+        """
+
+        raise NotImplementedError
+
+    def default_session(self) -> SparkSession:
+        """Return the currently active Spark session. If the environment is local, it
+        returns the local session. Otherwise, it returns the cloud session.
+
+        Returns:
+            SparkSession: The Spark session.
+        """
+        env = Environment.from_system_env()
+        match env:
+            case Environment.LOCAL:
+                return self.default_local_session()
+            case _:
+                return self.default_cloud_session()
+
+    def application_with_id(self, app_id: str) -> Sparkle:
+        """Return the Spark application with the given ID.
+
+        Args:
+            app_id (str): The application ID.
+
+        Returns:
+            Sparkle: The Spark application.
+        """
+
+        for app in self.applications:
+            if app.config.app_id == app_id:
+                return app
+
+        raise ValueError(f"Application with ID {app_id} not found.")
+
+    @runtime
+    def run_application(self, app_id: str) -> None:
+        """Run the Spark application with the given ID.
+
+        Args:
+            app_id (str): The application ID.
+        """
+
+        app = self.application_with_id(app_id)
+        df = app.process()
+        app.write(df)