Fix/brickflow examples (#36)

* update brickflow example with brickflow 0.10.0 * update examples readme file * include create notebooks dir command --------- Co-authored-by: pariksheet <[email protected]>
Nike-Inc · Sep 9, 2023 · 9d960df · 9d960df
1 parent 273fe7e
commit 9d960df
Show file tree

Hide file tree

Showing 8 changed files with 285 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -21,12 +21,150 @@ process through a command-line interface (CLI) tool.</p>
 
 Brickflow documentation can be found [here](https://engineering.nike.com/brickflow/).
 
-### Installation
+### Getting Started
+
+#### Prerequisites
+1. Install brickflows
 
 ```shell
 pip install brickflows
 ```
 
+2. Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html)
+
+```shell
+curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh
+```
+
+3. Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file.
+
+```shell
+databricks configure --token
+```
+
+#### Hello World workflow
+1. Create your first workflow using brickflow
+```shell
+mkdir hello-world-brickflow
+cd hello-world-brickflow
+brickflow projects add
+```
+
+2. Provide the following inputs
+```shell
+Project name: hello-world-brickflow
+Path from repo root to project root (optional) [.]: .
+Path from project root to workflows dir: workflows
+Git https url: https://github.com/Nike-Inc/brickflow.git
+Brickflow version [auto]:<hit enter>
+Spark expectations version [0.5.0]: 0.8.0
+Skip entrypoint [y/N]: N
+```
+_Note: You can provide your own github repo url._
+
+3. Create a new file hello_world_wf.py in the workflows directory
+```shell
+touch workflows/hello_world_wf.py
+```
+
+4. Copy the following code in hello_world_wf.py file
+```python
+from brickflow import (
+    Cluster,
+    Workflow,
+    NotebookTask,
+)
+from brickflow.context import ctx
+from airflow.operators.bash import BashOperator
+
+
+cluster = Cluster(
+    name="job_cluster",
+    node_type_id="m6gd.xlarge",
+    spark_version="13.3.x-scala2.12",
+    min_workers=1,
+    max_workers=2,
+)
+
+wf = Workflow(
+    "hello_world_workflow",
+    default_cluster=cluster,
+    tags={
+        "product_id": "brickflow_demo",
+    },
+    common_task_parameters={
+        "catalog": "<uc-catalog-name>",
+        "database": "<uc-schema-name>",
+    },
+)
+
+@wf.task
+# this task does nothing but explains the use of context object
+def start():
+    print(f"Environment: {ctx.env}")
+
+@wf.notebook_task
+# this task runs a databricks notebook
+def example_notebook():
+    return NotebookTask(
+        notebook_path="notebooks/example_notebook.py",
+        base_parameters={
+            "some_parameter": "some_value",  # in the notebook access these via dbutils.widgets.get("some_parameter")
+        },
+    )
+
+
+@wf.task(depends_on=[start, example_notebook])
+# this task runs a bash command
+def list_lending_club_data_files():
+    return BashOperator(
+        task_id=list_lending_club_data_files.__name__,
+        bash_command="ls -lrt /dbfs/databricks-datasets/samples/lending_club/parquet/",
+    )
+
+@wf.task(depends_on=list_lending_club_data_files)
+# this task runs the pyspark code
+def lending_data_ingest():
+    ctx.spark.sql(
+        f"""
+        CREATE TABLE IF NOT EXISTS
+        {ctx.dbutils_widget_get_or_else(key="catalog", debug="development")}.\
+        {ctx.dbutils_widget_get_or_else(key="database", debug="dummy_database")}.\
+        {ctx.dbutils_widget_get_or_else(key="brickflow_env", debug="local")}_lending_data_ingest
+        USING DELTA -- this is default just for explicit purpose
+        SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/`
+    """
+    )
+```
+_Note: Modify the values of catalog/database for common_task_parameters._
+
+
+5. Create a new file example_notebook.py in the notebooks directory
+```shell
+mkdir notebooks
+touch notebooks/example_notebook.py
+```
+6. Copy the following code in the example_notebook.py file
+```python
+# Databricks notebook source
+
+print("hello world")
+```
+
+#### Deploy the workflow to databricks
+```shell
+brickflow projects deploy --project hello-world-brickflow -e local
+```
+
+### Run the demo workflow
+1. Login to databricks workspace
+2. Go to the workflows and select the workflow
+![img.png](docs/img/workflow.png)
+3. click on the run button
+
+### Examples
+Refer to the [examples](https://github.com/Nike-Inc/brickflow/tree/main/examples/brickflow_examples) for more examples.
+
 ### Contributors
 
 Thanks to all the [contributors](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTORS.md) who have helped ideate, develop and bring Brickflow to its current state. 

diff --git a/docs/img/workflow.png b/docs/img/workflow.png
diff --git a/examples/brickflow_examples/.brickflow-project-root.yml b/examples/brickflow_examples/.brickflow-project-root.yml
@@ -0,0 +1,10 @@
+# DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE
+projects:
+  brickflow-demo:
+    brickflow_version: auto
+    deployment_mode: bundle
+    enable_plugins: true
+    name: brickflow-demo
+    path_from_repo_root_to_project_root: .
+    path_project_root_to_workflows_dir: workflows
+version: v1
diff --git a/examples/brickflow_examples/README.md b/examples/brickflow_examples/README.md
@@ -1,2 +1,124 @@
 # brickflow-examples
 This repository consists of examples for brickflow
+
+## Getting Started
+
+### Prerequisites
+1.Install brickflows
+
+```shell
+pip install brickflows
+```
+
+2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html)
+
+    ```shell
+    curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh
+    ```
+
+3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file.
+
+    ```shell
+    databricks configure --token
+    ```
+
+### Clone the repository
+
+```shell
+git clone https://github.com/Nike-Inc/brickflow.git
+cd brickflow/examples/brickflow_examples
+```
+
+### Hello World workflow
+- Create your first workflow using brickflow
+- Create a new file hello_world_workflow.py in the workflows directory
+- Add the following code to the file
+```python
+from brickflow import (
+    Cluster,
+    Workflow,
+    NotebookTask,
+)
+from brickflow.context import ctx
+from airflow.operators.bash import BashOperator
+
+
+cluster = Cluster(
+    name="job_cluster",
+    node_type_id="m6gd.xlarge",
+    spark_version="13.3.x-scala2.12",
+    min_workers=1,
+    max_workers=2,
+)
+
+wf = Workflow(
+    "hello_world_workflow",
+    default_cluster=cluster,
+    tags={
+        "product_id": "brickflow_demo",
+    },
+    common_task_parameters={
+        "catalog": "<uc-catalog-name>",
+        "database": "<uc-schema-name>",
+    },
+)
+
+@wf.task
+# this task does nothing but explains the use of context object
+def start():
+    print(f"Environment: {ctx.env}")
+
+@wf.notebook_task
+# this task runs a databricks notebook
+def example_notebook():
+    return NotebookTask(
+        notebook_path="notebooks/example_notebook.py",
+        base_parameters={
+            "some_parameter": "some_value",  # in the notebook access these via dbutils.widgets.get("some_parameter")
+        },
+    )
+
+
+@wf.task(depends_on=[start, example_notebook])
+# this task runs a bash command
+def list_lending_club_data_files():
+    return BashOperator(
+        task_id=list_lending_club_data_files.__name__,
+        bash_command="ls -lrt /dbfs/databricks-datasets/samples/lending_club/parquet/",
+    )
+
+@wf.task(depends_on=list_lending_club_data_files)
+# this task runs the pyspark code
+def lending_data_ingest():
+    ctx.spark.sql(
+        f"""
+        CREATE TABLE IF NOT EXISTS
+        {ctx.dbutils_widget_get_or_else(key="catalog", debug="development")}.\
+        {ctx.dbutils_widget_get_or_else(key="database", debug="dummy_database")}.\
+        {ctx.dbutils_widget_get_or_else(key="brickflow_env", debug="local")}_lending_data_ingest
+        USING DELTA -- this is default just for explicit purpose
+        SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/`
+    """
+    )
+```
+_Note: Modify the values of catalog/database for common_task_parameters._
+
+### Update demo_wf.py
+- demo_wf.py explains the various tasks and options available for the tasks
+- You can remove the demo_wf.py in case you just to run the hello_world_workflow.py
+- In case you want to run the demo_wf.py, update the below params with your values
+   - default_cluster
+   - common_task_parameters
+   - permissions
+   - default_task_settings
+
+### Deploy the workflow to databricks
+```shell
+brickflow projects deploy --project brickflow-demo -e local
+```
+
+### Run the demo workflow
+- login to databricks workspace
+- go to the workflows and select the workflow
+![img.png](../../docs/img/workflow.png)
+- click on the run button
diff --git a/examples/brickflow_examples/brickflow-multi-project.yml b/examples/brickflow_examples/brickflow-multi-project.yml
@@ -0,0 +1,4 @@
+project_roots:
+  brickflow-demo:
+    root_yaml_rel_path: .
+version: v1
diff --git a/examples/brickflow_examples/workflows/demo_wf.py b/examples/brickflow_examples/workflows/demo_wf.py
@@ -1,4 +1,3 @@
-import resolver
 from datetime import timedelta
 
 from airflow.operators.bash import BashOperator
@@ -17,22 +16,25 @@
 
 wf = Workflow(
     "brickflow-demo",
-    default_cluster=Cluster.from_existing_cluster("YOUR_CLUSTER_ID"),
+    # replace <all-purpose-cluster-id> with your cluster id
+    default_cluster=Cluster.from_existing_cluster("<all-purpose-cluster-id>"),
     # Optional parameters below
     schedule_quartz_expression="0 0/20 0 ? * * *",
     tags={
         "product_id": "brickflow_demo",
         "slack_channel": "YOUR_SLACK_CHANNEL",
     },
     common_task_parameters={
-        "catalog": "development",
-        "database": "your_database",
+        "catalog": "<unity-catalog-name>",
+        "database": "<unity-schema-name>",
     },
+    # replace <emails> with existing users' email on databricks
     permissions=WorkflowPermissions(
         can_manage_run=[User("[email protected]"), User("[email protected]")],
         can_view=[User("[email protected]")],
         can_manage=[User("[email protected]")],
     ),
+    # replace <emails> with existing users' email on databricks
     default_task_settings=TaskSettings(
         email_notifications=EmailNotifications(
             on_start=["[email protected]"],

diff --git a/examples/brickflow_examples/workflows/entrypoint.py b/examples/brickflow_examples/workflows/entrypoint.py
@@ -1,24 +1,19 @@
 # Databricks notebook source
 
-import resolver
+import brickflow
+from brickflow import Project, PypiTaskLibrary
 import workflows
 
-from brickflow import Project, PypiTaskLibrary, MavenTaskLibrary
-
-ARTIFACTORY = ""
-
 
 def main() -> None:
-    """Project entrypoint"""
     with Project(
         "brickflow-demo",
         git_repo="https://github.com/Nike-Inc/brickflow",
         provider="github",
         libraries=[
             PypiTaskLibrary(
-                package="brickflow==1.0.0 --extra-index-url " + ARTIFACTORY
-            ),
-            MavenTaskLibrary(coordinates="com.cronutils:cron-utils:9.2.0"),
+                package="spark-expectations==0.8.0"
+            ),  # comment if spark-expectations is not needed
         ],
     ) as f:
         f.add_pkg(workflows)

diff --git a/examples/brickflow_examples/workflows/resolver.py b/examples/brickflow_examples/workflows/resolver.py