From 9888c01868a45fd068ca8f6a920e4924970692ec Mon Sep 17 00:00:00 2001 From: xuwenyihust Date: Fri, 26 Apr 2024 22:18:17 +0800 Subject: [PATCH] Refactor GCP.md and README.md files, and update Docker configuration and startup script --- GCP.md | 39 +++++++++++++++ README.md | 76 +----------------------------- docker/jupyter-notebook/startup.py | 2 +- 3 files changed, 42 insertions(+), 75 deletions(-) create mode 100644 GCP.md diff --git a/GCP.md b/GCP.md new file mode 100644 index 0000000..123adbf --- /dev/null +++ b/GCP.md @@ -0,0 +1,39 @@ +## Quickstart +### Notebook +#### Step1: Setup Configuration +```bash +cp bin/env_template.yaml bin/env.yaml +``` +Fill in the `env.yaml` file with your own configurations. + +#### Step2: Create a Kubernetes cluster on GCP +```bash +source bin/setup.sh +``` + +#### Step3: Create a Jupyter Notebook +A service `notebook` will be created on the Kubernetes cluster. + +#### Step4: Check Spark Integration +![Alt text]() + +Check Spark information by running the following code in a notebook cell: +```python +start() +``` + +#### Step5: Check Spark UI +![Alt text]() + +Check Spark UI by clicking the link in the notebook cell output. + +## Docker Image +- [all-spark-notebook](https://hub.docker.com/repository/docker/wenyixu101/all-spark-notebook/general) + - Based on jupyter/all-spark-notebook:spark-3.5.0 + - Include Google Cloud SDK and GCS connector + - Include pyspark startup script + - Include notebook save hook function to save notebook to GCS + +- [spark-history-server](https://hub.docker.com/repository/docker/wenyixu101/spark-history-server) + - Based on apache/spark:3.5.0 + - Include GCS connector \ No newline at end of file diff --git a/README.md b/README.md index 8fa432f..9a05e38 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,5 @@

DataPulse: Platform For Big Data & AI

- - GitHub Workflow Status (with event) - - - GitHub Actions Workflow Status - GitHub Release @@ -16,76 +10,10 @@ GitHub License

-

- Documentation -

- - -## Features -- Spark Application Deployment - - Jar Application Submission - - PySpark Application Submission - - Jupyter Notebook - - Customized Integration with PySpark -- Monitoring - - Spark UI - - History Server - -## Supported Versions -- Apache Spark: 3.5.0 -- Scala: 2.12 -- Python: 3.11 -- GCS Connector: hadoop3-2.2.0 - -## Prerequisites -- GCP account - - Kubernetes Engine - - Cloud Storage -- gcloud SDK -- kubectl -- helm -- docker -- python3 - -## Quickstart -### Notebook -#### Step1: Setup Configuration -```bash -cp bin/env_template.yaml bin/env.yaml -``` -Fill in the `env.yaml` file with your own configurations. - -#### Step2: Create a Kubernetes cluster on GCP -```bash -source bin/setup.sh -``` - -#### Step3: Create a Jupyter Notebook -A service `notebook` will be created on the Kubernetes cluster. - -#### Step4: Check Spark Integration -![Alt text]() - -Check Spark information by running the following code in a notebook cell: -```python -start() -``` - -#### Step5: Check Spark UI -![Alt text]() -Check Spark UI by clicking the link in the notebook cell output. +## Summary +DataPulse is a platform for big data and AI. It is based on Apache Spark and Kubernetes. The platform is designed to be scalable and easy to use. It provides a set of tools for data processing, machine learning, and data visualization. -## Docker Image -- [all-spark-notebook](https://hub.docker.com/repository/docker/wenyixu101/all-spark-notebook/general) - - Based on jupyter/all-spark-notebook:spark-3.5.0 - - Include Google Cloud SDK and GCS connector - - Include pyspark startup script - - Include notebook save hook function to save notebook to GCS - -- [spark-history-server](https://hub.docker.com/repository/docker/wenyixu101/spark-history-server) - - Based on apache/spark:3.5.0 - - Include GCS connector ## License This project is licensed under the terms of the MIT license. diff --git a/docker/jupyter-notebook/startup.py b/docker/jupyter-notebook/startup.py index 70a87f0..7cd6ab2 100644 --- a/docker/jupyter-notebook/startup.py +++ b/docker/jupyter-notebook/startup.py @@ -88,7 +88,7 @@ def display_msg(): def create_spark_dev(): spark = SparkSession.builder \ .appName("PySpark Example") \ - .master("local[*]") \ + .master("spark://spark-master:7077") \ .getOrCreate() return spark