From 9888c01868a45fd068ca8f6a920e4924970692ec Mon Sep 17 00:00:00 2001
From: xuwenyihust <wenyixu101@gmail.com>
Date: Fri, 26 Apr 2024 22:18:17 +0800
Subject: [PATCH] Refactor GCP.md and README.md files, and update Docker
 configuration and startup script

---
 GCP.md                             | 39 +++++++++++++++
 README.md                          | 76 +-----------------------------
 docker/jupyter-notebook/startup.py |  2 +-
 3 files changed, 42 insertions(+), 75 deletions(-)
 create mode 100644 GCP.md
diff --git a/GCP.md b/GCP.md
new file mode 100644
index 0000000..123adbf
--- /dev/null
+++ b/GCP.md
@@ -0,0 +1,39 @@
+## Quickstart
+### Notebook
+#### Step1: Setup Configuration
+```bash
+cp bin/env_template.yaml bin/env.yaml
+```
+Fill in the `env.yaml` file with your own configurations.
+
+#### Step2: Create a Kubernetes cluster on GCP
+```bash
+source bin/setup.sh
+```
+
+#### Step3: Create a Jupyter Notebook
+A service `notebook` will be created on the Kubernetes cluster.
+
+#### Step4: Check Spark Integration
+![Alt text](<resources/images/notebook-spark-integration.png>)
+
+Check Spark information by running the following code in a notebook cell:
+```python
+start()
+```
+
+#### Step5: Check Spark UI
+![Alt text](<resources/images/spark-ui.png>)
+
+Check Spark UI by clicking the link in the notebook cell output.
+
+## Docker Image
+- [all-spark-notebook](https://hub.docker.com/repository/docker/wenyixu101/all-spark-notebook/general)
+  - Based on jupyter/all-spark-notebook:spark-3.5.0
+  - Include Google Cloud SDK and GCS connector
+  - Include pyspark startup script
+  - Include notebook save hook function to save notebook to GCS
+ 
+- [spark-history-server](https://hub.docker.com/repository/docker/wenyixu101/spark-history-server)
+  - Based on apache/spark:3.5.0
+  - Include GCS connector
\ No newline at end of file
diff --git a/README.md b/README.md
index 8fa432f..9a05e38 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,5 @@
 <h1 align="center">DataPulse: Platform For Big Data & AI</h2>
 <p align="center">
-    <a href="https://github.com/xuwenyihust/DataPulse/actions/workflows/build-examples.yml">
-        <img alt="GitHub Workflow Status (with event)" src="https://img.shields.io/github/actions/workflow/status/xuwenyihust/DataPulse/build-examples.yml?logo=github&label=Build%20%20Examples">
-    </a>
-    <a href="https://github.com/xuwenyihust/DataPulse/actions/workflows/build-docker.yml">
-      <img alt="GitHub Actions Workflow Status" src="https://img.shields.io/github/actions/workflow/status/xuwenyihust/DataPulse/build-docker.yml?logo=github&label=Build%20Docker">
-    </a>
     <a href="https://github.com/xuwenyihust/DataPulse/releases">
       <img alt="GitHub Release" src="https://img.shields.io/github/v/release/xuwenyihust/DataPulse?include_prereleases&label=Release">
     </a>
@@ -16,76 +10,10 @@
         <img alt="GitHub License" src="https://img.shields.io/github/license/xuwenyihust/Data-Platform?link=https%3A%2F%2Fgithub.com%2Fxuwenyihust%2FData-Platform%2Fblob%2Fmain%2FLICENSE&label=License">
     </a>
 </p>
-<p align="center">
-  <a href="https://github.com/xuwenyihust/DataPulse/wiki">Documentation</a> 
-</p>
-
-
-## Features
-- Spark Application Deployment
-    - Jar Application Submission
-    - PySpark Application Submission
-    - Jupyter Notebook
-        - Customized Integration with PySpark
-- Monitoring
-  - Spark UI
-  - History Server
-
-## Supported Versions
-- Apache Spark: 3.5.0
-- Scala: 2.12
-- Python: 3.11
-- GCS Connector: hadoop3-2.2.0
-
-## Prerequisites
-- GCP account
-  - Kubernetes Engine
-  - Cloud Storage
-- gcloud SDK
-- kubectl
-- helm
-- docker
-- python3
-
-## Quickstart
-### Notebook
-#### Step1: Setup Configuration
-```bash
-cp bin/env_template.yaml bin/env.yaml
-```
-Fill in the `env.yaml` file with your own configurations.
-
-#### Step2: Create a Kubernetes cluster on GCP
-```bash
-source bin/setup.sh
-```
-
-#### Step3: Create a Jupyter Notebook
-A service `notebook` will be created on the Kubernetes cluster.
-
-#### Step4: Check Spark Integration
-![Alt text](<resources/images/notebook-spark-integration.png>)
-
-Check Spark information by running the following code in a notebook cell:
-```python
-start()
-```
-
-#### Step5: Check Spark UI
-![Alt text](<resources/images/spark-ui.png>)
 
-Check Spark UI by clicking the link in the notebook cell output.
+## Summary
+DataPulse is a platform for big data and AI. It is based on Apache Spark and Kubernetes. The platform is designed to be scalable and easy to use. It provides a set of tools for data processing, machine learning, and data visualization.
 
-## Docker Image
-- [all-spark-notebook](https://hub.docker.com/repository/docker/wenyixu101/all-spark-notebook/general)
-  - Based on jupyter/all-spark-notebook:spark-3.5.0
-  - Include Google Cloud SDK and GCS connector
-  - Include pyspark startup script
-  - Include notebook save hook function to save notebook to GCS
- 
-- [spark-history-server](https://hub.docker.com/repository/docker/wenyixu101/spark-history-server)
-  - Based on apache/spark:3.5.0
-  - Include GCS connector
 
 ## License
 This project is licensed under the terms of the MIT license.
diff --git a/docker/jupyter-notebook/startup.py b/docker/jupyter-notebook/startup.py
index 70a87f0..7cd6ab2 100644
--- a/docker/jupyter-notebook/startup.py
+++ b/docker/jupyter-notebook/startup.py
@@ -88,7 +88,7 @@ def display_msg():
 def create_spark_dev():
     spark = SparkSession.builder \
         .appName("PySpark Example") \
-        .master("local[*]") \
+        .master("spark://spark-master:7077") \
         .getOrCreate()
     
     return spark