-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create colab notebook demonstrating loading data into a custom DC. (#287
- Loading branch information
Showing
1 changed file
with
324 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,324 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"provenance": [], | ||
"mount_file_id": "1-mn6074ewnVaAekb0_WXtns6IdNK3HGu", | ||
"authorship_tag": "ABX9TyOVQ2i3HxmNED6HbFpIulxM", | ||
"include_colab_link": true | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "view-in-github", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"<a href=\"https://colab.research.google.com/github/datacommonsorg/tools/blob/master/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Copyright 2024 Google LLC.\n", | ||
"SPDX-License-Identifier: Apache-2.0" | ||
], | ||
"metadata": { | ||
"id": "j0scfBZnWZTL" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "Y8Tm9pY0iYiZ" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Install dependencies\n", | ||
"\n", | ||
"%%capture\n", | ||
"!pip install google-cloud-storage\n", | ||
"!pip install google-cloud-run" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# User parameters\n", | ||
"\n", | ||
"# Replace the values in this section with yours.\n", | ||
"# Running without replacement can lead to errors since you won't have permissions to those resources.\n", | ||
"\n", | ||
"# Path to your local folder with files to be uploaded to GCS.\n", | ||
"# Note that colab works best with Google drive, so upload your data to a drive and use that path here.\n", | ||
"# This is only needed when working within colab. With standalone scripts, you can obviously use local folder paths.\n", | ||
"google_drive_folder_path = '/content/drive/MyDrive/demo-data'\n", | ||
"\n", | ||
"# Your GCP project ID.\n", | ||
"# This notebook uses this project ID for all GCP resources - GCS, Cloud SQL, Cloud Run.\n", | ||
"# If your resources are spread across different projects, you will need to specify different project IDs accordingly.\n", | ||
"gcp_project_id = 'datcom-website-dev'\n", | ||
"\n", | ||
"# The GCS bucket where your data will be uploaded.\n", | ||
"gcs_bucket_name = 'customdc-data'\n", | ||
"# The folder under the GCP bucket where your data will be uploaded.\n", | ||
"gcs_folder_path = 'load-data-workflow-demo'\n", | ||
"\n", | ||
"# The name of your Cloud SQL DB where your data will be imported.\n", | ||
"# Note that DB names can only have alpha-numeric characters.\n", | ||
"# Otherwise it fails with obscure messages.\n", | ||
"cloud_sql_db_name = 'demodb'\n", | ||
"\n", | ||
"# The region where your cloud run jobs and services are deployed.\n", | ||
"cloud_run_region = 'us-central1'\n", | ||
"\n", | ||
"# Cloud Run resources.\n", | ||
"# The script does not create a new job or service but runs and deploys existing ones respectively.\n", | ||
"# You can programmatically creates them as well but will need to use different APIs to do so.\n", | ||
"\n", | ||
"# Name of the cloud run job used for loading data.\n", | ||
"load_data_cloud_run_job_name = 'demo-job'\n", | ||
"# Name of the cloud run service for your datacommons instance.\n", | ||
"datacommons_cloud_run_service_name = 'demo-service'\n", | ||
"# The name of the docker image that will be deployed to the cloud run service.\n", | ||
"datacommons_service_image = 'gcr.io/datcom-ci/datacommons-services:latest'\n", | ||
"\n", | ||
"full_gcs_path = f'gs://{gcs_bucket_name}/{gcs_folder_path}'\n", | ||
"full_gcs_path" | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/", | ||
"height": 35 | ||
}, | ||
"id": "34H15wBAi5G3", | ||
"outputId": "3a87c029-2351-459a-d60d-fbfc0d644e97" | ||
}, | ||
"execution_count": null, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/plain": [ | ||
"'gs://customdc-data/load-data-workflow-demo'" | ||
], | ||
"application/vnd.google.colaboratory.intrinsic+json": { | ||
"type": "string" | ||
} | ||
}, | ||
"metadata": {}, | ||
"execution_count": 21 | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Imports\n", | ||
"\n", | ||
"import os\n", | ||
"import glob\n", | ||
"from datetime import datetime\n", | ||
"from google.colab import auth\n", | ||
"from google.cloud import storage\n", | ||
"from google.cloud import run_v2\n", | ||
"from google.colab import drive" | ||
], | ||
"metadata": { | ||
"id": "Hcz3Nhg1lKcn" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Authenticate user, mount google drive and instantiate GCP objects\n", | ||
"auth.authenticate_user()\n", | ||
"\n", | ||
"drive.mount('/content/drive')\n", | ||
"\n", | ||
"gcs_client = storage.Client(project=gcp_project_id)\n", | ||
"gcs_bucket = gcs_client.bucket(gcs_bucket_name)" | ||
], | ||
"metadata": { | ||
"id": "-FvqSgDdlOTo", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"outputId": "4b5d2b0c-d116-48f1-b809-966d08c6d5d8" | ||
}, | ||
"execution_count": null, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" | ||
] | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Workflow functions\n", | ||
"\n", | ||
"def upload_local_directory_to_gcs():\n", | ||
" assert os.path.isdir(google_drive_folder_path), f'Not a directory: {google_drive_folder_path}'\n", | ||
" for local_file in glob.glob(google_drive_folder_path + '/**'):\n", | ||
" if os.path.isfile(local_file):\n", | ||
" remote_path = os.path.join(gcs_folder_path, local_file[1 + len(google_drive_folder_path):])\n", | ||
" blob = gcs_bucket.blob(remote_path)\n", | ||
" blob.upload_from_filename(local_file)\n", | ||
" print(f'Uploaded {local_file} to gs://{gcs_bucket_name}/{remote_path}')\n", | ||
"\n", | ||
"def run_load_data_job():\n", | ||
" client = run_v2.JobsClient()\n", | ||
"\n", | ||
" job_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/jobs/{load_data_cloud_run_job_name}'\n", | ||
"\n", | ||
" env_vars = [\n", | ||
" run_v2.EnvVar(name='RUN_TIMESTAMP', value=str(datetime.now())),\n", | ||
" run_v2.EnvVar(name='INPUT_DIR', value=full_gcs_path),\n", | ||
" run_v2.EnvVar(name='OUTPUT_DIR', value=full_gcs_path),\n", | ||
" run_v2.EnvVar(name='DB_NAME', value=cloud_sql_db_name)\n", | ||
" ]\n", | ||
"\n", | ||
" request = run_v2.RunJobRequest(\n", | ||
" name=job_path,\n", | ||
" etag='*', # Use * for latest revision\n", | ||
" validate_only=False,\n", | ||
" overrides=run_v2.RunJobRequest.Overrides(\n", | ||
" task_count=1,\n", | ||
" container_overrides=[\n", | ||
" run_v2.RunJobRequest.Overrides.ContainerOverride(env=env_vars)\n", | ||
" ],\n", | ||
" ),\n", | ||
" )\n", | ||
"\n", | ||
" operation = client.run_job(request=request)\n", | ||
"\n", | ||
" logs_url = f'https://console.cloud.google.com/run/jobs/details/{cloud_run_region}/{load_data_cloud_run_job_name}/logs?project={gcp_project_id}'\n", | ||
"\n", | ||
" print('Waiting for load data job to complete. This will take a few minutes...')\n", | ||
" print(f'You can monitor the logs at: {logs_url}')\n", | ||
" response = operation.result()\n", | ||
" print(f'Load data job completed: {response.name}')\n", | ||
"\n", | ||
"def deploy_datacommons_service():\n", | ||
" client = run_v2.ServicesClient()\n", | ||
"\n", | ||
" service_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/services/{datacommons_cloud_run_service_name}'\n", | ||
"\n", | ||
" env = {\n", | ||
" 'DEPLOY_TIMESTAMP': str(datetime.now()),\n", | ||
" 'INPUT_DIR': full_gcs_path,\n", | ||
" 'OUTPUT_DIR': full_gcs_path,\n", | ||
" 'DB_NAME': cloud_sql_db_name\n", | ||
" }\n", | ||
"\n", | ||
" service = client.get_service(name=service_path)\n", | ||
" container = service.template.containers[0]\n", | ||
" env_vars = []\n", | ||
"\n", | ||
" for var_name, var_value in env.items():\n", | ||
" env_vars.append(run_v2.EnvVar(name=var_name, value=var_value))\n", | ||
"\n", | ||
" for env_var in container.env:\n", | ||
" var_name = env_var.name\n", | ||
" if var_name not in env:\n", | ||
" env_vars.append(env_var)\n", | ||
"\n", | ||
" container.env = env_vars\n", | ||
" service.template.containers = [container]\n", | ||
"\n", | ||
" request = run_v2.UpdateServiceRequest(\n", | ||
" service=service,\n", | ||
" validate_only=False,\n", | ||
" allow_missing=False,\n", | ||
" )\n", | ||
"\n", | ||
" operation = client.update_service(request=request)\n", | ||
"\n", | ||
" logs_url = f'https://console.cloud.google.com/run/detail/{cloud_run_region}/{datacommons_cloud_run_service_name}/logs?project={gcp_project_id}'\n", | ||
"\n", | ||
" print('Waiting for service to get deployed. This will take a few minutes...')\n", | ||
" print(f'You can monitor the logs at: {logs_url}')\n", | ||
" response = operation.result()\n", | ||
" print(f'Service deployed: {response.name}')\n", | ||
" print(f'Service URL: {response.urls[-1]}')" | ||
], | ||
"metadata": { | ||
"id": "GDUiNaWLmGEf" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Execute workflow\n", | ||
"\n", | ||
"print(\"\\n==== STEP 1 of 3: Uploading data to GCS ====\\n\")\n", | ||
"upload_local_directory_to_gcs()\n", | ||
"\n", | ||
"print(\"\\n==== STEP 2 of 3: Loading data in your datacommons store ====\\n\")\n", | ||
"run_load_data_job()\n", | ||
"\n", | ||
"print(\"\\n==== STEP 3 of 3: Deploying your datacommons service ====\\n\")\n", | ||
"deploy_datacommons_service()\n", | ||
"\n", | ||
"print(\"\\n==== DONE ====\")" | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "agD4W8TfnHh5", | ||
"outputId": "3ad21d6b-e489-4533-8fc6-fd50006bdaf1" | ||
}, | ||
"execution_count": null, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"\n", | ||
"==== STEP 1 of 3: Uploading data to GCS ====\n", | ||
"\n", | ||
"Uploaded /content/drive/MyDrive/demo-data/gender_wage_gap.csv to gs://customdc-data/load-data-workflow-demo/gender_wage_gap.csv\n", | ||
"Uploaded /content/drive/MyDrive/demo-data/config.json to gs://customdc-data/load-data-workflow-demo/config.json\n", | ||
"Uploaded /content/drive/MyDrive/demo-data/average_annual_wage.csv to gs://customdc-data/load-data-workflow-demo/average_annual_wage.csv\n", | ||
"\n", | ||
"==== STEP 2 of 3: Loading data in your datacommons store ====\n", | ||
"\n", | ||
"Waiting for load data job to complete. This will take a few minutes...\n", | ||
"You can monitor the logs at: https://console.cloud.google.com/run/jobs/details/us-central1/demo-job/logs?project=datcom-website-dev\n", | ||
"Load data job completed: projects/datcom-website-dev/locations/us-central1/jobs/demo-job/executions/demo-job-2fhrs\n", | ||
"\n", | ||
"==== STEP 3 of 3: Deploying your datacommons service ====\n", | ||
"\n", | ||
"Waiting for service to get deployed. This will take a few minutes...\n", | ||
"You can monitor the logs at: https://console.cloud.google.com/run/detail/us-central1/demo-service/logs?project=datcom-website-dev\n", | ||
"Service deployed: projects/datcom-website-dev/locations/us-central1/services/demo-service\n", | ||
"Service URL: https://demo-service-kqb7thiuka-uc.a.run.app\n", | ||
"\n", | ||
"==== DONE ====\n" | ||
] | ||
} | ||
] | ||
} | ||
] | ||
} |