From c3aeaabee34a161133e5c599356b24c771dc346b Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 18:09:49 +0000 Subject: [PATCH 01/13] update mysqltospanner run notebook script --- .../MySqlToSpanner_notebook.ipynb | 18 +++++++----------- .../MySqlToSpanner_parameterize_script.py | 10 ++++++++++ .../util/notebook_constants.py | 2 ++ notebooks/requirements.txt | 1 + 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb index 66d397801..aa2f9ba65 100644 --- a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb +++ b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb @@ -100,10 +100,7 @@ }, "outputs": [], "source": [ - "!pip3 install pymysql SQLAlchemy\n", - "!pip3 install --upgrade google-cloud-pipeline-components kfp --user -q\n", - "!pip3 install google-cloud-spanner\n", - "!pip3 install --upgrade google-cloud-storage" + "!pip3 install --upgrade google-cloud-storage google-cloud-aiplatform kfp google-cloud-pipeline-components pandas google-cloud-spanner pymysql SQLAlchemy" ] }, { @@ -115,9 +112,9 @@ }, "outputs": [], "source": [ - "!sudo apt-get update -y\n", - "!sudo apt-get install default-jdk -y\n", - "!sudo apt-get install maven -y" + "#!sudo apt-get update -y\n", + "#!sudo apt-get install default-jdk -y\n", + "#!sudo apt-get install maven -y" ] }, { @@ -399,7 +396,7 @@ " username=MYSQL_USERNAME,\n", " password=MYSQL_PASSWORD,\n", " database=MYSQL_DATABASE,\n", - " host=MYSQL_HOST,\n", + " host=\"127.0.0.1\",\n", " port=MYSQL_PORT\n", " )\n", ")\n", @@ -541,7 +538,7 @@ "metadata": {}, "outputs": [], "source": [ - "!wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.29.tar.gz\n", + "!wget --backups=1 https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.29.tar.gz\n", "!tar -xf mysql-connector-java-8.0.29.tar.gz\n", "!mvn clean spotless:apply install -DskipTests " ] @@ -658,9 +655,8 @@ " file_uris=FILE_URIS,\n", " subnetwork_uri=SUBNETWORK_URI,\n", " runtime_config_version=\"1.1\", # issue 665\n", - " service_account=DATAPROC_SERVICE_ACCOUNT,\n", " args=TEMPLATE_SPARK_ARGS\n", - " )\n", + " ) # add DATAPROC_SERVICE_ACCOUNT if needed\n", " time.sleep(1)\n", "\n", " compiler.Compiler().compile(pipeline_func=pipeline, package_path=\"pipeline.json\")\n", diff --git a/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py b/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py index b09fe7d37..a10aae9a0 100644 --- a/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py +++ b/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py @@ -112,6 +112,13 @@ def parse_args(args: Optional[Sequence[str]] = None) -> Dict[str, Any]: help='Provide table & PK column which do not have PK in MySQL table {"table_name":"primary_key"}', ) + parser.add_argument( + f"--{constants.MYSQL_READ_PARTITION_COLUMNS_ARG}", + dest=constants.MYSQL_READ_PARTITION_COLUMNS, + required=True, + help='Dictionary of custom read partition columns, e.g.: {"table2": "secondary_id"}', + ) + parser.add_argument( f"--{constants.MAX_PARALLELISM_ARG}", dest=constants.MAX_PARALLELISM, @@ -141,6 +148,9 @@ def run(self, args: Dict[str, Any]) -> None: args[constants.SPANNER_TABLE_PRIMARY_KEYS] = json.loads( args[constants.SPANNER_TABLE_PRIMARY_KEYS] ) + args[constants.MYSQL_READ_PARTITION_COLUMNS] = json.loads( + args[constants.MYSQL_READ_PARTITION_COLUMNS] + ) # Exclude arguments that are not needed to be passed to the notebook ignore_keys = {constants.LOG_LEVEL_ARG, constants.OUTPUT_NOTEBOOK_ARG} diff --git a/notebooks/parameterize_script/util/notebook_constants.py b/notebooks/parameterize_script/util/notebook_constants.py index b3aa8010f..4c7080b8e 100644 --- a/notebooks/parameterize_script/util/notebook_constants.py +++ b/notebooks/parameterize_script/util/notebook_constants.py @@ -51,6 +51,7 @@ MYSQL_PASSWORD_ARG = "mysql.password" MYSQL_DATABASE_ARG = "mysql.database" MYSQL_TABLE_LIST_ARG = "mysql.table.list" +MYSQL_READ_PARTITION_COLUMNS_ARG = "mysql.read.partition.columns" MYSQL_OUTPUT_SPANNER_MODE_ARG = "mysql.output.spanner.mode" SPANNER_INSTANCE_ARG = "spanner.instance" SPANNER_DATABASE_ARG = "spanner.database" @@ -64,6 +65,7 @@ MYSQL_PASSWORD = "MYSQL_PASSWORD" MYSQL_DATABASE = "MYSQL_DATABASE" MYSQL_TABLE_LIST = "MYSQL_TABLE_LIST" +MYSQL_READ_PARTITION_COLUMNS = "MYSQL_READ_PARTITION_COLUMNS" MYSQL_OUTPUT_SPANNER_MODE = "MYSQL_OUTPUT_SPANNER_MODE" SPANNER_INSTANCE = "SPANNER_INSTANCE" SPANNER_DATABASE = "SPANNER_DATABASE" diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt index 15034118c..beeda79dc 100644 --- a/notebooks/requirements.txt +++ b/notebooks/requirements.txt @@ -4,3 +4,4 @@ google-cloud-spanner>=3.35.1 google-cloud-storage>=2.9.0 pandas>=1.3.5 papermill>=2.4.0 +ipykernel>=6.29.5 \ No newline at end of file From f5bdfa0d7e3075b0fa854e7dbb688b96ed2b3fb4 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 18:10:07 +0000 Subject: [PATCH 02/13] add mysqltospanner integration test --- notebooks/.ci/Jenkinsfile | 102 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 notebooks/.ci/Jenkinsfile diff --git a/notebooks/.ci/Jenkinsfile b/notebooks/.ci/Jenkinsfile new file mode 100644 index 000000000..24ac32319 --- /dev/null +++ b/notebooks/.ci/Jenkinsfile @@ -0,0 +1,102 @@ +def stageRetryCount = 3 + +pipeline { + + agent any + + environment { + DATAPROC_TELEPORT_WEBHOOK_URL = credentials('dataproc-teleport-webhook-url') + + TEST_JDBC_URL = credentials('env-test-jdbc-url') + + # Extract host + TEST_JDBC_host=$(echo "$TEST_JDBC_URL" | sed -E 's/jdbc:mysql:\/\/([^:]+):.*/\1/') + # Extract database + TEST_JDBC_database=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\/([^?]+)\?.*/\1/') + # Extract user + TEST_JDBC_user=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\?user=([^&]+)&.*/\1/') + # Extract password + TEST_JDBC_password=$(echo "$TEST_JDBC_URL" | sed -E 's/.*&password=([^&]+).*/\1/') + + GIT_BRANCH_LOCAL = sh ( + script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|main|'", // Remove "origin/" and set the default branch to main + returnStdout: true + ).trim() + } + + stages { + stage('Checkout') { + steps{ + git branch: "${GIT_BRANCH_LOCAL}", changelog: false, poll: false, url: 'https://github.com/GoogleCloudPlatform/dataproc-templates/' + } + } + stage('Build'){ + steps { + catchError { + sh ''' + python3.8 -m pip install --user virtualenv + + python3.8 -m venv env + source env/bin/activate + + export PACKAGE_EGG_FILE=dist/dataproc_templates_distribution.egg + + cd python + python setup.py bdist_egg --output=$PACKAGE_EGG_FILE + + cd ../notebooks + pip3 install -r requirements.txt + ''' + } + } + } + stage('Parallel Execution'){ + parallel{ + stage('MYSQL TO SPANNER') { + steps{ + retry(count: stageRetryCount) { + sh ''' + + source env/bin/activate + + export GCS_STAGING_LOCATION=gs://dataproc-templates/integration-testing + export JARS="gs://datproc_template_nk/jars/mysql-connector-java-8.0.29.jar,gs://datproc_template_nk/jars/postgresql-42.2.6.jar,gs://datproc_template_nk/jars/mssql-jdbc-6.4.0.jre8.jar" + export SKIP_BUILD=true + + cd notebooks + + python3 run_notebook.py --script=MYSQLTOSPANNER \ + --mysql.host="$TEST_JDBC_host" \ + --mysql.port="3306" \ + --mysql.username="$TEST_JDBC_user" \ + --mysql.password="$TEST_JDBC_password" \ + --mysql.database="$TEST_JDBC_database" \ + --mysql.table.list="employee" \ + --mysql.read.partition.columns="{}" \ + --spanner.instance="dataproc-spark-test" \ + --spanner.database="spark-ci-db" \ + --spanner.table.primary.keys="{\"employee\":\"empno\"}" + + ''' + } + } + } + } + } + } + post { + always{ + script { + if( env.GIT_BRANCH_LOCAL == 'main' ){ + googlechatnotification url: DATAPROC_TELEPORT_WEBHOOK_URL, + message: 'Jenkins: ${JOB_NAME}\nBuild status is ${BUILD_STATUS}\nSee ${BUILD_URL}\n', + notifyFailure: 'true', + notifyAborted: 'true', + notifyUnstable: 'true', + notifyNotBuilt: 'true', + notifyBackToNormal: 'true' + } + } + } + } +} From 5f83fa771787686690aae7c80f3a784ebc4609e6 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 18:17:15 +0000 Subject: [PATCH 03/13] fix Jenkinsfile --- notebooks/.ci/Jenkinsfile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/notebooks/.ci/Jenkinsfile b/notebooks/.ci/Jenkinsfile index 24ac32319..8019316eb 100644 --- a/notebooks/.ci/Jenkinsfile +++ b/notebooks/.ci/Jenkinsfile @@ -9,15 +9,6 @@ pipeline { TEST_JDBC_URL = credentials('env-test-jdbc-url') - # Extract host - TEST_JDBC_host=$(echo "$TEST_JDBC_URL" | sed -E 's/jdbc:mysql:\/\/([^:]+):.*/\1/') - # Extract database - TEST_JDBC_database=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\/([^?]+)\?.*/\1/') - # Extract user - TEST_JDBC_user=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\?user=([^&]+)&.*/\1/') - # Extract password - TEST_JDBC_password=$(echo "$TEST_JDBC_URL" | sed -E 's/.*&password=([^&]+).*/\1/') - GIT_BRANCH_LOCAL = sh ( script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|main|'", // Remove "origin/" and set the default branch to main returnStdout: true @@ -63,6 +54,15 @@ pipeline { export JARS="gs://datproc_template_nk/jars/mysql-connector-java-8.0.29.jar,gs://datproc_template_nk/jars/postgresql-42.2.6.jar,gs://datproc_template_nk/jars/mssql-jdbc-6.4.0.jre8.jar" export SKIP_BUILD=true + # Extract host + TEST_JDBC_host=$(echo "$TEST_JDBC_URL" | sed -E 's/jdbc:mysql:\/\/([^:]+):.*/\1/') + # Extract database + TEST_JDBC_database=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\/([^?]+)\?.*/\1/') + # Extract user + TEST_JDBC_user=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\?user=([^&]+)&.*/\1/') + # Extract password + TEST_JDBC_password=$(echo "$TEST_JDBC_URL" | sed -E 's/.*&password=([^&]+).*/\1/') + cd notebooks python3 run_notebook.py --script=MYSQLTOSPANNER \ From 9fd1b1259409430a079434c4b70e6ca536a8801a Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 19:11:04 +0000 Subject: [PATCH 04/13] fix integration test dependencies --- notebooks/.ci/Jenkinsfile | 48 ++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/notebooks/.ci/Jenkinsfile b/notebooks/.ci/Jenkinsfile index 8019316eb..d76494907 100644 --- a/notebooks/.ci/Jenkinsfile +++ b/notebooks/.ci/Jenkinsfile @@ -9,16 +9,33 @@ pipeline { TEST_JDBC_URL = credentials('env-test-jdbc-url') + TEST_JDBC_host = sh ( + script: "echo $TEST_JDBC_URL | sed -E 's/jdbc:mysql:\\/\\/([^:]+):.*/\\1/'", + returnStdout: true + ).trim() + TEST_JDBC_database = sh ( + script: "echo $TEST_JDBC_URL | sed -E 's/.*\\/([^?]+)\\?.*/\\1/'", + returnStdout: true + ).trim() + TEST_JDBC_user = sh ( + script: "echo $TEST_JDBC_URL | sed -E 's/.*\\?user=([^&]+)&.*/\\1/'", + returnStdout: true + ).trim() + TEST_JDBC_password = sh ( + script: "echo $TEST_JDBC_URL | sed -E 's/.*&password=([^&]+).*/\\1/'", + returnStdout: true + ).trim() + GIT_BRANCH_LOCAL = sh ( script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|main|'", // Remove "origin/" and set the default branch to main returnStdout: true ).trim() } - + stages { stage('Checkout') { steps{ - git branch: "${GIT_BRANCH_LOCAL}", changelog: false, poll: false, url: 'https://github.com/GoogleCloudPlatform/dataproc-templates/' + git branch: "${GIT_BRANCH_LOCAL}", changelog: false, poll: false, url: 'https://github.com/GoogleCloudPlatform/dataproc-templates/' } } stage('Build'){ @@ -29,14 +46,17 @@ pipeline { python3.8 -m venv env source env/bin/activate - + export PACKAGE_EGG_FILE=dist/dataproc_templates_distribution.egg cd python python setup.py bdist_egg --output=$PACKAGE_EGG_FILE cd ../notebooks - pip3 install -r requirements.txt + pip install --upgrade pip ipython ipykernel + ipython kernel install --name "python3" --user + pip install -r requirements.txt + ''' } } @@ -47,25 +67,17 @@ pipeline { steps{ retry(count: stageRetryCount) { sh ''' - source env/bin/activate export GCS_STAGING_LOCATION=gs://dataproc-templates/integration-testing export JARS="gs://datproc_template_nk/jars/mysql-connector-java-8.0.29.jar,gs://datproc_template_nk/jars/postgresql-42.2.6.jar,gs://datproc_template_nk/jars/mssql-jdbc-6.4.0.jre8.jar" export SKIP_BUILD=true - # Extract host - TEST_JDBC_host=$(echo "$TEST_JDBC_URL" | sed -E 's/jdbc:mysql:\/\/([^:]+):.*/\1/') - # Extract database - TEST_JDBC_database=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\/([^?]+)\?.*/\1/') - # Extract user - TEST_JDBC_user=$(echo "$TEST_JDBC_URL" | sed -E 's/.*\?user=([^&]+)&.*/\1/') - # Extract password - TEST_JDBC_password=$(echo "$TEST_JDBC_URL" | sed -E 's/.*&password=([^&]+).*/\1/') - - cd notebooks + cd notebooks/mysql2spanner + pip install -r requirements.txt + cd .. - python3 run_notebook.py --script=MYSQLTOSPANNER \ + python run_notebook.py --script=MYSQLTOSPANNER \ --mysql.host="$TEST_JDBC_host" \ --mysql.port="3306" \ --mysql.username="$TEST_JDBC_user" \ @@ -75,7 +87,7 @@ pipeline { --mysql.read.partition.columns="{}" \ --spanner.instance="dataproc-spark-test" \ --spanner.database="spark-ci-db" \ - --spanner.table.primary.keys="{\"employee\":\"empno\"}" + --spanner.table.primary.keys="{\\"employee\\":\\"empno\\"}" ''' } @@ -99,4 +111,4 @@ pipeline { } } } -} +} \ No newline at end of file From e1808d97ae7606b544e9fc02f0b2447bb48f2e0e Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 19:12:27 +0000 Subject: [PATCH 05/13] add mysql2spanner requeriments file --- notebooks/mysql2spanner/requirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 notebooks/mysql2spanner/requirements.txt diff --git a/notebooks/mysql2spanner/requirements.txt b/notebooks/mysql2spanner/requirements.txt new file mode 100644 index 000000000..a028b5c81 --- /dev/null +++ b/notebooks/mysql2spanner/requirements.txt @@ -0,0 +1,8 @@ +google-cloud-storage +google-cloud-aiplatform +kfp +google-cloud-pipeline-components +pandas +google-cloud-spanner +pymysql +SQLAlchemy \ No newline at end of file From cfac4660cc0e1bcd22dd3f32c987a62382239025 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 19:19:13 +0000 Subject: [PATCH 06/13] fix variable value --- notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb index aa2f9ba65..50e415328 100644 --- a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb +++ b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb @@ -396,7 +396,7 @@ " username=MYSQL_USERNAME,\n", " password=MYSQL_PASSWORD,\n", " database=MYSQL_DATABASE,\n", - " host=\"127.0.0.1\",\n", + " host=MYSQL_HOST,\n", " port=MYSQL_PORT\n", " )\n", ")\n", From c3589ebc633a974b921910ccfc0b88ee62ca9341 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 19:22:35 +0000 Subject: [PATCH 07/13] change mysql host variable --- notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb index 50e415328..b78612f6b 100644 --- a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb +++ b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb @@ -396,7 +396,7 @@ " username=MYSQL_USERNAME,\n", " password=MYSQL_PASSWORD,\n", " database=MYSQL_DATABASE,\n", - " host=MYSQL_HOST,\n", + " host=MYSQL_HOST if not IS_PARAMETERIZED else \"127.0.0.1\", # use cloud sql proxy\n", " port=MYSQL_PORT\n", " )\n", ")\n", From b602dc249288e77672a6d4c3b33e618a03dc3c90 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 20:08:10 +0000 Subject: [PATCH 08/13] update string manipulation Jenkinsfile --- notebooks/.ci/Jenkinsfile | 46 +++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/notebooks/.ci/Jenkinsfile b/notebooks/.ci/Jenkinsfile index d76494907..8ba4caa84 100644 --- a/notebooks/.ci/Jenkinsfile +++ b/notebooks/.ci/Jenkinsfile @@ -9,25 +9,9 @@ pipeline { TEST_JDBC_URL = credentials('env-test-jdbc-url') - TEST_JDBC_host = sh ( - script: "echo $TEST_JDBC_URL | sed -E 's/jdbc:mysql:\\/\\/([^:]+):.*/\\1/'", - returnStdout: true - ).trim() - TEST_JDBC_database = sh ( - script: "echo $TEST_JDBC_URL | sed -E 's/.*\\/([^?]+)\\?.*/\\1/'", - returnStdout: true - ).trim() - TEST_JDBC_user = sh ( - script: "echo $TEST_JDBC_URL | sed -E 's/.*\\?user=([^&]+)&.*/\\1/'", - returnStdout: true - ).trim() - TEST_JDBC_password = sh ( - script: "echo $TEST_JDBC_URL | sed -E 's/.*&password=([^&]+).*/\\1/'", - returnStdout: true - ).trim() GIT_BRANCH_LOCAL = sh ( - script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|main|'", // Remove "origin/" and set the default branch to main + script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|860-integration-tests-notebook|'", // Remove "origin/" and set the default branch to main returnStdout: true ).trim() } @@ -61,6 +45,20 @@ pipeline { } } } + stage('Extract JDBC URL Parameters') { + steps { + script { + def host = TEST_JDBC_URL.substring(TEST_JDBC_URL.indexOf("//") + 2, TEST_JDBC_URL.indexOf(":", TEST_JDBC_URL.indexOf("//") + 2)) + def database = TEST_JDBC_URL.substring(TEST_JDBC_URL.indexOf("/", TEST_JDBC_URL.indexOf("//") + 2) + 1, TEST_JDBC_URL.indexOf("?")) + def user = TEST_JDBC_URL.substring(TEST_JDBC_URL.indexOf("user=") + 5, TEST_JDBC_URL.indexOf("&")) + def password = TEST_JDBC_URL.substring(TEST_JDBC_URL.indexOf("password=") + 9) + env.DB_HOST = host + env.DB_NAME = database + env.DB_USER = user + env.DB_PASSWORD = password + } + } + } stage('Parallel Execution'){ parallel{ stage('MYSQL TO SPANNER') { @@ -77,18 +75,24 @@ pipeline { pip install -r requirements.txt cd .. + wget https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O ./cloud_sql_proxy + chmod +x cloud_sql_proxy + nohup ./cloud_sql_proxy -instances=$MYSQL_INSTANCE_CONNECTION_NAME_GCP=tcp:3306 & + python run_notebook.py --script=MYSQLTOSPANNER \ - --mysql.host="$TEST_JDBC_host" \ + --mysql.host="${DB_HOST}" \ --mysql.port="3306" \ - --mysql.username="$TEST_JDBC_user" \ - --mysql.password="$TEST_JDBC_password" \ - --mysql.database="$TEST_JDBC_database" \ + --mysql.username="${DB_USER}" \ + --mysql.password="${DB_PASSWORD}" \ + --mysql.database="${DB_NAME}" \ --mysql.table.list="employee" \ --mysql.read.partition.columns="{}" \ --spanner.instance="dataproc-spark-test" \ --spanner.database="spark-ci-db" \ --spanner.table.primary.keys="{\\"employee\\":\\"empno\\"}" + kill $(pgrep cloud_sql_proxy) + ''' } } From 982b75edb10f3fe48f996603d29b3114f718ad83 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 4 Oct 2024 20:14:12 +0000 Subject: [PATCH 09/13] update readme with notebooks int test --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 79a0b9c7f..5c976fbae 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ ![Python Dataproc Serverless Integration Test Status](https://dataproctemplatesci.com/buildStatus/icon?job=dataproc-templates-build%2Fintegration-tests-python&&subject=python-serverless-integration-tests) ![Python Dataproc Cluster Integration Tests Status](https://dataproctemplatesci.com/buildStatus/icon?job=dataproc-templates-build%2Fcluster-integration-tests-python&&subject=python-cluster-integration-tests) +![Notebooks Integration Test Status](https://dataproctemplatesci.com/buildStatus/icon?job=dataproc-templates-build%2Fintegration-tests-notebooks&&subject=integration-tests-notebooks) # Dataproc Templates Dataproc templates are designed to address various in-cloud data tasks, including data import/export/backup/restore and bulk API operations. These templates leverage the power of [Google Cloud's Dataproc](https://cloud.google.com/dataproc/), supporting both Dataproc Serverless and Dataproc clusters. From aeeb34d12e90c6f4f91765b7e11b8682211d02a7 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 18 Oct 2024 14:22:28 +0000 Subject: [PATCH 10/13] change mysql host parameter --- notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb index b78612f6b..50e415328 100644 --- a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb +++ b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb @@ -396,7 +396,7 @@ " username=MYSQL_USERNAME,\n", " password=MYSQL_PASSWORD,\n", " database=MYSQL_DATABASE,\n", - " host=MYSQL_HOST if not IS_PARAMETERIZED else \"127.0.0.1\", # use cloud sql proxy\n", + " host=MYSQL_HOST,\n", " port=MYSQL_PORT\n", " )\n", ")\n", From d0cd39b000ef645e6f3cf648b7ddc413cb9748f3 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 18 Oct 2024 18:43:15 +0000 Subject: [PATCH 11/13] use cloud sql proxy config --- notebooks/.ci/Jenkinsfile | 3 ++- .../mysql2spanner/MySqlToSpanner_notebook.ipynb | 16 ++++++---------- .../MySqlToSpanner_parameterize_script.py | 9 ++++++++- .../util/notebook_constants.py | 2 ++ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/notebooks/.ci/Jenkinsfile b/notebooks/.ci/Jenkinsfile index 8ba4caa84..e53fe4c5e 100644 --- a/notebooks/.ci/Jenkinsfile +++ b/notebooks/.ci/Jenkinsfile @@ -11,7 +11,7 @@ pipeline { GIT_BRANCH_LOCAL = sh ( - script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|860-integration-tests-notebook|'", // Remove "origin/" and set the default branch to main + script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|main|'", // Remove "origin/" and set the default branch to main returnStdout: true ).trim() } @@ -87,6 +87,7 @@ pipeline { --mysql.database="${DB_NAME}" \ --mysql.table.list="employee" \ --mysql.read.partition.columns="{}" \ + --use.cloud.sql.proxy="true" \ --spanner.instance="dataproc-spark-test" \ --spanner.database="spark-ci-db" \ --spanner.table.primary.keys="{\\"employee\\":\\"empno\\"}" diff --git a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb index 50e415328..e27fe93dc 100644 --- a/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb +++ b/notebooks/mysql2spanner/MySqlToSpanner_notebook.ipynb @@ -176,7 +176,10 @@ "execution_count": null, "id": "2703b502-1b41-44f1-bf21-41069255bc32", "metadata": { - "tags": [] + "tags": [], + "pycharm": { + "is_executing": true + } }, "outputs": [], "source": [ @@ -309,6 +312,7 @@ " MYSQL_PASSWORD = \"\"\n", " MYSQL_DATABASE = \"\"\n", " MYSQL_TABLE_LIST = [] # Leave list empty for migrating complete database else provide tables as ['table1','table2']\n", + " USE_CLOUD_SQL_PROXY = \"false\"\n", " MYSQL_READ_PARTITION_COLUMNS = {} # Leave empty for default read partition columns\n", " MYSQL_OUTPUT_SPANNER_MODE = \"overwrite\" # one of overwrite|append (Use append when schema already exists in Spanner)\n", "\n", @@ -396,7 +400,7 @@ " username=MYSQL_USERNAME,\n", " password=MYSQL_PASSWORD,\n", " database=MYSQL_DATABASE,\n", - " host=MYSQL_HOST,\n", + " host=MYSQL_HOST if USE_CLOUD_SQL_PROXY==\"false\" else \"127.0.0.1\",\n", " port=MYSQL_PORT\n", " )\n", ")\n", @@ -854,14 +858,6 @@ "- You may create relationships (FKs), constraints and indexes (as needed).\n", "- You may configure countinuous replication with [DataStream](https://cloud.google.com/datastream/docs/configure-your-source-mysql-database) or any other 3rd party tools." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89ec62c1-0b95-4536-9339-03a4a8de035e", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py b/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py index a10aae9a0..ad58312c0 100644 --- a/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py +++ b/notebooks/mysql2spanner/MySqlToSpanner_parameterize_script.py @@ -71,7 +71,6 @@ def parse_args(args: Optional[Sequence[str]] = None) -> Dict[str, Any]: required=True, help="MySQL database name", ) - parser.add_argument( f"--{constants.MYSQL_TABLE_LIST_ARG}", dest=constants.MYSQL_TABLE_LIST, @@ -91,6 +90,14 @@ def parse_args(args: Optional[Sequence[str]] = None) -> Dict[str, Any]: choices=[constants.OUTPUT_MODE_OVERWRITE, constants.OUTPUT_MODE_APPEND], ) + parser.add_argument( + f"--{constants.USE_CLOUD_SQL_PROXY_ARG}", + dest=constants.USE_CLOUD_SQL_PROXY, + default="false", + required=False, + help="Flag to reach mysql instance using cloud sql proxy" + ) + parser.add_argument( f"--{constants.SPANNER_INSTANCE_ARG}", dest=constants.SPANNER_INSTANCE, diff --git a/notebooks/parameterize_script/util/notebook_constants.py b/notebooks/parameterize_script/util/notebook_constants.py index 4c7080b8e..8090d3aba 100644 --- a/notebooks/parameterize_script/util/notebook_constants.py +++ b/notebooks/parameterize_script/util/notebook_constants.py @@ -53,6 +53,7 @@ MYSQL_TABLE_LIST_ARG = "mysql.table.list" MYSQL_READ_PARTITION_COLUMNS_ARG = "mysql.read.partition.columns" MYSQL_OUTPUT_SPANNER_MODE_ARG = "mysql.output.spanner.mode" +USE_CLOUD_SQL_PROXY_ARG = "use.cloud.sql.proxy" SPANNER_INSTANCE_ARG = "spanner.instance" SPANNER_DATABASE_ARG = "spanner.database" # provide table & pk column which do not have PK in MYSQL "{"table_name":"primary_key"}" @@ -67,6 +68,7 @@ MYSQL_TABLE_LIST = "MYSQL_TABLE_LIST" MYSQL_READ_PARTITION_COLUMNS = "MYSQL_READ_PARTITION_COLUMNS" MYSQL_OUTPUT_SPANNER_MODE = "MYSQL_OUTPUT_SPANNER_MODE" +USE_CLOUD_SQL_PROXY = "USE_CLOUD_SQL_PROXY" SPANNER_INSTANCE = "SPANNER_INSTANCE" SPANNER_DATABASE = "SPANNER_DATABASE" SPANNER_TABLE_PRIMARY_KEYS = "SPANNER_TABLE_PRIMARY_KEYS" From 5832dc67413e82ab809e1c532300480af0259e65 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Fri, 18 Oct 2024 20:04:46 +0000 Subject: [PATCH 12/13] change variable name --- notebooks/.ci/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/.ci/Jenkinsfile b/notebooks/.ci/Jenkinsfile index e53fe4c5e..bcbad5792 100644 --- a/notebooks/.ci/Jenkinsfile +++ b/notebooks/.ci/Jenkinsfile @@ -77,7 +77,7 @@ pipeline { wget https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O ./cloud_sql_proxy chmod +x cloud_sql_proxy - nohup ./cloud_sql_proxy -instances=$MYSQL_INSTANCE_CONNECTION_NAME_GCP=tcp:3306 & + nohup ./cloud_sql_proxy -instances=$ENV_TEST_MYSQL_INSTANCE_CONNECTION_NAME=tcp:3306 & python run_notebook.py --script=MYSQLTOSPANNER \ --mysql.host="${DB_HOST}" \ From 8152f313cb44111d6a9e6b295aaeea9092ba6db4 Mon Sep 17 00:00:00 2001 From: Nilo Resende Date: Thu, 24 Oct 2024 12:12:36 +0000 Subject: [PATCH 13/13] ci: Implemented the Jenkins automation for notebooks Mysql to Spanner --- notebooks/.ci/Jenkinsfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/notebooks/.ci/Jenkinsfile b/notebooks/.ci/Jenkinsfile index bcbad5792..5bfc5eb8f 100644 --- a/notebooks/.ci/Jenkinsfile +++ b/notebooks/.ci/Jenkinsfile @@ -9,7 +9,6 @@ pipeline { TEST_JDBC_URL = credentials('env-test-jdbc-url') - GIT_BRANCH_LOCAL = sh ( script: "echo $branchName | sed -e 's|origin/||g' | sed -e 's|^null\$|main|'", // Remove "origin/" and set the default branch to main returnStdout: true @@ -27,7 +26,6 @@ pipeline { catchError { sh ''' python3.8 -m pip install --user virtualenv - python3.8 -m venv env source env/bin/activate