From f8f7439a1e3722296c8fbc6f309bd544b4a2f162 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Tue, 12 Nov 2024 11:20:00 +0000 Subject: [PATCH 1/5] Enable Access to GCE Public IPs for the GKE Integration Test Clusters --- .../blueprints/gke-a3-highgpu.yaml | 92 +++++++++ .../blueprints/gke-a3-megagpu.yaml | 92 +++++++++ .../blueprints/gke-storage-parallelstore.yaml | 122 ++++++++++++ .../daily-tests/blueprints/hpc-gke.yaml | 71 +++++++ .../daily-tests/blueprints/ml-gke-e2e.yaml | 1 + .../daily-tests/blueprints/ml-gke.yaml | 82 ++++++++ .../daily-tests/blueprints/storage-gke.yaml | 182 ++++++++++++++++++ .../daily-tests/tests/gke-a3-highgpu.yml | 3 +- .../daily-tests/tests/gke-a3-megagpu.yml | 3 +- .../tests/gke-storage-parallelstore.yml | 3 +- .../daily-tests/tests/gke-storage.yml | 3 +- tools/cloud-build/daily-tests/tests/gke.yml | 4 +- .../daily-tests/tests/ml-gke-e2e.yml | 1 + .../cloud-build/daily-tests/tests/ml-gke.yml | 3 +- 14 files changed, 656 insertions(+), 6 deletions(-) create mode 100644 tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml create mode 100644 tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml create mode 100644 tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml create mode 100644 tools/cloud-build/daily-tests/blueprints/hpc-gke.yaml create mode 100644 tools/cloud-build/daily-tests/blueprints/ml-gke.yaml create mode 100644 tools/cloud-build/daily-tests/blueprints/storage-gke.yaml diff --git a/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml new file mode 100644 index 0000000000..e2c6cede29 --- /dev/null +++ b/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml @@ -0,0 +1,92 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: gke-a3-highgpu + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-a3-highgpu + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet-a3-highgpu + mtu: 8244 + secondary_ranges: + gke-subnet-a3-highgpu: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 192.169.0.0/16 + network_count: 4 + subnetwork_cidr_suffix: 24 + mtu: 8244 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gpunets, gke_service_account] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. + display_name: "kubectl-access-network" + outputs: [instructions] + + - id: a3_highgpu_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gpunets, gke_service_account] + settings: + machine_type: a3-highgpu-8g + autoscaling_total_min_nodes: 2 + zones: [$(vars.zone)] + outputs: [instructions] + + - id: workload_component_install + source: modules/management/kubectl-apply + use: [gke_cluster] + settings: + kueue: + install: true + jobset: + install: true diff --git a/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml new file mode 100644 index 0000000000..79e905e343 --- /dev/null +++ b/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml @@ -0,0 +1,92 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: gke-a3-mega + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-a3-mega + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet-a3-mega + mtu: 8244 + secondary_ranges: + gke-subnet-a3-mega: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 192.169.0.0/16 + network_count: 8 + subnetwork_cidr_suffix: 24 + mtu: 8244 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gpunets, gke_service_account] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. + display_name: "kubectl-access-network" + outputs: [instructions] + + - id: a3_megagpu_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gpunets, gke_service_account] + settings: + machine_type: a3-megagpu-8g + autoscaling_total_min_nodes: 2 + zones: [$(vars.zone)] + outputs: [instructions] + + - id: workload_manager_install + source: modules/management/kubectl-apply + use: [gke_cluster] + settings: + kueue: + install: true + jobset: + install: true diff --git a/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml new file mode 100644 index 0000000000..1b6d26b862 --- /dev/null +++ b/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml @@ -0,0 +1,122 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +blueprint_name: gke-storage-parallelstore +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-storage-parallelstore + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: setup + modules: + - id: network + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet-parallelstore + secondary_ranges: + gke-subnet-parallelstore: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: private_service_access # required for parallelstore + source: community/modules/network/private-service-access + use: [network] + settings: + prefix_length: 24 + +- group: primary + modules: + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network] + settings: + enable_parallelstore_csi: true # enable Parallelstore for the cluster + configure_workload_identity_sa: true + enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) + master_authorized_networks: + - display_name: deployment-machine + cidr_block: $(vars.authorized_cidr) + outputs: [instructions] + + ### Set up storage class and persistent volume claim for Parallelstore ### + - id: parallelstore-setup + source: modules/file-system/gke-storage + use: [gke_cluster, private_service_access] + settings: + storage_type: Parallelstore + access_mode: ReadWriteMany + sc_volume_binding_mode: Immediate + sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after + sc_topology_zones: [$(vars.zone)] + pvc_count: 1 + capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB + + - id: sample-pool + source: modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: sample-pool + zones: [$(vars.zone)] + machine_type: n2-standard-16 + + # Train a TensorFlow model with Keras and Parallelstore on GKE + # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample + + - id: parallelstore-job + source: modules/compute/gke-job-template + use: + - gke_cluster + - parallelstore-setup + settings: + name: tensorflow + image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d + security_context: # to make sure the job have enough access to execute the jobs and r/w from parallelstore + - key: runAsUser + value: 1000 + - key: runAsGroup + value: 100 + - key: fsGroup + value: 100 + command: + - bash + - -c + - | + pip install transformers datasets + python - </32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gke_service_account] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + master_authorized_networks: + - display_name: deployment-machine + cidr_block: $(vars.authorized_cidr) + configure_workload_identity_sa: true + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) + outputs: [instructions] + + - id: g2_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gke_service_account] + settings: + disk_type: pd-balanced + machine_type: g2-standard-4 + + - id: job_template + source: modules/compute/gke-job-template + use: [g2_pool] + settings: + image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 + command: + - nvidia-smi + node_count: 1 + outputs: [instructions] diff --git a/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml b/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml new file mode 100644 index 0000000000..39f265797e --- /dev/null +++ b/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml @@ -0,0 +1,182 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +blueprint_name: storage-gke + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: storage-gke-01 + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gke_service_account] + settings: + enable_filestore_csi: true + enable_gcsfuse_csi: true + configure_workload_identity_sa: true # needed when using GCS + enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) + master_authorized_networks: + - display_name: deployment-machine + cidr_block: $(vars.authorized_cidr) + outputs: [instructions] + + - id: debug_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gke_service_account] + settings: + name: debug + zones: [$(vars.zone)] + machine_type: n2d-standard-2 + + ### Google Cloud Storage ### + + - id: data-bucket + source: community/modules/file-system/cloud-storage-bucket + settings: + local_mount: /data + random_suffix: true + force_destroy: true + + - id: data-bucket-pv + source: modules/file-system/gke-persistent-volume + use: [gke_cluster, data-bucket] + settings: {capacity_gb: 5000} + + ### Filestore ### + + - id: filestore + source: modules/file-system/filestore + use: [network1] + settings: {local_mount: /shared} + + - id: shared-filestore-pv + source: modules/file-system/gke-persistent-volume + use: [gke_cluster, filestore] + + ### Shared Storage Job ### + + - id: shared-fs-job + source: modules/compute/gke-job-template + use: + - gke_cluster + - debug_pool + - shared-filestore-pv + - data-bucket-pv + settings: + image: bash + command: + - bash + - -c + - | + echo "Set up job folders" + shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])} + mkdir /data/${JOB}/ -p; mkdir /shared/${JOB}/ -p; + + echo "Writing seed file to GCS" + dd if=/dev/urandom of=/data/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 + + echo "Copy seed data from GCS to Filestore" + cp /data/${JOB}/${JOB_COMPLETION_INDEX}.dat /shared/${JOB}/ + + echo "Hash file from Filestore and save to GCS" + md5sum /shared/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/${JOB}/${JOB_COMPLETION_INDEX}.md5 + node_count: 5 + outputs: [instructions] + + ### Ephemeral Storage ### + + - id: local-ssd-pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gke_service_account] + settings: + name: local-ssd + machine_type: n2d-standard-2 + local_ssd_count_ephemeral_storage: 1 + + - id: ephemeral-storage-job + source: modules/compute/gke-job-template + use: [local-ssd-pool] + settings: + name: ephemeral-storage-job + ephemeral_volumes: # below shows all options, usually only 1 is needed + + - type: memory # backed by node memory + mount_path: /scratch-mem + size_gb: 5 + + - type: local-ssd # node pool must specify local_ssd_count_ephemeral_storage + mount_path: /scratch-local-ssd + size_gb: 280 # System holds back some of 375 GiB + + - type: pd-ssd + mount_path: /pd-ssd + size_gb: 100 + + - type: pd-balanced + mount_path: /pd-balanced + size_gb: 100 + + image: ljishen/fio + command: # https://cloud.google.com/compute/docs/disks/benchmarking-pd-performance + - fio + - --name=write_throughput + - --directory=/scratch-local-ssd + - --numjobs=16 + - --size=5G + - --time_based + - --runtime=30s + - --ramp_time=2s + - --ioengine=libaio + - --direct=1 + - --verify=0 + - --bs=1M + - --iodepth=64 + - --rw=write + - --group_reporting=1 + - --iodepth_batch_submit=64 + - --iodepth_batch_complete_max=64 + outputs: [instructions] diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml index 26b894a6fe..a7cf3c5d05 100644 --- a/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml +++ b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml @@ -19,7 +19,7 @@ test_name: gke-a3high deployment_name: gke-a3high-{{ build }} workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/gke-a3-highgpu.yaml" +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml" network: "gke-a3high-net-{{ build }}" region: us-west1 zone: us-west1-a @@ -37,6 +37,7 @@ cli_deployment_vars: authorized_cidr: "{{ build_ip.stdout }}/32" network_name: "{{ network }}" local_ssd_count_nvme_block: 16 + gcp_public_cidrs_access_enabled: true custom_vars: project: "{{ project }}" post_deploy_tests: diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml index 6b305c3410..d7165f44c5 100644 --- a/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml +++ b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml @@ -19,7 +19,7 @@ test_name: gke-a3mega deployment_name: gke-a3mega-{{ build }} workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/gke-a3-megagpu.yaml" +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml" network: "gke-a3mega-net-{{ build }}" region: us-west4 zone: us-west4-a @@ -37,6 +37,7 @@ cli_deployment_vars: authorized_cidr: "{{ build_ip.stdout }}/32" network_name: "{{ network }}" local_ssd_count_nvme_block: 16 + gcp_public_cidrs_access_enabled: true custom_vars: project: "{{ project }}" post_deploy_tests: diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml index 6a43c01ab3..bd76b26844 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml +++ b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml @@ -17,7 +17,7 @@ deployment_name: gke-storage-parallelstore-{{ build }} zone: us-central1-a # for remote node region: us-central1 workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/gke-storage-parallelstore.yaml" +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: @@ -26,3 +26,4 @@ custom_vars: project: "{{ project }}" cli_deployment_vars: region: "{{ region }}" + gcp_public_cidrs_access_enabled: true diff --git a/tools/cloud-build/daily-tests/tests/gke-storage.yml b/tools/cloud-build/daily-tests/tests/gke-storage.yml index 9beb5eba4c..b34d2a7aa2 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage.yml +++ b/tools/cloud-build/daily-tests/tests/gke-storage.yml @@ -16,10 +16,11 @@ test_name: storage-gke deployment_name: gke-storage-{{ build }} zone: us-central1-a # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/storage-gke.yaml" +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml" network: "{{ test_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: [] cli_deployment_vars: network_name: "{{ network }}" authorized_cidr: "{{ build_ip.stdout }}/32" + gcp_public_cidrs_access_enabled: true diff --git a/tools/cloud-build/daily-tests/tests/gke.yml b/tools/cloud-build/daily-tests/tests/gke.yml index 7f4e97bd52..c97b716bd0 100644 --- a/tools/cloud-build/daily-tests/tests/gke.yml +++ b/tools/cloud-build/daily-tests/tests/gke.yml @@ -16,7 +16,9 @@ test_name: hpc-gke deployment_name: gke-{{ build }} zone: us-central1-a # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/hpc-gke.yaml" +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/hpc-gke.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" +cli_deployment_vars: + gcp_public_cidrs_access_enabled: true post_deploy_tests: [] diff --git a/tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml b/tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml index 6c4a4e0b37..009a93d069 100644 --- a/tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml +++ b/tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml @@ -22,6 +22,7 @@ network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" cli_deployment_vars: region: "{{ region }}" + gcp_public_cidrs_access_enabled: true custom_vars: project: "{{ project }}" post_deploy_tests: diff --git a/tools/cloud-build/daily-tests/tests/ml-gke.yml b/tools/cloud-build/daily-tests/tests/ml-gke.yml index d26cab3869..fec60209dd 100644 --- a/tools/cloud-build/daily-tests/tests/ml-gke.yml +++ b/tools/cloud-build/daily-tests/tests/ml-gke.yml @@ -17,11 +17,12 @@ deployment_name: ml-gke-{{ build }} region: asia-southeast1 zone: asia-southeast1-b # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/ml-gke.yaml" +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/ml-gke.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" cli_deployment_vars: region: "{{ region }}" + gcp_public_cidrs_access_enabled: true custom_vars: project: "{{ project }}" post_deploy_tests: From 88d22e5a31375b7d35e1f6e87e401ee3b5fdade1 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 13 Nov 2024 06:32:18 +0000 Subject: [PATCH 2/5] Correct references --- tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml | 2 +- .../daily-tests/builds/gke-storage-parallelstore.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-storage.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke.yaml | 2 +- tools/cloud-build/daily-tests/builds/ml-gke.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml index 2ad20f6b8d..a62fbae307 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - EXAMPLE_BP=examples/gke-a3-highgpu.yaml + EXAMPLE_BP=tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml # Replacing the static subnet name to prevent collisions sed -i "s/gke-subnet-a3-highgpu/gke-subnet-a3-highgpu-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP} diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml index 05c5ce1097..502f48e9a0 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - EXAMPLE_BP=examples/gke-a3-megagpu.yaml + EXAMPLE_BP=tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml # Replacing the static subnet name to prevent collisions sed -i "s/gke-subnet-a3-mega/gke-subnet-a3-mega-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP} diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml index 1a6a5873cf..b188a89ae5 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml @@ -39,7 +39,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=examples/gke-storage-parallelstore.yaml + SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 1e4a11998a..15d751ee38 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -41,7 +41,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=examples/storage-gke.yaml + SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/storage-gke.yaml/storage-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index b73409a94f..da8f1bc05f 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=examples/hpc-gke.yaml + SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/hpc-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index a3b83c6fa8..d278f42f0d 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=examples/ml-gke.yaml + SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/ml-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} From ac10c3c5ad3fc033e8dab87897fcf3b098adbdc2 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 13 Nov 2024 12:40:53 +0000 Subject: [PATCH 3/5] Correct references --- tools/cloud-build/daily-tests/builds/gke-storage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 15d751ee38..d5692583fb 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -41,7 +41,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/storage-gke.yaml/storage-gke.yaml + SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/storage-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} From 36c4fbbcaf1c0cea956e69185ed036eba05242bd Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 14 Nov 2024 07:30:04 +0000 Subject: [PATCH 4/5] Use the example blueprints --- examples/gke-a3-highgpu.yaml | 1 + examples/gke-a3-megagpu.yaml | 1 + examples/gke-storage-parallelstore.yaml | 1 + examples/hpc-gke.yaml | 1 + examples/ml-gke.yaml | 1 + examples/storage-gke.yaml | 1 + .../blueprints/gke-a3-highgpu.yaml | 92 --------- .../blueprints/gke-a3-megagpu.yaml | 92 --------- .../blueprints/gke-storage-parallelstore.yaml | 122 ------------ .../daily-tests/blueprints/hpc-gke.yaml | 71 ------- .../daily-tests/blueprints/ml-gke.yaml | 82 -------- .../daily-tests/blueprints/storage-gke.yaml | 182 ------------------ .../daily-tests/builds/gke-a3-highgpu.yaml | 2 +- .../daily-tests/builds/gke-a3-megagpu.yaml | 2 +- .../builds/gke-storage-parallelstore.yaml | 2 +- .../daily-tests/builds/gke-storage.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke.yaml | 2 +- .../daily-tests/builds/ml-gke.yaml | 2 +- .../daily-tests/tests/gke-a3-highgpu.yml | 2 +- .../daily-tests/tests/gke-a3-megagpu.yml | 2 +- .../tests/gke-storage-parallelstore.yml | 2 +- .../daily-tests/tests/gke-storage.yml | 2 +- tools/cloud-build/daily-tests/tests/gke.yml | 2 +- .../cloud-build/daily-tests/tests/ml-gke.yml | 2 +- 24 files changed, 18 insertions(+), 653 deletions(-) delete mode 100644 tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml delete mode 100644 tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml delete mode 100644 tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml delete mode 100644 tools/cloud-build/daily-tests/blueprints/hpc-gke.yaml delete mode 100644 tools/cloud-build/daily-tests/blueprints/ml-gke.yaml delete mode 100644 tools/cloud-build/daily-tests/blueprints/storage-gke.yaml diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index 1c19dcd2e6..e2c6cede29 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -67,6 +67,7 @@ deployment_groups: use: [network1, gpunets, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) master_authorized_networks: - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. display_name: "kubectl-access-network" diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index 1198b520c0..79e905e343 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -67,6 +67,7 @@ deployment_groups: use: [network1, gpunets, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) master_authorized_networks: - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. display_name: "kubectl-access-network" diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 9ffe737e83..1b6d26b862 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -52,6 +52,7 @@ deployment_groups: enable_parallelstore_csi: true # enable Parallelstore for the cluster configure_workload_identity_sa: true enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) master_authorized_networks: - display_name: deployment-machine cidr_block: $(vars.authorized_cidr) diff --git a/examples/hpc-gke.yaml b/examples/hpc-gke.yaml index f927fd8169..0b3cf679d1 100644 --- a/examples/hpc-gke.yaml +++ b/examples/hpc-gke.yaml @@ -52,6 +52,7 @@ deployment_groups: use: [network1, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) outputs: [instructions] - id: compute_pool diff --git a/examples/ml-gke.yaml b/examples/ml-gke.yaml index cbce0a6c1a..f70af7f8d2 100644 --- a/examples/ml-gke.yaml +++ b/examples/ml-gke.yaml @@ -57,6 +57,7 @@ deployment_groups: use: [network1, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) master_authorized_networks: - display_name: deployment-machine cidr_block: $(vars.authorized_cidr) diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml index a257f97c49..39f265797e 100644 --- a/examples/storage-gke.yaml +++ b/examples/storage-gke.yaml @@ -58,6 +58,7 @@ deployment_groups: enable_gcsfuse_csi: true configure_workload_identity_sa: true # needed when using GCS enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) master_authorized_networks: - display_name: deployment-machine cidr_block: $(vars.authorized_cidr) diff --git a/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml deleted file mode 100644 index e2c6cede29..0000000000 --- a/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: gke-a3-highgpu - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: gke-a3-highgpu - region: us-central1 - zone: us-central1-c - - # Cidr block containing the IP of the machine calling terraform. - # The following line must be updated for this example to work. - authorized_cidr: /32 - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - settings: - subnetwork_name: gke-subnet-a3-highgpu - mtu: 8244 - secondary_ranges: - gke-subnet-a3-highgpu: - - range_name: pods - ip_cidr_range: 10.4.0.0/14 - - range_name: services - ip_cidr_range: 10.0.32.0/20 - - - id: gke_service_account - source: community/modules/project/service-account - settings: - name: gke-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - - id: gpunets - source: modules/network/multivpc - settings: - network_name_prefix: $(vars.deployment_name)-gpunet - global_ip_address_range: 192.169.0.0/16 - network_count: 4 - subnetwork_cidr_suffix: 24 - mtu: 8244 - - - id: gke_cluster - source: modules/scheduler/gke-cluster - use: [network1, gpunets, gke_service_account] - settings: - enable_private_endpoint: false # Allows for access from authorized public IPs - gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) - master_authorized_networks: - - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. - display_name: "kubectl-access-network" - outputs: [instructions] - - - id: a3_highgpu_pool - source: modules/compute/gke-node-pool - use: [gke_cluster, gpunets, gke_service_account] - settings: - machine_type: a3-highgpu-8g - autoscaling_total_min_nodes: 2 - zones: [$(vars.zone)] - outputs: [instructions] - - - id: workload_component_install - source: modules/management/kubectl-apply - use: [gke_cluster] - settings: - kueue: - install: true - jobset: - install: true diff --git a/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml deleted file mode 100644 index 79e905e343..0000000000 --- a/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: gke-a3-mega - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: gke-a3-mega - region: us-central1 - zone: us-central1-c - - # Cidr block containing the IP of the machine calling terraform. - # The following line must be updated for this example to work. - authorized_cidr: /32 - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - settings: - subnetwork_name: gke-subnet-a3-mega - mtu: 8244 - secondary_ranges: - gke-subnet-a3-mega: - - range_name: pods - ip_cidr_range: 10.4.0.0/14 - - range_name: services - ip_cidr_range: 10.0.32.0/20 - - - id: gke_service_account - source: community/modules/project/service-account - settings: - name: gke-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - - id: gpunets - source: modules/network/multivpc - settings: - network_name_prefix: $(vars.deployment_name)-gpunet - global_ip_address_range: 192.169.0.0/16 - network_count: 8 - subnetwork_cidr_suffix: 24 - mtu: 8244 - - - id: gke_cluster - source: modules/scheduler/gke-cluster - use: [network1, gpunets, gke_service_account] - settings: - enable_private_endpoint: false # Allows for access from authorized public IPs - gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) - master_authorized_networks: - - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. - display_name: "kubectl-access-network" - outputs: [instructions] - - - id: a3_megagpu_pool - source: modules/compute/gke-node-pool - use: [gke_cluster, gpunets, gke_service_account] - settings: - machine_type: a3-megagpu-8g - autoscaling_total_min_nodes: 2 - zones: [$(vars.zone)] - outputs: [instructions] - - - id: workload_manager_install - source: modules/management/kubectl-apply - use: [gke_cluster] - settings: - kueue: - install: true - jobset: - install: true diff --git a/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml deleted file mode 100644 index 1b6d26b862..0000000000 --- a/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -blueprint_name: gke-storage-parallelstore -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: gke-storage-parallelstore - region: us-central1 - zone: us-central1-c - - # Cidr block containing the IP of the machine calling terraform. - # The following line must be updated for this example to work. - authorized_cidr: /32 - -deployment_groups: -- group: setup - modules: - - id: network - source: modules/network/vpc - settings: - subnetwork_name: gke-subnet-parallelstore - secondary_ranges: - gke-subnet-parallelstore: - - range_name: pods - ip_cidr_range: 10.4.0.0/14 - - range_name: services - ip_cidr_range: 10.0.32.0/20 - - - id: private_service_access # required for parallelstore - source: community/modules/network/private-service-access - use: [network] - settings: - prefix_length: 24 - -- group: primary - modules: - - id: gke_cluster - source: modules/scheduler/gke-cluster - use: [network] - settings: - enable_parallelstore_csi: true # enable Parallelstore for the cluster - configure_workload_identity_sa: true - enable_private_endpoint: false # Allows for access from authorized public IPs - gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) - master_authorized_networks: - - display_name: deployment-machine - cidr_block: $(vars.authorized_cidr) - outputs: [instructions] - - ### Set up storage class and persistent volume claim for Parallelstore ### - - id: parallelstore-setup - source: modules/file-system/gke-storage - use: [gke_cluster, private_service_access] - settings: - storage_type: Parallelstore - access_mode: ReadWriteMany - sc_volume_binding_mode: Immediate - sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after - sc_topology_zones: [$(vars.zone)] - pvc_count: 1 - capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB - - - id: sample-pool - source: modules/compute/gke-node-pool - use: [gke_cluster] - settings: - name: sample-pool - zones: [$(vars.zone)] - machine_type: n2-standard-16 - - # Train a TensorFlow model with Keras and Parallelstore on GKE - # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample - - - id: parallelstore-job - source: modules/compute/gke-job-template - use: - - gke_cluster - - parallelstore-setup - settings: - name: tensorflow - image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d - security_context: # to make sure the job have enough access to execute the jobs and r/w from parallelstore - - key: runAsUser - value: 1000 - - key: runAsGroup - value: 100 - - key: fsGroup - value: 100 - command: - - bash - - -c - - | - pip install transformers datasets - python - </32 - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - settings: - subnetwork_name: gke-subnet - secondary_ranges: - gke-subnet: - - range_name: pods - ip_cidr_range: 10.4.0.0/14 - - range_name: services - ip_cidr_range: 10.0.32.0/20 - - - id: gke_service_account - source: community/modules/project/service-account - settings: - name: gke-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - - id: gke_cluster - source: modules/scheduler/gke-cluster - use: [network1, gke_service_account] - settings: - enable_private_endpoint: false # Allows for access from authorized public IPs - master_authorized_networks: - - display_name: deployment-machine - cidr_block: $(vars.authorized_cidr) - configure_workload_identity_sa: true - gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) - outputs: [instructions] - - - id: g2_pool - source: modules/compute/gke-node-pool - use: [gke_cluster, gke_service_account] - settings: - disk_type: pd-balanced - machine_type: g2-standard-4 - - - id: job_template - source: modules/compute/gke-job-template - use: [g2_pool] - settings: - image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 - command: - - nvidia-smi - node_count: 1 - outputs: [instructions] diff --git a/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml b/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml deleted file mode 100644 index 39f265797e..0000000000 --- a/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -blueprint_name: storage-gke - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: storage-gke-01 - region: us-central1 - zone: us-central1-c - - # Cidr block containing the IP of the machine calling terraform. - # The following line must be updated for this example to work. - authorized_cidr: /32 - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - settings: - subnetwork_name: gke-subnet - secondary_ranges: - gke-subnet: - - range_name: pods - ip_cidr_range: 10.4.0.0/14 - - range_name: services - ip_cidr_range: 10.0.32.0/20 - - - id: gke_service_account - source: community/modules/project/service-account - settings: - name: gke-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - - id: gke_cluster - source: modules/scheduler/gke-cluster - use: [network1, gke_service_account] - settings: - enable_filestore_csi: true - enable_gcsfuse_csi: true - configure_workload_identity_sa: true # needed when using GCS - enable_private_endpoint: false # Allows for access from authorized public IPs - gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) - master_authorized_networks: - - display_name: deployment-machine - cidr_block: $(vars.authorized_cidr) - outputs: [instructions] - - - id: debug_pool - source: modules/compute/gke-node-pool - use: [gke_cluster, gke_service_account] - settings: - name: debug - zones: [$(vars.zone)] - machine_type: n2d-standard-2 - - ### Google Cloud Storage ### - - - id: data-bucket - source: community/modules/file-system/cloud-storage-bucket - settings: - local_mount: /data - random_suffix: true - force_destroy: true - - - id: data-bucket-pv - source: modules/file-system/gke-persistent-volume - use: [gke_cluster, data-bucket] - settings: {capacity_gb: 5000} - - ### Filestore ### - - - id: filestore - source: modules/file-system/filestore - use: [network1] - settings: {local_mount: /shared} - - - id: shared-filestore-pv - source: modules/file-system/gke-persistent-volume - use: [gke_cluster, filestore] - - ### Shared Storage Job ### - - - id: shared-fs-job - source: modules/compute/gke-job-template - use: - - gke_cluster - - debug_pool - - shared-filestore-pv - - data-bucket-pv - settings: - image: bash - command: - - bash - - -c - - | - echo "Set up job folders" - shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])} - mkdir /data/${JOB}/ -p; mkdir /shared/${JOB}/ -p; - - echo "Writing seed file to GCS" - dd if=/dev/urandom of=/data/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 - - echo "Copy seed data from GCS to Filestore" - cp /data/${JOB}/${JOB_COMPLETION_INDEX}.dat /shared/${JOB}/ - - echo "Hash file from Filestore and save to GCS" - md5sum /shared/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/${JOB}/${JOB_COMPLETION_INDEX}.md5 - node_count: 5 - outputs: [instructions] - - ### Ephemeral Storage ### - - - id: local-ssd-pool - source: modules/compute/gke-node-pool - use: [gke_cluster, gke_service_account] - settings: - name: local-ssd - machine_type: n2d-standard-2 - local_ssd_count_ephemeral_storage: 1 - - - id: ephemeral-storage-job - source: modules/compute/gke-job-template - use: [local-ssd-pool] - settings: - name: ephemeral-storage-job - ephemeral_volumes: # below shows all options, usually only 1 is needed - - - type: memory # backed by node memory - mount_path: /scratch-mem - size_gb: 5 - - - type: local-ssd # node pool must specify local_ssd_count_ephemeral_storage - mount_path: /scratch-local-ssd - size_gb: 280 # System holds back some of 375 GiB - - - type: pd-ssd - mount_path: /pd-ssd - size_gb: 100 - - - type: pd-balanced - mount_path: /pd-balanced - size_gb: 100 - - image: ljishen/fio - command: # https://cloud.google.com/compute/docs/disks/benchmarking-pd-performance - - fio - - --name=write_throughput - - --directory=/scratch-local-ssd - - --numjobs=16 - - --size=5G - - --time_based - - --runtime=30s - - --ramp_time=2s - - --ioengine=libaio - - --direct=1 - - --verify=0 - - --bs=1M - - --iodepth=64 - - --rw=write - - --group_reporting=1 - - --iodepth_batch_submit=64 - - --iodepth_batch_complete_max=64 - outputs: [instructions] diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml index a62fbae307..2ad20f6b8d 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - EXAMPLE_BP=tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml + EXAMPLE_BP=examples/gke-a3-highgpu.yaml # Replacing the static subnet name to prevent collisions sed -i "s/gke-subnet-a3-highgpu/gke-subnet-a3-highgpu-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP} diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml index 502f48e9a0..05c5ce1097 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - EXAMPLE_BP=tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml + EXAMPLE_BP=examples/gke-a3-megagpu.yaml # Replacing the static subnet name to prevent collisions sed -i "s/gke-subnet-a3-mega/gke-subnet-a3-mega-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP} diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml index b188a89ae5..1a6a5873cf 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml @@ -39,7 +39,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml + SG_EXAMPLE=examples/gke-storage-parallelstore.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index d5692583fb..1e4a11998a 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -41,7 +41,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/storage-gke.yaml + SG_EXAMPLE=examples/storage-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index da8f1bc05f..b73409a94f 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/hpc-gke.yaml + SG_EXAMPLE=examples/hpc-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index d278f42f0d..a3b83c6fa8 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -37,7 +37,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/ml-gke.yaml + SG_EXAMPLE=examples/ml-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml index a7cf3c5d05..ea7e105141 100644 --- a/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml +++ b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml @@ -19,7 +19,7 @@ test_name: gke-a3high deployment_name: gke-a3high-{{ build }} workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-a3-highgpu.yaml" +blueprint_yaml: "{{ workspace }}/examples/gke-a3-highgpu.yaml" network: "gke-a3high-net-{{ build }}" region: us-west1 zone: us-west1-a diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml index d7165f44c5..f24facfe68 100644 --- a/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml +++ b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml @@ -19,7 +19,7 @@ test_name: gke-a3mega deployment_name: gke-a3mega-{{ build }} workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-a3-megagpu.yaml" +blueprint_yaml: "{{ workspace }}/examples/gke-a3-megagpu.yaml" network: "gke-a3mega-net-{{ build }}" region: us-west4 zone: us-west4-a diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml index bd76b26844..a6de4bf239 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml +++ b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml @@ -17,7 +17,7 @@ deployment_name: gke-storage-parallelstore-{{ build }} zone: us-central1-a # for remote node region: us-central1 workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-storage-parallelstore.yaml" +blueprint_yaml: "{{ workspace }}/examples/gke-storage-parallelstore.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: diff --git a/tools/cloud-build/daily-tests/tests/gke-storage.yml b/tools/cloud-build/daily-tests/tests/gke-storage.yml index b34d2a7aa2..f2addf9432 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage.yml +++ b/tools/cloud-build/daily-tests/tests/gke-storage.yml @@ -16,7 +16,7 @@ test_name: storage-gke deployment_name: gke-storage-{{ build }} zone: us-central1-a # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/storage-gke.yaml" +blueprint_yaml: "{{ workspace }}/examples/storage-gke.yaml" network: "{{ test_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: [] diff --git a/tools/cloud-build/daily-tests/tests/gke.yml b/tools/cloud-build/daily-tests/tests/gke.yml index c97b716bd0..bed7e02d59 100644 --- a/tools/cloud-build/daily-tests/tests/gke.yml +++ b/tools/cloud-build/daily-tests/tests/gke.yml @@ -16,7 +16,7 @@ test_name: hpc-gke deployment_name: gke-{{ build }} zone: us-central1-a # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/hpc-gke.yaml" +blueprint_yaml: "{{ workspace }}/examples/hpc-gke.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" cli_deployment_vars: diff --git a/tools/cloud-build/daily-tests/tests/ml-gke.yml b/tools/cloud-build/daily-tests/tests/ml-gke.yml index fec60209dd..c0f40270be 100644 --- a/tools/cloud-build/daily-tests/tests/ml-gke.yml +++ b/tools/cloud-build/daily-tests/tests/ml-gke.yml @@ -17,7 +17,7 @@ deployment_name: ml-gke-{{ build }} region: asia-southeast1 zone: asia-southeast1-b # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/ml-gke.yaml" +blueprint_yaml: "{{ workspace }}/examples/ml-gke.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" cli_deployment_vars: From 675f6e09ed0415898284c99685ff0075bfcff3db Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 14 Nov 2024 07:56:10 +0000 Subject: [PATCH 5/5] Ensure the variable is defined --- examples/gke-a3-highgpu.yaml | 2 ++ examples/gke-a3-megagpu.yaml | 2 ++ examples/gke-storage-parallelstore.yaml | 2 ++ examples/hpc-gke.yaml | 1 + examples/ml-gke.yaml | 2 ++ examples/storage-gke.yaml | 2 ++ tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml | 2 ++ 7 files changed, 13 insertions(+) diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index e2c6cede29..e3387aa121 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -26,6 +26,8 @@ vars: # The following line must be updated for this example to work. authorized_cidr: /32 + gcp_public_cidrs_access_enabled: false + deployment_groups: - group: primary modules: diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index 79e905e343..bfee018df6 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -26,6 +26,8 @@ vars: # The following line must be updated for this example to work. authorized_cidr: /32 + gcp_public_cidrs_access_enabled: false + deployment_groups: - group: primary modules: diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 1b6d26b862..f025903b37 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -23,6 +23,8 @@ vars: # The following line must be updated for this example to work. authorized_cidr: /32 + gcp_public_cidrs_access_enabled: false + deployment_groups: - group: setup modules: diff --git a/examples/hpc-gke.yaml b/examples/hpc-gke.yaml index 0b3cf679d1..60f4f57ce8 100644 --- a/examples/hpc-gke.yaml +++ b/examples/hpc-gke.yaml @@ -20,6 +20,7 @@ vars: project_id: ## Set GCP Project ID Here ## deployment_name: cluster-01 region: us-central1 + gcp_public_cidrs_access_enabled: false deployment_groups: - group: primary diff --git a/examples/ml-gke.yaml b/examples/ml-gke.yaml index f70af7f8d2..053a5dcedc 100644 --- a/examples/ml-gke.yaml +++ b/examples/ml-gke.yaml @@ -26,6 +26,8 @@ vars: # The following line must be updated for this example to work. authorized_cidr: /32 + gcp_public_cidrs_access_enabled: false + deployment_groups: - group: primary modules: diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml index 39f265797e..108392cd18 100644 --- a/examples/storage-gke.yaml +++ b/examples/storage-gke.yaml @@ -24,6 +24,8 @@ vars: # The following line must be updated for this example to work. authorized_cidr: /32 + gcp_public_cidrs_access_enabled: false + deployment_groups: - group: primary modules: diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index 234c895e3f..6400189e85 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -26,6 +26,8 @@ vars: # The following line must be updated for this example to work. authorized_cidr: /32 + gcp_public_cidrs_access_enabled: false + deployment_groups: - group: primary modules: