From 33a0fd24f1b66c063319ff0f888a56dd6941a8ef Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 15 Mar 2022 19:19:06 -0700 Subject: [PATCH] Switch to using pd-balanced for all user & dask nodes We were using standard disk to save costs, but that brings with it much slower node startup time, as images being pulled take time. pd-balanced is a newer alternative to pure SSD disks that is not as expensive, but provides much better performance than pd-standard. I think the extra cost is worth the performance on all these cases. --- terraform/gcp/cluster.tf | 17 ++++++++++++---- terraform/gcp/projects/cloudbank.tfvars | 4 ++-- terraform/gcp/projects/meom-ige.tfvars | 24 +++++++++++++---------- terraform/gcp/projects/pangeo-hubs.tfvars | 12 ++++++------ terraform/gcp/projects/pilot-hubs.tfvars | 4 ++-- terraform/gcp/variables.tf | 16 +++++++-------- 6 files changed, 45 insertions(+), 32 deletions(-) diff --git a/terraform/gcp/cluster.tf b/terraform/gcp/cluster.tf index 52ed778ee..8d21e2209 100644 --- a/terraform/gcp/cluster.tf +++ b/terraform/gcp/cluster.tf @@ -194,6 +194,13 @@ resource "google_container_node_pool" "notebook" { node_config { + + # Balanced disks are much faster than standard disks, and much cheaper + # than SSD disks. It contributes heavily to how fast new nodes spin up, + # as images being pulled takes up a lot of new node spin up time. + # Faster disks provide faster image pulls! + disk_type = "pd-balanced" + workload_metadata_config { # Config Connector requires workload identity to be enabled (via GKE_METADATA_SERVER). # If config connector is not necessary, we use simple metadata concealment @@ -257,10 +264,12 @@ resource "google_container_node_pool" "dask_worker" { node_config { preemptible = true - # SSD Disks for dask workers make image pulls much faster - # Since we might have many dask workers spinning up at the - # same time, the extra cost of using this is probably worth it. - disk_type = "pd-ssd" + + # Balanced disks are much faster than standard disks, and much cheaper + # than SSD disks. It contributes heavily to how fast new nodes spin up, + # as images being pulled takes up a lot of new node spin up time. + # Faster disks provide faster image pulls! + disk_type = "pd-balanced" workload_metadata_config { # Config Connector requires workload identity to be enabled (via GKE_METADATA_SERVER). diff --git a/terraform/gcp/projects/cloudbank.tfvars b/terraform/gcp/projects/cloudbank.tfvars index 7588416e5..b963734d8 100644 --- a/terraform/gcp/projects/cloudbank.tfvars +++ b/terraform/gcp/projects/cloudbank.tfvars @@ -15,7 +15,7 @@ notebook_nodes = { min : 0, max : 20, machine_type : "n1-highmem-4", - labels: {} + labels: {}, }, } @@ -24,7 +24,7 @@ dask_nodes = { min : 0, max : 100, machine_type : "n1-highmem-4", - labels: {} + labels: {}, }, } diff --git a/terraform/gcp/projects/meom-ige.tfvars b/terraform/gcp/projects/meom-ige.tfvars index b5a386734..7579f381a 100644 --- a/terraform/gcp/projects/meom-ige.tfvars +++ b/terraform/gcp/projects/meom-ige.tfvars @@ -24,31 +24,31 @@ notebook_nodes = { min : 0, max : 20, machine_type : "n1-standard-2", - labels: {} + labels: {}, }, "medium" : { min : 0, max : 20, machine_type : "n1-standard-8", - labels: {} + labels: {}, }, "large" : { min : 0, max : 20, machine_type : "n1-standard-16", - labels: {} + labels: {}, }, "very-large" : { min : 0, max : 20, machine_type : "n1-standard-32", - labels: {} + labels: {}, }, "huge" : { min : 0, max : 20, machine_type : "n1-standard-64", - labels: {} + labels: {}, }, } @@ -58,31 +58,35 @@ dask_nodes = { min : 0, max : 20, machine_type : "n1-standard-2", - labels: {} + labels: {}, }, "medium" : { min : 0, max : 20, machine_type : "n1-standard-8", - labels: {} + labels: {}, + disk_type: "pd-ssd" }, "large" : { min : 0, max : 20, machine_type : "n1-standard-16", - labels: {} + labels: {}, + disk_type: "pd-ssd" }, "very-large" : { min : 0, max : 20, machine_type : "n1-standard-32", - labels: {} + labels: {}, + disk_type: "pd-ssd" }, "huge" : { min : 0, max : 20, machine_type : "n1-standard-64", - labels: {} + labels: {}, + disk_type: "pd-ssd" }, } diff --git a/terraform/gcp/projects/pangeo-hubs.tfvars b/terraform/gcp/projects/pangeo-hubs.tfvars index ea95097a8..656bf43da 100644 --- a/terraform/gcp/projects/pangeo-hubs.tfvars +++ b/terraform/gcp/projects/pangeo-hubs.tfvars @@ -23,19 +23,19 @@ notebook_nodes = { min : 0, max : 100, machine_type : "n1-standard-4", - labels: {} + labels: {}, }, "medium" : { min : 0, max : 100, machine_type : "n1-standard-8", - labels: {} + labels: {}, }, "large" : { min : 0, max : 100, machine_type : "n1-standard-16", - labels: {} + labels: {}, }, } @@ -44,18 +44,18 @@ dask_nodes = { min : 0, max : 100, machine_type : "n1-standard-4", - labels: {} + labels: {}, }, "medium" : { min : 0, max : 100, machine_type : "n1-standard-8", - labels: {} + labels: {}, }, "large" : { min : 0, max : 100, machine_type : "n1-standard-16", - labels: {} + labels: {}, }, } diff --git a/terraform/gcp/projects/pilot-hubs.tfvars b/terraform/gcp/projects/pilot-hubs.tfvars index 74e669bb9..200122c19 100644 --- a/terraform/gcp/projects/pilot-hubs.tfvars +++ b/terraform/gcp/projects/pilot-hubs.tfvars @@ -14,8 +14,8 @@ notebook_nodes = { min : 0, max : 20, machine_type : "n1-highmem-4", - labels: { } - } + labels: { }, + }, } dask_nodes = { diff --git a/terraform/gcp/variables.tf b/terraform/gcp/variables.tf index 2d80d8966..a3d9008d6 100644 --- a/terraform/gcp/variables.tf +++ b/terraform/gcp/variables.tf @@ -189,8 +189,8 @@ variable "enable_private_cluster" { } variable "enable_filestore" { - type = bool - default = false + type = bool + default = false description = <<-EOT Deploy a Google FileStore for home directories @@ -201,8 +201,8 @@ variable "enable_filestore" { } variable "filestore_capacity_gb" { - type = number - default = 1024 + type = number + default = 1024 description = <<-EOT Minimum size (in GB) of Google FileStore. @@ -211,8 +211,8 @@ variable "filestore_capacity_gb" { } variable "filestore_tier" { - type = string - default = "BASIC_HDD" + type = string + default = "BASIC_HDD" description = <<-EOT Google FileStore service tier to use. @@ -222,8 +222,8 @@ variable "filestore_tier" { } variable "enable_node_autoprovisioning" { - type = bool - default = false + type = bool + default = false description = <<-EOT Enable auto-provisioning of nodes based on workload EOT