updates during annual 2024 Cloud Distaster Recovery test

sourcegraph · Jan 30, 2024 · 58bda8d · 58bda8d
1 parent 6155bcb
commit 58bda8d
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 12 deletions.
diff --git a/content/departments/cloud/technical-docs/v2.0/disaster_recovery_process.md b/content/departments/cloud/technical-docs/v2.0/disaster_recovery_process.md
@@ -13,8 +13,9 @@ Report from [failover test on 28tf of November 2022](https://docs.google.com/doc
 ```sh
 export ENVIRONMENT=[dev|prod]
 export SLUG=<SLUG>
-export GKE_NAME=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.gkeClusters[0].name')
-export GKE_REGION=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.spec.gcpRegion')
+export GKE_NAME=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.gcp.gkeClusters[0].name')
+export GKE_REGION=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.gcp.region')
+export GCP_PROJECT=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.gcp.projectId')
 ```
 
 - extract the instance from Control Plane if `cloud.sourcegraph.com/control-plane-mode=true` is in `config.yaml`
@@ -44,9 +45,9 @@ kubectl describe node <NODE_FROM_CLUSTER> | grep zone
 - perform zone failover (remove node zone from GKE node locations)
 
 ```sh
-gcloud container node-pools list --cluster $GKE_NAME --region $GKE_REGION
-gcloud container node-pools describe primary --cluster $GKE_NAME --region $GKE_REGION --format json | jq '.locations'
-gcloud container node-pools update primary --cluster $GKE_NAME --region $GKE_REGION --node-locations <DIFFERENT_ZONE_THAN_EXISTING_NODE> --async
+gcloud container node-pools list --cluster $GKE_NAME --region $GKE_REGION --project $GCP_PROJECT
+gcloud container node-pools describe primary --cluster $GKE_NAME --region $GKE_REGION --project $GCP_PROJECT --format json | jq '.locations'
+gcloud container node-pools update primary --cluster $GKE_NAME --region $GKE_REGION --project $GCP_PROJECT --node-locations <DIFFERENT_ZONE_THAN_EXISTING_NODE> --async
 ```
 
 - verify pods were terminated
@@ -86,8 +87,8 @@ Follow the `Backfill instance into control plane` section from the Ops Dashboard
 ```sh
 export ENVIRONMENT=[dev|prod]
 export SLUG=<SLUG>
-export CLOUDSQL_INSTANCE_NAME=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.cloudSQL[0].name')
-export GCP_PROJECT=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.gcpProjectId')
+export CLOUDSQL_INSTANCE_NAME=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.gcp.cloudSQL[0].name')
+export GCP_PROJECT=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.status.gcp.projectId')
 export INSTANCE_ID=$(mi2 instance get -e $ENVIRONMENT --slug $SLUG | jq -r '.metadata.name')
 ```
 
@@ -110,7 +111,7 @@ gcloud sql instances describe $CLOUDSQL_INSTANCE_NAME --project $GCP_PROJECT | g
 mi2 instance edit --jq '.spec.infrastructure.gcp.zone = "'$FAILOVER_ZONE'"' --slug $SLUG -e $ENVIRONMENT
 mi2 generate cdktf -e $ENVIRONMENT --slug $SLUG
 cd environments/$ENVIRONMENT/deployments/$INSTANCE_ID/terraform/stacks/sql
-terraform init && terraform apply
+terraform init && terraform apply -auto-approve
 
 gcloud sql instances describe $CLOUDSQL_INSTANCE_NAME --project $GCP_PROJECT | grep zone
 # should return <FAILOVER_ZONE>
@@ -132,6 +133,7 @@ curl -sSL --fail https://$SLUG.sourcegraph.com/sign-in -i
 mi2 instance sql-backup list --slug $SLUG -e $ENVIRONMENT
 mi2 instance sql-restore create --backup-id $SQL_BACKUP_ID --slug $SLUG -e $ENVIRONMENT
 # wait until ready
+# can check status with command: mi2 instance sql-restore list --slug $SLUG -e $ENVIRONMENT
 gcloud sql instances describe $CLOUDSQL_INSTANCE_NAME --project $GCP_PROJECT
 ```
 

diff --git a/content/departments/cloud/technical-docs/v2.0/restore_process.md b/content/departments/cloud/technical-docs/v2.0/restore_process.md
@@ -76,11 +76,9 @@ note the backup name, you will need it later.
 
 ```sh
 cd sourcegraph/cloud
-cd environments/$ENVIRONMENT/deployments/$INSTANCE_ID/terraform/infra
-terraform init
-terraform apply
+cd environments/$ENVIRONMENT/deployments/$INSTANCE_ID
+mi2 instance tfc deploy -auto-approve -e $ENVIRONMENT --slug $SLUG
 mi2 instance workon -e $ENVIRONMENT --slug $SLUG
-cd sourcegraph/cloud
 mi2 instance restore create --backup-name $BACKUP_NAME --restore-type full-replace --slug $SLUG -e $ENVIRONMENT
 ```
 
@@ -93,6 +91,12 @@ cd sourcegraph/cloud
 mi2 instance restore create --backup-name <BACKUP_NAME> --restore-type full-replace --slug $SLUG -e $ENVIRONMENT
 ```
 
+Note: if pod hangs with PVCs pending, use below command:
+
+```sh
+kubectl delete sc gce-pd-gkebackup-de && kubectl get sc sourcegraph -o json | jq '.metadata.name = "gce-pd-gkebackup-de"' | kubectl apply -f -
+```
+
 ### Restore stateless application
 
 e.g. `sourcegraph-frontend`