Merge #139443

139443: drtprod: YAML for 300 node and YCSB scripts r=vidit-bhat a=nameisbhaskar The PR includes the following changes: 1. YAML for the 300 node sniff testing 2. YCSB init and run scripts 3. Max rate for tpcc changed to 500 in drt-large cluster YAML 4. Change in the tpcc_run_multiregion.sh to take the PGURL at runtime. Epic: None Release: None Co-authored-by: Bhaskarjyoti Bora <[email protected]>
cockroachdb · Jan 21, 2025 · 4cb25fc · 4cb25fc
2 parents dda6410 + f27a48e
commit 4cb25fc
Show file tree

Hide file tree

Showing 5 changed files with 321 additions and 3 deletions.
diff --git a/pkg/cmd/drtprod/configs/drt_large.yaml b/pkg/cmd/drtprod/configs/drt_large.yaml
@@ -19,7 +19,7 @@ environment:
   RUN_DURATION: 12h
   NUM_CONNECTIONS: 500
   NUM_WORKERS: 500
-  MAX_RATE: 1000
+  MAX_RATE: 500
 
 targets:
   - target_name: $CLUSTER
@@ -120,6 +120,18 @@ targets:
         args:
           - $WORKLOAD_CLUSTER
           - workload
+  - target_name: rp
+    steps:
+      - command: put
+        args:
+          - $WORKLOAD_CLUSTER
+          - artifacts/roachprod
+          - roachprod
+      - command: put
+        args:
+          - $WORKLOAD_CLUSTER:1
+          - artifacts/roachtest
+          - roachtest-operations
       - script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
   - target_name: post_tasks
     dependent_targets:

diff --git a/pkg/cmd/drtprod/configs/drt_scale_300.yaml b/pkg/cmd/drtprod/configs/drt_scale_300.yaml
@@ -0,0 +1,150 @@
+# Yaml for creating and configuring the drt-scale cluster. This also configures Datadog.
+# Build the roachprod and roachtest binaries (using --cross) before running this script
+environment:
+  ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
+  ROACHPROD_DNS: drt.crdb.io
+  ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
+  ROACHPROD_GCE_DNS_ZONE: drt
+  ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
+  CLUSTER: drt-scale-300
+  WORKLOAD_CLUSTER: workload-scale-300
+  CLUSTER_NODES: 300
+  WORKLOAD_NODES: 20
+
+targets:
+  # crdb cluster specs
+  - target_name: $CLUSTER
+    steps:
+      - command: create
+        args:
+          - $CLUSTER
+        flags:
+          clouds: gce
+          gce-managed: true
+          gce-enable-multiple-stores: true
+          gce-zones: "us-central1-a:150,us-central1-b:150"
+          nodes: $CLUSTER_NODES
+          gce-machine-type: n2-standard-16
+          local-ssd: false
+          gce-pd-volume-size: 375
+          gce-pd-volume-type: pd-ssd
+          gce-pd-volume-count: 4
+          os-volume-size: 100
+          username: drt
+          lifetime: 8760h
+          gce-image: "ubuntu-2204-jammy-v20250112"
+      - command: sync
+        flags:
+          clouds: gce
+      - command: stage
+        args:
+          - $CLUSTER
+          - cockroach
+      - script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
+      - command: start
+        args:
+          - $CLUSTER
+          - "--binary"
+          - "./cockroach"
+        flags:
+          # add flag to set provisioned throughput on each store according to their cloud provider limits
+          enable-fluent-sink: true
+          store-count: 4
+          args: --wal-failover=among-stores
+          restart: false
+          sql-port: 26257
+      - command: run
+        args:
+          - $CLUSTER
+          - --
+          - "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
+  # workload cluster specs
+  - target_name: $WORKLOAD_CLUSTER
+    steps:
+      - command: create
+        args:
+          - $WORKLOAD_CLUSTER
+        flags:
+          clouds: gce
+          gce-zones: "us-central1-a,us-central1-b"
+          nodes: $WORKLOAD_NODES
+          gce-machine-type: n2-standard-8
+          os-volume-size: 100
+          username: workload
+          lifetime: 8760h
+          gce-image: "ubuntu-2204-jammy-v20250112"
+        on_rollback:
+          - command: destroy
+            args:
+              - $WORKLOAD_CLUSTER
+      - command: sync
+        flags:
+          clouds: gce
+      - command: stage
+        args:
+          - $WORKLOAD_CLUSTER
+          - cockroach
+      - command: stage
+        args:
+          - $WORKLOAD_CLUSTER
+          - workload
+      - script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
+  - target_name: post_tasks
+    dependent_targets:
+      - $CLUSTER
+      - $WORKLOAD_CLUSTER
+    steps:
+      - script: rm
+        args:
+          - -rf
+          - certs-$CLUSTER
+      - command: get
+        args:
+          - $CLUSTER:1
+          - certs
+          - certs-$CLUSTER
+      - command: put
+        args:
+          - $WORKLOAD_CLUSTER
+          - certs-$CLUSTER
+          - certs
+      - command: ssh
+        args:
+          - $WORKLOAD_CLUSTER
+          - --
+          - chmod
+          - 600
+          - './certs/*'
+      - command: put
+        args:
+          - $WORKLOAD_CLUSTER
+          - artifacts/roachprod
+          - roachprod
+      - command: put
+        args:
+          - $WORKLOAD_CLUSTER
+          - artifacts/roachtest
+          - roachtest-operations
+      - script: "pkg/cmd/drtprod/scripts/ycsb_init.sh"
+        args:
+          - 20M
+          - true
+        flags:
+          splits: 200
+          insert-count: 20000000
+  - target_name: ycsb_run
+    dependent_targets:
+      - post_tasks
+    steps:
+      - script: "pkg/cmd/drtprod/scripts/generate_ycsb_run.sh"
+        args:
+          - 20M
+          - false
+        flags:
+          max-rate: 66666
+          read-freq: 0.8
+          insert-freq: 0.1
+          update-freq: 0.05
+          delete-freq: 0.05
+          duration: 0
+          ramp: 5s
diff --git a/pkg/cmd/drtprod/scripts/generate_ycsb_run.sh b/pkg/cmd/drtprod/scripts/generate_ycsb_run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+# Copyright 2024 The Cockroach Authors.
+#
+# Use of this software is governed by the CockroachDB Software License
+# included in the /LICENSE file.
+
+# This script sets up the ycsb run workload script in the workload nodes
+# The --warehouses flag is passed as argument to this script
+# NOTE - This uses CLUSTER and WORKLOAD_CLUSTER environment variable, if not set the script fails
+if [ "$#" -lt 7 ]; then
+  echo "Usage: $0 <script_suffix> <execute:true|false> <flags to run: max-rate, read-freq, insert-freq, update-freq, delete-freq>"
+  exit 1
+fi
+suffix=$1
+shift
+# The second argument represents whether the init process should be started in the workload cluster
+# The value is true or false
+if [ "$1" != "true" ] && [ "$1" != "false" ]; then
+  # $1 is used again because of the shift
+  echo "Error: The second argument must be 'true' or 'false' which implies whether the script should be started in background or not."
+  exit 1
+fi
+execute_script=$1
+shift
+
+if [ -z "${CLUSTER}" ]; then
+  echo "environment CLUSTER is not set"
+  exit 1
+fi
+
+if [ -z "${WORKLOAD_CLUSTER}" ]; then
+  echo "environment WORKLOAD_CLUSTER is not set"
+  exit 1
+fi
+
+if [ -z "${WORKLOAD_NODES}" ]; then
+  echo "environment WORKLOAD_NODES is not set"
+  exit 1
+fi
+
+if [ -z "${CLUSTER_NODES}" ]; then
+  echo "environment CLUSTER_NODES is not set"
+  exit 1
+fi
+
+absolute_path=$(drtprod run "${WORKLOAD_CLUSTER}":1 -- "realpath ./cockroach")
+pwd=$(drtprod run "${WORKLOAD_CLUSTER}":1 -- "dirname ${absolute_path}")
+
+# Calculate the number of PGURLS each workload node should get
+PGURL_PER_NODE=$((CLUSTER_NODES / WORKLOAD_NODES))
+REMAINDER_NODE=$((CLUSTER_NODES % WORKLOAD_NODES))
+
+# Distribute the PGURLS among the workload nodes
+for ((NODE=0; NODE<WORKLOAD_NODES; NODE++)); do
+  START_OFFSET=$((NODE * PGURL_PER_NODE + (NODE < REMAINDER_NODE ? NODE : REMAINDER_NODE) + 1))
+  END_OFFSET=$((START_OFFSET + PGURL_PER_NODE + (NODE < REMAINDER_NODE ? 1 : 0) - 1))
+
+  # Print or use the PGURLS for the current workload node
+  echo "pgurl for Nodes ${START_OFFSET}:${END_OFFSET}"
+
+  # Create the workload script
+  cat <<EOF >/tmp/ycsb_run_${suffix}.sh
+#!/usr/bin/env bash
+
+export ROACHPROD_GCE_DEFAULT_PROJECT=$ROACHPROD_GCE_DEFAULT_PROJECT
+INSERT_START=10000000000000000
+NUM_WORKERS_PER_NODE=5
+OUTPUT_FILE_A="ycsb-a-\$(date '+%Y-%m-%d-%H:%M:%S').log"
+OUTPUT_ERROR_FILE_A="ycsb-a-\$(date '+%Y-%m-%d-%H:%M:%S').error.log"
+CLIENTS_PER_WORKLOAD=4000
+
+./roachprod sync
+PGURLS=\$(./roachprod pgurl $CLUSTER:$START_OFFSET-$END_OFFSET | sed s/\'//g)
+read -r -a PGURLS_ARR <<< "\$PGURLS"
+
+for ((j=1;j<=\$NUM_WORKERS_PER_NODE;j++)); do
+    echo ">> Starting ycsb workload"
+    nohup ./cockroach workload run ycsb --tolerate-errors --workload='custom' \
+       --min-conns=\$((CLIENTS_PER_WORKLOAD/NUM_WORKERS_PER_NODE)) $@ \
+       --insert-start=\$((INSERT_START*$NODE+(INSERT_START/j))) \
+        --families=false --request-distribution='uniform' --scan-length-distribution='uniform' \
+         --concurrency=\$((CLIENTS_PER_WORKLOAD/NUM_WORKERS_PER_NODE)) \
+          \${PGURLS_ARR[@]} > \$OUTPUT_FILE_A 2> \$OUTPUT_ERROR_FILE_A &
+done
+EOF
+
+#   Upload the script to the workload cluster
+  drtprod put $WORKLOAD_CLUSTER:$((NODE + 1)) /tmp/ycsb_run_${suffix}.sh
+  drtprod ssh $WORKLOAD_CLUSTER:$((NODE + 1)) -- "chmod +x ycsb_run_${suffix}.sh"
+done
+if [ "$execute_script" = "true" ]; then
+    drtprod run "${WORKLOAD_CLUSTER}" -- "${pwd}/ycsb_run_${suffix}.sh"
+else
+  echo "Run --> drtprod run "${WORKLOAD_CLUSTER}" -- \"${pwd}/ycsb_run_${suffix}.sh\""
+fi
diff --git a/pkg/cmd/drtprod/scripts/tpcc_run_multiregion.sh b/pkg/cmd/drtprod/scripts/tpcc_run_multiregion.sh
@@ -28,9 +28,14 @@ do
   #  us to reach the specified region, and then add the actual number of workers
   #  we want to run.
   EFFECTIVE_NUM_WORKERS=$(($(($TPCC_WAREHOUSES/$NUM_REGIONS))*$(($NODE-1))+$NUM_WORKERS))
-  PGURLS_REGION=$(./bin/drtprod pgurl $CLUSTER:$NODE_OFFSET-$LAST_NODE_IN_REGION | sed "s/'//g; s/^/'/; s/$/'/")
   cat <<EOF >/tmp/tpcc_run.sh
 #!/usr/bin/env bash
+
+export ROACHPROD_GCE_DEFAULT_PROJECT=$ROACHPROD_GCE_DEFAULT_PROJECT
+./roachprod sync
+PGURLS=\$(./roachprod pgurl $CLUSTER:$NODE_OFFSET-$LAST_NODE_IN_REGION | sed s/\'//g)
+read -r -a PGURLS_REGION <<< "\$PGURLS"
+
 j=0
 while true; do
   echo ">> Starting tpcc workload"
@@ -47,7 +52,7 @@ while true; do
       --partitions=$NUM_REGIONS \
       --partition-affinity=$(($NODE-1)) \
       --tolerate-errors \
-      $PGURLS_REGION \
+      \${PGURLS_REGION[@]} \
       --survival-goal region \
       --regions=$REGIONS
 done

diff --git a/pkg/cmd/drtprod/scripts/ycsb_init.sh b/pkg/cmd/drtprod/scripts/ycsb_init.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2024 The Cockroach Authors.
+#
+# Use of this software is governed by the CockroachDB Software License
+# included in the /LICENSE file.
+
+# This script sets up the ycsb import workload script in the workload node and starts the same in nohup
+# The --warehouses and other flags for import are passed as argument to this script
+# NOTE - This uses CLUSTER and WORKLOAD_CLUSTER environment variable, if not set the script fails
+
+# The first argument is the name suffix that is added to the script as ycsb_init_<suffix>.sh
+if [ "$#" -lt 4 ]; then
+  echo "Usage: $0 <script_suffix> <execute:true|false> <flags to init:--splits, --insert-count>"
+  exit 1
+fi
+suffix=$1
+shift
+# The second argument represents whether the init process should be started in the workload cluster
+# The value is true or false
+if [ "$1" != "true" ] && [ "$1" != "false" ]; then
+  # $1 is used again because of the shift
+  echo "Error: The second argument must be 'true' or 'false' which implies whether the script should be started in background or not."
+  exit 1
+fi
+execute_script=$1
+shift
+
+if [ -z "${CLUSTER}" ]; then
+  echo "environment CLUSTER is not set"
+  exit 1
+fi
+
+if [ -z "${WORKLOAD_CLUSTER}" ]; then
+  echo "environment CLUSTER is not set"
+  exit 1
+fi
+
+absolute_path=$(drtprod run "${WORKLOAD_CLUSTER}":1 -- "realpath ./cockroach")
+pwd=$(drtprod run "${WORKLOAD_CLUSTER}":1 -- "dirname ${absolute_path}")
+PGURLS=$(drtprod pgurl "${CLUSTER}":1)
+
+# script is responsible for importing the ycsb database for workload
+drtprod ssh "${WORKLOAD_CLUSTER}":1 -- "tee ycsb_init_${suffix}.sh > /dev/null << 'EOF'
+#!/bin/bash
+
+${pwd}/cockroach workload init ycsb $PGURLS --drop --families=false $@
+EOF"
+drtprod ssh "${WORKLOAD_CLUSTER}":1 -- "chmod +x ycsb_init_${suffix}.sh"
+
+if [ "$execute_script" = "true" ]; then
+  drtprod run "${WORKLOAD_CLUSTER}":1 -- "sudo systemd-run --unit ycsb_init_${suffix} --same-dir --uid \$(id -u) --gid \$(id -g) bash ${pwd}/ycsb_init_${suffix}.sh"
+else
+  echo "Run --> drtprod run "${WORKLOAD_CLUSTER}":1 -- \"sudo systemd-run --unit ycsb_init_${suffix} --same-dir --uid \\\$(id -u) --gid \\\$(id -g) bash ${pwd}/ycsb_init_${suffix}.sh\""
+fi