Merge pull request #201 from julienrf/align-ansible-setup

Align the scaling strategy of the Ansible-based setup with the other setups
scylladb · Aug 27, 2024 · 04aa85c · 04aa85c
2 parents a77ea75 + 84f8e9b
commit 04aa85c
Show file tree

Hide file tree

Showing 12 changed files with 51 additions and 88 deletions.
diff --git a/ansible/files/config.dynamodb.yml b/ansible/files/config.dynamodb.yml
@@ -53,7 +53,6 @@ savepoints:
   # Interval in which savepoints will be created
   intervalSeconds: 300
 renames: []
-skipTokenRanges: []
 validation:
   # Should WRITETIMEs and TTLs be compared?
   compareTimestamps: false

diff --git a/ansible/files/start-slave.sh b/ansible/files/start-slave.sh
@@ -1,2 +1,2 @@
 source spark-env
-$SPARK_HOME/sbin/start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
+$SPARK_HOME/sbin/start-worker.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
diff --git a/ansible/files/start-spark.sh b/ansible/files/start-spark.sh
@@ -13,6 +13,4 @@ cd $SPARK_HOME/sbin
 
 ./start-history-server.sh
 
-./start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
-
-./start-shuffle-service.sh
+./start-mesos-shuffle-service.sh
diff --git a/ansible/files/stop-slave.sh b/ansible/files/stop-slave.sh
@@ -1,2 +1,2 @@
 source spark-env
-$SPARK_HOME/sbin/stop-slave.sh
+$SPARK_HOME/sbin/stop-worker.sh
diff --git a/ansible/files/stop-spark.sh b/ansible/files/stop-spark.sh
@@ -7,8 +7,7 @@ source spark-env
 
 cd $SPARK_HOME/sbin
 
-./stop-shuffle-service.sh
-./stop-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
+./stop-mesos-shuffle-service.sh
 
 ./stop-history-server.sh
 

diff --git a/ansible/scylla-migrator.yml b/ansible/scylla-migrator.yml
@@ -43,7 +43,7 @@
 
     - name: Unarchive the installer.
       unarchive: 
-        src: awscliv2.zip
+        src: "{{ home_dir }}/awscliv2.zip"
         dest: "{{ home_dir }}"
         remote_src: yes
 
@@ -71,7 +71,7 @@
 
     - name: Extract spark
       ansible.builtin.unarchive:
-        src: "spark-3.5.1-bin-hadoop3-scala2.13.tgz"
+        src: "{{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13.tgz"
         dest: "{{ home_dir }}"
         remote_src: yes
 
@@ -83,7 +83,7 @@
 
     - name: Move spark to opt/spark
       become: true
-      command: "sudo mv spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}"
+      command: "sudo mv {{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}"
 
     - name: Set JAVA_HOME
       ansible.builtin.lineinfile:

diff --git a/ansible/templates/spark-env-master-sample b/ansible/templates/spark-env-master-sample
@@ -2,38 +2,31 @@
 # Master node requires a lot of memory.  We allocate 64G for the driver to avoid OOM when
 # there are a lot of tasks.
 # 
-# For this example, we're not using any workers on master.
-# so CORES = 0 and SPARK_WORKER_INSTANCES = 0.
+# EXECUTOR_MEMORY and EXECUTOR_CORES are used in the spark-submit job and controls the memory
+# and CPUs per executor. Here are our recommendations to set the number of cores per executor:
+#  - it should be between 5 and 10
+#  - the total number of cores per worker node should be a multiple of the number of cores
+#    per executor
+#  - it can not be higher than the number of cores on a worker node
+# For example, if there are 16 cores on each worker node, we could set EXECUTOR_CORES=8,
+# or if there are 2 cores on each worker node, we could set EXECUTOR_CORES=2.
 #
-# MEMORY is used in the spark-submit job and allocates the memory per executor.
-# You can have one or more executors per worker.
-# 
-# By using multiple workers on an instance, we can control the velocity of the migration.
+# By using multiple worker nodes, we can control the velocity of the migration.
 #
 # Eg. 
 #    Target system is 3 x i4i.4xlarge (16 vCPU, 128G)
-#    We can provision 3 x i4i.2xlarge (8vCPU, 64G)
+#    We can provision 2 x m7g.2xlarge (8vCPU, 32G)
 #
-#    SPARK_WORKER_INSTANCES = 1
-#    CORES = 4
-#    MEMORY = 8G     (4 cores per worker * 2G)
+#    EXECUTOR_CORES=8
+#    EXECUTOR_MEMORY=16G    # (8 cores per worker * 2G)
 #
-#    Start 1 worker per node,  or one at a time.  
+#    Start the Spark master node and the Spark worker node, and run the migration.
 #    - Monitor the pressure on the source system
-#    - Monitor the velocity of the migration in the spark jobs monito
-#
-#    You can increase the velocity by updating spark-env on worker nodes, increase the
-#    SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh.  
-#    So long as there are tasks available, the new workers would pick up tasks and increase
-#    the velocity of the migration.
-#
-#    You should be mindful of over-provisioning the number of workers on an instance.
-#    Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8...
+#    - Monitor the velocity of the migration in the spark jobs monitoring UI
+#    - If necessary, provision more worker nodes
 ##################################################################################################################
 
-export CORES=0                                          # 0 cores for slaves on master node
-export MEMORY=8G                                        # cores per worker x 2G
-export SPARK_WORKER_INSTANCES=0                         # no workers on master node
-export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }}
-
-export SLAVESIZE="-c $CORES "                 
+export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }}
+export EXECUTOR_CORES=4
+# By default, allocate 2GB of memory per core
+export EXECUTOR_MEMORY="$((EXECUTOR_CORES * 2))G"
diff --git a/ansible/templates/spark-env-worker-sample b/ansible/templates/spark-env-worker-sample
@@ -1,38 +1,13 @@
 ##################################################################################################################
-# Master node requires a lot of memory.  We allocate 64G for the driver to avoid OOM when
-# there are a lot of tasks.
-# 
-# For this example, we're not using any workers on master.
-# so CORES = 0 and SPARK_WORKER_INSTANCES = 0.
-#
-# MEMORY is used in the spark-submit job and allocates the memory per executor.
-# You can have one or more executors per worker.
-# 
-# By using multiple workers on an instance, we can control the velocity of the migration.
-#
-# Eg. 
-#    Target system is 3 x i4i.4xlarge (16 vCPU, 128G)
-#    We can provision 3 x i4i.2xlarge (8vCPU, 64G)
-#
-#    SPARK_WORKER_INSTANCES = 1
-#    CORES = 4
-#    MEMORY = 8G     (4 cores per worker * 2G)
-#
-#    Start 1 worker per node,  or one at a time.  
-#    - Monitor the pressure on the source system
-#    - Monitor the velocity of the migration in the spark jobs monito
-#
-#    You can increase the velocity by updating spark-env on worker nodes, increase the
-#    SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh.  
-#    So long as there are tasks available, the new workers would pick up tasks and increase
-#    the velocity of the migration.
-#
-#    You should be mindful of over-provisioning the number of workers on an instance.
-#    Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8...
+# SLAVESIZE is used by the start-slave.sh script. By default, a Spark worker uses all the
+# resources of the machine it is started on. If you want to decrease the migration velocity,
+# you can set the maximum number of cores and memory that can be used by the worker.
 ##################################################################################################################
 
-export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }}
-export CORES=4                                          # number of cores per worker
-export SPARK_WORKER_INSTANCES=4                         # this is how many workers will be 
-                                                        # started/stopped on the node.
-export SLAVESIZE="-c $CORES"
+export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }}
+
+export SLAVESIZE=""
+# Optionally, limit the resources available to the Spark worker by uncommenting the following lines
+#export MAX_CORES=2                  # max number of cores to use on the machine
+#export MAX_MEMORY=4G                # max amount of memory to use on the machine
+#export SLAVESIZE="--cores $MAX_CORES --memory $MAX_MEMORY"
diff --git a/ansible/templates/submit-alternator-job.sh b/ansible/templates/submit-alternator-job.sh
@@ -6,12 +6,11 @@ source spark-env
 
 mkdir /tmp/savepoints
 
-# 
-
 time spark-submit --class com.scylladb.migrator.Migrator \
---master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+--master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
 --conf spark.eventLog.enabled=true \
 --conf spark.scylla.config=/home/ubuntu/scylla-migrator/config.dynamodb.yml \
---conf spark.executor.memory=$MEMORY \
---conf spark.driver.memory=64G \
+--executor-memory $EXECUTOR_MEMORY \
+--executor-cores $EXECUTOR_CORES \
+--driver-memory 4G \
 /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
diff --git a/ansible/templates/submit-cql-job-validator.sh b/ansible/templates/submit-cql-job-validator.sh
@@ -7,12 +7,12 @@ source spark-env
 mkdir /tmp/savepoints
 
 time spark-submit --class com.scylladb.migrator.Validator \
-  --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+  --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
   --conf spark.eventLog.enabled=true \
   --conf spark.scylla.config=config.yaml \
   --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \
   --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \
-  --num-executors $SPARK_WORKER_INSTANCES \
-  --executor-memory $MEMORY \
+  --executor-memory $EXECUTOR_MEMORY \
+  --executor-cores $EXECUTOR_CORES \
   --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \
   /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
diff --git a/ansible/templates/submit-cql-job.sh b/ansible/templates/submit-cql-job.sh
@@ -7,29 +7,29 @@ source spark-env
 mkdir /tmp/savepoints
 
 time spark-submit --class com.scylladb.migrator.Migrator \
-  --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+  --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
   --conf spark.eventLog.enabled=true \
   --conf spark.scylla.config=config.yaml \
   --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \
   --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \
-  --num-executors $SPARK_WORKER_INSTANCES \
-  --executor-memory $MEMORY \
+  --executor-memory $EXECUTOR_MEMORY \
+  --executor-cores $EXECUTOR_CORES \
   --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \
   /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
 
 #sometimes you will need a tuning for driver memory size
 #add this config to above to tune it:
-#  --conf spark.driver.memory=4G \
+#  --driver-memory 4G \
 
 # debug example
 #$SPARK_HOME/spark-submit --class com.scylladb.migrator.Migrator \
 #  --driver-java-options -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=65005 \
-#  --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+#  --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
 #  --conf spark.scylla.config=config.yaml \
 #  --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \
 #  --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \
-#  --num-executors 1 \
-#  --executor-memory $MEMORY \
+#  --executor-memory $EXECUTOR_MEMORY \
+#  --executor-cores $EXECUTOR_CORES \
 #  --conf "spark.executor.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=64000 -XX:+HeapDumpOnOutOfMemoryError" \
 #  --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \
 #  /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar

diff --git a/docs/source/getting-started/ansible.rst b/docs/source/getting-started/ansible.rst
@@ -4,7 +4,7 @@ Set Up a Spark Cluster with Ansible
 
 An `Ansible <https://www.ansible.com/>`_ playbook is provided in the `ansible folder <https://github.com/scylladb/scylla-migrator/tree/master/ansible>`_ of our Git repository. The Ansible playbook will install the pre-requisites, Spark, on the master and workers added to the ``ansible/inventory/hosts`` file.  Scylla-migrator will be installed on the spark master node.
 
-The Ansible playbook expects to be run in an Ubuntu environment where the directory ``/home/ubuntu`` already exists.
+The Ansible playbook expects to be run in an Ubuntu environment, by a user named ``ubuntu`` (like you get in AWS EC2 Ubuntu-based images).
 
 1. Clone the Migrator Git repository: