diff --git a/ansible/files/config.dynamodb.yml b/ansible/files/config.dynamodb.yml index 8298c171..61339d12 100644 --- a/ansible/files/config.dynamodb.yml +++ b/ansible/files/config.dynamodb.yml @@ -53,7 +53,6 @@ savepoints: # Interval in which savepoints will be created intervalSeconds: 300 renames: [] -skipTokenRanges: [] validation: # Should WRITETIMEs and TTLs be compared? compareTimestamps: false diff --git a/ansible/files/start-slave.sh b/ansible/files/start-slave.sh index 5845c6ff..ebd9ac3a 100644 --- a/ansible/files/start-slave.sh +++ b/ansible/files/start-slave.sh @@ -1,2 +1,2 @@ source spark-env -$SPARK_HOME/sbin/start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE \ No newline at end of file +$SPARK_HOME/sbin/start-worker.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE \ No newline at end of file diff --git a/ansible/files/start-spark.sh b/ansible/files/start-spark.sh index dd1cc7f9..f7f1b1de 100644 --- a/ansible/files/start-spark.sh +++ b/ansible/files/start-spark.sh @@ -13,6 +13,4 @@ cd $SPARK_HOME/sbin ./start-history-server.sh -./start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE - -./start-shuffle-service.sh \ No newline at end of file +./start-mesos-shuffle-service.sh diff --git a/ansible/files/stop-slave.sh b/ansible/files/stop-slave.sh index bb15a8e9..5f2d0ad2 100644 --- a/ansible/files/stop-slave.sh +++ b/ansible/files/stop-slave.sh @@ -1,2 +1,2 @@ source spark-env -$SPARK_HOME/sbin/stop-slave.sh \ No newline at end of file +$SPARK_HOME/sbin/stop-worker.sh \ No newline at end of file diff --git a/ansible/files/stop-spark.sh b/ansible/files/stop-spark.sh index 99b4b9f6..2d7436df 100644 --- a/ansible/files/stop-spark.sh +++ b/ansible/files/stop-spark.sh @@ -7,8 +7,7 @@ source spark-env cd $SPARK_HOME/sbin -./stop-shuffle-service.sh -./stop-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE +./stop-mesos-shuffle-service.sh ./stop-history-server.sh diff --git a/ansible/scylla-migrator.yml b/ansible/scylla-migrator.yml index c4bc287e..a5493a02 100644 --- a/ansible/scylla-migrator.yml +++ b/ansible/scylla-migrator.yml @@ -43,7 +43,7 @@ - name: Unarchive the installer. unarchive: - src: awscliv2.zip + src: "{{ home_dir }}/awscliv2.zip" dest: "{{ home_dir }}" remote_src: yes @@ -71,7 +71,7 @@ - name: Extract spark ansible.builtin.unarchive: - src: "spark-3.5.1-bin-hadoop3-scala2.13.tgz" + src: "{{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13.tgz" dest: "{{ home_dir }}" remote_src: yes @@ -83,7 +83,7 @@ - name: Move spark to opt/spark become: true - command: "sudo mv spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}" + command: "sudo mv {{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}" - name: Set JAVA_HOME ansible.builtin.lineinfile: diff --git a/ansible/templates/spark-env-master-sample b/ansible/templates/spark-env-master-sample index 60767491..ff31235e 100644 --- a/ansible/templates/spark-env-master-sample +++ b/ansible/templates/spark-env-master-sample @@ -2,38 +2,31 @@ # Master node requires a lot of memory. We allocate 64G for the driver to avoid OOM when # there are a lot of tasks. # -# For this example, we're not using any workers on master. -# so CORES = 0 and SPARK_WORKER_INSTANCES = 0. +# EXECUTOR_MEMORY and EXECUTOR_CORES are used in the spark-submit job and controls the memory +# and CPUs per executor. Here are our recommendations to set the number of cores per executor: +# - it should be between 5 and 10 +# - the total number of cores per worker node should be a multiple of the number of cores +# per executor +# - it can not be higher than the number of cores on a worker node +# For example, if there are 16 cores on each worker node, we could set EXECUTOR_CORES=8, +# or if there are 2 cores on each worker node, we could set EXECUTOR_CORES=2. # -# MEMORY is used in the spark-submit job and allocates the memory per executor. -# You can have one or more executors per worker. -# -# By using multiple workers on an instance, we can control the velocity of the migration. +# By using multiple worker nodes, we can control the velocity of the migration. # # Eg. # Target system is 3 x i4i.4xlarge (16 vCPU, 128G) -# We can provision 3 x i4i.2xlarge (8vCPU, 64G) +# We can provision 2 x m7g.2xlarge (8vCPU, 32G) # -# SPARK_WORKER_INSTANCES = 1 -# CORES = 4 -# MEMORY = 8G (4 cores per worker * 2G) +# EXECUTOR_CORES=8 +# EXECUTOR_MEMORY=16G # (8 cores per worker * 2G) # -# Start 1 worker per node, or one at a time. +# Start the Spark master node and the Spark worker node, and run the migration. # - Monitor the pressure on the source system -# - Monitor the velocity of the migration in the spark jobs monito -# -# You can increase the velocity by updating spark-env on worker nodes, increase the -# SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh. -# So long as there are tasks available, the new workers would pick up tasks and increase -# the velocity of the migration. -# -# You should be mindful of over-provisioning the number of workers on an instance. -# Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8... +# - Monitor the velocity of the migration in the spark jobs monitoring UI +# - If necessary, provision more worker nodes ################################################################################################################## -export CORES=0 # 0 cores for slaves on master node -export MEMORY=8G # cores per worker x 2G -export SPARK_WORKER_INSTANCES=0 # no workers on master node -export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }} - -export SLAVESIZE="-c $CORES " \ No newline at end of file +export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }} +export EXECUTOR_CORES=4 +# By default, allocate 2GB of memory per core +export EXECUTOR_MEMORY="$((EXECUTOR_CORES * 2))G" diff --git a/ansible/templates/spark-env-worker-sample b/ansible/templates/spark-env-worker-sample index 72826684..896308b4 100644 --- a/ansible/templates/spark-env-worker-sample +++ b/ansible/templates/spark-env-worker-sample @@ -1,38 +1,13 @@ ################################################################################################################## -# Master node requires a lot of memory. We allocate 64G for the driver to avoid OOM when -# there are a lot of tasks. -# -# For this example, we're not using any workers on master. -# so CORES = 0 and SPARK_WORKER_INSTANCES = 0. -# -# MEMORY is used in the spark-submit job and allocates the memory per executor. -# You can have one or more executors per worker. -# -# By using multiple workers on an instance, we can control the velocity of the migration. -# -# Eg. -# Target system is 3 x i4i.4xlarge (16 vCPU, 128G) -# We can provision 3 x i4i.2xlarge (8vCPU, 64G) -# -# SPARK_WORKER_INSTANCES = 1 -# CORES = 4 -# MEMORY = 8G (4 cores per worker * 2G) -# -# Start 1 worker per node, or one at a time. -# - Monitor the pressure on the source system -# - Monitor the velocity of the migration in the spark jobs monito -# -# You can increase the velocity by updating spark-env on worker nodes, increase the -# SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh. -# So long as there are tasks available, the new workers would pick up tasks and increase -# the velocity of the migration. -# -# You should be mindful of over-provisioning the number of workers on an instance. -# Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8... +# SLAVESIZE is used by the start-slave.sh script. By default, a Spark worker uses all the +# resources of the machine it is started on. If you want to decrease the migration velocity, +# you can set the maximum number of cores and memory that can be used by the worker. ################################################################################################################## -export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }} -export CORES=4 # number of cores per worker -export SPARK_WORKER_INSTANCES=4 # this is how many workers will be - # started/stopped on the node. -export SLAVESIZE="-c $CORES" \ No newline at end of file +export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }} + +export SLAVESIZE="" +# Optionally, limit the resources available to the Spark worker by uncommenting the following lines +#export MAX_CORES=2 # max number of cores to use on the machine +#export MAX_MEMORY=4G # max amount of memory to use on the machine +#export SLAVESIZE="--cores $MAX_CORES --memory $MAX_MEMORY" diff --git a/ansible/templates/submit-alternator-job.sh b/ansible/templates/submit-alternator-job.sh index 342ee5e9..f9d554bc 100644 --- a/ansible/templates/submit-alternator-job.sh +++ b/ansible/templates/submit-alternator-job.sh @@ -6,12 +6,11 @@ source spark-env mkdir /tmp/savepoints -# - time spark-submit --class com.scylladb.migrator.Migrator \ ---master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ +--master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ --conf spark.eventLog.enabled=true \ --conf spark.scylla.config=/home/ubuntu/scylla-migrator/config.dynamodb.yml \ ---conf spark.executor.memory=$MEMORY \ ---conf spark.driver.memory=64G \ +--executor-memory $EXECUTOR_MEMORY \ +--executor-cores $EXECUTOR_CORES \ +--driver-memory 4G \ /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar \ No newline at end of file diff --git a/ansible/templates/submit-cql-job-validator.sh b/ansible/templates/submit-cql-job-validator.sh index cc3f9b52..048a9409 100644 --- a/ansible/templates/submit-cql-job-validator.sh +++ b/ansible/templates/submit-cql-job-validator.sh @@ -7,12 +7,12 @@ source spark-env mkdir /tmp/savepoints time spark-submit --class com.scylladb.migrator.Validator \ - --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ + --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ --conf spark.eventLog.enabled=true \ --conf spark.scylla.config=config.yaml \ --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \ --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \ - --num-executors $SPARK_WORKER_INSTANCES \ - --executor-memory $MEMORY \ + --executor-memory $EXECUTOR_MEMORY \ + --executor-cores $EXECUTOR_CORES \ --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \ /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar \ No newline at end of file diff --git a/ansible/templates/submit-cql-job.sh b/ansible/templates/submit-cql-job.sh index 7d2e42a7..f167df79 100644 --- a/ansible/templates/submit-cql-job.sh +++ b/ansible/templates/submit-cql-job.sh @@ -7,29 +7,29 @@ source spark-env mkdir /tmp/savepoints time spark-submit --class com.scylladb.migrator.Migrator \ - --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ + --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ --conf spark.eventLog.enabled=true \ --conf spark.scylla.config=config.yaml \ --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \ --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \ - --num-executors $SPARK_WORKER_INSTANCES \ - --executor-memory $MEMORY \ + --executor-memory $EXECUTOR_MEMORY \ + --executor-cores $EXECUTOR_CORES \ --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \ /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar #sometimes you will need a tuning for driver memory size #add this config to above to tune it: -# --conf spark.driver.memory=4G \ +# --driver-memory 4G \ # debug example #$SPARK_HOME/spark-submit --class com.scylladb.migrator.Migrator \ # --driver-java-options -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=65005 \ -# --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ +# --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ # --conf spark.scylla.config=config.yaml \ # --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \ # --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \ -# --num-executors 1 \ -# --executor-memory $MEMORY \ +# --executor-memory $EXECUTOR_MEMORY \ +# --executor-cores $EXECUTOR_CORES \ # --conf "spark.executor.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=64000 -XX:+HeapDumpOnOutOfMemoryError" \ # --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \ # /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar diff --git a/docs/source/getting-started/ansible.rst b/docs/source/getting-started/ansible.rst index 7638a772..c3d47aed 100644 --- a/docs/source/getting-started/ansible.rst +++ b/docs/source/getting-started/ansible.rst @@ -4,7 +4,7 @@ Set Up a Spark Cluster with Ansible An `Ansible `_ playbook is provided in the `ansible folder `_ of our Git repository. The Ansible playbook will install the pre-requisites, Spark, on the master and workers added to the ``ansible/inventory/hosts`` file. Scylla-migrator will be installed on the spark master node. -The Ansible playbook expects to be run in an Ubuntu environment where the directory ``/home/ubuntu`` already exists. +The Ansible playbook expects to be run in an Ubuntu environment, by a user named ``ubuntu`` (like you get in AWS EC2 Ubuntu-based images). 1. Clone the Migrator Git repository: