From aa000e587b7838f026989e3136e24a3763680386 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Fri, 23 Aug 2024 17:31:47 +0200 Subject: [PATCH 1/9] Apply the AlternatorEndpointProvider only on custom endpoints The previous implementation was failing when used on AWS DynamoDB because `conf.get(DynamoDBConstants.ENDPOINT)` was `null`. This was not caught by our tests because our tests always use a custom endpoint (see #113) --- .../scala/com/scylladb/migrator/DynamoUtils.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala b/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala index 50b9dc6..26f6cda 100644 --- a/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala +++ b/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala @@ -293,10 +293,14 @@ object DynamoUtils { class AlternatorLoadBalancingEnabler extends DynamoDbClientBuilderTransformer with Configurable { private var conf: Configuration = null - override def apply(builder: DynamoDbClientBuilder): DynamoDbClientBuilder = - builder.endpointProvider( - new AlternatorEndpointProvider(URI.create(conf.get(DynamoDBConstants.ENDPOINT))) - ) + override def apply(builder: DynamoDbClientBuilder): DynamoDbClientBuilder = { + for (customEndpoint <- Option(conf.get(DynamoDBConstants.ENDPOINT))) { + builder.endpointProvider( + new AlternatorEndpointProvider(URI.create(customEndpoint)) + ) + } + builder + } override def setConf(configuration: Configuration): Unit = conf = configuration From f4a40594953b84042c02710f07640f280036ee09 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Sun, 25 Aug 2024 16:49:38 +0200 Subject: [PATCH 2/9] Set 1.0.x release series as stable in the documentation --- CONTRIBUTING.md | 9 ++++++++- docs/source/conf.py | 5 +++-- docs/source/index.rst | 1 + 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 69cbbc8..9dd7ffb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -79,7 +79,14 @@ Follow the procedure documented [here](https://stackoverflow.com/a/15505308/5617 ## Publishing -Create a new [GitHub release](https://github.com/scylladb/scylla-migrator/releases), give it a tag name (please see below), a title, and a description, and then click Publish. A workflow will be triggered and will build the application fat-jar and upload it as a release asset. +Create a new [GitHub release](https://github.com/scylladb/scylla-migrator/releases), give it a tag name (please see below), a title, and a description, and then click Publish. A workflow will be automatically triggered and will build the application fat-jar and upload it as a release asset. Last, _fast-forward-merge_ the branch `master` into the current stable feature-branch: + +~~~ sh +git checkout 1.0.x +git pull origin 1.0.x +git merge --ff-only master +git push origin 1.0.x +~~~ Rules for the release tag name: - Make sure to use tag names like `v1.2.3`, starting with `v` and followed by a [semantic version number](https://semver.org/). diff --git a/docs/source/conf.py b/docs/source/conf.py index 915c537..44f9b95 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -14,12 +14,13 @@ TAGS = [] BRANCHES = [ "master", + "1.0.x" ] # Sets the latest version. -LATEST_VERSION = "master" +LATEST_VERSION = "1.0.x" # Set which versions are not released yet. # Todo: List master when there is more than one version. -UNSTABLE_VERSIONS = [] +UNSTABLE_VERSIONS = ["master"] # Set which versions are deprecated DEPRECATED_VERSIONS = [""] # Sets custom build. diff --git a/docs/source/index.rst b/docs/source/index.rst index eabac5e..623bbf7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -21,6 +21,7 @@ The following table summarizes the required version of Spark and Scala for each Migrator Spark Scala ======== ===== ====== 0.9.x 3.5.x 2.13.x +1.x 3.5.x 2.13.x ======== ===== ====== .. toctree:: From c6f46806d2fc08ca92a0f2e4cb2646da223bfd69 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Mon, 26 Aug 2024 14:24:39 +0200 Subject: [PATCH 3/9] Use `branch-` prefix --- CONTRIBUTING.md | 6 +++--- docs/source/conf.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9dd7ffb..7628828 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -82,10 +82,10 @@ Follow the procedure documented [here](https://stackoverflow.com/a/15505308/5617 Create a new [GitHub release](https://github.com/scylladb/scylla-migrator/releases), give it a tag name (please see below), a title, and a description, and then click Publish. A workflow will be automatically triggered and will build the application fat-jar and upload it as a release asset. Last, _fast-forward-merge_ the branch `master` into the current stable feature-branch: ~~~ sh -git checkout 1.0.x -git pull origin 1.0.x +git checkout branch-1.0.x +git pull origin branch-1.0.x git merge --ff-only master -git push origin 1.0.x +git push origin branch-1.0.x ~~~ Rules for the release tag name: diff --git a/docs/source/conf.py b/docs/source/conf.py index 44f9b95..c93f314 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -14,10 +14,10 @@ TAGS = [] BRANCHES = [ "master", - "1.0.x" + "branch-1.0.x" ] # Sets the latest version. -LATEST_VERSION = "1.0.x" +LATEST_VERSION = "branch-1.0.x" # Set which versions are not released yet. # Todo: List master when there is more than one version. UNSTABLE_VERSIONS = ["master"] From 0a5d6cc60199126c1120d5da86557b53069aace7 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Thu, 15 Aug 2024 16:27:03 +0200 Subject: [PATCH 4/9] Document better the resources to allocate to the Spark executors - Create a new page dedicated to the arguments that should be supplied to `spark-submit` - Link to this page from every page that explains how to set up Spark - Insist on the importance of setting `--executor-cores` and `--executor-memory` Relates to #191 --- docker-compose-tests.yml | 4 +- docker-compose.yaml | 4 +- docs/source/getting-started/aws-emr.rst | 5 +- docs/source/getting-started/docker.rst | 3 +- docs/source/getting-started/index.rst | 18 +----- .../getting-started/spark-standalone.rst | 3 +- docs/source/index.rst | 1 + docs/source/run-the-migration.rst | 61 +++++++++++++++++++ docs/source/tutorials/index.rst | 1 + 9 files changed, 76 insertions(+), 24 deletions(-) create mode 100644 docs/source/run-the-migration.rst diff --git a/docker-compose-tests.yml b/docker-compose-tests.yml index 8482f6d..8428a4c 100644 --- a/docker-compose-tests.yml +++ b/docker-compose-tests.yml @@ -69,8 +69,8 @@ services: build: dockerfiles/spark command: worker environment: - SPARK_WORKER_CORES: 3 - SPARK_WORKER_MEMORY: 1024m + SPARK_WORKER_CORES: 2 + SPARK_WORKER_MEMORY: 4G SPARK_WORKER_WEBUI_PORT: 8081 SPARK_PUBLIC_DNS: localhost expose: diff --git a/docker-compose.yaml b/docker-compose.yaml index 61f45cf..657b073 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -17,8 +17,8 @@ services: build: dockerfiles/spark command: worker environment: - SPARK_WORKER_CORES: 3 - SPARK_WORKER_MEMORY: 1024m + SPARK_WORKER_CORES: 2 + SPARK_WORKER_MEMORY: 4G SPARK_WORKER_WEBUI_PORT: 8081 SPARK_PUBLIC_DNS: localhost ports: diff --git a/docs/source/getting-started/aws-emr.rst b/docs/source/getting-started/aws-emr.rst index 61b9383..b0cedc3 100644 --- a/docs/source/getting-started/aws-emr.rst +++ b/docs/source/getting-started/aws-emr.rst @@ -65,9 +65,10 @@ This page describes how to use the Migrator in `Amazon EMR /mnt1/scylla-migrator-assembly.jar + + See a complete description of the expected arguments to ``spark-submit`` in page :doc:`Run the Migration `, and replace “<... other arguments>” above with the appropriate arguments. - See also our `general recommendations to tune the Spark job <./#run-the-migration>`_. - Add a Bootstrap action to download the Migrator and the migration configuration: diff --git a/docs/source/getting-started/docker.rst b/docs/source/getting-started/docker.rst index 071c06f..a4d49d9 100644 --- a/docs/source/getting-started/docker.rst +++ b/docs/source/getting-started/docker.rst @@ -43,11 +43,12 @@ This page describes how to set up a Spark cluster locally on your machine by usi --master spark://spark-master:7077 \ --conf spark.driver.host=spark-master \ --conf spark.scylla.config=/app/config.yaml \ + <... other arguments> \ /jars/scylla-migrator-assembly.jar The ``spark-master`` container mounts the ``./migrator/target/scala-2.13`` dir on ``/jars`` and the repository root on ``/app``. - See also our `general recommendations to tune the Spark job <./#run-the-migration>`_. + See a complete description of the expected arguments to ``spark-submit`` in page :doc:`Run the Migration `, and replace “<... other arguments>” above with the appropriate arguments. 7. You can monitor progress by observing the Spark web console you opened in step 4. Additionally, after the job has started, you can track progress via ``http://localhost:4040``. diff --git a/docs/source/getting-started/index.rst b/docs/source/getting-started/index.rst index 3a7f6aa..4adf6a7 100644 --- a/docs/source/getting-started/index.rst +++ b/docs/source/getting-started/index.rst @@ -8,7 +8,7 @@ Since the Migrator is packaged as a Spark application, you first have to set up Set Up a Spark Cluster ---------------------- -A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feed them with the tasks to compute. +A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feeds them with the tasks to compute. We recommend provisioning at least 2 GB of memory per CPU on each node. For instance, a cluster node with 4 CPUs should have at least 8 GB of memory. @@ -36,21 +36,7 @@ Once you have a Spark cluster ready to run the ``scylla-migrator-assembly.jar``, Run the Migration ----------------- -The way to start the Migrator depends on how the Spark cluster was installed. Please refer to the page that describes your Spark cluster setup to see how to invoke the ``spark-submit`` command. The remainder of this section describes general options you can use to fine-tune the Migration job. - -We recommend using between 5 to 10 CPUs per Spark executor. For instance, if your Spark worker node has 16 CPUs, you could use 8 CPUs per executor (the Spark driver would then allocate two executors on the worker to fully utilize its resources). You can control the number of CPUs per executors with the argument ``--executor-cores`` passed to the ``spark-submit`` command: - -.. code-block:: bash - - --executor-cores 8 - -We also recommend using 2 GB of memory per CPU. So, if you provide 8 CPU per executor, you should require 16 GB of memory on the executor. You can control the amount of memory per executor with the argument ``--executor-memory`` passed to the ``spark-submit`` command: - -.. code-block:: bash - - --executor-memory 16G - -As long as your source and target databases are not saturated during the migration, you can increase the migration throughput by adding more worker nodes to your Spark cluster. +Start the migration by invoking the ``spark-submit`` command with the appropriate arguments, as explained in the page :doc:`/run-the-migration`. -------------- Extra Features diff --git a/docs/source/getting-started/spark-standalone.rst b/docs/source/getting-started/spark-standalone.rst index 26bba07..9a64e9a 100644 --- a/docs/source/getting-started/spark-standalone.rst +++ b/docs/source/getting-started/spark-standalone.rst @@ -30,8 +30,9 @@ This page describes how to set up a Spark cluster on your infrastructure and to spark-submit --class com.scylladb.migrator.Migrator \ --master spark://:7077 \ --conf spark.scylla.config= \ + <... other arguments> \ - See also our `general recommendations to tune the Spark job <./#run-the-migration>`_. + See a complete description of the expected arguments to ``spark-submit`` in page :doc:`Run the Migration `, and replace “”, “<... other arguments>”, and “” above with appropriate values. 6. You can monitor progress from the `Spark web UI `_. diff --git a/docs/source/index.rst b/docs/source/index.rst index 623bbf7..bfea795 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -30,6 +30,7 @@ Migrator Spark Scala getting-started/index migrate-from-cassandra-or-parquet migrate-from-dynamodb + run-the-migration stream-changes rename-columns validate diff --git a/docs/source/run-the-migration.rst b/docs/source/run-the-migration.rst new file mode 100644 index 0000000..7307e1b --- /dev/null +++ b/docs/source/run-the-migration.rst @@ -0,0 +1,61 @@ +================= +Run the Migration +================= + +After you have `set up a Spark cluster <./getting-started#set-up-a-spark-cluster>`_ and `configured the migration <./getting-started#configure-the-migration>`_ you can start the migration by submitting a job to your Spark cluster. The command to use to submit the job depends on how the Spark cluster was installed. Please refer to the page that describes your Spark cluster setup to see how to invoke the ``spark-submit`` command. This page describes the arguments you need to pass to the ``spark-submit`` command to control the resources allocated to the migration job. + +----------------------- +Invoke ``spark-submit`` +----------------------- + +The ``spark-submit`` command submits a job to the Spark cluster. You can run it from the Spark master node. You should supply the following arguments: + +.. code-block:: bash + + spark-submit \ + --class com.scylladb.migrator.Migrator \ + --master spark://:7077 \ + --conf spark.scylla.config= \ + --executor-cores 2 \ + --executor-memory 4G \ + + +Here is an explanation of the arguments shown above: + +- ``--class com.scylladb.migrator.Migrator`` sets the entry point of the Migrator. +- ``--master spark://:7077`` indicates the URI of the Spark master node. Replace ```` with the actual hostname of your master node. +- ``--conf spark.scylla.config=`` indicates the location of the migration :doc:`configuration file `. It must be a path on the Spark master node. +- ``--executor-cores 2`` and ``--executor-memory 4G`` set the CPU and memory requirements for the Spark executors. See the section `below <#executor-resources>`_ for an explanation of how to set these values. +- Finally, ```` indicates the location of the program binaries. It must be a path on the Spark master node. + +------------------ +Executor Resources +------------------ + +When the Spark master node starts the application, it breaks down the work into multiple tasks, and spawns *executors* on the worker nodes to compute these tasks. + +.. caution:: You should explicitly indicate the CPU and memory requirements of the Spark executors, otherwise by default Spark will create a single executor using all the cores but only 1 GB of memory, which may not be enough and would lead to run-time errors such as ``OutOfMemoryError``. + +The number of CPUs and the amount of memory to allocate to the Spark executors depends on the number of CPUs and amount of memory of the Spark worker nodes. + +We recommend using between 5 to 10 CPUs per Spark executor. For instance, if your Spark worker node has 16 CPUs, you could use 8 CPUs per executor (the Spark driver would then allocate two executors on the worker to fully utilize its resources). You can control the number of CPUs per executors with the argument ``--executor-cores`` passed to the ``spark-submit`` command: + +.. code-block:: bash + + --executor-cores 8 + +We also recommend using 2 GB of memory per CPU. So, if you provide 8 CPU per executor, you should require 16 GB of memory on the executor. You can control the amount of memory per executor with the argument ``--executor-memory`` passed to the ``spark-submit`` command: + +.. code-block:: bash + + --executor-memory 16G + +As long as your source and target databases are not saturated during the migration, you can increase the migration throughput by adding more worker nodes to your Spark cluster. + +.. caution:: + + To decrease the migration throughput, do not decrease the number of executor cores. Indeed, if you do that, Spark will simply allocate several executors to fully utilize the resources of the cluster. If you want to decrease the migration throughput, you can: + + - use a “smaller” Spark cluster (ie, with fewer worker nodes, each having fewer cores), + - limit the number of total cores allocated to the application by passing the argument ``--conf spark.cores.max=2``, + - in the case of a DynamoDB migration, decrease the value of the configuration properties ``throughputReadPercent`` and ``throughputWritePercent``. diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index 27c3277..fec5a30 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -4,5 +4,6 @@ Tutorials .. toctree:: + :maxdepth: 1 dynamodb-to-scylladb-alternator/index From 1b074374a31f5916b3a98bd948d616f72bd1a891 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Mon, 19 Aug 2024 09:28:39 +0200 Subject: [PATCH 5/9] Add a comment about the cluster size --- docs/source/getting-started/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/getting-started/index.rst b/docs/source/getting-started/index.rst index 4adf6a7..de1e2d7 100644 --- a/docs/source/getting-started/index.rst +++ b/docs/source/getting-started/index.rst @@ -8,9 +8,9 @@ Since the Migrator is packaged as a Spark application, you first have to set up Set Up a Spark Cluster ---------------------- -A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feeds them with the tasks to compute. +A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feeds them with the tasks to compute. Since the tasks are processed in parallel, you can increase the possible throughput of the migration by increasing the number of worker nodes. Note that the migration throughput is also limited by the read throughput of the source database and the write throughput of the target database. -We recommend provisioning at least 2 GB of memory per CPU on each node. For instance, a cluster node with 4 CPUs should have at least 8 GB of memory. +We suggest starting with a small cluster containing a single worker node with 5 to 10 CPUs, and increasing the number of worker nodes (or the number of CPUs per node) if necessary, as long as the source and target database are not saturated. We recommend provisioning at least 2 GB of memory per CPU on each node. For instance, a cluster node with 8 CPUs should have at least 16 GB of memory. .. caution:: From a77ea755231995f92ac217b4b87fb8f8ae3af050 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Mon, 19 Aug 2024 09:59:36 +0200 Subject: [PATCH 6/9] dogfooding --- tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala b/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala index e1444ea..adb76c2 100644 --- a/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala +++ b/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala @@ -44,6 +44,8 @@ object SparkUtils { "spark.driver.host=spark-master", "--conf", s"spark.scylla.config=/app/configurations/${migratorConfigFile}", + "--executor-cores", "2", + "--executor-memory", "4G", // Uncomment one of the following lines to plug a remote debugger on the Spark master or worker. // "--conf", "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005", // "--conf", "spark.executor.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5006", From 1874e1c2a56ca3dbf020a0aa1b81f59f4f7aa193 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Thu, 22 Aug 2024 16:34:18 +0200 Subject: [PATCH 7/9] Fix issues in the Ansible playbook and Spark 3.5 - Use `{start,stop}-worker.sh` instead of the deprecated `{start,stop}-slave.sh` - Use `{start,stop}-mesos-shuffle-service.sh` instead of `{start,stop}-shuffle-service.sh` --- ansible/files/start-slave.sh | 2 +- ansible/files/start-spark.sh | 2 +- ansible/files/stop-slave.sh | 2 +- ansible/files/stop-spark.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/files/start-slave.sh b/ansible/files/start-slave.sh index 5845c6f..ebd9ac3 100644 --- a/ansible/files/start-slave.sh +++ b/ansible/files/start-slave.sh @@ -1,2 +1,2 @@ source spark-env -$SPARK_HOME/sbin/start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE \ No newline at end of file +$SPARK_HOME/sbin/start-worker.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE \ No newline at end of file diff --git a/ansible/files/start-spark.sh b/ansible/files/start-spark.sh index dd1cc7f..23df43a 100644 --- a/ansible/files/start-spark.sh +++ b/ansible/files/start-spark.sh @@ -15,4 +15,4 @@ cd $SPARK_HOME/sbin ./start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE -./start-shuffle-service.sh \ No newline at end of file +./start-mesos-shuffle-service.sh diff --git a/ansible/files/stop-slave.sh b/ansible/files/stop-slave.sh index bb15a8e..5f2d0ad 100644 --- a/ansible/files/stop-slave.sh +++ b/ansible/files/stop-slave.sh @@ -1,2 +1,2 @@ source spark-env -$SPARK_HOME/sbin/stop-slave.sh \ No newline at end of file +$SPARK_HOME/sbin/stop-worker.sh \ No newline at end of file diff --git a/ansible/files/stop-spark.sh b/ansible/files/stop-spark.sh index 99b4b9f..3d3d3a0 100644 --- a/ansible/files/stop-spark.sh +++ b/ansible/files/stop-spark.sh @@ -7,7 +7,7 @@ source spark-env cd $SPARK_HOME/sbin -./stop-shuffle-service.sh +./stop-mesos-shuffle-service.sh ./stop-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE ./stop-history-server.sh From e01c75bc3ba1960e29cb4245b115ffee462e8694 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Thu, 22 Aug 2024 16:39:48 +0200 Subject: [PATCH 8/9] Align the scaling strategy of the Ansible-based solution with the other solutions Fixes #192 --- ansible/files/config.dynamodb.yml | 1 - ansible/files/start-spark.sh | 2 - ansible/files/stop-spark.sh | 1 - ansible/scylla-migrator.yml | 6 +-- ansible/templates/spark-env-master-sample | 45 ++++++++----------- ansible/templates/spark-env-worker-sample | 45 +++++-------------- ansible/templates/submit-alternator-job.sh | 7 ++- ansible/templates/submit-cql-job-validator.sh | 6 +-- ansible/templates/submit-cql-job.sh | 12 ++--- docs/source/getting-started/ansible.rst | 2 +- 10 files changed, 45 insertions(+), 82 deletions(-) diff --git a/ansible/files/config.dynamodb.yml b/ansible/files/config.dynamodb.yml index 8298c17..61339d1 100644 --- a/ansible/files/config.dynamodb.yml +++ b/ansible/files/config.dynamodb.yml @@ -53,7 +53,6 @@ savepoints: # Interval in which savepoints will be created intervalSeconds: 300 renames: [] -skipTokenRanges: [] validation: # Should WRITETIMEs and TTLs be compared? compareTimestamps: false diff --git a/ansible/files/start-spark.sh b/ansible/files/start-spark.sh index 23df43a..f7f1b1d 100644 --- a/ansible/files/start-spark.sh +++ b/ansible/files/start-spark.sh @@ -13,6 +13,4 @@ cd $SPARK_HOME/sbin ./start-history-server.sh -./start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE - ./start-mesos-shuffle-service.sh diff --git a/ansible/files/stop-spark.sh b/ansible/files/stop-spark.sh index 3d3d3a0..2d7436d 100644 --- a/ansible/files/stop-spark.sh +++ b/ansible/files/stop-spark.sh @@ -8,7 +8,6 @@ source spark-env cd $SPARK_HOME/sbin ./stop-mesos-shuffle-service.sh -./stop-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE ./stop-history-server.sh diff --git a/ansible/scylla-migrator.yml b/ansible/scylla-migrator.yml index c4bc287..a5493a0 100644 --- a/ansible/scylla-migrator.yml +++ b/ansible/scylla-migrator.yml @@ -43,7 +43,7 @@ - name: Unarchive the installer. unarchive: - src: awscliv2.zip + src: "{{ home_dir }}/awscliv2.zip" dest: "{{ home_dir }}" remote_src: yes @@ -71,7 +71,7 @@ - name: Extract spark ansible.builtin.unarchive: - src: "spark-3.5.1-bin-hadoop3-scala2.13.tgz" + src: "{{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13.tgz" dest: "{{ home_dir }}" remote_src: yes @@ -83,7 +83,7 @@ - name: Move spark to opt/spark become: true - command: "sudo mv spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}" + command: "sudo mv {{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}" - name: Set JAVA_HOME ansible.builtin.lineinfile: diff --git a/ansible/templates/spark-env-master-sample b/ansible/templates/spark-env-master-sample index 6076749..ff31235 100644 --- a/ansible/templates/spark-env-master-sample +++ b/ansible/templates/spark-env-master-sample @@ -2,38 +2,31 @@ # Master node requires a lot of memory. We allocate 64G for the driver to avoid OOM when # there are a lot of tasks. # -# For this example, we're not using any workers on master. -# so CORES = 0 and SPARK_WORKER_INSTANCES = 0. +# EXECUTOR_MEMORY and EXECUTOR_CORES are used in the spark-submit job and controls the memory +# and CPUs per executor. Here are our recommendations to set the number of cores per executor: +# - it should be between 5 and 10 +# - the total number of cores per worker node should be a multiple of the number of cores +# per executor +# - it can not be higher than the number of cores on a worker node +# For example, if there are 16 cores on each worker node, we could set EXECUTOR_CORES=8, +# or if there are 2 cores on each worker node, we could set EXECUTOR_CORES=2. # -# MEMORY is used in the spark-submit job and allocates the memory per executor. -# You can have one or more executors per worker. -# -# By using multiple workers on an instance, we can control the velocity of the migration. +# By using multiple worker nodes, we can control the velocity of the migration. # # Eg. # Target system is 3 x i4i.4xlarge (16 vCPU, 128G) -# We can provision 3 x i4i.2xlarge (8vCPU, 64G) +# We can provision 2 x m7g.2xlarge (8vCPU, 32G) # -# SPARK_WORKER_INSTANCES = 1 -# CORES = 4 -# MEMORY = 8G (4 cores per worker * 2G) +# EXECUTOR_CORES=8 +# EXECUTOR_MEMORY=16G # (8 cores per worker * 2G) # -# Start 1 worker per node, or one at a time. +# Start the Spark master node and the Spark worker node, and run the migration. # - Monitor the pressure on the source system -# - Monitor the velocity of the migration in the spark jobs monito -# -# You can increase the velocity by updating spark-env on worker nodes, increase the -# SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh. -# So long as there are tasks available, the new workers would pick up tasks and increase -# the velocity of the migration. -# -# You should be mindful of over-provisioning the number of workers on an instance. -# Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8... +# - Monitor the velocity of the migration in the spark jobs monitoring UI +# - If necessary, provision more worker nodes ################################################################################################################## -export CORES=0 # 0 cores for slaves on master node -export MEMORY=8G # cores per worker x 2G -export SPARK_WORKER_INSTANCES=0 # no workers on master node -export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }} - -export SLAVESIZE="-c $CORES " \ No newline at end of file +export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }} +export EXECUTOR_CORES=4 +# By default, allocate 2GB of memory per core +export EXECUTOR_MEMORY="$((EXECUTOR_CORES * 2))G" diff --git a/ansible/templates/spark-env-worker-sample b/ansible/templates/spark-env-worker-sample index 7282668..896308b 100644 --- a/ansible/templates/spark-env-worker-sample +++ b/ansible/templates/spark-env-worker-sample @@ -1,38 +1,13 @@ ################################################################################################################## -# Master node requires a lot of memory. We allocate 64G for the driver to avoid OOM when -# there are a lot of tasks. -# -# For this example, we're not using any workers on master. -# so CORES = 0 and SPARK_WORKER_INSTANCES = 0. -# -# MEMORY is used in the spark-submit job and allocates the memory per executor. -# You can have one or more executors per worker. -# -# By using multiple workers on an instance, we can control the velocity of the migration. -# -# Eg. -# Target system is 3 x i4i.4xlarge (16 vCPU, 128G) -# We can provision 3 x i4i.2xlarge (8vCPU, 64G) -# -# SPARK_WORKER_INSTANCES = 1 -# CORES = 4 -# MEMORY = 8G (4 cores per worker * 2G) -# -# Start 1 worker per node, or one at a time. -# - Monitor the pressure on the source system -# - Monitor the velocity of the migration in the spark jobs monito -# -# You can increase the velocity by updating spark-env on worker nodes, increase the -# SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh. -# So long as there are tasks available, the new workers would pick up tasks and increase -# the velocity of the migration. -# -# You should be mindful of over-provisioning the number of workers on an instance. -# Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8... +# SLAVESIZE is used by the start-slave.sh script. By default, a Spark worker uses all the +# resources of the machine it is started on. If you want to decrease the migration velocity, +# you can set the maximum number of cores and memory that can be used by the worker. ################################################################################################################## -export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }} -export CORES=4 # number of cores per worker -export SPARK_WORKER_INSTANCES=4 # this is how many workers will be - # started/stopped on the node. -export SLAVESIZE="-c $CORES" \ No newline at end of file +export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }} + +export SLAVESIZE="" +# Optionally, limit the resources available to the Spark worker by uncommenting the following lines +#export MAX_CORES=2 # max number of cores to use on the machine +#export MAX_MEMORY=4G # max amount of memory to use on the machine +#export SLAVESIZE="--cores $MAX_CORES --memory $MAX_MEMORY" diff --git a/ansible/templates/submit-alternator-job.sh b/ansible/templates/submit-alternator-job.sh index 342ee5e..7acbdb0 100644 --- a/ansible/templates/submit-alternator-job.sh +++ b/ansible/templates/submit-alternator-job.sh @@ -6,12 +6,11 @@ source spark-env mkdir /tmp/savepoints -# - time spark-submit --class com.scylladb.migrator.Migrator \ ---master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ +--master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ --conf spark.eventLog.enabled=true \ --conf spark.scylla.config=/home/ubuntu/scylla-migrator/config.dynamodb.yml \ ---conf spark.executor.memory=$MEMORY \ +--executor-memory $EXECUTOR_MEMORY \ +--executor-cores $EXECUTOR_CORES \ --conf spark.driver.memory=64G \ /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar \ No newline at end of file diff --git a/ansible/templates/submit-cql-job-validator.sh b/ansible/templates/submit-cql-job-validator.sh index cc3f9b5..048a940 100644 --- a/ansible/templates/submit-cql-job-validator.sh +++ b/ansible/templates/submit-cql-job-validator.sh @@ -7,12 +7,12 @@ source spark-env mkdir /tmp/savepoints time spark-submit --class com.scylladb.migrator.Validator \ - --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ + --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ --conf spark.eventLog.enabled=true \ --conf spark.scylla.config=config.yaml \ --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \ --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \ - --num-executors $SPARK_WORKER_INSTANCES \ - --executor-memory $MEMORY \ + --executor-memory $EXECUTOR_MEMORY \ + --executor-cores $EXECUTOR_CORES \ --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \ /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar \ No newline at end of file diff --git a/ansible/templates/submit-cql-job.sh b/ansible/templates/submit-cql-job.sh index 7d2e42a..e9f6143 100644 --- a/ansible/templates/submit-cql-job.sh +++ b/ansible/templates/submit-cql-job.sh @@ -7,13 +7,13 @@ source spark-env mkdir /tmp/savepoints time spark-submit --class com.scylladb.migrator.Migrator \ - --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ + --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ --conf spark.eventLog.enabled=true \ --conf spark.scylla.config=config.yaml \ --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \ --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \ - --num-executors $SPARK_WORKER_INSTANCES \ - --executor-memory $MEMORY \ + --executor-memory $EXECUTOR_MEMORY \ + --executor-cores $EXECUTOR_CORES \ --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \ /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar @@ -24,12 +24,12 @@ time spark-submit --class com.scylladb.migrator.Migrator \ # debug example #$SPARK_HOME/spark-submit --class com.scylladb.migrator.Migrator \ # --driver-java-options -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=65005 \ -# --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \ +# --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \ # --conf spark.scylla.config=config.yaml \ # --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \ # --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \ -# --num-executors 1 \ -# --executor-memory $MEMORY \ +# --executor-memory $EXECUTOR_MEMORY \ +# --executor-cores $EXECUTOR_CORES \ # --conf "spark.executor.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=64000 -XX:+HeapDumpOnOutOfMemoryError" \ # --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \ # /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar diff --git a/docs/source/getting-started/ansible.rst b/docs/source/getting-started/ansible.rst index 7638a77..c3d47ae 100644 --- a/docs/source/getting-started/ansible.rst +++ b/docs/source/getting-started/ansible.rst @@ -4,7 +4,7 @@ Set Up a Spark Cluster with Ansible An `Ansible `_ playbook is provided in the `ansible folder `_ of our Git repository. The Ansible playbook will install the pre-requisites, Spark, on the master and workers added to the ``ansible/inventory/hosts`` file. Scylla-migrator will be installed on the spark master node. -The Ansible playbook expects to be run in an Ubuntu environment where the directory ``/home/ubuntu`` already exists. +The Ansible playbook expects to be run in an Ubuntu environment, by a user named ``ubuntu`` (like you get in AWS EC2 Ubuntu-based images). 1. Clone the Migrator Git repository: From 84f8e9b75a511eb394e9559428d1872997da58e5 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Fri, 23 Aug 2024 09:20:20 +0200 Subject: [PATCH 9/9] Use --driver-memory instead of --conf spark.driver.memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Spark documentation, the configuration property `spark.driver.memory` has no effect in our case (we use the “client” deploy-mode): > In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the --driver-memory command line option or in your default properties file. https://spark.apache.org/docs/latest/configuration.html --- ansible/templates/submit-alternator-job.sh | 2 +- ansible/templates/submit-cql-job.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/templates/submit-alternator-job.sh b/ansible/templates/submit-alternator-job.sh index 7acbdb0..f9d554b 100644 --- a/ansible/templates/submit-alternator-job.sh +++ b/ansible/templates/submit-alternator-job.sh @@ -12,5 +12,5 @@ time spark-submit --class com.scylladb.migrator.Migrator \ --conf spark.scylla.config=/home/ubuntu/scylla-migrator/config.dynamodb.yml \ --executor-memory $EXECUTOR_MEMORY \ --executor-cores $EXECUTOR_CORES \ ---conf spark.driver.memory=64G \ +--driver-memory 4G \ /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar \ No newline at end of file diff --git a/ansible/templates/submit-cql-job.sh b/ansible/templates/submit-cql-job.sh index e9f6143..f167df7 100644 --- a/ansible/templates/submit-cql-job.sh +++ b/ansible/templates/submit-cql-job.sh @@ -19,7 +19,7 @@ time spark-submit --class com.scylladb.migrator.Migrator \ #sometimes you will need a tuning for driver memory size #add this config to above to tune it: -# --conf spark.driver.memory=4G \ +# --driver-memory 4G \ # debug example #$SPARK_HOME/spark-submit --class com.scylladb.migrator.Migrator \