From aa000e587b7838f026989e3136e24a3763680386 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Fri, 23 Aug 2024 17:31:47 +0200
Subject: [PATCH 1/9] Apply the AlternatorEndpointProvider only on custom
 endpoints

The previous implementation was failing when used on AWS DynamoDB because `conf.get(DynamoDBConstants.ENDPOINT)` was `null`.

This was not caught by our tests because our tests always use a custom endpoint (see #113)
---
 .../scala/com/scylladb/migrator/DynamoUtils.scala    | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala b/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala
index 50b9dc6..26f6cda 100644
--- a/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala
+++ b/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala
@@ -293,10 +293,14 @@ object DynamoUtils {
   class AlternatorLoadBalancingEnabler extends DynamoDbClientBuilderTransformer with Configurable {
     private var conf: Configuration = null
 
-    override def apply(builder: DynamoDbClientBuilder): DynamoDbClientBuilder =
-      builder.endpointProvider(
-        new AlternatorEndpointProvider(URI.create(conf.get(DynamoDBConstants.ENDPOINT)))
-      )
+    override def apply(builder: DynamoDbClientBuilder): DynamoDbClientBuilder = {
+      for (customEndpoint <- Option(conf.get(DynamoDBConstants.ENDPOINT))) {
+        builder.endpointProvider(
+          new AlternatorEndpointProvider(URI.create(customEndpoint))
+        )
+      }
+      builder
+    }
 
     override def setConf(configuration: Configuration): Unit =
       conf = configuration

From f4a40594953b84042c02710f07640f280036ee09 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Sun, 25 Aug 2024 16:49:38 +0200
Subject: [PATCH 2/9] Set 1.0.x release series as stable in the documentation

---
 CONTRIBUTING.md       | 9 ++++++++-
 docs/source/conf.py   | 5 +++--
 docs/source/index.rst | 1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 69cbbc8..9dd7ffb 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -79,7 +79,14 @@ Follow the procedure documented [here](https://stackoverflow.com/a/15505308/5617
 
 ## Publishing
 
-Create a new [GitHub release](https://github.com/scylladb/scylla-migrator/releases), give it a tag name (please see below), a title, and a description, and then click Publish. A workflow will be triggered and will build the application fat-jar and upload it as a release asset.
+Create a new [GitHub release](https://github.com/scylladb/scylla-migrator/releases), give it a tag name (please see below), a title, and a description, and then click Publish. A workflow will be automatically triggered and will build the application fat-jar and upload it as a release asset. Last, _fast-forward-merge_ the branch `master` into the current stable feature-branch:
+
+~~~ sh
+git checkout 1.0.x
+git pull origin 1.0.x
+git merge --ff-only master
+git push origin 1.0.x
+~~~
 
 Rules for the release tag name:
 - Make sure to use tag names like `v1.2.3`, starting with `v` and followed by a [semantic version number](https://semver.org/).
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 915c537..44f9b95 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,12 +14,13 @@
 TAGS = []
 BRANCHES = [
     "master",
+    "1.0.x"
 ]
 # Sets the latest version.
-LATEST_VERSION = "master"
+LATEST_VERSION = "1.0.x"
 # Set which versions are not released yet.
 # Todo: List master when there is more than one version.
-UNSTABLE_VERSIONS = []
+UNSTABLE_VERSIONS = ["master"]
 # Set which versions are deprecated
 DEPRECATED_VERSIONS = [""]
 # Sets custom build.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index eabac5e..623bbf7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -21,6 +21,7 @@ The following table summarizes the required version of Spark and Scala for each
 Migrator  Spark  Scala
 ========  =====  ======
 0.9.x     3.5.x  2.13.x
+1.x       3.5.x  2.13.x
 ========  =====  ======
 
 .. toctree::

From c6f46806d2fc08ca92a0f2e4cb2646da223bfd69 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Mon, 26 Aug 2024 14:24:39 +0200
Subject: [PATCH 3/9] Use `branch-` prefix

---
 CONTRIBUTING.md     | 6 +++---
 docs/source/conf.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9dd7ffb..7628828 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -82,10 +82,10 @@ Follow the procedure documented [here](https://stackoverflow.com/a/15505308/5617
 Create a new [GitHub release](https://github.com/scylladb/scylla-migrator/releases), give it a tag name (please see below), a title, and a description, and then click Publish. A workflow will be automatically triggered and will build the application fat-jar and upload it as a release asset. Last, _fast-forward-merge_ the branch `master` into the current stable feature-branch:
 
 ~~~ sh
-git checkout 1.0.x
-git pull origin 1.0.x
+git checkout branch-1.0.x
+git pull origin branch-1.0.x
 git merge --ff-only master
-git push origin 1.0.x
+git push origin branch-1.0.x
 ~~~
 
 Rules for the release tag name:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 44f9b95..c93f314 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,10 +14,10 @@
 TAGS = []
 BRANCHES = [
     "master",
-    "1.0.x"
+    "branch-1.0.x"
 ]
 # Sets the latest version.
-LATEST_VERSION = "1.0.x"
+LATEST_VERSION = "branch-1.0.x"
 # Set which versions are not released yet.
 # Todo: List master when there is more than one version.
 UNSTABLE_VERSIONS = ["master"]

From 0a5d6cc60199126c1120d5da86557b53069aace7 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Thu, 15 Aug 2024 16:27:03 +0200
Subject: [PATCH 4/9] Document better the resources to allocate to the Spark
 executors

- Create a new page dedicated to the arguments that should be supplied to `spark-submit`
- Link to this page from every page that explains how to set up Spark
- Insist on the importance of setting `--executor-cores` and `--executor-memory`

Relates to #191
---
 docker-compose-tests.yml                      |  4 +-
 docker-compose.yaml                           |  4 +-
 docs/source/getting-started/aws-emr.rst       |  5 +-
 docs/source/getting-started/docker.rst        |  3 +-
 docs/source/getting-started/index.rst         | 18 +-----
 .../getting-started/spark-standalone.rst      |  3 +-
 docs/source/index.rst                         |  1 +
 docs/source/run-the-migration.rst             | 61 +++++++++++++++++++
 docs/source/tutorials/index.rst               |  1 +
 9 files changed, 76 insertions(+), 24 deletions(-)
 create mode 100644 docs/source/run-the-migration.rst

diff --git a/docker-compose-tests.yml b/docker-compose-tests.yml
index 8482f6d..8428a4c 100644
--- a/docker-compose-tests.yml
+++ b/docker-compose-tests.yml
@@ -69,8 +69,8 @@ services:
     build: dockerfiles/spark
     command: worker
     environment:
-      SPARK_WORKER_CORES: 3
-      SPARK_WORKER_MEMORY: 1024m
+      SPARK_WORKER_CORES: 2
+      SPARK_WORKER_MEMORY: 4G
       SPARK_WORKER_WEBUI_PORT: 8081
       SPARK_PUBLIC_DNS: localhost
     expose:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 61f45cf..657b073 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -17,8 +17,8 @@ services:
     build: dockerfiles/spark
     command: worker
     environment:
-      SPARK_WORKER_CORES: 3
-      SPARK_WORKER_MEMORY: 1024m
+      SPARK_WORKER_CORES: 2
+      SPARK_WORKER_MEMORY: 4G
       SPARK_WORKER_WEBUI_PORT: 8081
       SPARK_PUBLIC_DNS: localhost
     ports:
diff --git a/docs/source/getting-started/aws-emr.rst b/docs/source/getting-started/aws-emr.rst
index 61b9383..b0cedc3 100644
--- a/docs/source/getting-started/aws-emr.rst
+++ b/docs/source/getting-started/aws-emr.rst
@@ -65,9 +65,10 @@ This page describes how to use the Migrator in `Amazon EMR <https://aws.amazon.c
 
        .. code-block:: text
 
-         spark-submit --deploy-mode cluster --class com.scylladb.migrator.Migrator --conf spark.scylla.config=/mnt1/config.yaml /mnt1/scylla-migrator-assembly.jar
+         spark-submit --deploy-mode cluster --class com.scylladb.migrator.Migrator --conf spark.scylla.config=/mnt1/config.yaml <... other arguments> /mnt1/scylla-migrator-assembly.jar
+
+       See a complete description of the expected arguments to ``spark-submit`` in page :doc:`Run the Migration </run-the-migration>`, and replace “<... other arguments>” above with the appropriate arguments.
 
-       See also our `general recommendations to tune the Spark job <./#run-the-migration>`_.
 
    - Add a Bootstrap action to download the Migrator and the migration configuration:
 
diff --git a/docs/source/getting-started/docker.rst b/docs/source/getting-started/docker.rst
index 071c06f..a4d49d9 100644
--- a/docs/source/getting-started/docker.rst
+++ b/docs/source/getting-started/docker.rst
@@ -43,11 +43,12 @@ This page describes how to set up a Spark cluster locally on your machine by usi
        --master spark://spark-master:7077 \
        --conf spark.driver.host=spark-master \
        --conf spark.scylla.config=/app/config.yaml \
+       <... other arguments> \
        /jars/scylla-migrator-assembly.jar
 
    The ``spark-master`` container mounts the ``./migrator/target/scala-2.13`` dir on ``/jars`` and the repository root on ``/app``.
 
-   See also our `general recommendations to tune the Spark job <./#run-the-migration>`_.
+   See a complete description of the expected arguments to ``spark-submit`` in page :doc:`Run the Migration </run-the-migration>`, and replace “<... other arguments>” above with the appropriate arguments.
 
 7. You can monitor progress by observing the Spark web console you opened in step 4. Additionally, after the job has started, you can track progress via ``http://localhost:4040``.
 
diff --git a/docs/source/getting-started/index.rst b/docs/source/getting-started/index.rst
index 3a7f6aa..4adf6a7 100644
--- a/docs/source/getting-started/index.rst
+++ b/docs/source/getting-started/index.rst
@@ -8,7 +8,7 @@ Since the Migrator is packaged as a Spark application, you first have to set up
 Set Up a Spark Cluster
 ----------------------
 
-A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feed them with the tasks to compute.
+A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feeds them with the tasks to compute.
 
 We recommend provisioning at least 2 GB of memory per CPU on each node. For instance, a cluster node with 4 CPUs should have at least 8 GB of memory.
 
@@ -36,21 +36,7 @@ Once you have a Spark cluster ready to run the ``scylla-migrator-assembly.jar``,
 Run the Migration
 -----------------
 
-The way to start the Migrator depends on how the Spark cluster was installed. Please refer to the page that describes your Spark cluster setup to see how to invoke the ``spark-submit`` command. The remainder of this section describes general options you can use to fine-tune the Migration job.
-
-We recommend using between 5 to 10 CPUs per Spark executor. For instance, if your Spark worker node has 16 CPUs, you could use 8 CPUs per executor (the Spark driver would then allocate two executors on the worker to fully utilize its resources). You can control the number of CPUs per executors with the argument ``--executor-cores`` passed to the ``spark-submit`` command:
-
-.. code-block:: bash
-
-  --executor-cores 8
-
-We also recommend using 2 GB of memory per CPU. So, if you provide 8 CPU per executor, you should require 16 GB of memory on the executor. You can control the amount of memory per executor with the argument ``--executor-memory`` passed to the ``spark-submit`` command:
-
-.. code-block:: bash
-
-  --executor-memory 16G
-
-As long as your source and target databases are not saturated during the migration, you can increase the migration throughput by adding more worker nodes to your Spark cluster.
+Start the migration by invoking the ``spark-submit`` command with the appropriate arguments, as explained in the page :doc:`/run-the-migration`.
 
 --------------
 Extra Features
diff --git a/docs/source/getting-started/spark-standalone.rst b/docs/source/getting-started/spark-standalone.rst
index 26bba07..9a64e9a 100644
--- a/docs/source/getting-started/spark-standalone.rst
+++ b/docs/source/getting-started/spark-standalone.rst
@@ -30,8 +30,9 @@ This page describes how to set up a Spark cluster on your infrastructure and to
      spark-submit --class com.scylladb.migrator.Migrator \
        --master spark://<spark-master-hostname>:7077 \
        --conf spark.scylla.config=<path to config.yaml> \
+       <... other arguments> \
        <path to scylla-migrator-assembly.jar>
 
-   See also our `general recommendations to tune the Spark job <./#run-the-migration>`_.
+   See a complete description of the expected arguments to ``spark-submit`` in page :doc:`Run the Migration </run-the-migration>`, and replace “<spark-master-hostname>”, “<... other arguments>”, and “<path to scylla-migrator-assembly.jar>” above with appropriate values.
 
 6. You can monitor progress from the `Spark web UI <https://spark.apache.org/docs/latest/spark-standalone.html#monitoring-and-logging>`_.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 623bbf7..bfea795 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -30,6 +30,7 @@ Migrator  Spark  Scala
   getting-started/index
   migrate-from-cassandra-or-parquet
   migrate-from-dynamodb
+  run-the-migration
   stream-changes
   rename-columns
   validate
diff --git a/docs/source/run-the-migration.rst b/docs/source/run-the-migration.rst
new file mode 100644
index 0000000..7307e1b
--- /dev/null
+++ b/docs/source/run-the-migration.rst
@@ -0,0 +1,61 @@
+=================
+Run the Migration
+=================
+
+After you have `set up a Spark cluster <./getting-started#set-up-a-spark-cluster>`_ and `configured the migration <./getting-started#configure-the-migration>`_ you can start the migration by submitting a job to your Spark cluster. The command to use to submit the job depends on how the Spark cluster was installed. Please refer to the page that describes your Spark cluster setup to see how to invoke the ``spark-submit`` command. This page describes the arguments you need to pass to the ``spark-submit`` command to control the resources allocated to the migration job.
+
+-----------------------
+Invoke ``spark-submit``
+-----------------------
+
+The ``spark-submit`` command submits a job to the Spark cluster. You can run it from the Spark master node. You should supply the following arguments:
+
+.. code-block:: bash
+
+  spark-submit \
+    --class com.scylladb.migrator.Migrator \
+    --master spark://<spark-master-hostname>:7077 \
+    --conf spark.scylla.config=<path to config.yaml> \
+    --executor-cores 2 \
+    --executor-memory 4G \
+    <path to scylla-migrator-assembly.jar>
+
+Here is an explanation of the arguments shown above:
+
+- ``--class com.scylladb.migrator.Migrator`` sets the entry point of the Migrator.
+- ``--master spark://<spark-master-hostname>:7077`` indicates the URI of the Spark master node. Replace ``<spark-master-hostname>`` with the actual hostname of your master node.
+- ``--conf spark.scylla.config=<path to config.yaml>`` indicates the location of the migration :doc:`configuration file </configuration>`. It must be a path on the Spark master node.
+- ``--executor-cores 2`` and ``--executor-memory 4G`` set the CPU and memory requirements for the Spark executors. See the section `below <#executor-resources>`_ for an explanation of how to set these values.
+- Finally, ``<path to scylla-migrator-assembly.jar>`` indicates the location of the program binaries. It must be a path on the Spark master node.
+
+------------------
+Executor Resources
+------------------
+
+When the Spark master node starts the application, it breaks down the work into multiple tasks, and spawns *executors* on the worker nodes to compute these tasks.
+
+.. caution:: You should explicitly indicate the CPU and memory requirements of the Spark executors, otherwise by default Spark will create a single executor using all the cores but only 1 GB of memory, which may not be enough and would lead to run-time errors such as ``OutOfMemoryError``.
+
+The number of CPUs and the amount of memory to allocate to the Spark executors depends on the number of CPUs and amount of memory of the Spark worker nodes.
+
+We recommend using between 5 to 10 CPUs per Spark executor. For instance, if your Spark worker node has 16 CPUs, you could use 8 CPUs per executor (the Spark driver would then allocate two executors on the worker to fully utilize its resources). You can control the number of CPUs per executors with the argument ``--executor-cores`` passed to the ``spark-submit`` command:
+
+.. code-block:: bash
+
+  --executor-cores 8
+
+We also recommend using 2 GB of memory per CPU. So, if you provide 8 CPU per executor, you should require 16 GB of memory on the executor. You can control the amount of memory per executor with the argument ``--executor-memory`` passed to the ``spark-submit`` command:
+
+.. code-block:: bash
+
+  --executor-memory 16G
+
+As long as your source and target databases are not saturated during the migration, you can increase the migration throughput by adding more worker nodes to your Spark cluster.
+
+.. caution::
+
+  To decrease the migration throughput, do not decrease the number of executor cores. Indeed, if you do that, Spark will simply allocate several executors to fully utilize the resources of the cluster. If you want to decrease the migration throughput, you can:
+
+  - use a “smaller” Spark cluster (ie, with fewer worker nodes, each having fewer cores),
+  - limit the number of total cores allocated to the application by passing the argument ``--conf spark.cores.max=2``,
+  - in the case of a DynamoDB migration, decrease the value of the configuration properties ``throughputReadPercent`` and ``throughputWritePercent``.
diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst
index 27c3277..fec5a30 100644
--- a/docs/source/tutorials/index.rst
+++ b/docs/source/tutorials/index.rst
@@ -4,5 +4,6 @@ Tutorials
 
 
 .. toctree::
+  :maxdepth: 1
 
   dynamodb-to-scylladb-alternator/index

From 1b074374a31f5916b3a98bd948d616f72bd1a891 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Mon, 19 Aug 2024 09:28:39 +0200
Subject: [PATCH 5/9] Add a comment about the cluster size

---
 docs/source/getting-started/index.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting-started/index.rst b/docs/source/getting-started/index.rst
index 4adf6a7..de1e2d7 100644
--- a/docs/source/getting-started/index.rst
+++ b/docs/source/getting-started/index.rst
@@ -8,9 +8,9 @@ Since the Migrator is packaged as a Spark application, you first have to set up
 Set Up a Spark Cluster
 ----------------------
 
-A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feeds them with the tasks to compute.
+A Spark cluster is made of several *nodes*, which can contain several *workers* (although there is usually just one worker per node). When you start the Migrator, the Spark *driver* looks at the job content and splits it into tasks. It then spawns *executors* on the cluster workers and feeds them with the tasks to compute. Since the tasks are processed in parallel, you can increase the possible throughput of the migration by increasing the number of worker nodes. Note that the migration throughput is also limited by the read throughput of the source database and the write throughput of the target database.
 
-We recommend provisioning at least 2 GB of memory per CPU on each node. For instance, a cluster node with 4 CPUs should have at least 8 GB of memory.
+We suggest starting with a small cluster containing a single worker node with 5 to 10 CPUs, and increasing the number of worker nodes (or the number of CPUs per node) if necessary, as long as the source and target database are not saturated. We recommend provisioning at least 2 GB of memory per CPU on each node. For instance, a cluster node with 8 CPUs should have at least 16 GB of memory.
 
 .. caution::
 

From a77ea755231995f92ac217b4b87fb8f8ae3af050 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Mon, 19 Aug 2024 09:59:36 +0200
Subject: [PATCH 6/9] dogfooding

---
 tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala b/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala
index e1444ea..adb76c2 100644
--- a/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala
+++ b/tests/src/test/scala/com/scylladb/migrator/SparkUtils.scala
@@ -44,6 +44,8 @@ object SparkUtils {
         "spark.driver.host=spark-master",
         "--conf",
         s"spark.scylla.config=/app/configurations/${migratorConfigFile}",
+        "--executor-cores", "2",
+        "--executor-memory", "4G",
         // Uncomment one of the following lines to plug a remote debugger on the Spark master or worker.
         // "--conf", "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005",
         // "--conf", "spark.executor.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5006",

From 1874e1c2a56ca3dbf020a0aa1b81f59f4f7aa193 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Thu, 22 Aug 2024 16:34:18 +0200
Subject: [PATCH 7/9] Fix issues in the Ansible playbook and Spark 3.5

- Use `{start,stop}-worker.sh` instead of the deprecated `{start,stop}-slave.sh`
- Use `{start,stop}-mesos-shuffle-service.sh` instead of `{start,stop}-shuffle-service.sh`
---
 ansible/files/start-slave.sh | 2 +-
 ansible/files/start-spark.sh | 2 +-
 ansible/files/stop-slave.sh  | 2 +-
 ansible/files/stop-spark.sh  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ansible/files/start-slave.sh b/ansible/files/start-slave.sh
index 5845c6f..ebd9ac3 100644
--- a/ansible/files/start-slave.sh
+++ b/ansible/files/start-slave.sh
@@ -1,2 +1,2 @@
 source spark-env
-$SPARK_HOME/sbin/start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
\ No newline at end of file
+$SPARK_HOME/sbin/start-worker.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
\ No newline at end of file
diff --git a/ansible/files/start-spark.sh b/ansible/files/start-spark.sh
index dd1cc7f..23df43a 100644
--- a/ansible/files/start-spark.sh
+++ b/ansible/files/start-spark.sh
@@ -15,4 +15,4 @@ cd $SPARK_HOME/sbin
 
 ./start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
 
-./start-shuffle-service.sh
\ No newline at end of file
+./start-mesos-shuffle-service.sh
diff --git a/ansible/files/stop-slave.sh b/ansible/files/stop-slave.sh
index bb15a8e..5f2d0ad 100644
--- a/ansible/files/stop-slave.sh
+++ b/ansible/files/stop-slave.sh
@@ -1,2 +1,2 @@
 source spark-env
-$SPARK_HOME/sbin/stop-slave.sh
\ No newline at end of file
+$SPARK_HOME/sbin/stop-worker.sh
\ No newline at end of file
diff --git a/ansible/files/stop-spark.sh b/ansible/files/stop-spark.sh
index 99b4b9f..3d3d3a0 100644
--- a/ansible/files/stop-spark.sh
+++ b/ansible/files/stop-spark.sh
@@ -7,7 +7,7 @@ source spark-env
 
 cd $SPARK_HOME/sbin
 
-./stop-shuffle-service.sh
+./stop-mesos-shuffle-service.sh
 ./stop-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
 
 ./stop-history-server.sh

From e01c75bc3ba1960e29cb4245b115ffee462e8694 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Thu, 22 Aug 2024 16:39:48 +0200
Subject: [PATCH 8/9] Align the scaling strategy of the Ansible-based solution
 with the other solutions

Fixes #192
---
 ansible/files/config.dynamodb.yml             |  1 -
 ansible/files/start-spark.sh                  |  2 -
 ansible/files/stop-spark.sh                   |  1 -
 ansible/scylla-migrator.yml                   |  6 +--
 ansible/templates/spark-env-master-sample     | 45 ++++++++-----------
 ansible/templates/spark-env-worker-sample     | 45 +++++--------------
 ansible/templates/submit-alternator-job.sh    |  7 ++-
 ansible/templates/submit-cql-job-validator.sh |  6 +--
 ansible/templates/submit-cql-job.sh           | 12 ++---
 docs/source/getting-started/ansible.rst       |  2 +-
 10 files changed, 45 insertions(+), 82 deletions(-)

diff --git a/ansible/files/config.dynamodb.yml b/ansible/files/config.dynamodb.yml
index 8298c17..61339d1 100644
--- a/ansible/files/config.dynamodb.yml
+++ b/ansible/files/config.dynamodb.yml
@@ -53,7 +53,6 @@ savepoints:
   # Interval in which savepoints will be created
   intervalSeconds: 300
 renames: []
-skipTokenRanges: []
 validation:
   # Should WRITETIMEs and TTLs be compared?
   compareTimestamps: false
diff --git a/ansible/files/start-spark.sh b/ansible/files/start-spark.sh
index 23df43a..f7f1b1d 100644
--- a/ansible/files/start-spark.sh
+++ b/ansible/files/start-spark.sh
@@ -13,6 +13,4 @@ cd $SPARK_HOME/sbin
 
 ./start-history-server.sh
 
-./start-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
-
 ./start-mesos-shuffle-service.sh
diff --git a/ansible/files/stop-spark.sh b/ansible/files/stop-spark.sh
index 3d3d3a0..2d7436d 100644
--- a/ansible/files/stop-spark.sh
+++ b/ansible/files/stop-spark.sh
@@ -8,7 +8,6 @@ source spark-env
 cd $SPARK_HOME/sbin
 
 ./stop-mesos-shuffle-service.sh
-./stop-slave.sh spark://$SPARK_MASTER_HOST:7077 $SLAVESIZE
 
 ./stop-history-server.sh
 
diff --git a/ansible/scylla-migrator.yml b/ansible/scylla-migrator.yml
index c4bc287..a5493a0 100644
--- a/ansible/scylla-migrator.yml
+++ b/ansible/scylla-migrator.yml
@@ -43,7 +43,7 @@
 
     - name: Unarchive the installer.
       unarchive: 
-        src: awscliv2.zip
+        src: "{{ home_dir }}/awscliv2.zip"
         dest: "{{ home_dir }}"
         remote_src: yes
         
@@ -71,7 +71,7 @@
 
     - name: Extract spark
       ansible.builtin.unarchive:
-        src: "spark-3.5.1-bin-hadoop3-scala2.13.tgz"
+        src: "{{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13.tgz"
         dest: "{{ home_dir }}"
         remote_src: yes
 
@@ -83,7 +83,7 @@
 
     - name: Move spark to opt/spark
       become: true
-      command: "sudo mv spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}"
+      command: "sudo mv {{ home_dir }}/spark-3.5.1-bin-hadoop3-scala2.13 {{ spark_home }}"
 
     - name: Set JAVA_HOME
       ansible.builtin.lineinfile:
diff --git a/ansible/templates/spark-env-master-sample b/ansible/templates/spark-env-master-sample
index 6076749..ff31235 100644
--- a/ansible/templates/spark-env-master-sample
+++ b/ansible/templates/spark-env-master-sample
@@ -2,38 +2,31 @@
 # Master node requires a lot of memory.  We allocate 64G for the driver to avoid OOM when
 # there are a lot of tasks.
 # 
-# For this example, we're not using any workers on master.
-# so CORES = 0 and SPARK_WORKER_INSTANCES = 0.
+# EXECUTOR_MEMORY and EXECUTOR_CORES are used in the spark-submit job and controls the memory
+# and CPUs per executor. Here are our recommendations to set the number of cores per executor:
+#  - it should be between 5 and 10
+#  - the total number of cores per worker node should be a multiple of the number of cores
+#    per executor
+#  - it can not be higher than the number of cores on a worker node
+# For example, if there are 16 cores on each worker node, we could set EXECUTOR_CORES=8,
+# or if there are 2 cores on each worker node, we could set EXECUTOR_CORES=2.
 #
-# MEMORY is used in the spark-submit job and allocates the memory per executor.
-# You can have one or more executors per worker.
-# 
-# By using multiple workers on an instance, we can control the velocity of the migration.
+# By using multiple worker nodes, we can control the velocity of the migration.
 #
 # Eg. 
 #    Target system is 3 x i4i.4xlarge (16 vCPU, 128G)
-#    We can provision 3 x i4i.2xlarge (8vCPU, 64G)
+#    We can provision 2 x m7g.2xlarge (8vCPU, 32G)
 #
-#    SPARK_WORKER_INSTANCES = 1
-#    CORES = 4
-#    MEMORY = 8G     (4 cores per worker * 2G)
+#    EXECUTOR_CORES=8
+#    EXECUTOR_MEMORY=16G    # (8 cores per worker * 2G)
 #
-#    Start 1 worker per node,  or one at a time.  
+#    Start the Spark master node and the Spark worker node, and run the migration.
 #    - Monitor the pressure on the source system
-#    - Monitor the velocity of the migration in the spark jobs monito
-#
-#    You can increase the velocity by updating spark-env on worker nodes, increase the
-#    SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh.  
-#    So long as there are tasks available, the new workers would pick up tasks and increase
-#    the velocity of the migration.
-#
-#    You should be mindful of over-provisioning the number of workers on an instance.
-#    Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8...
+#    - Monitor the velocity of the migration in the spark jobs monitoring UI
+#    - If necessary, provision more worker nodes
 ##################################################################################################################
 
-export CORES=0                                          # 0 cores for slaves on master node
-export MEMORY=8G                                        # cores per worker x 2G
-export SPARK_WORKER_INSTANCES=0                         # no workers on master node
-export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }}
-
-export SLAVESIZE="-c $CORES "                 
\ No newline at end of file
+export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }}
+export EXECUTOR_CORES=4
+# By default, allocate 2GB of memory per core
+export EXECUTOR_MEMORY="$((EXECUTOR_CORES * 2))G"
diff --git a/ansible/templates/spark-env-worker-sample b/ansible/templates/spark-env-worker-sample
index 7282668..896308b 100644
--- a/ansible/templates/spark-env-worker-sample
+++ b/ansible/templates/spark-env-worker-sample
@@ -1,38 +1,13 @@
 ##################################################################################################################
-# Master node requires a lot of memory.  We allocate 64G for the driver to avoid OOM when
-# there are a lot of tasks.
-# 
-# For this example, we're not using any workers on master.
-# so CORES = 0 and SPARK_WORKER_INSTANCES = 0.
-#
-# MEMORY is used in the spark-submit job and allocates the memory per executor.
-# You can have one or more executors per worker.
-# 
-# By using multiple workers on an instance, we can control the velocity of the migration.
-#
-# Eg. 
-#    Target system is 3 x i4i.4xlarge (16 vCPU, 128G)
-#    We can provision 3 x i4i.2xlarge (8vCPU, 64G)
-#
-#    SPARK_WORKER_INSTANCES = 1
-#    CORES = 4
-#    MEMORY = 8G     (4 cores per worker * 2G)
-#
-#    Start 1 worker per node,  or one at a time.  
-#    - Monitor the pressure on the source system
-#    - Monitor the velocity of the migration in the spark jobs monito
-#
-#    You can increase the velocity by updating spark-env on worker nodes, increase the
-#    SPARK_WORKER_INSTANCES = 2, run ./start-worker.sh.  
-#    So long as there are tasks available, the new workers would pick up tasks and increase
-#    the velocity of the migration.
-#
-#    You should be mindful of over-provisioning the number of workers on an instance.
-#    Eg. if the node has 8 CPUs, then number of workers on an instance * cores <= 8...
+# SLAVESIZE is used by the start-slave.sh script. By default, a Spark worker uses all the
+# resources of the machine it is started on. If you want to decrease the migration velocity,
+# you can set the maximum number of cores and memory that can be used by the worker.
 ##################################################################################################################
 
-export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_facts.default_ipv4.address }}
-export CORES=4                                          # number of cores per worker
-export SPARK_WORKER_INSTANCES=4                         # this is how many workers will be 
-                                                        # started/stopped on the node.
-export SLAVESIZE="-c $CORES"
\ No newline at end of file
+export SPARK_MASTER_HOST={{ hostvars.spark_master.ansible_default_ipv4.address }}
+
+export SLAVESIZE=""
+# Optionally, limit the resources available to the Spark worker by uncommenting the following lines
+#export MAX_CORES=2                  # max number of cores to use on the machine
+#export MAX_MEMORY=4G                # max amount of memory to use on the machine
+#export SLAVESIZE="--cores $MAX_CORES --memory $MAX_MEMORY"
diff --git a/ansible/templates/submit-alternator-job.sh b/ansible/templates/submit-alternator-job.sh
index 342ee5e..7acbdb0 100644
--- a/ansible/templates/submit-alternator-job.sh
+++ b/ansible/templates/submit-alternator-job.sh
@@ -6,12 +6,11 @@ source spark-env
 
 mkdir /tmp/savepoints
 
-# 
-
 time spark-submit --class com.scylladb.migrator.Migrator \
---master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+--master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
 --conf spark.eventLog.enabled=true \
 --conf spark.scylla.config=/home/ubuntu/scylla-migrator/config.dynamodb.yml \
---conf spark.executor.memory=$MEMORY \
+--executor-memory $EXECUTOR_MEMORY \
+--executor-cores $EXECUTOR_CORES \
 --conf spark.driver.memory=64G \
 /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
\ No newline at end of file
diff --git a/ansible/templates/submit-cql-job-validator.sh b/ansible/templates/submit-cql-job-validator.sh
index cc3f9b5..048a940 100644
--- a/ansible/templates/submit-cql-job-validator.sh
+++ b/ansible/templates/submit-cql-job-validator.sh
@@ -7,12 +7,12 @@ source spark-env
 mkdir /tmp/savepoints
 
 time spark-submit --class com.scylladb.migrator.Validator \
-  --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+  --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
   --conf spark.eventLog.enabled=true \
   --conf spark.scylla.config=config.yaml \
   --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \
   --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \
-  --num-executors $SPARK_WORKER_INSTANCES \
-  --executor-memory $MEMORY \
+  --executor-memory $EXECUTOR_MEMORY \
+  --executor-cores $EXECUTOR_CORES \
   --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \
   /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
\ No newline at end of file
diff --git a/ansible/templates/submit-cql-job.sh b/ansible/templates/submit-cql-job.sh
index 7d2e42a..e9f6143 100644
--- a/ansible/templates/submit-cql-job.sh
+++ b/ansible/templates/submit-cql-job.sh
@@ -7,13 +7,13 @@ source spark-env
 mkdir /tmp/savepoints
 
 time spark-submit --class com.scylladb.migrator.Migrator \
-  --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+  --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
   --conf spark.eventLog.enabled=true \
   --conf spark.scylla.config=config.yaml \
   --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \
   --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \
-  --num-executors $SPARK_WORKER_INSTANCES \
-  --executor-memory $MEMORY \
+  --executor-memory $EXECUTOR_MEMORY \
+  --executor-cores $EXECUTOR_CORES \
   --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \
   /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
 
@@ -24,12 +24,12 @@ time spark-submit --class com.scylladb.migrator.Migrator \
 # debug example
 #$SPARK_HOME/spark-submit --class com.scylladb.migrator.Migrator \
 #  --driver-java-options -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=65005 \
-#  --master spark://{{ hostvars.spark_master.ansible_facts.default_ipv4.address }}:7077 \
+#  --master spark://{{ hostvars.spark_master.ansible_default_ipv4.address }}:7077 \
 #  --conf spark.scylla.config=config.yaml \
 #  --conf spark.cassandra.input.consistency.level=LOCAL_QUORUM \
 #  --conf spark.cassandra.output.consistency.level=LOCAL_QUORUM \
-#  --num-executors 1 \
-#  --executor-memory $MEMORY \
+#  --executor-memory $EXECUTOR_MEMORY \
+#  --executor-cores $EXECUTOR_CORES \
 #  --conf "spark.executor.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=64000 -XX:+HeapDumpOnOutOfMemoryError" \
 #  --conf spark.cassandra.connection.localConnectionsPerExecutor=4 \
 #  /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
diff --git a/docs/source/getting-started/ansible.rst b/docs/source/getting-started/ansible.rst
index 7638a77..c3d47ae 100644
--- a/docs/source/getting-started/ansible.rst
+++ b/docs/source/getting-started/ansible.rst
@@ -4,7 +4,7 @@ Set Up a Spark Cluster with Ansible
 
 An `Ansible <https://www.ansible.com/>`_ playbook is provided in the `ansible folder <https://github.com/scylladb/scylla-migrator/tree/master/ansible>`_ of our Git repository. The Ansible playbook will install the pre-requisites, Spark, on the master and workers added to the ``ansible/inventory/hosts`` file.  Scylla-migrator will be installed on the spark master node.
 
-The Ansible playbook expects to be run in an Ubuntu environment where the directory ``/home/ubuntu`` already exists.
+The Ansible playbook expects to be run in an Ubuntu environment, by a user named ``ubuntu`` (like you get in AWS EC2 Ubuntu-based images).
 
 1. Clone the Migrator Git repository:
 

From 84f8e9b75a511eb394e9559428d1872997da58e5 Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Fri, 23 Aug 2024 09:20:20 +0200
Subject: [PATCH 9/9] Use --driver-memory instead of --conf spark.driver.memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to Spark documentation, the configuration property `spark.driver.memory` has no effect in our case (we use the “client” deploy-mode):

>  In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the --driver-memory command line option or in your default properties file.

https://spark.apache.org/docs/latest/configuration.html
---
 ansible/templates/submit-alternator-job.sh | 2 +-
 ansible/templates/submit-cql-job.sh        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ansible/templates/submit-alternator-job.sh b/ansible/templates/submit-alternator-job.sh
index 7acbdb0..f9d554b 100644
--- a/ansible/templates/submit-alternator-job.sh
+++ b/ansible/templates/submit-alternator-job.sh
@@ -12,5 +12,5 @@ time spark-submit --class com.scylladb.migrator.Migrator \
 --conf spark.scylla.config=/home/ubuntu/scylla-migrator/config.dynamodb.yml \
 --executor-memory $EXECUTOR_MEMORY \
 --executor-cores $EXECUTOR_CORES \
---conf spark.driver.memory=64G \
+--driver-memory 4G \
 /home/ubuntu/scylla-migrator/scylla-migrator-assembly.jar
\ No newline at end of file
diff --git a/ansible/templates/submit-cql-job.sh b/ansible/templates/submit-cql-job.sh
index e9f6143..f167df7 100644
--- a/ansible/templates/submit-cql-job.sh
+++ b/ansible/templates/submit-cql-job.sh
@@ -19,7 +19,7 @@ time spark-submit --class com.scylladb.migrator.Migrator \
 
 #sometimes you will need a tuning for driver memory size
 #add this config to above to tune it:
-#  --conf spark.driver.memory=4G \
+#  --driver-memory 4G \
 
 # debug example
 #$SPARK_HOME/spark-submit --class com.scylladb.migrator.Migrator \