diff --git a/.github/workflows/feature_test.yml b/.github/workflows/feature_test.yml
index 9433041f85..001f4b9faf 100644
--- a/.github/workflows/feature_test.yml
+++ b/.github/workflows/feature_test.yml
@@ -63,6 +63,30 @@ jobs:
             - name: Run Pytest
               run: uv run pytest --cov-report= --cov=cumulusci
 
+    unit_tests_opt_deps:
+        name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}"
+        runs-on: ${{ matrix.os }}
+        strategy:
+            fail-fast: false
+            matrix:
+                os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows]
+                python-version: ["3.11", "3.12", "3.13"]
+        steps:
+            - uses: actions/checkout@v4
+            - name: Set up Python
+              uses: actions/setup-python@v4
+              with:
+                  python-version: "${{ matrix.python-version }}"
+            - name: Set up uv
+              uses: SFDO-Tooling/setup-uv@main
+              with:
+                  version: "0.5.0"
+                  enable-cache: true
+            - name: Install dependencies
+              run: uv sync --all-extras -p ${{ matrix.python-version }}
+            - name: Run Pytest
+              run: uv run pytest --cov-report= --cov=cumulusci
+
     robot_api:
         name: "Robot: No browser"
         runs-on: SFDO-Tooling-Ubuntu
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 4f0f038758..fa07e9b997 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -14,7 +14,9 @@ build:
         - asdf plugin add uv
         - asdf install uv latest
         - asdf global uv latest
-        - uv sync --only-group docs --frozen
+        - uv sync --group docs --frozen
+        - uv run cci task doc --write
+        - uv run cci flow doc > docs/flows.rst
         - uv run -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html
 
 # Build documentation in the docs/ directory with Sphinx
diff --git a/AUTHORS.rst b/AUTHORS.rst
index 6801fd7b6d..848008c153 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -38,3 +38,4 @@ For example:
 * Gustavo Tandeciarz (dcinzona)
 * Chandler Anderson (zenibako)
 * Ben French (BenjaminFrench)
+* Rupert Barrow (rupertbarrow)
diff --git a/cumulusci/__about__.py b/cumulusci/__about__.py
index 76ad18b89a..0fd7811c0d 100644
--- a/cumulusci/__about__.py
+++ b/cumulusci/__about__.py
@@ -1 +1 @@
-__version__ = "4.0.1"
+__version__ = "4.2.0"
diff --git a/cumulusci/tasks/bulkdata/mapping_parser.py b/cumulusci/tasks/bulkdata/mapping_parser.py
index 59c7d630a2..28b71b1f34 100644
--- a/cumulusci/tasks/bulkdata/mapping_parser.py
+++ b/cumulusci/tasks/bulkdata/mapping_parser.py
@@ -338,7 +338,10 @@ def _get_required_permission_types(
         self, operation: DataOperationType
     ) -> T.Tuple[str]:
         """Return a tuple of the permission types required to execute an operation"""
-        if operation is DataOperationType.QUERY:
+        if (
+            operation is DataOperationType.QUERY
+            or self.action is DataOperationType.SELECT
+        ):
             return ("queryable",)
         if (
             operation is DataOperationType.INSERT
diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py
index 2d2728dadb..7835d8dea8 100644
--- a/cumulusci/tasks/bulkdata/select_utils.py
+++ b/cumulusci/tasks/bulkdata/select_utils.py
@@ -1,22 +1,34 @@
+import logging
 import random
 import re
 import typing as T
 from enum import Enum
 
-import numpy as np
-import pandas as pd
-from annoy import AnnoyIndex
 from pydantic import Field, root_validator, validator
-from sklearn.feature_extraction.text import HashingVectorizer
-from sklearn.preprocessing import StandardScaler
 
 from cumulusci.core.enums import StrEnum
-from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
-    DEFAULT_DECLARATIONS,
-)
 from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
+from cumulusci.utils import get_cci_upgrade_command
 from cumulusci.utils.yaml.model_parser import CCIDictModel
 
+logger = logging.getLogger(__name__)
+try:
+    import numpy as np
+    import pandas as pd
+    from annoy import AnnoyIndex
+    from sklearn.feature_extraction.text import HashingVectorizer
+    from sklearn.preprocessing import StandardScaler
+
+    OPTIONAL_DEPENDENCIES_AVAILABLE = True
+except ImportError:
+    logger.warning(
+        f"Optional dependencies are missing. "
+        "Handling high volumes of records for the 'select' functionality will be significantly slower, "
+        "as optimizations for this feature are currently disabled. "
+        f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
+    )
+    OPTIONAL_DEPENDENCIES_AVAILABLE = False
+
 
 class SelectStrategy(StrEnum):
     """Enum defining the different selection strategies requested."""
@@ -173,10 +185,6 @@ def standard_generate_query(
             filter_clause=user_filter, limit_clause=limit, offset_clause=offset
         )
     else:
-        # Get the WHERE clause from DEFAULT_DECLARATIONS if available
-        declaration = DEFAULT_DECLARATIONS.get(sobject)
-        if declaration:
-            query += f" WHERE {declaration.where}"
         query += f" LIMIT {limit}" if limit else ""
         query += f" OFFSET {offset}" if offset else ""
     return query, ["Id"]
@@ -266,10 +274,6 @@ def similarity_generate_query(
             filter_clause=user_filter, limit_clause=limit, offset_clause=offset
         )
     else:
-        # Get the WHERE clause from DEFAULT_DECLARATIONS if available
-        declaration = DEFAULT_DECLARATIONS.get(sobject)
-        if declaration:
-            query += f" WHERE {declaration.where}"
         query += f" LIMIT {limit}" if limit else ""
         query += f" OFFSET {offset}" if offset else ""
 
@@ -292,7 +296,7 @@ def similarity_post_process(
 ]:
     """Processes the query results for the similarity selection strategy"""
     # Handle case where query returns 0 records
-    if not query_records and not threshold:
+    if not query_records and threshold is None:
         error_message = f"No records found for {sobject} in the target org."
         return [], [], error_message
 
@@ -308,7 +312,7 @@ def similarity_post_process(
     select_records = []
     insert_records = []
 
-    if complexity_constant < 1000:
+    if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
         select_records, insert_records = levenshtein_post_process(
             load_records, query_records, fields, weights, threshold
         )
@@ -328,6 +332,12 @@ def annoy_post_process(
     threshold: T.Union[float, None],
 ) -> T.Tuple[T.List[dict], list]:
     """Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
+    # Add warning when threshold is 0
+    if threshold is not None and threshold == 0:
+        logger.warning(
+            "Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
+        )
+
     selected_records = []
     insertion_candidates = []
 
@@ -397,7 +407,7 @@ def annoy_post_process(
             # Retrieve the corresponding record from the database
             record = query_record_data[neighbor_index]
             closest_record_id = record_to_id_map[tuple(record)]
-            if threshold and (neighbor_distances[idx] >= threshold):
+            if threshold is not None and (neighbor_distances[idx] >= threshold):
                 selected_records.append(None)
                 insertion_candidates.append(load_shaped_records[i])
             else:
@@ -445,7 +455,7 @@ def levenshtein_post_process(
             select_record, target_records, similarity_weights
         )
 
-        if distance_threshold and match_distance > distance_threshold:
+        if distance_threshold is not None and match_distance > distance_threshold:
             # Append load record for insertion if distance exceeds threshold
             insertion_candidates.append(load_record)
             selected_records.append(None)
diff --git a/cumulusci/tasks/bulkdata/snowfakery.py b/cumulusci/tasks/bulkdata/snowfakery.py
index 1125714ca0..bef4e888cf 100644
--- a/cumulusci/tasks/bulkdata/snowfakery.py
+++ b/cumulusci/tasks/bulkdata/snowfakery.py
@@ -583,8 +583,10 @@ def _generate_and_load_initial_batch(self, working_directory: Path):
             self.sets_finished_while_generating_template = num_records
 
         new_template_dir = data_loader_new_directory_name(template_dir, self.run_until)
-        shutil.move(template_dir, new_template_dir)
-        template_dir = new_template_dir
+        # rename only if new_template_dir does not match template_dir
+        if template_dir.resolve() != new_template_dir.resolve():
+            shutil.move(template_dir, new_template_dir)
+            template_dir = new_template_dir
 
         # don't send data tables to child processes. All they
         # care about are ID->OID mappings
diff --git a/cumulusci/tasks/bulkdata/step.py b/cumulusci/tasks/bulkdata/step.py
index 9dbbe40cd7..4ae6c50cca 100644
--- a/cumulusci/tasks/bulkdata/step.py
+++ b/cumulusci/tasks/bulkdata/step.py
@@ -9,6 +9,7 @@
 from contextlib import contextmanager
 from itertools import tee
 from typing import Any, Dict, List, NamedTuple, Optional, Union
+from urllib.parse import quote
 
 import requests
 import salesforce_bulk
@@ -955,9 +956,7 @@ def _determine_limit_clause(self, total_num_records):
     def _execute_soql_query(self, select_query, query_fields):
         """Executes the SOQL query and returns the flattened records."""
         query_records = []
-        response = self.sf.restful(
-            requests.utils.requote_uri(f"query/?q={select_query}"), method="GET"
-        )
+        response = self.sf.restful(f"query/?q={quote(select_query)}", method="GET")
         query_records.extend(self._flatten_response_records(response, query_fields))
 
         while not response["done"]:
diff --git a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_random_strategy.yaml b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_random_strategy.yaml
index 508be49cb4..1f49e2cec0 100644
--- a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_random_strategy.yaml
+++ b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_random_strategy.yaml
@@ -48,7 +48,7 @@ interactions:
              
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account
           body: null
           headers: *id004
       response:
@@ -125,7 +125,7 @@ interactions:
 
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'%20LIMIT%205
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%20FROM%20Account%20LIMIT%205
           body: null
           headers: *id004
       response:
diff --git a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy.yaml b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy.yaml
index 4bebf958e1..8d25b5bdaf 100644
--- a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy.yaml
+++ b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy.yaml
@@ -225,7 +225,7 @@ interactions:
     
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20TYPEOF%20Who%20WHEN%20Contact%20THEN%20LastName,%20Email%20WHEN%20Lead%20THEN%20LastName,%20Company%20ELSE%20Id%20END,%20TYPEOF%20What%20WHEN%20Account%20THEN%20Name,%20Description,%20Phone,%20AccountNumber%20ELSE%20Id%20END,%20Subject,%20DurationInMinutes,%20ActivityDateTime%20FROM%20Event
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%2C%20TYPEOF%20Who%20WHEN%20Contact%20THEN%20LastName%2C%20Email%20WHEN%20Lead%20THEN%20LastName%2C%20Company%20ELSE%20Id%20END%2C%20TYPEOF%20What%20WHEN%20Account%20THEN%20Name%2C%20Description%2C%20Phone%2C%20AccountNumber%20ELSE%20Id%20END%2C%20Subject%2C%20DurationInMinutes%2C%20ActivityDateTime%20FROM%20Event
           body: null
           headers: *id004
       response:
diff --git a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy_bulk.yaml b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy_bulk.yaml
index 92ff0a2061..6e12dfa2b7 100644
--- a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy_bulk.yaml
+++ b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy_bulk.yaml
@@ -225,7 +225,7 @@ interactions:
     
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20TYPEOF%20Who%20WHEN%20Contact%20THEN%20LastName,%20Email%20WHEN%20Lead%20THEN%20LastName,%20Company%20ELSE%20Id%20END,%20TYPEOF%20What%20WHEN%20Account%20THEN%20Name,%20Description,%20Phone,%20AccountNumber%20ELSE%20Id%20END,%20Subject,%20DurationInMinutes,%20ActivityDateTime%20FROM%20Event
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%2C%20TYPEOF%20Who%20WHEN%20Contact%20THEN%20LastName%2C%20Email%20WHEN%20Lead%20THEN%20LastName%2C%20Company%20ELSE%20Id%20END%2C%20TYPEOF%20What%20WHEN%20Account%20THEN%20Name%2C%20Description%2C%20Phone%2C%20AccountNumber%20ELSE%20Id%20END%2C%20Subject%2C%20DurationInMinutes%2C%20ActivityDateTime%20FROM%20Event
           body: null
           headers: *id004
       response:
diff --git a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_strategy.yaml b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_strategy.yaml
index 31897e7650..16844b707e 100644
--- a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_strategy.yaml
+++ b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_strategy.yaml
@@ -48,7 +48,7 @@ interactions:
              
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%2C%20Name%2C%20Description%2C%20Phone%2C%20AccountNumber%20FROM%20Account
           body: null
           headers: *id004
       response:
diff --git a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_standard_strategy.yaml b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_standard_strategy.yaml
index 508be49cb4..1f49e2cec0 100644
--- a/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_standard_strategy.yaml
+++ b/cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_standard_strategy.yaml
@@ -48,7 +48,7 @@ interactions:
              
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account
           body: null
           headers: *id004
       response:
@@ -125,7 +125,7 @@ interactions:
 
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'%20LIMIT%205
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%20FROM%20Account%20LIMIT%205
           body: null
           headers: *id004
       response:
diff --git a/cumulusci/tasks/bulkdata/tests/test_select_utils.py b/cumulusci/tasks/bulkdata/tests/test_select_utils.py
index fb77abcf9b..dbd2a993ca 100644
--- a/cumulusci/tasks/bulkdata/tests/test_select_utils.py
+++ b/cumulusci/tasks/bulkdata/tests/test_select_utils.py
@@ -1,7 +1,7 @@
-import pandas as pd
 import pytest
 
 from cumulusci.tasks.bulkdata.select_utils import (
+    OPTIONAL_DEPENDENCIES_AVAILABLE,
     SelectOperationExecutor,
     SelectStrategy,
     add_limit_offset_to_user_filter,
@@ -15,24 +15,16 @@
     vectorize_records,
 )
 
+# Check for pandas availability
+try:
+    import pandas as pd
 
-# Test Cases for standard_generate_query
-def test_standard_generate_query_with_default_record_declaration():
-    select_operator = SelectOperationExecutor(SelectStrategy.STANDARD)
-    sobject = "Account"  # Assuming Account has a declaration in DEFAULT_DECLARATIONS
-    limit = 5
-    offset = 2
-    query, fields = select_operator.select_generate_query(
-        sobject=sobject, fields=[], user_filter="", limit=limit, offset=offset
-    )
-
-    assert "WHERE" in query  # Ensure WHERE clause is included
-    assert f"LIMIT {limit}" in query
-    assert f"OFFSET {offset}" in query
-    assert fields == ["Id"]
+    PANDAS_AVAILABLE = True
+except ImportError:
+    PANDAS_AVAILABLE = False
 
 
-def test_standard_generate_query_without_default_record_declaration():
+def test_standard_generate_query_without_filter():
     select_operator = SelectOperationExecutor(SelectStrategy.STANDARD)
     sobject = "Contact"  # Assuming no declaration for this object
     limit = 3
@@ -41,7 +33,6 @@ def test_standard_generate_query_without_default_record_declaration():
         sobject=sobject, fields=[], user_filter="", limit=limit, offset=offset
     )
 
-    assert "WHERE" not in query  # No WHERE clause should be present
     assert f"LIMIT {limit}" in query
     assert "OFFSET" not in query
     assert fields == ["Id"]
@@ -64,23 +55,7 @@ def test_standard_generate_query_with_user_filter():
     assert fields == ["Id"]
 
 
-# Test Cases for random generate query
-def test_random_generate_query_with_default_record_declaration():
-    select_operator = SelectOperationExecutor(SelectStrategy.RANDOM)
-    sobject = "Account"  # Assuming Account has a declaration in DEFAULT_DECLARATIONS
-    limit = 5
-    offset = 2
-    query, fields = select_operator.select_generate_query(
-        sobject=sobject, fields=[], user_filter="", limit=limit, offset=offset
-    )
-
-    assert "WHERE" in query  # Ensure WHERE clause is included
-    assert f"LIMIT {limit}" in query
-    assert f"OFFSET {offset}" in query
-    assert fields == ["Id"]
-
-
-def test_random_generate_query_without_default_record_declaration():
+def test_random_generate_query():
     select_operator = SelectOperationExecutor(SelectStrategy.RANDOM)
     sobject = "Contact"  # Assuming no declaration for this object
     limit = 3
@@ -89,7 +64,6 @@ def test_random_generate_query_without_default_record_declaration():
         sobject=sobject, fields=[], user_filter="", limit=limit, offset=offset
     )
 
-    assert "WHERE" not in query  # No WHERE clause should be present
     assert f"LIMIT {limit}" in query
     assert "OFFSET" not in query
     assert fields == ["Id"]
@@ -201,23 +175,7 @@ def test_random_post_process_with_no_records():
     assert error_message == f"No records found for {sobject} in the target org."
 
 
-# Test Cases for Similarity Generate Query
-def test_similarity_generate_query_with_default_record_declaration():
-    select_operator = SelectOperationExecutor(SelectStrategy.SIMILARITY)
-    sobject = "Account"  # Assuming Account has a declaration in DEFAULT_DECLARATIONS
-    limit = 5
-    offset = 2
-    query, fields = select_operator.select_generate_query(
-        sobject, ["Name"], [], limit, offset
-    )
-
-    assert "WHERE" in query  # Ensure WHERE clause is included
-    assert fields == ["Id", "Name"]
-    assert f"LIMIT {limit}" in query
-    assert f"OFFSET {offset}" in query
-
-
-def test_similarity_generate_query_without_default_record_declaration():
+def test_similarity_generate_query_no_nesting():
     select_operator = SelectOperationExecutor(SelectStrategy.SIMILARITY)
     sobject = "Contact"  # Assuming no declaration for this object
     limit = 3
@@ -226,7 +184,6 @@ def test_similarity_generate_query_without_default_record_declaration():
         sobject, ["Name"], [], limit, offset
     )
 
-    assert "WHERE" not in query  # No WHERE clause should be present
     assert fields == ["Id", "Name"]
     assert f"LIMIT {limit}" in query
     assert "OFFSET" not in query
@@ -403,6 +360,33 @@ def test_similarity_post_process_with_no_records():
     assert error_message == f"No records found for {sobject} in the target org."
 
 
+def test_similarity_post_process_with_no_records__zero_threshold():
+    select_operator = SelectOperationExecutor(SelectStrategy.SIMILARITY)
+    load_records = [["Aditya", "Salesforce"], ["Jawad", "Salesforce"]]
+    query_records = []
+    num_records = 2
+    sobject = "Lead"
+    (
+        selected_records,
+        insert_records,
+        error_message,
+    ) = select_operator.select_post_process(
+        load_records=load_records,
+        query_records=query_records,
+        num_records=num_records,
+        sobject=sobject,
+        weights=[1, 1, 1],
+        fields=["LastName", "Company"],
+        threshold=0,
+    )
+
+    # Assert that it inserts everything
+    assert selected_records == [None, None]
+    assert insert_records[0] == ["Aditya", "Salesforce"]
+    assert insert_records[1] == ["Jawad", "Salesforce"]
+    assert error_message is None
+
+
 def test_calculate_levenshtein_distance_basic():
     record1 = ["hello", "world"]
     record2 = ["hullo", "word"]
@@ -484,9 +468,13 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
     assert "Records must be same size as fields (weights)." in str(e.value)
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_numeric_columns():
-    df_db = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
-    df_query = pd.DataFrame({"A": [4, 5, ""], "B": [4.5, 5.5, 6.5]})
+    df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
+    df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
     weights = [0.1, 0.2]
     expected_output = (
         ["A", "B"],  # numerical_features
@@ -499,21 +487,29 @@ def test_all_numeric_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_numeric_columns__one_non_numeric():
-    df_db = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
-    df_query = pd.DataFrame({"A": [4, 5, 6], "B": ["abcd", 5.5, 6.5]})
+    df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
+    df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
     weights = [0.1, 0.2]
     expected_output = (
         ["A"],  # numerical_features
         [],  # boolean_features
-        [],  # categorical_features
+        ["B"],  # categorical_features
         [0.1],  # numerical_weights
         [],  # boolean_weights
-        [],  # categorical_weights
+        [0.2],  # categorical_weights
     )
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_boolean_columns():
     df_db = pd.DataFrame(
         {"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
@@ -533,6 +529,10 @@ def test_all_boolean_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_categorical_columns():
     df_db = pd.DataFrame(
         {"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
@@ -552,19 +552,23 @@ def test_all_categorical_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_mixed_types():
     df_db = pd.DataFrame(
         {
-            "A": [1, 2, 3],
+            "A": ["1", "2", "3"],
             "B": ["true", "false", "true"],
             "C": ["apple", "banana", "cherry"],
         }
     )
     df_query = pd.DataFrame(
         {
-            "A": [1, 3, ""],
+            "A": ["1", "3", ""],
             "B": ["true", "true", "true"],
-            "C": ["apple", "", 3],
+            "C": ["apple", "", "3"],
         }
     )
     weights = [0.7, 0.8, 0.9]
@@ -579,6 +583,10 @@ def test_mixed_types():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_vectorize_records_mixed_numerical_boolean_categorical():
     # Test data with mixed types: numerical and categorical only
     db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
@@ -606,6 +614,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
     ), "Query vectors column count mismatch"
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process():
     # Test data
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -632,6 +644,10 @@ def test_annoy_post_process():
     assert not insert_records
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process__insert_records():
     # Test data
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -687,6 +703,10 @@ def test_annoy_post_process__no_query_records():
     ]  # The first insert record should match the second load record
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process__insert_records_with_polymorphic_fields():
     # Test data
     load_records = [
@@ -722,6 +742,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
     ]  # The first insert record should match the second load record
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_single_record_match_annoy_post_process():
     # Mock data where only the first query record matches the first load record
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
diff --git a/cumulusci/tasks/bulkdata/tests/test_step.py b/cumulusci/tasks/bulkdata/tests/test_step.py
index e94e91f226..3887b270f3 100644
--- a/cumulusci/tasks/bulkdata/tests/test_step.py
+++ b/cumulusci/tasks/bulkdata/tests/test_step.py
@@ -1232,7 +1232,9 @@ def test_process_insert_records_failure(self, download_mock):
                 )
 
     @mock.patch("cumulusci.tasks.bulkdata.step.download_file")
-    def test_select_records_similarity_strategy__insert_records(self, download_mock):
+    def test_select_records_similarity_strategy__insert_records__non_zero_threshold(
+        self, download_mock
+    ):
         # Set up mock context and BulkApiDmlOperation
         context = mock.Mock()
         # Add step with threshold
@@ -1325,6 +1327,102 @@ def test_select_records_similarity_strategy__insert_records(self, download_mock)
             == 1
         )
 
+    @mock.patch("cumulusci.tasks.bulkdata.step.download_file")
+    def test_select_records_similarity_strategy__insert_records__zero_threshold(
+        self, download_mock
+    ):
+        # Set up mock context and BulkApiDmlOperation
+        context = mock.Mock()
+        # Add step with threshold
+        step = BulkApiDmlOperation(
+            sobject="Contact",
+            operation=DataOperationType.QUERY,
+            api_options={"batch_size": 10, "update_key": "LastName"},
+            context=context,
+            fields=["Name", "Email"],
+            selection_strategy=SelectStrategy.SIMILARITY,
+            threshold=0,
+        )
+
+        # Mock Bulk API responses
+        step.bulk.endpoint = "https://test"
+        step.bulk.create_query_job.return_value = "JOB"
+        step.bulk.query.return_value = "BATCH"
+        step.bulk.get_query_batch_result_ids.return_value = ["RESULT"]
+
+        # Mock the downloaded CSV content with a single record
+        select_results = io.StringIO(
+            """[{"Id":"003000000000001", "Name":"Jawad", "Email":"mjawadtp@example.com"}]"""
+        )
+        insert_results = io.StringIO(
+            "Id,Success,Created\n003000000000002,true,true\n003000000000003,true,true\n"
+        )
+        download_mock.side_effect = [select_results, insert_results]
+
+        # Mock the _wait_for_job method to simulate a successful job
+        step._wait_for_job = mock.Mock()
+        step._wait_for_job.return_value = DataOperationJobResult(
+            DataOperationStatus.SUCCESS, [], 0, 0
+        )
+
+        # Prepare input records
+        records = iter(
+            [
+                ["Jawad", "mjawadtp@example.com"],
+                ["Aditya", "aditya@example.com"],
+                ["Tom", "cruise@example.com"],
+            ]
+        )
+
+        # Mock sub-operation for BulkApiDmlOperation
+        insert_step = mock.Mock(spec=BulkApiDmlOperation)
+        insert_step.start = mock.Mock()
+        insert_step.load_records = mock.Mock()
+        insert_step.end = mock.Mock()
+        insert_step.batch_ids = ["BATCH1"]
+        insert_step.bulk = mock.Mock()
+        insert_step.bulk.endpoint = "https://test"
+        insert_step.job_id = "JOB"
+
+        with mock.patch(
+            "cumulusci.tasks.bulkdata.step.BulkApiDmlOperation",
+            return_value=insert_step,
+        ):
+            # Execute the select_records operation
+            step.start()
+            step.select_records(records)
+            step.end()
+
+        # Get the results and assert their properties
+        results = list(step.get_results())
+
+        assert len(results) == 3  # Expect 3 results (matching the input records count)
+        # Assert that all results have the expected ID, success, and created values
+        assert (
+            results.count(
+                DataOperationResult(
+                    id="003000000000001", success=True, error="", created=False
+                )
+            )
+            == 1
+        )
+        assert (
+            results.count(
+                DataOperationResult(
+                    id="003000000000002", success=True, error="", created=True
+                )
+            )
+            == 1
+        )
+        assert (
+            results.count(
+                DataOperationResult(
+                    id="003000000000003", success=True, error="", created=True
+                )
+            )
+            == 1
+        )
+
     @mock.patch("cumulusci.tasks.bulkdata.step.download_file")
     def test_select_records_similarity_strategy__insert_records__no_select_records(
         self, download_mock
@@ -2807,7 +2905,9 @@ def test_process_insert_records_failure(self):
                 mock_rest_api_dml_operation.end.assert_not_called()
 
     @responses.activate
-    def test_select_records_similarity_strategy__insert_records(self):
+    def test_select_records_similarity_strategy__insert_records__non_zero_threshold(
+        self,
+    ):
         mock_describe_calls()
         task = _make_task(
             LoadData,
@@ -2891,6 +2991,91 @@ def test_select_records_similarity_strategy__insert_records(self):
             == 1
         )
 
+    @responses.activate
+    def test_select_records_similarity_strategy__insert_records__zero_threshold(self):
+        mock_describe_calls()
+        task = _make_task(
+            LoadData,
+            {
+                "options": {
+                    "database_url": "sqlite:///test.db",
+                    "mapping": "mapping.yml",
+                }
+            },
+        )
+        task.project_config.project__package__api_version = CURRENT_SF_API_VERSION
+        task._init_task()
+
+        # Create step with threshold
+        step = RestApiDmlOperation(
+            sobject="Contact",
+            operation=DataOperationType.UPSERT,
+            api_options={"batch_size": 10},
+            context=task,
+            fields=["Name", "Email"],
+            selection_strategy=SelectStrategy.SIMILARITY,
+            threshold=0,
+        )
+
+        results_select_call = {
+            "records": [
+                {
+                    "Id": "003000000000001",
+                    "Name": "Jawad",
+                    "Email": "mjawadtp@example.com",
+                },
+            ],
+            "done": True,
+        }
+
+        results_insert_call = [
+            {"id": "003000000000002", "success": True, "created": True},
+            {"id": "003000000000003", "success": True, "created": True},
+        ]
+
+        step.sf.restful = mock.Mock(
+            side_effect=[results_select_call, results_insert_call]
+        )
+        records = iter(
+            [
+                ["Jawad", "mjawadtp@example.com"],
+                ["Aditya", "aditya@example.com"],
+                ["Tom Cruise", "tom@example.com"],
+            ]
+        )
+        step.start()
+        step.select_records(records)
+        step.end()
+
+        # Get the results and assert their properties
+        results = list(step.get_results())
+        assert len(results) == 3  # Expect 3 results (matching the input records count)
+        # Assert that all results have the expected ID, success, and created values
+        assert (
+            results.count(
+                DataOperationResult(
+                    id="003000000000001", success=True, error="", created=False
+                )
+            )
+            == 1
+        )
+        assert (
+            results.count(
+                DataOperationResult(
+                    id="003000000000002", success=True, error="", created=True
+                )
+            )
+            == 1
+        )
+        assert (
+            results.count(
+                DataOperationResult(
+                    id="003000000000003", success=True, error="", created=True
+                )
+            )
+            == 1
+        )
+
     @responses.activate
     def test_insert_dml_operation__boolean_conversion(self):
         mock_describe_calls()
diff --git a/cumulusci/tasks/vlocity/vlocity.py b/cumulusci/tasks/vlocity/vlocity.py
index c1d0f68085..7c772b97d1 100644
--- a/cumulusci/tasks/vlocity/vlocity.py
+++ b/cumulusci/tasks/vlocity/vlocity.py
@@ -27,7 +27,7 @@
 LWC_RSS_NAME = "OmniStudioLightning"
 OMNI_NAMESPACE = "omnistudio"
 VBT_SF_ALIAS = "cci-vbt-target"
-SF_TOKEN_ENV = "SFDX_ACCESS_TOKEN"
+SF_TOKEN_ENV = "SF_ACCESS_TOKEN"
 VBT_TOKEN_ENV = "OMNIOUT_TOKEN"
 
 
@@ -106,7 +106,9 @@ def _add_token_to_sfdx(self, access_token: str, instance_url: str) -> str:
         # TODO: Use the sf v2 form of this command instead (when we migrate)
         token_store_cmd = [
             "sf",
-            "org login access-token",
+            "org",
+            "login",
+            "access-token",
             "--no-prompt",
             "--alias",
             f"{VBT_SF_ALIAS}",
diff --git a/docs/data.md b/docs/data.md
index fe9396a4ae..ba61076315 100644
--- a/docs/data.md
+++ b/docs/data.md
@@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a
 
 This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org.
 
+**Important Note:**  
+For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process.
+
 ---
 
 #### Example
diff --git a/docs/env-var-reference.md b/docs/env-var-reference.md
index cc01c8a798..4d6e3fa738 100644
--- a/docs/env-var-reference.md
+++ b/docs/env-var-reference.md
@@ -70,3 +70,7 @@ org, e.g. a Dev Hub. Set with SFDX_CLIENT_ID.
 ## `SFDX_ORG_CREATE_ARGS`
 
 Extra arguments passed to `sf org create scratch`.
+
+To provide additional arguments, use the following format. For instance, to set the release to "preview", the environment variable would be: "--release=preview"
+
+To specify multiple options, you can include them together, like: "--edition=developer --release=preview"
diff --git a/docs/history.md b/docs/history.md
index 5ff75547d1..a1f1e7fdbc 100644
--- a/docs/history.md
+++ b/docs/history.md
@@ -2,6 +2,77 @@
 
 <!-- latest-start -->
 
+## v4.2.0 (2025-01-20)
+
+<!-- Release notes generated using configuration in .github/release.yml at main -->
+
+## What's Changed
+
+### Changes 🎉
+
+-   fix: Prevent 'directory not empty' error in `snowfakery` on Linux by [@schenkkp](https://github.com/schenkkp) in [#3864](https://github.com/SFDO-Tooling/CumulusCI/pull/3864)
+-   @W-17621812: Update \_get_required_permission_types to handle SELECT operations by [@aditya-balachander](https://github.com/aditya-balachander) in [#3870](https://github.com/SFDO-Tooling/CumulusCI/pull/3870)
+
+## New Contributors
+
+-   @RupertBarrow made their first contribution in [#3490](https://github.com/SFDO-Tooling/CumulusCI/pull/3490)
+-   @schenkkp made their first contribution in [#3864](https://github.com/SFDO-Tooling/CumulusCI/pull/3864)
+
+**Full Changelog**: https://github.com/SFDO-Tooling/CumulusCI/compare/v4.1.0...v4.2.0
+
+<!-- latest-stop -->
+
+## v4.1.0 (2025-01-09)
+
+<!-- Release notes generated using configuration in .github/release.yml at main -->
+
+## What's Changed
+
+### Changes 🎉
+
+-   Remove default declaration for select rows query by [@mjawadtp](https://github.com/mjawadtp) in
+    [#3867](https://github.com/SFDO-Tooling/CumulusCI/pull/3867)
+-   @W-17427085: Set ANNOY related dependencies to be optional by [@aditya-balachander](https://github.com/aditya-balachander) in [#3858](https://github.com/SFDO-Tooling/CumulusCI/pull/3858)
+-   Update on the documentation for SFDX_ORG_CREATE_ARGS environment variable by [@lakshmi2506](https://github.com/lakshmi2506) in [#3861](https://github.com/SFDO-Tooling/CumulusCI/pull/3861)
+-   @W-16485311: Core Logic for Selecting Records from Target Org by [@aditya-balachander](https://github.com/aditya-balachander) in [#3818](https://github.com/SFDO-Tooling/CumulusCI/pull/3818)
+-   Add integration tests for all selection strategies by [@mjawadtp](https://github.com/mjawadtp) in [#3851](https://github.com/SFDO-Tooling/CumulusCI/pull/3851)
+-   @W-17357226: Fix for issue where zero threshold defaulted to select by [@aditya-balachander](https://github.com/aditya-balachander) in [#3853](https://github.com/SFDO-Tooling/CumulusCI/pull/3853)
+-   @W-17366392: Fix Omnistudio issues for the sf command format by [@lakshmi2506](https://github.com/lakshmi2506) in [#3855](https://github.com/SFDO-Tooling/CumulusCI/pull/3855)
+-   fix: restore task and flow reference docs by [@jstvz](https://github.com/jstvz) in [#3856](https://github.com/SFDO-Tooling/CumulusCI/pull/3856)
+-   @W-17412267: Fix for records not being inserted when threshold 0 by [@aditya-balachander](https://github.com/aditya-balachander) in [#3857](https://github.com/SFDO-Tooling/CumulusCI/pull/3857)
+
+**Full Changelog**: https://github.com/SFDO-Tooling/CumulusCI/compare/v4.0.1.dev1...v4.1.0
+
+## v4.1.0.dev1 (2025-01-01)
+
+<!-- Release notes generated using configuration in .github/release.yml at main -->
+
+## What's Changed
+
+### Changes 🎉
+
+-   @W-17427085: Set ANNOY related dependencies to be optional by [@aditya-balachander](https://github.com/aditya-balachander) in [#3858](https://github.com/SFDO-Tooling/CumulusCI/pull/3858)
+-   Update on the documentation for SFDX_ORG_CREATE_ARGS environment variable by [@lakshmi2506](https://github.com/lakshmi2506) in [#3861](https://github.com/SFDO-Tooling/CumulusCI/pull/3861)
+
+**Full Changelog**: https://github.com/SFDO-Tooling/CumulusCI/compare/v4.0.1.dev0...v4.0.1.dev1
+
+## v4.0.1.dev0 (2024-12-16)
+
+<!-- Release notes generated using configuration in .github/release.yml at main -->
+
+## What's Changed
+
+### Changes 🎉
+
+-   @W-16485311: Core Logic for Selecting Records from Target Org by [@aditya-balachander](https://github.com/aditya-balachander) in [#3818](https://github.com/SFDO-Tooling/CumulusCI/pull/3818)
+-   Add integration tests for all selection strategies by [@mjawadtp](https://github.com/mjawadtp) in [#3851](https://github.com/SFDO-Tooling/CumulusCI/pull/3851)
+-   @W-17357226: Fix for issue where zero threshold defaulted to select by [@aditya-balachander](https://github.com/aditya-balachander) in [#3853](https://github.com/SFDO-Tooling/CumulusCI/pull/3853)
+-   @W-17366392: Fix Omnistudio issues for the sf command format by [@lakshmi2506](https://github.com/lakshmi2506) in [#3855](https://github.com/SFDO-Tooling/CumulusCI/pull/3855)
+-   fix: restore task and flow reference docs by [@jstvz](https://github.com/jstvz) in [#3856](https://github.com/SFDO-Tooling/CumulusCI/pull/3856)
+-   @W-17412267: Fix for records not being inserted when threshold 0 by [@aditya-balachander](https://github.com/aditya-balachander) in [#3857](https://github.com/SFDO-Tooling/CumulusCI/pull/3857)
+
+**Full Changelog**: https://github.com/SFDO-Tooling/CumulusCI/compare/v4.0.1...v4.1.0
+
 ## v4.0.1 (2024-11-18)
 
 ### Issues Fixed 🩴
@@ -10,8 +81,6 @@
 
 **Full Changelog**: https://github.com/SFDO-Tooling/CumulusCI/compare/v4.0.0...v4.0.1
 
-<!-- latest-stop -->
-
 ## v4.0.0 (2024-11-12)
 
 ## What's Changed
@@ -1694,9 +1763,9 @@ Critical Changes:
         subfolders will see a change in resolution behavior. Previously,
         a dependency specified like this:
 
-            dependencies:
-                - github: https://github.com/SalesforceFoundation/NPSP
-                  subfolder: unpackaged/config/trial
+                      dependencies:
+                          - github: https://github.com/SalesforceFoundation/NPSP
+                            subfolder: unpackaged/config/trial
 
         would always deploy from the latest commit on the default
         branch. Now, this dependency will be resolved to a GitHub commit
@@ -1707,12 +1776,12 @@ Critical Changes:
     -   The `project__dependencies` section in `cumulusci.yml` no longer
         supports nested dependencies specified like this:
 
-            dependencies:
-                - namespace: "test"
-                  version: "1.0"
-                  dependencies:
-                    - namespace: "parent"
-                      version: "2.2"
+                      dependencies:
+                          - namespace: "test"
+                            version: "1.0"
+                            dependencies:
+                              - namespace: "parent"
+                                version: "2.2"
 
         All dependencies should be listed in install order.
 
@@ -1881,12 +1950,12 @@ Critical changes:
     -   The `project__dependencies` section in `cumulusci.yml` will no
         longer support nested dependencies specified like this :
 
-            dependencies:
-              - namespace: "test"
-                version: "1.0"
-                dependencies:
-                  - namespace: "parent"
-                    version: "2.2"
+                      dependencies:
+                        - namespace: "test"
+                          version: "1.0"
+                          dependencies:
+                            - namespace: "parent"
+                              version: "2.2"
 
     All dependencies should be listed in install order.
 
@@ -3493,33 +3562,33 @@ New features:
 
     : -
 
-            Added keywords for generating a collection of sObjects according to a template:
+                      Added keywords for generating a collection of sObjects according to a template:
 
-            :   -   `Generate Test Data`
-                -   `Salesforce Collection Insert`
-                -   `Salesforce Collection Update`
+                      :   -   `Generate Test Data`
+                          -   `Salesforce Collection Insert`
+                          -   `Salesforce Collection Update`
 
-        -
+                  -
 
-            Changes to Page Objects:
+                      Changes to Page Objects:
 
-            :   -   More than one page object can be loaded at once.
-                    Once loaded, the keywords of a page object remain
-                    visible in the suite. Robot will give priority to
-                    keywords in the reverse order in which they were
-                    imported.
-                -   There is a new keyword, `Log Current Page Object`,
-                    which can be useful to see information about the
-                    most recently loaded page object.
-                -   There is a new keyword, `Get Page Object`, which
-                    will return the robot library for a given page
-                    object. This can be used in other keywords to access
-                    keywords from another page object if necessary.
-                -   The `Go To Page` keyword will now automatically load
-                    the page object for the given page.
+                      :   -   More than one page object can be loaded at once.
+                              Once loaded, the keywords of a page object remain
+                              visible in the suite. Robot will give priority to
+                              keywords in the reverse order in which they were
+                              imported.
+                          -   There is a new keyword, `Log Current Page Object`,
+                              which can be useful to see information about the
+                              most recently loaded page object.
+                          -   There is a new keyword, `Get Page Object`, which
+                              will return the robot library for a given page
+                              object. This can be used in other keywords to access
+                              keywords from another page object if necessary.
+                          -   The `Go To Page` keyword will now automatically load
+                              the page object for the given page.
 
-        -   Added a basic debugger for Robot tests. It can be enabled
-            using the `-o debug True` option to the robot task.
+                  -   Added a basic debugger for Robot tests. It can be enabled
+                      using the `-o debug True` option to the robot task.
 
 -   Added support for deploying new metadata types
     `ProfilePasswordPolicy` and `ProfileSessionSetting`.
@@ -3594,8 +3663,8 @@ New features:
     permanently set this option, add this in
     `~/.cumulusci/cumulusci.yml`:
 
-        cli:
-            plain_output: True
+                  cli:
+                      plain_output: True
 
 -   Added additional info to the `cci version` command, including the
     Python version, an upgrade check, and a warning on Python 2.
@@ -4876,12 +4945,12 @@ Resolving a few issues from beta77:
     below. In flows that need to inject the actual namespace prefix,
     override the [unmanaged]{.title-ref} option .. :
 
-        custom_deploy_task:
-            class_path: cumulusci.tasks.salesforce.Deploy
-            options:
-                path: your/custom/metadata
-                namespace_inject: $project_config.project__package__namespace
-                unmanaged: False
+                  custom_deploy_task:
+                      class_path: cumulusci.tasks.salesforce.Deploy
+                      options:
+                          path: your/custom/metadata
+                          namespace_inject: $project_config.project__package__namespace
+                          unmanaged: False
 
 ### Enhancements
 
@@ -5596,13 +5665,13 @@ Resolving a few issues from beta77:
 -   **IMPORANT** This release changes the yaml structure for flows. The
     new structure now looks like this:
 
-        flows:
-            flow_name:
-                tasks:
-                    1:
-                        task: deploy
-                    2:
-                        task: run_tests
+                  flows:
+                      flow_name:
+                          tasks:
+                              1:
+                                  task: deploy
+                              2:
+                                  task: run_tests
 
 -   See the new flow customization examples in the cookbook for examples
     of why this change was made and how to use it:
diff --git a/docs/org_config-reference.md b/docs/org_config-reference.md
new file mode 100644
index 0000000000..aa2f76b18c
--- /dev/null
+++ b/docs/org_config-reference.md
@@ -0,0 +1,59 @@
+# org_config Object Reference
+
+The `org_config` object can be used in the `cumulusci.yml` file to read a large number number of attributes of the Salesforce org currently used. For example, in a [custom flow step](https://cumulusci.readthedocs.io/en/stable/config.html#add-a-flow-step), you can use a `when` clause to adapt the behavior of the new step to the type of org (scratch org or not) by referencing the `org_config.scratch` attribute.
+
+The following information is documented here in the application's source code :
+https://github.com/SFDO-Tooling/CumulusCI/blob/main/cumulusci/core/config/org_config.py
+
+## org_config Object Attributes
+
+-   `access_token` : access token currently used to authenticate with Salesforce
+-   `installed_packages` : comma-separated list of package names; a `dict` mapping a namespace or metadata package ID (starts with `033`) to the installed package version(s) matching that identifier. All values are lists, because multiple second-generation packages may be installed with the same namespace. The beta version of a package is represented as "1.2.3b5", where 5 is the build number.
+-   `instance_url` : eg `https://crazy-demo.scratch.my.salesforce.com`
+-   `instance_name` : eg `crazy-demo`
+-   `is_advanced_currency_management_enabled` : `true` or `false`
+-   `is_multiple_currencies_enabled`: `true` or `false`
+-   `is_person_accounts_enabled` : `true` or `false`
+-   `is_sandbox` : `true` if the org is a sandbox
+-   `is_survey_advanced_features_enabled`: `true` or `false`
+-   `lightning_base_url` : base url ending with `.lightning.force.com`
+-   `namespace` : namespace of the org
+-   `namespaced` : `true` if the org has a namespace
+-   `org_id` : Organization ID of the Salesforce org
+-   `org_type` : eg "Enterprise Edition" or "Developer Edition"
+-   `organization_sobject` : The Organization object (see the [Salesforce documentation for the Organization SObject](https://developer.salesforce.com/docs/atlas.en-us.object_reference.meta/object_reference/sforce_api_objects_organization.htm)) for the org.
+-   `scratch` : `True` when the org is a scratch org.
+-   `start_url`: the frontdoor URL that results in an instant login, like `https://mydomain.my.salesforce.com/secur/frontdoor.jsp?sid=...`
+-   `user_id` : user ID of the current Salesforce user
+-   `userinfo`: user OAuth2 information (see https://help.salesforce.com/s/articleView?id=sf.remoteaccess_using_userinfo_endpoint.htm)
+-   `username` : username of the current Salesforce user
+
+## Other org_config Object Attributes
+
+-   `config_file`
+-   `config_name`
+-   `latest_api_version`
+-   `name`
+-   `salesforce_client`
+-   `sfdx_alias`
+
+## org_config Object Methods
+
+-   `has_minimum_package_version(package_identifier, version_identifier)`: `true` if the org has a version of the specified package that is equal to or newer than the supplied version identifier.
+    The package identifier may be either a namespace or a `033` MetadataPackage ID.
+    The version identifier should be in "1.2.3" or "1.2.3b4" format.
+
+    `when` expressions can use the `has_minimum_package_version` method to check if a package is installed with a sufficient version.
+
+    For example:
+    `when: org_config.has_minimum_package_version("namespace", "1.0")`
+
+    See a real-life example here : https://trailhead.salesforce.com/fr/trailblazer-community/feed/0D54V00007erukZSAQ
+
+-   `get_community_info(community_name, force_refresh=False)`: Returns the community information for the given community (see https://developer.salesforce.com/docs/atlas.en-us.chatterapi.meta/chatterapi/connect_responses_community.htm)
+
+    An API call will be made the first time this method is used,
+    and the return values will be cached. Subsequent calls will
+    not call the API unless the requested community name is not in
+    the cached results, or unless the force_refresh parameter is
+    set to True
diff --git a/docs/reference.md b/docs/reference.md
index be688a4590..a15b88f497 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -6,7 +6,8 @@ maxdepth: 1
 ---
 
 cheat-sheet
-tasks
+cumulusci_tasks
 flows
-env_var_reference
+env-var-reference
+org_config-reference
 ```
diff --git a/pyproject.toml b/pyproject.toml
index 7dec9eedab..d840b1eb9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,6 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-    "annoy",
     "click>=8.1",
     "cryptography",
     "python-dateutil",
@@ -35,8 +34,6 @@ dependencies = [
     "defusedxml",
     "lxml",
     "MarkupSafe",
-    "numpy",
-    "pandas",
     "psutil",
     "pydantic<2",
     "PyJWT",
@@ -53,7 +50,6 @@ dependencies = [
     "rst2ansi>=0.1.5",
     "salesforce-bulk",
     "sarge",
-    "scikit-learn",
     "selenium<4",
     "simple-salesforce==1.11.4",
     "snowfakery>=4.0.0",
@@ -88,6 +84,14 @@ lint = [
     "pre-commit>=3.5.0",
 ]
 
+[project.optional-dependencies]
+select = [
+    "annoy",
+    "numpy",
+    "pandas",
+    "scikit-learn",
+]
+
 [project.scripts]
 cci = "cumulusci.cli.cci:main"
 snowfakery = "snowfakery.cli:main"