Using loky (thin wrapper around multiprocessing) instead of dask (#…

…947) This change has a lot of benefits: - Internal: dropping a large dependency and lots of complex code - External: users can now define map in various places (notebooks, top-level, etc without the need to protect surrounding score with `if __name__ == __main__` blocks, and avoiding various pickle serialization errors.
databricks · Dec 13, 2023 · b80710e · b80710e
1 parent 08353dd
commit b80710e
Show file tree

Hide file tree

Showing 37 changed files with 472 additions and 1,105 deletions.
diff --git a/.env b/.env
@@ -4,9 +4,6 @@
 # When not defined, define the project directory with `lilac start ./data`.
 # LILAC_PROJECT_DIR=./data
 
-# Set to 1 for duckdb to use views instead of materialized tables (lower memory usage, but slower).
-DUCKDB_USE_VIEWS=0
-
 # Set to true to enable read-only mode, disabling the ability to add datasets & compute dataset
 # signals.
 # LILAC_AUTH_ENABLED=true

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -48,14 +48,14 @@
     "editor.defaultFormatter": "charliermarsh.ruff",
     "editor.formatOnSave": true,
     "editor.codeActionsOnSave": {
-      "source.fixAll": false,
-      "source.fixAll.ruff": true
+      "source.fixAll": "never",
+      "source.fixAll.ruff": "explicit"
     }
   },
   "[svelte]": {
     "editor.defaultFormatter": "esbenp.prettier-vscode",
     "editor.codeActionsOnSave": {
-      "source.organizeImports": true
+      "source.organizeImports": "explicit"
     }
   },
   "eslint.workingDirectories": ["auto"],

diff --git a/docs/deployment/huggingface_spaces.md b/docs/deployment/huggingface_spaces.md
@@ -120,9 +120,6 @@ The modal also contains options for environment variables.
 - `LILAC_AUTH_ENABLED`: Whether to enable Google authentication on your duplicated server. Set this
   to `false`, or delete it, to disable Google authentication. If your HuggingFace space is private,
   you can set this to `false` and rely on HuggingFace space authentication.
-- `DUCKDB_USE_VIEWS`: Whether DuckDB uses views (1), or DuckDB tables (0). Views allow for much less
-  RAM consumption, with a runtime query penalty. When using DuckDB tables (0), demos will take more
-  RAM but be much faster.
 - `HF_HOME`: This should be kept `/data/.huggingface` if you plan on using Persistent Storage. This
   allows the HuggingFace cache to be persistent. If you are not, you should remove this variable
   entirely.

diff --git a/lilac/conftest.py b/lilac/conftest.py
@@ -8,7 +8,7 @@
 
 from .data.dataset import Dataset
 from .data.dataset_duckdb import DatasetDuckDB
-from .data.dataset_test_utils import TestDaskLogger, make_dataset
+from .data.dataset_test_utils import TestProcessLogger, make_dataset
 from .db_manager import set_default_dataset_cls
 from .schema import Item, Schema
 
@@ -30,7 +30,7 @@ def _make_test_data(items: list[Item], schema: Optional[Schema] = None) -> Datas
 
 
 @pytest.fixture(scope='function')
-def test_dask_logger(tmp_path: pathlib.Path) -> Generator:
+def test_process_logger(tmp_path: pathlib.Path) -> Generator:
   """A pytest fixture for creating a logger that can be read between workers and the scheduler."""
   # Return the factory for datasets that test methods can use.
-  yield TestDaskLogger(tmp_path)
+  yield TestProcessLogger(tmp_path)
diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py
@@ -58,7 +58,7 @@
 )
 from ..signals.concept_scorer import ConceptSignal
 from ..source import Source, resolve_source
-from ..tasks import TaskExecutionType, TaskStepId
+from ..tasks import TaskExecutionType, TaskShardId
 from .dataset_format import DatasetFormat
 
 # Threshold for rejecting certain queries (e.g. group by) for columns with large cardinality.
@@ -436,7 +436,7 @@ def compute_signal(
     limit: Optional[int] = None,
     include_deleted: bool = False,
     overwrite: bool = False,
-    task_step_id: Optional[TaskStepId] = None,
+    task_shard_id: Optional[TaskShardId] = None,
   ) -> None:
     """Compute a signal for a column.
 
@@ -447,8 +447,8 @@ def compute_signal(
       limit: Limit the number of rows to compute the signal on.
       include_deleted: Whether to include deleted rows in the computation.
       overwrite: Whether to overwrite an existing signal computed at this path.
-      task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
-        progress of the task.
+      task_shard_id: The TaskManager `task_shard_id` for this process run. This is used to update
+        the progress of the task.
     """
     pass
 
@@ -460,11 +460,11 @@ def compute_embedding(
     limit: Optional[int] = None,
     include_deleted: bool = False,
     overwrite: bool = False,
-    task_step_id: Optional[TaskStepId] = None,
+    task_shard_id: Optional[TaskShardId] = None,
   ) -> None:
     """Compute an embedding for a given field path."""
     signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
-    self.compute_signal(signal, path, filters, limit, include_deleted, overwrite, task_step_id)
+    self.compute_signal(signal, path, filters, limit, include_deleted, overwrite, task_shard_id)
 
   def compute_concept(
     self,
@@ -476,12 +476,18 @@ def compute_concept(
     limit: Optional[int] = None,
     include_deleted: bool = False,
     overwrite: bool = False,
-    task_step_id: Optional[TaskStepId] = None,
+    task_shard_id: Optional[TaskShardId] = None,
   ) -> None:
     """Compute concept scores for a given field path."""
     signal = ConceptSignal(namespace=namespace, concept_name=concept_name, embedding=embedding)
     self.compute_signal(
-      signal, path, filters, limit, include_deleted, overwrite=overwrite, task_step_id=task_step_id
+      signal,
+      path,
+      filters,
+      limit,
+      include_deleted,
+      overwrite=overwrite,
+      task_shard_id=task_shard_id,
     )
 
   @abc.abstractmethod
@@ -529,7 +535,6 @@ def select_rows(
     sort_order: Optional[SortOrder] = SortOrder.DESC,
     limit: Optional[int] = 100,
     offset: Optional[int] = 0,
-    task_step_id: Optional[TaskStepId] = None,
     resolve_span: bool = False,
     combine_columns: bool = False,
     include_deleted: bool = False,
@@ -553,8 +558,6 @@ def select_rows(
       sort_order: The sort order.
       limit: The maximum number of rows to return.
       offset: The offset to start returning rows from.
-      task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
-        progress.
       resolve_span: Whether to resolve the span of the row.
       combine_columns: Whether to combine columns into a single object. The object will be pruned
         to only include sub-fields that correspond to the requested columns.