Skip to content

Commit

Permalink
Using loky (thin wrapper around multiprocessing) instead of dask (#…
Browse files Browse the repository at this point in the history
…947)

This change has a lot of benefits:
- Internal: dropping a large dependency and lots of complex code
- External: users can now define map in various places (notebooks,
top-level, etc without the need to protect surrounding score with `if
__name__ == __main__` blocks, and avoiding various pickle serialization
errors.
  • Loading branch information
dsmilkov authored Dec 13, 2023
1 parent 08353dd commit b80710e
Show file tree
Hide file tree
Showing 37 changed files with 472 additions and 1,105 deletions.
3 changes: 0 additions & 3 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
# When not defined, define the project directory with `lilac start ./data`.
# LILAC_PROJECT_DIR=./data

# Set to 1 for duckdb to use views instead of materialized tables (lower memory usage, but slower).
DUCKDB_USE_VIEWS=0

# Set to true to enable read-only mode, disabling the ability to add datasets & compute dataset
# signals.
# LILAC_AUTH_ENABLED=true
Expand Down
6 changes: 3 additions & 3 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.fixAll": false,
"source.fixAll.ruff": true
"source.fixAll": "never",
"source.fixAll.ruff": "explicit"
}
},
"[svelte]": {
"editor.defaultFormatter": "esbenp.prettier-vscode",
"editor.codeActionsOnSave": {
"source.organizeImports": true
"source.organizeImports": "explicit"
}
},
"eslint.workingDirectories": ["auto"],
Expand Down
3 changes: 0 additions & 3 deletions docs/deployment/huggingface_spaces.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,6 @@ The modal also contains options for environment variables.
- `LILAC_AUTH_ENABLED`: Whether to enable Google authentication on your duplicated server. Set this
to `false`, or delete it, to disable Google authentication. If your HuggingFace space is private,
you can set this to `false` and rely on HuggingFace space authentication.
- `DUCKDB_USE_VIEWS`: Whether DuckDB uses views (1), or DuckDB tables (0). Views allow for much less
RAM consumption, with a runtime query penalty. When using DuckDB tables (0), demos will take more
RAM but be much faster.
- `HF_HOME`: This should be kept `/data/.huggingface` if you plan on using Persistent Storage. This
allows the HuggingFace cache to be persistent. If you are not, you should remove this variable
entirely.
Expand Down
6 changes: 3 additions & 3 deletions lilac/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .data.dataset import Dataset
from .data.dataset_duckdb import DatasetDuckDB
from .data.dataset_test_utils import TestDaskLogger, make_dataset
from .data.dataset_test_utils import TestProcessLogger, make_dataset
from .db_manager import set_default_dataset_cls
from .schema import Item, Schema

Expand All @@ -30,7 +30,7 @@ def _make_test_data(items: list[Item], schema: Optional[Schema] = None) -> Datas


@pytest.fixture(scope='function')
def test_dask_logger(tmp_path: pathlib.Path) -> Generator:
def test_process_logger(tmp_path: pathlib.Path) -> Generator:
"""A pytest fixture for creating a logger that can be read between workers and the scheduler."""
# Return the factory for datasets that test methods can use.
yield TestDaskLogger(tmp_path)
yield TestProcessLogger(tmp_path)
25 changes: 14 additions & 11 deletions lilac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
)
from ..signals.concept_scorer import ConceptSignal
from ..source import Source, resolve_source
from ..tasks import TaskExecutionType, TaskStepId
from ..tasks import TaskExecutionType, TaskShardId
from .dataset_format import DatasetFormat

# Threshold for rejecting certain queries (e.g. group by) for columns with large cardinality.
Expand Down Expand Up @@ -436,7 +436,7 @@ def compute_signal(
limit: Optional[int] = None,
include_deleted: bool = False,
overwrite: bool = False,
task_step_id: Optional[TaskStepId] = None,
task_shard_id: Optional[TaskShardId] = None,
) -> None:
"""Compute a signal for a column.
Expand All @@ -447,8 +447,8 @@ def compute_signal(
limit: Limit the number of rows to compute the signal on.
include_deleted: Whether to include deleted rows in the computation.
overwrite: Whether to overwrite an existing signal computed at this path.
task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
progress of the task.
task_shard_id: The TaskManager `task_shard_id` for this process run. This is used to update
the progress of the task.
"""
pass

Expand All @@ -460,11 +460,11 @@ def compute_embedding(
limit: Optional[int] = None,
include_deleted: bool = False,
overwrite: bool = False,
task_step_id: Optional[TaskStepId] = None,
task_shard_id: Optional[TaskShardId] = None,
) -> None:
"""Compute an embedding for a given field path."""
signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
self.compute_signal(signal, path, filters, limit, include_deleted, overwrite, task_step_id)
self.compute_signal(signal, path, filters, limit, include_deleted, overwrite, task_shard_id)

def compute_concept(
self,
Expand All @@ -476,12 +476,18 @@ def compute_concept(
limit: Optional[int] = None,
include_deleted: bool = False,
overwrite: bool = False,
task_step_id: Optional[TaskStepId] = None,
task_shard_id: Optional[TaskShardId] = None,
) -> None:
"""Compute concept scores for a given field path."""
signal = ConceptSignal(namespace=namespace, concept_name=concept_name, embedding=embedding)
self.compute_signal(
signal, path, filters, limit, include_deleted, overwrite=overwrite, task_step_id=task_step_id
signal,
path,
filters,
limit,
include_deleted,
overwrite=overwrite,
task_shard_id=task_shard_id,
)

@abc.abstractmethod
Expand Down Expand Up @@ -529,7 +535,6 @@ def select_rows(
sort_order: Optional[SortOrder] = SortOrder.DESC,
limit: Optional[int] = 100,
offset: Optional[int] = 0,
task_step_id: Optional[TaskStepId] = None,
resolve_span: bool = False,
combine_columns: bool = False,
include_deleted: bool = False,
Expand All @@ -553,8 +558,6 @@ def select_rows(
sort_order: The sort order.
limit: The maximum number of rows to return.
offset: The offset to start returning rows from.
task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
progress.
resolve_span: Whether to resolve the span of the row.
combine_columns: Whether to combine columns into a single object. The object will be pruned
to only include sub-fields that correspond to the requested columns.
Expand Down
Loading

0 comments on commit b80710e

Please sign in to comment.