Export to HuggingFace. Support glaive-function-calling-v2 in the demo…

…, clusters, and via sharegpt. (#1113) Add `Dataset.to_huggingface` which returns a HuggingFace dataset. Users can then publish it explicitly. To get the glaive-function-calling-v2 dataset to work we had to: - Write a notebook to map() the data in lilac over to the sharegpt format. Here is the output dataset: https://huggingface.co/datasets/lilacai/glaive-function-calling-v2-sharegpt Demos: - https://lilacai-lilac.hf.space/datasets#lilac/glaive-function-calling-v2 - https://lilacai-lilac.hf.space/datasets#lilac/glaive-function-calling-v2&viewPivot=true&pivot=%7B%22outerPath%22%3A%5B%22conversation_clusters%22%2C%22category_title%22%5D%2C%22innerPath%22%3A%5B%22conversation_clusters%22%2C%22cluster_title%22%5D%7D Other: - Allow this ./run_server_dev.sh ./demo_data Input: ![image](https://github.com/lilacai/lilac/assets/1100749/e682b9ba-3279-471d-997f-12c86c26eae7) Output: ![image](https://github.com/lilacai/lilac/assets/1100749/cd5a8ea3-c90b-4bd7-8b90-c3ea625eb2e0)
databricks · Jan 23, 2024 · 3e8f292 · 3e8f292
1 parent d55a2f1
commit 3e8f292
Show file tree

Hide file tree

Showing 14 changed files with 352 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ If you prefer, you can load datasets directly from the UI without writing any Py
 
 ### 🔎 Explore
 
-> [🔗 Try OpenOrca-100K before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k)
+> [🔗 Try OpenOrca before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca)
 
 Once we've loaded a dataset, we can explore it from the UI and get a sense for what's in the data.
 More documentation [here](https://docs.lilacml.com/datasets/dataset_explore.html).

diff --git a/docs/welcome.md b/docs/welcome.md
@@ -14,9 +14,9 @@
 Lilac is an open-source tool that enables data and AI practitioners improve their products by
 improving their data.
 
-[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k),
-where we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language
-model" on the OpenOrca dataset!
+[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca), where
+we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language model" on
+the OpenOrca dataset!
 
 ## Why use Lilac?
 

diff --git a/lilac/__init__.py b/lilac/__init__.py
@@ -12,7 +12,7 @@
 from .data import *  # noqa: F403
 from .data.dataset_duckdb import DatasetDuckDB
 from .data.dataset_storage_utils import download, upload
-from .db_manager import get_dataset, list_datasets, set_default_dataset_cls
+from .db_manager import get_dataset, has_dataset, list_datasets, set_default_dataset_cls
 from .deploy import deploy_config, deploy_project
 from .embeddings import *  # noqa: F403
 from .env import *  # noqa: F403
@@ -53,6 +53,7 @@
   'from_dicts',
   'from_huggingface',
   'get_dataset',
+  'has_dataset',
   'list_datasets',
   'init',
   'span',

diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import pandas as pd
+from datasets import Dataset as HuggingFaceDataset
 from pydantic import (
   BaseModel,
   ConfigDict,
@@ -977,6 +978,26 @@ def transform(
       schema=schema,
     )
 
+  @abc.abstractmethod
+  def to_huggingface(
+    self,
+    columns: Optional[Sequence[ColumnId]] = None,
+    filters: Optional[Sequence[FilterLike]] = None,
+    include_labels: Optional[Sequence[str]] = None,
+    exclude_labels: Optional[Sequence[str]] = None,
+    include_deleted: bool = False,
+  ) -> HuggingFaceDataset:
+    """Export the dataset to a huggingface dataset.
+
+    Args:
+      columns: The columns to export.
+      filters: The filters to apply to the query.
+      include_labels: The labels to include in the export.
+      exclude_labels: The labels to exclude in the export.
+      include_deleted: Whether to include deleted rows in the export.
+    """
+    pass
+
   @abc.abstractmethod
   def to_json(
     self,

diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
@@ -26,6 +26,7 @@
 import orjson
 import pandas as pd
 import yaml
+from datasets import Dataset as HuggingFaceDataset
 from pandas.api.types import is_object_dtype
 from pydantic import BaseModel, SerializeAsAny, field_validator
 from typing_extensions import override
@@ -3177,6 +3178,29 @@ def cluster(
       self, input, output_path, min_cluster_size, topic_fn, overwrite, use_garden, task_id=task_id
     )
 
+  @override
+  def to_huggingface(
+    self,
+    columns: Optional[Sequence[ColumnId]] = None,
+    filters: Optional[Sequence[FilterLike]] = None,
+    include_labels: Optional[Sequence[str]] = None,
+    exclude_labels: Optional[Sequence[str]] = None,
+    include_deleted: bool = False,
+  ) -> HuggingFaceDataset:
+    filters, _ = self._normalize_filters(
+      filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest()
+    )
+    filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels))
+    rows = self.select_rows(
+      columns, filters=filters, combine_columns=True, include_deleted=include_deleted
+    )
+
+    def _gen() -> Iterator[Item]:
+      for row in rows:
+        yield row
+
+    return cast(HuggingFaceDataset, HuggingFaceDataset.from_generator(_gen))
+
   @override
   def to_json(
     self,

diff --git a/lilac/data/dataset_export_test.py b/lilac/data/dataset_export_test.py
@@ -40,6 +40,41 @@ def setup_teardown() -> Iterable[None]:
   clear_signal_registry()  # Teardown.
 
 
+def test_export_to_huggingface(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
+  dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
+  dataset.compute_signal(TestSignal(), 'text')
+
+  hf_dataset = dataset.to_huggingface()
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}},
+    {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}},
+  ]
+
+
+def test_export_to_huggingface_filters(
+  make_test_data: TestDataMaker, tmp_path: pathlib.Path
+) -> None:
+  dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
+  dataset.compute_signal(TestSignal(), 'text')
+
+  # Download a subset of columns with filter.
+  hf_dataset = dataset.to_huggingface(
+    columns=['text', 'text.test_signal.flen'],
+    filters=[('text.test_signal.len', 'greater', 6)],
+  )
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}}
+  ]
+
+  hf_dataset = dataset.to_huggingface(filters=[('text.test_signal.flen', 'less_equal', '5')])
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}
+  ]
+
+
 def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
   dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
   dataset.compute_signal(TestSignal(), 'text')

diff --git a/lilac/data/dataset_storage_utils.py b/lilac/data/dataset_storage_utils.py
@@ -34,8 +34,8 @@ def download(
 
   Args:
     url_or_repo: A remote URL to a Lilac-processed dataset. Currently only supports HuggingFace
-      dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca-100k
-      or a repo_id: lilacai/lilac-OpenOrca-100k.
+      dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca
+      or a repo_id: lilacai/lilac-OpenOrca.
     project_dir: The project directory to use for the demo. Defaults to `env.LILAC_PROJECT_DIR`
       which can be set with `ll.set_project_dir()`.
     dataset_namespace: The local namespace to use. Defaults to 'local'.

diff --git a/lilac/db_manager.py b/lilac/db_manager.py
@@ -47,6 +47,24 @@ def get_dataset(
     return _CACHED_DATASETS[cache_key]
 
 
+def has_dataset(
+  namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
+) -> bool:
+  """Get the dataset instance."""
+  if not _DEFAULT_DATASET_CLS:
+    raise ValueError('Default dataset class not set.')
+
+  project_dir = project_dir or get_project_dir()
+  try:
+    # This will try to load the dataset, and throw an error if it doesn't exist because when the
+    # dataset is not in the cache, it will try to call the constructor, which will error if the
+    # dataset does not exist.
+    get_dataset(namespace, dataset_name, project_dir)
+    return True
+  except ValueError:
+    return False
+
+
 def remove_dataset_from_cache(
   namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
 ) -> None:

diff --git a/lilac/formats/sharegpt.py b/lilac/formats/sharegpt.py
@@ -24,6 +24,10 @@ def _sharegpt_selector(item: Item, conv_from: str) -> str:
   name='human',
   selector=lambda item: _sharegpt_selector(item, 'human'),
 )
+_TOOL_SELECTOR = DatasetFormatInputSelector(
+  name='tool',
+  selector=lambda item: _sharegpt_selector(item, 'tool'),
+)
 _GPT_SELECTOR = DatasetFormatInputSelector(
   name='gpt',
   selector=lambda item: _sharegpt_selector(item, 'gpt'),
@@ -50,8 +54,10 @@ class ShareGPT(DatasetFormat):
 
   system: ClassVar[DatasetFormatInputSelector] = _SYSTEM_SELECTOR
   human: ClassVar[DatasetFormatInputSelector] = _HUMAN_SELECTOR
+  tool: ClassVar[DatasetFormatInputSelector] = _TOOL_SELECTOR
   gpt: ClassVar[DatasetFormatInputSelector] = _GPT_SELECTOR
 
   input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
-    selector.name: selector for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR]
+    selector.name: selector
+    for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR, _TOOL_SELECTOR]
   }
diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
@@ -54,6 +54,26 @@ datasets:
         path:
           - answer
 
+  - name: glaive-function-calling-v2
+    namespace: lilac
+    source:
+      dataset_name: lilacai/glaive-function-calling-v2-sharegpt
+      source_name: huggingface
+    settings:
+      tags: [datasets]
+      ui:
+        media_paths:
+          - - conversations
+            - '*'
+            - value
+        markdown_paths: []
+    embeddings:
+      - embedding: gte-small
+        path:
+          - conversations
+          - '*'
+          - value
+
   - name: open-assistant-conversations-2
     namespace: lilac
     settings:
@@ -80,7 +100,7 @@ datasets:
             - content
     source:
       source_name: huggingface
-      dataset_name: OpenAssistant/oasst2
+      dataset_name: lmsys/lmsys-chat-1m
     embeddings:
       - embedding: gte-small
         path:
@@ -138,27 +158,6 @@ datasets:
         path:
           - question
 
-  - name: 'OpenOrca-100k'
-    namespace: lilac
-    settings:
-      tags: [machine-learning]
-      ui:
-        media_paths:
-          - question
-          - response
-      preferred_embedding: 'gte-small'
-    source:
-      source_name: huggingface
-      dataset_name: Open-Orca/OpenOrca
-      sample_size: 100000
-    embeddings:
-      - embedding: gte-small
-        path:
-          - question
-      - embedding: gte-small
-        path:
-          - response
-
   - namespace: lilac
     name: dolphin
     tags: [datasets]
@@ -196,6 +195,13 @@ clusters:
     dataset_name: glaive-code-assistant
     input_path:
       - question
+  - dataset_namespace: lilac
+    dataset_name: glaive-function-calling-v2
+    input_selector:
+      format: sharegpt
+      selector: human
+    output_path:
+      - conversation_clusters
   - dataset_namespace: lilac
     dataset_name: open-assistant-conversations-2
     input_path:
@@ -215,10 +221,6 @@ clusters:
     dataset_name: databricks-dolly-15k-curated-en
     input_path:
       - original-instruction
-  - dataset_namespace: lilac
-    dataset_name: OpenOrca-100k
-    input_path:
-      - question
   - dataset_namespace: lilac
     dataset_name: dolphin
     input_path: