databricks · nsthorat · Jan 23, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 23, 2024
diff --git a/lilac/__init__.py b/lilac/__init__.py
@@ -12,7 +12,7 @@
 from .data import *  # noqa: F403
 from .data.dataset_duckdb import DatasetDuckDB
 from .data.dataset_storage_utils import download, upload
-from .db_manager import get_dataset, list_datasets, set_default_dataset_cls
+from .db_manager import get_dataset, has_dataset, list_datasets, set_default_dataset_cls
 from .deploy import deploy_config, deploy_project
 from .embeddings import *  # noqa: F403
 from .env import *  # noqa: F403
@@ -53,6 +53,7 @@
   'from_dicts',
   'from_huggingface',
   'get_dataset',
+  'has_dataset',
   'list_datasets',
   'init',
   'span',

diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import pandas as pd
+from datasets import Dataset as HuggingFaceDataset
 from pydantic import (
   BaseModel,
   ConfigDict,
@@ -977,6 +978,26 @@ def transform(
       schema=schema,
     )
 
+  @abc.abstractmethod
+  def to_huggingface(
+    self,
+    columns: Optional[Sequence[ColumnId]] = None,
+    filters: Optional[Sequence[FilterLike]] = None,
+    include_labels: Optional[Sequence[str]] = None,
+    exclude_labels: Optional[Sequence[str]] = None,
+    include_deleted: bool = False,
+  ) -> HuggingFaceDataset:
+    """Export the dataset to a huggingface dataset.
+
+    Args:
+      columns: The columns to export.
+      filters: The filters to apply to the query.
+      include_labels: The labels to include in the export.
+      exclude_labels: The labels to exclude in the export.
+      include_deleted: Whether to include deleted rows in the export.
+    """
+    pass
+
   @abc.abstractmethod
   def to_json(
     self,

diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
@@ -26,6 +26,7 @@
 import orjson
 import pandas as pd
 import yaml
+from datasets import Dataset as HuggingFaceDataset
 from pandas.api.types import is_object_dtype
 from pydantic import BaseModel, SerializeAsAny, field_validator
 from typing_extensions import override
@@ -3174,6 +3175,30 @@ def cluster(
       self, input, output_path, min_cluster_size, topic_fn, overwrite, use_garden, task_id=task_id
     )
 
+  @override
+  def to_huggingface(
+    self,
+    columns: Optional[Sequence[ColumnId]] = None,
+    filters: Optional[Sequence[FilterLike]] = None,
+    include_labels: Optional[Sequence[str]] = None,
+    exclude_labels: Optional[Sequence[str]] = None,
+    include_deleted: bool = False,
+  ) -> HuggingFaceDataset:
+    filters, _ = self._normalize_filters(
+      filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest()
+    )
+    filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels))
+    print('filters=', filters)
+    rows = self.select_rows(
+      columns, filters=filters, combine_columns=True, include_deleted=include_deleted
+    )
+
+    def _gen() -> Iterator[Item]:
+      for row in rows:
+        yield row
+
+    return cast(HuggingFaceDataset, HuggingFaceDataset.from_generator(_gen))
+
   @override
   def to_json(
     self,

diff --git a/lilac/data/dataset_export_test.py b/lilac/data/dataset_export_test.py
@@ -40,6 +40,41 @@ def setup_teardown() -> Iterable[None]:
   clear_signal_registry()  # Teardown.
 
 
+def test_export_to_huggingface(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
+  dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
+  dataset.compute_signal(TestSignal(), 'text')
+
+  hf_dataset = dataset.to_huggingface()
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}},
+    {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}},
+  ]
+
+
+def test_export_to_huggingface_filters(
+  make_test_data: TestDataMaker, tmp_path: pathlib.Path
+) -> None:
+  dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
+  dataset.compute_signal(TestSignal(), 'text')
+
+  # Download a subset of columns with filter.
+  hf_dataset = dataset.to_huggingface(
+    columns=['text', 'text.test_signal.flen'],
+    filters=[('text.test_signal.len', 'greater', 6)],
+  )
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}}
+  ]
+
+  hf_dataset = dataset.to_huggingface(filters=[('text.test_signal.flen', 'less_equal', '5')])
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}
+  ]
+
+
 def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
   dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
   dataset.compute_signal(TestSignal(), 'text')

diff --git a/lilac/db_manager.py b/lilac/db_manager.py
@@ -47,6 +47,21 @@ def get_dataset(
     return _CACHED_DATASETS[cache_key]
 
 
+def has_dataset(
+  namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
+) -> bool:
+  """Get the dataset instance."""
+  if not _DEFAULT_DATASET_CLS:
+    raise ValueError('Default dataset class not set.')
+
+  project_dir = project_dir or get_project_dir()
+  try:
+    get_dataset(namespace, dataset_name, project_dir)
+    return True
+  except ValueError:
+    return False
+
+
 def remove_dataset_from_cache(
   namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
 ) -> None:

diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
@@ -54,6 +54,26 @@ datasets:
         path:
           - answer
 
+  - name: glaive-function-calling-v2
+    namespace: lilac
+    source:
+      dataset_name: lilacai/glaive-function-calling-v2-sharegpt
+      source_name: huggingface
+    settings:
+      tags: [datasets]
+      ui:
+        media_paths:
+          - - conversations
+            - '*'
+            - value
+        markdown_paths: []
+    embeddings:
+      - embedding: gte-small
+        path:
+          - conversations
+          - '*'
+          - value
+
   - name: open-assistant-conversations-2
     namespace: lilac
     settings:
@@ -196,6 +216,13 @@ clusters:
     dataset_name: glaive-code-assistant
     input_path:
       - question
+  - dataset_namespace: lilac
+    dataset_name: glaive-function-calling-v2
+    input_selector:
+      format: sharegpt
+      selector: user
+    output_path:
+      - conversation_clusters
   - dataset_namespace: lilac
     dataset_name: open-assistant-conversations-2
     input_path:

diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Convert the unformatted Glaive dataset to ShareGPT\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import lilac as ll\n",
+    "\n",
+    "ll.set_project_dir('./data')\n",
+    "\n",
+    "if not ll.has_dataset('local', 'glaive-function-calling-v2'):\n",
+    "  ll.from_huggingface(\n",
+    "    'glaiveai/glaive-function-calling-v2',\n",
+    "    'local',\n",
+    "    'glaive-function-calling-v2',\n",
+    "  )\n",
+    "\n",
+    "ds = ll.get_dataset('local', 'glaive-function-calling-v2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\":   0%|          | 0/112960 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 100%|██████████| 112960/112960 [00:07<00:00, 15708.45it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrote map output to conversations-00000-of-00001.parquet\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']\n",
+    "\n",
+    "# The split regex is a role, plus semicolon and space. For example\n",
+    "# \"USER: \" or \"FUNCTION RESPONSE: \".\n",
+    "split_re = re.compile(r'({}): '.format('|'.join(ROLES)))\n",
+    "\n",
+    "\n",
+    "def _parse_chat(row: dict):\n",
+    "  system_prompt = row.get('system')\n",
+    "\n",
+    "  chat = row['chat']\n",
+    "  # Split chat by split_res, and remove empty strings.\n",
+    "  chats = [s.strip() for s in split_re.split(chat) if s]\n",
+    "\n",
+    "  # results look like:\n",
+    "  # ['USER', 'Can you book a flight for me from New York to London?', 'ASSISTANT', '...']\n",
+    "  # We now want it to be a dictionary of {'from': 'user', 'value': 'Can you book a flight...'}\n",
+    "  chats = [{'from': role.lower(), 'value': value} for role, value in zip(chats[::2], chats[1::2])]\n",
+    "\n",
+    "  if system_prompt:\n",
+    "    chats = [{'from': 'system', 'value': system_prompt}] + chats\n",
+    "\n",
+    "  return chats\n",
+    "\n",
+    "\n",
+    "res = ds.map(_parse_chat, output_path='conversations', overwrite=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'datasets.arrow_dataset.Dataset'>\n",
+      "Dataset({\n",
+      "    features: ['chat', 'system', '__hfsplit__', 'chat__cluster', 'conversations'],\n",
+      "    num_rows: 112960\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 90.73ba/s]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 91.67ba/s]s/it]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "hf_ds = ds.to_huggingface()\n",
+    "\n",
+    "print(hf_ds)\n",
+    "\n",
+    "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/run_server_dev.sh b/run_server_dev.sh
@@ -11,7 +11,11 @@ npm run dev --workspace web/blueprint -- --open &
 pid[2]=$!
 
 # Run the FastAPI server.
-export LILAC_PROJECT_DIR='./data'
+if [ "$1" ]; then
+  export LILAC_PROJECT_DIR="$1"
+else
+  export LILAC_PROJECT_DIR="./data"
+fi
 poetry run uvicorn lilac.server:app --reload --port 5432 --host 0.0.0.0 \
   --reload-dir lilac &
 pid[1]=$!