From a518a8e0c18f1bb2b809667ff72f8367452ed442 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Mon, 22 Jan 2024 17:47:01 -0500
Subject: [PATCH 1/6] Support glaive-function-calling-v2 in the demo, clusters,
 and via sharegpt.

---
 lilac/__init__.py                |   3 +-
 lilac/data/dataset.py            |  20 ++++
 lilac/data/dataset_duckdb.py     |  24 +++++
 lilac/db_manager.py              |  15 +++
 lilac_hf_space.yml               |  27 ++++++
 notebooks/GlaiveToShareGPT.ipynb | 162 +++++++++++++++++++++++++++++++
 run_server_dev.sh                |   6 +-
 7 files changed, 255 insertions(+), 2 deletions(-)
 create mode 100644 notebooks/GlaiveToShareGPT.ipynb

diff --git a/lilac/__init__.py b/lilac/__init__.py
index 2c7604994..776126d86 100644
--- a/lilac/__init__.py
+++ b/lilac/__init__.py
@@ -12,7 +12,7 @@
 from .data import *  # noqa: F403
 from .data.dataset_duckdb import DatasetDuckDB
 from .data.dataset_storage_utils import download, upload
-from .db_manager import get_dataset, list_datasets, set_default_dataset_cls
+from .db_manager import get_dataset, has_dataset, list_datasets, set_default_dataset_cls
 from .deploy import deploy_config, deploy_project
 from .embeddings import *  # noqa: F403
 from .env import *  # noqa: F403
@@ -53,6 +53,7 @@
   'from_dicts',
   'from_huggingface',
   'get_dataset',
+  'has_dataset',
   'list_datasets',
   'init',
   'span',
diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py
index a4fff42ac..2e774c879 100644
--- a/lilac/data/dataset.py
+++ b/lilac/data/dataset.py
@@ -977,6 +977,26 @@ def transform(
       schema=schema,
     )
 
+  @abc.abstractmethod
+  def to_huggingface(
+    self,
+    columns: Optional[Sequence[ColumnId]] = None,
+    filters: Optional[Sequence[FilterLike]] = None,
+    include_labels: Optional[Sequence[str]] = None,
+    exclude_labels: Optional[Sequence[str]] = None,
+    include_deleted: bool = False,
+  ) -> HuggingFaceDataset:
+    """Export the dataset to a huggingface dataset.
+
+    Args:
+      columns: The columns to export.
+      filters: The filters to apply to the query.
+      include_labels: The labels to include in the export.
+      exclude_labels: The labels to exclude in the export.
+      include_deleted: Whether to include deleted rows in the export.
+    """
+    pass
+
   @abc.abstractmethod
   def to_json(
     self,
diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
index 50c492f17..1326890c2 100644
--- a/lilac/data/dataset_duckdb.py
+++ b/lilac/data/dataset_duckdb.py
@@ -26,6 +26,7 @@
 import orjson
 import pandas as pd
 import yaml
+from datasets import Dataset as HuggingFaceDataset
 from pandas.api.types import is_object_dtype
 from pydantic import BaseModel, SerializeAsAny, field_validator
 from typing_extensions import override
@@ -3174,6 +3175,29 @@ def cluster(
       self, input, output_path, min_cluster_size, topic_fn, overwrite, use_garden, task_id=task_id
     )
 
+  @override
+  def to_huggingface(
+    self,
+    columns: Optional[Sequence[ColumnId]] = None,
+    filters: Optional[Sequence[FilterLike]] = None,
+    include_labels: Optional[Sequence[str]] = None,
+    exclude_labels: Optional[Sequence[str]] = None,
+    include_deleted: bool = False,
+  ) -> HuggingFaceDataset:
+    filters, _ = self._normalize_filters(
+      filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest()
+    )
+    filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels))
+    rows = self.select_rows(
+      columns, filters=filters, combine_columns=True, include_deleted=include_deleted
+    )
+
+    def _gen() -> Iterator[Item]:
+      for row in rows:
+        yield row
+
+    return cast(HuggingFaceDataset, HuggingFaceDataset.from_generator(_gen))
+
   @override
   def to_json(
     self,
diff --git a/lilac/db_manager.py b/lilac/db_manager.py
index e1dbbdd69..e020cd11e 100644
--- a/lilac/db_manager.py
+++ b/lilac/db_manager.py
@@ -47,6 +47,21 @@ def get_dataset(
     return _CACHED_DATASETS[cache_key]
 
 
+def has_dataset(
+  namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
+) -> bool:
+  """Get the dataset instance."""
+  if not _DEFAULT_DATASET_CLS:
+    raise ValueError('Default dataset class not set.')
+
+  project_dir = project_dir or get_project_dir()
+  try:
+    get_dataset(namespace, dataset_name, project_dir)
+    return True
+  except ValueError:
+    return False
+
+
 def remove_dataset_from_cache(
   namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
 ) -> None:
diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
index 78a3c1ce9..a66702088 100644
--- a/lilac_hf_space.yml
+++ b/lilac_hf_space.yml
@@ -54,6 +54,26 @@ datasets:
         path:
           - answer
 
+  - name: glaive-function-calling-v2
+    namespace: lilac
+    source:
+      dataset_name: lilacai/glaive-function-calling-v2-sharegpt
+      source_name: huggingface
+    settings:
+      tags: [datasets]
+      ui:
+        media_paths:
+          - - conversations
+            - '*'
+            - value
+        markdown_paths: []
+    embeddings:
+      - embedding: gte-small
+        path:
+          - conversations
+          - '*'
+          - value
+
   - name: open-assistant-conversations-2
     namespace: lilac
     settings:
@@ -196,6 +216,13 @@ clusters:
     dataset_name: glaive-code-assistant
     input_path:
       - question
+  - dataset_namespace: lilac
+    dataset_name: glaive-function-calling-v2
+    input_selector:
+      format: sharegpt
+      selector: user
+    output_path:
+      - conversation_clusters
   - dataset_namespace: lilac
     dataset_name: open-assistant-conversations-2
     input_path:
diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb
new file mode 100644
index 000000000..887248340
--- /dev/null
+++ b/notebooks/GlaiveToShareGPT.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Convert the unformatted Glaive dataset to ShareGPT\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import lilac as ll\n",
+    "\n",
+    "ll.set_project_dir('./data')\n",
+    "\n",
+    "if not ll.has_dataset('local', 'glaive-function-calling-v2'):\n",
+    "  ll.from_huggingface(\n",
+    "    'glaiveai/glaive-function-calling-v2',\n",
+    "    'local',\n",
+    "    'glaive-function-calling-v2',\n",
+    "  )\n",
+    "\n",
+    "ds = ll.get_dataset('local', 'glaive-function-calling-v2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\":   0%|          | 0/112960 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 100%|██████████| 112960/112960 [00:07<00:00, 15708.45it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrote map output to conversations-00000-of-00001.parquet\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']\n",
+    "\n",
+    "# The split regex is a role, plus semicolon and space. For example\n",
+    "# \"USER: \" or \"FUNCTION RESPONSE: \".\n",
+    "split_re = re.compile(r'({}): '.format('|'.join(ROLES)))\n",
+    "\n",
+    "\n",
+    "def _parse_chat(row: dict):\n",
+    "  system_prompt = row.get('system')\n",
+    "\n",
+    "  chat = row['chat']\n",
+    "  # Split chat by split_res, and remove empty strings.\n",
+    "  chats = [s.strip() for s in split_re.split(chat) if s]\n",
+    "\n",
+    "  # results look like:\n",
+    "  # ['USER', 'Can you book a flight for me from New York to London?', 'ASSISTANT', '...']\n",
+    "  # We now want it to be a dictionary of {'from': 'user', 'value': 'Can you book a flight...'}\n",
+    "  chats = [{'from': role.lower(), 'value': value} for role, value in zip(chats[::2], chats[1::2])]\n",
+    "\n",
+    "  if system_prompt:\n",
+    "    chats = [{'from': 'system', 'value': system_prompt}] + chats\n",
+    "\n",
+    "  return chats\n",
+    "\n",
+    "\n",
+    "res = ds.map(_parse_chat, output_path='conversations', overwrite=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'datasets.arrow_dataset.Dataset'>\n",
+      "Dataset({\n",
+      "    features: ['chat', 'system', '__hfsplit__', 'chat__cluster', 'conversations'],\n",
+      "    num_rows: 112960\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 90.73ba/s]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 91.67ba/s]s/it]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "hf_ds = ds.to_huggingface()\n",
+    "\n",
+    "print(hf_ds)\n",
+    "\n",
+    "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/run_server_dev.sh b/run_server_dev.sh
index ed99841ae..b6e724def 100755
--- a/run_server_dev.sh
+++ b/run_server_dev.sh
@@ -11,7 +11,11 @@ npm run dev --workspace web/blueprint -- --open &
 pid[2]=$!
 
 # Run the FastAPI server.
-export LILAC_PROJECT_DIR='./data'
+if [ "$1" ]; then
+  export LILAC_PROJECT_DIR="$1"
+else
+  export LILAC_PROJECT_DIR="./data"
+fi
 poetry run uvicorn lilac.server:app --reload --port 5432 --host 0.0.0.0 \
   --reload-dir lilac &
 pid[1]=$!

From 91d8916c1dc216a2f21fa4cdb56514c9bdf229f9 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Mon, 22 Jan 2024 18:09:30 -0500
Subject: [PATCH 2/6] save

---
 lilac/data/dataset.py             |  1 +
 lilac/data/dataset_duckdb.py      |  1 +
 lilac/data/dataset_export_test.py | 35 +++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py
index 2e774c879..55fb2da24 100644
--- a/lilac/data/dataset.py
+++ b/lilac/data/dataset.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import pandas as pd
+from datasets import Dataset as HuggingFaceDataset
 from pydantic import (
   BaseModel,
   ConfigDict,
diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
index 1326890c2..b52b5813a 100644
--- a/lilac/data/dataset_duckdb.py
+++ b/lilac/data/dataset_duckdb.py
@@ -3188,6 +3188,7 @@ def to_huggingface(
       filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest()
     )
     filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels))
+    print('filters=', filters)
     rows = self.select_rows(
       columns, filters=filters, combine_columns=True, include_deleted=include_deleted
     )
diff --git a/lilac/data/dataset_export_test.py b/lilac/data/dataset_export_test.py
index 9f0cda6d7..aeb9c77f5 100644
--- a/lilac/data/dataset_export_test.py
+++ b/lilac/data/dataset_export_test.py
@@ -40,6 +40,41 @@ def setup_teardown() -> Iterable[None]:
   clear_signal_registry()  # Teardown.
 
 
+def test_export_to_huggingface(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
+  dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
+  dataset.compute_signal(TestSignal(), 'text')
+
+  hf_dataset = dataset.to_huggingface()
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}},
+    {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}},
+  ]
+
+
+def test_export_to_huggingface_filters(
+  make_test_data: TestDataMaker, tmp_path: pathlib.Path
+) -> None:
+  dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
+  dataset.compute_signal(TestSignal(), 'text')
+
+  # Download a subset of columns with filter.
+  hf_dataset = dataset.to_huggingface(
+    columns=['text', 'text.test_signal.flen'],
+    filters=[('text.test_signal.len', 'greater', 6)],
+  )
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}}
+  ]
+
+  hf_dataset = dataset.to_huggingface(filters=[('text.test_signal.flen', 'less_equal', '5')])
+
+  assert list(hf_dataset) == [
+    {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}
+  ]
+
+
 def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
   dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
   dataset.compute_signal(TestSignal(), 'text')

From db9fd195d06a3602830e32ff7487b432391e2fe2 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Tue, 23 Jan 2024 10:22:20 -0500
Subject: [PATCH 3/6] save

---
 README.md                                     |   2 +-
 docs/welcome.md                               |   6 +-
 lilac/data/dataset_duckdb.py                  |   1 -
 lilac/data/dataset_storage_utils.py           |   4 +-
 lilac/db_manager.py                           |   3 +
 lilac/formats/sharegpt.py                     |   8 +-
 lilac_hf_space.yml                            |  25 ----
 notebooks/GlaiveToShareGPT.ipynb              | 109 +++++++++++++-----
 .../components/HuggingFaceSpaceWelcome.svelte |   2 +-
 web/blueprint/src/lib/view_utils.ts           |   3 +-
 10 files changed, 97 insertions(+), 66 deletions(-)

diff --git a/README.md b/README.md
index f23ea4372..52103dd9e 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ If you prefer, you can load datasets directly from the UI without writing any Py
 
 ### 🔎 Explore
 
-> [🔗 Try OpenOrca-100K before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k)
+> [🔗 Try OpenOrca before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca)
 
 Once we've loaded a dataset, we can explore it from the UI and get a sense for what's in the data.
 More documentation [here](https://docs.lilacml.com/datasets/dataset_explore.html).
diff --git a/docs/welcome.md b/docs/welcome.md
index e97cead74..e2c0a902e 100644
--- a/docs/welcome.md
+++ b/docs/welcome.md
@@ -14,9 +14,9 @@
 Lilac is an open-source tool that enables data and AI practitioners improve their products by
 improving their data.
 
-[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k),
-where we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language
-model" on the OpenOrca dataset!
+[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca), where
+we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language model" on
+the OpenOrca dataset!
 
 ## Why use Lilac?
 
diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
index b52b5813a..1326890c2 100644
--- a/lilac/data/dataset_duckdb.py
+++ b/lilac/data/dataset_duckdb.py
@@ -3188,7 +3188,6 @@ def to_huggingface(
       filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest()
     )
     filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels))
-    print('filters=', filters)
     rows = self.select_rows(
       columns, filters=filters, combine_columns=True, include_deleted=include_deleted
     )
diff --git a/lilac/data/dataset_storage_utils.py b/lilac/data/dataset_storage_utils.py
index 21de0a101..28e00c573 100644
--- a/lilac/data/dataset_storage_utils.py
+++ b/lilac/data/dataset_storage_utils.py
@@ -34,8 +34,8 @@ def download(
 
   Args:
     url_or_repo: A remote URL to a Lilac-processed dataset. Currently only supports HuggingFace
-      dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca-100k
-      or a repo_id: lilacai/lilac-OpenOrca-100k.
+      dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca
+      or a repo_id: lilacai/lilac-OpenOrca.
     project_dir: The project directory to use for the demo. Defaults to `env.LILAC_PROJECT_DIR`
       which can be set with `ll.set_project_dir()`.
     dataset_namespace: The local namespace to use. Defaults to 'local'.
diff --git a/lilac/db_manager.py b/lilac/db_manager.py
index e020cd11e..6af8f1159 100644
--- a/lilac/db_manager.py
+++ b/lilac/db_manager.py
@@ -56,6 +56,9 @@ def has_dataset(
 
   project_dir = project_dir or get_project_dir()
   try:
+    # This will try to load the dataset, and throw an error if it doesn't exist because when the
+    # dataset is not in the cache, it will try to call the constructor, which will error if the
+    # dataset does not exist.
     get_dataset(namespace, dataset_name, project_dir)
     return True
   except ValueError:
diff --git a/lilac/formats/sharegpt.py b/lilac/formats/sharegpt.py
index ec545da98..75c9f2e91 100644
--- a/lilac/formats/sharegpt.py
+++ b/lilac/formats/sharegpt.py
@@ -24,6 +24,10 @@ def _sharegpt_selector(item: Item, conv_from: str) -> str:
   name='human',
   selector=lambda item: _sharegpt_selector(item, 'human'),
 )
+_TOOL_SELECTOR = DatasetFormatInputSelector(
+  name='tool',
+  selector=lambda item: _sharegpt_selector(item, 'tool'),
+)
 _GPT_SELECTOR = DatasetFormatInputSelector(
   name='gpt',
   selector=lambda item: _sharegpt_selector(item, 'gpt'),
@@ -50,8 +54,10 @@ class ShareGPT(DatasetFormat):
 
   system: ClassVar[DatasetFormatInputSelector] = _SYSTEM_SELECTOR
   human: ClassVar[DatasetFormatInputSelector] = _HUMAN_SELECTOR
+  tool: ClassVar[DatasetFormatInputSelector] = _TOOL_SELECTOR
   gpt: ClassVar[DatasetFormatInputSelector] = _GPT_SELECTOR
 
   input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
-    selector.name: selector for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR]
+    selector.name: selector
+    for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR, _TOOL_SELECTOR]
   }
diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
index a66702088..22c3da0ff 100644
--- a/lilac_hf_space.yml
+++ b/lilac_hf_space.yml
@@ -158,27 +158,6 @@ datasets:
         path:
           - question
 
-  - name: 'OpenOrca-100k'
-    namespace: lilac
-    settings:
-      tags: [machine-learning]
-      ui:
-        media_paths:
-          - question
-          - response
-      preferred_embedding: 'gte-small'
-    source:
-      source_name: huggingface
-      dataset_name: Open-Orca/OpenOrca
-      sample_size: 100000
-    embeddings:
-      - embedding: gte-small
-        path:
-          - question
-      - embedding: gte-small
-        path:
-          - response
-
   - namespace: lilac
     name: dolphin
     tags: [datasets]
@@ -242,10 +221,6 @@ clusters:
     dataset_name: databricks-dolly-15k-curated-en
     input_path:
       - original-instruction
-  - dataset_namespace: lilac
-    dataset_name: OpenOrca-100k
-    input_path:
-      - question
   - dataset_namespace: lilac
     dataset_name: dolphin
     input_path:
diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb
index 887248340..6298e532d 100644
--- a/notebooks/GlaiveToShareGPT.ipynb
+++ b/notebooks/GlaiveToShareGPT.ipynb
@@ -9,15 +9,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+      "/Users/nikhil/Code/lilac/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
    ],
@@ -40,21 +40,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\":   0%|          | 0/112960 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 100%|██████████| 112960/112960 [00:07<00:00, 15708.45it/s]\n"
+      "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 100%|██████████| 112960/112960 [00:06<00:00, 16609.62it/s]\n"
      ]
     },
     {
@@ -68,15 +61,25 @@
    "source": [
     "import re\n",
     "\n",
-    "ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']\n",
+    "GLAIVE_ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']\n",
+    "GLAIVE_TO_SHAREGPT_ROLE = {\n",
+    "  'SYSTEM': 'system',\n",
+    "  'USER': 'human',\n",
+    "  'ASSISTANT': 'gpt',\n",
+    "  'FUNCTION RESPONSE': 'tool',\n",
+    "}\n",
+    "\n",
     "\n",
     "# The split regex is a role, plus semicolon and space. For example\n",
     "# \"USER: \" or \"FUNCTION RESPONSE: \".\n",
-    "split_re = re.compile(r'({}): '.format('|'.join(ROLES)))\n",
+    "split_re = re.compile(r'({}): '.format('|'.join(GLAIVE_ROLES)))\n",
     "\n",
     "\n",
     "def _parse_chat(row: dict):\n",
     "  system_prompt = row.get('system')\n",
+    "  # Remove \"SYSTEM: \" from the beginning of the prompt.\n",
+    "  if system_prompt:\n",
+    "    system_prompt = system_prompt.removeprefix('SYSTEM: ')\n",
     "\n",
     "  chat = row['chat']\n",
     "  # Split chat by split_res, and remove empty strings.\n",
@@ -85,10 +88,13 @@
     "  # results look like:\n",
     "  # ['USER', 'Can you book a flight for me from New York to London?', 'ASSISTANT', '...']\n",
     "  # We now want it to be a dictionary of {'from': 'user', 'value': 'Can you book a flight...'}\n",
-    "  chats = [{'from': role.lower(), 'value': value} for role, value in zip(chats[::2], chats[1::2])]\n",
+    "  chats = [\n",
+    "    {'from': GLAIVE_TO_SHAREGPT_ROLE[role], 'value': value}\n",
+    "    for role, value in zip(chats[::2], chats[1::2])\n",
+    "  ]\n",
     "\n",
     "  if system_prompt:\n",
-    "    chats = [{'from': 'system', 'value': system_prompt}] + chats\n",
+    "    chats = [{'from': GLAIVE_TO_SHAREGPT_ROLE['SYSTEM'], 'value': system_prompt}] + chats\n",
     "\n",
     "  return chats\n",
     "\n",
@@ -98,35 +104,78 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "<class 'datasets.arrow_dataset.Dataset'>\n",
-      "Dataset({\n",
-      "    features: ['chat', 'system', '__hfsplit__', 'chat__cluster', 'conversations'],\n",
-      "    num_rows: 112960\n",
-      "})\n"
+      "{'conversations': [{'from': 'system',\n",
+      "                    'value': 'You are a helpful assistant with access to the '\n",
+      "                             'following functions. Use them if required -\\n'\n",
+      "                             '{\\n'\n",
+      "                             '    \"name\": \"calculate_median\",\\n'\n",
+      "                             '    \"description\": \"Calculate the median of a '\n",
+      "                             'list of numbers\",\\n'\n",
+      "                             '    \"parameters\": {\\n'\n",
+      "                             '        \"type\": \"object\",\\n'\n",
+      "                             '        \"properties\": {\\n'\n",
+      "                             '            \"numbers\": {\\n'\n",
+      "                             '                \"type\": \"array\",\\n'\n",
+      "                             '                \"items\": {\\n'\n",
+      "                             '                    \"type\": \"number\"\\n'\n",
+      "                             '                },\\n'\n",
+      "                             '                \"description\": \"The list of '\n",
+      "                             'numbers\"\\n'\n",
+      "                             '            }\\n'\n",
+      "                             '        },\\n'\n",
+      "                             '        \"required\": [\\n'\n",
+      "                             '            \"numbers\"\\n'\n",
+      "                             '        ]\\n'\n",
+      "                             '    }\\n'\n",
+      "                             '}\\n'},\n",
+      "                   {'from': 'human',\n",
+      "                    'value': 'Hi, I have a list of numbers and I need to find '\n",
+      "                             'the median. The numbers are 5, 2, 9, 1, 7, 4, 6, '\n",
+      "                             '3, 8.'},\n",
+      "                   {'from': 'gpt',\n",
+      "                    'value': '<functioncall> {\"name\": \"calculate_median\", '\n",
+      "                             '\"arguments\": \\'{\"numbers\": [5, 2, 9, 1, 7, 4, 6, '\n",
+      "                             \"3, 8]}'} <|endoftext|>\"},\n",
+      "                   {'from': 'tool', 'value': '{\"median\": 5}'},\n",
+      "                   {'from': 'gpt',\n",
+      "                    'value': 'The median of your list of numbers is 5. '\n",
+      "                             '<|endoftext|>'}]}\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "import pprint\n",
+    "\n",
+    "pprint.pprint(next(ds.select_rows(['conversations'], limit=1)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 90.73ba/s]\n",
-      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 91.67ba/s]s/it]\n",
-      "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]\n"
+      "Generating train split: 112960 examples [00:03, 36913.50 examples/s]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 89.98ba/s]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 85.01ba/s]s/it]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:10<00:00,  5.15s/it]\n",
+      "Deleting unused files from dataset repository: 100%|██████████| 2/2 [00:01<00:00,  1.33it/s]\n",
+      "Downloading metadata: 100%|██████████| 2.83k/2.83k [00:00<00:00, 13.0MB/s]\n"
      ]
     }
    ],
    "source": [
     "hf_ds = ds.to_huggingface()\n",
-    "\n",
-    "print(hf_ds)\n",
-    "\n",
     "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')"
    ]
   },
diff --git a/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte b/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte
index 403b595b9..764423ab2 100644
--- a/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte
+++ b/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte
@@ -9,7 +9,7 @@
 
   const tryDataset = {
     namespace: 'lilac',
-    name: 'OpenOrca-100k',
+    name: 'OpenOrca',
     displayName: 'OpenOrca',
     originalLink: 'https://huggingface.co/datasets/Open-Orca/OpenOrca'
   };
diff --git a/web/blueprint/src/lib/view_utils.ts b/web/blueprint/src/lib/view_utils.ts
index de742eeef..66f1b2355 100644
--- a/web/blueprint/src/lib/view_utils.ts
+++ b/web/blueprint/src/lib/view_utils.ts
@@ -217,8 +217,7 @@ export function getTaggedDatasets(
   });
 
   const namespaceSortPriorities = ['lilac'];
-  // TODO(nsthorat): Don't hard-code this. Let's make this a config.
-  const pinnedDatasets = ['OpenOrca-100k'];
+  const pinnedDatasets: string[] = [];
 
   // Sort each tag by namespace and then dataset name.
   const taggedDatasetGroups: NavigationTagGroup<DatasetInfo>[] = [];

From f1861d9e706c401c6a822a028c9c0e59b5b60594 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Tue, 23 Jan 2024 10:36:21 -0500
Subject: [PATCH 4/6] save

---
 lilac_hf_space.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
index 22c3da0ff..f86e557ed 100644
--- a/lilac_hf_space.yml
+++ b/lilac_hf_space.yml
@@ -199,7 +199,7 @@ clusters:
     dataset_name: glaive-function-calling-v2
     input_selector:
       format: sharegpt
-      selector: user
+      selector: human
     output_path:
       - conversation_clusters
   - dataset_namespace: lilac

From a3378f35390d5e253780356afc3ff70dbc17f620 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Tue, 23 Jan 2024 11:27:03 -0500
Subject: [PATCH 5/6] save

---
 lilac_hf_space.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
index f86e557ed..b0a9acada 100644
--- a/lilac_hf_space.yml
+++ b/lilac_hf_space.yml
@@ -100,7 +100,7 @@ datasets:
             - content
     source:
       source_name: huggingface
-      dataset_name: OpenAssistant/oasst2
+      dataset_name: lmsys/lmsys-chat-1m
     embeddings:
       - embedding: gte-small
         path:

From 6894032644ff5cc394fd850213f0ef8aa73621db Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Tue, 23 Jan 2024 11:28:15 -0500
Subject: [PATCH 6/6] save

---
 notebooks/GlaiveToShareGPT.ipynb | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb
index 6298e532d..d01bc85ae 100644
--- a/notebooks/GlaiveToShareGPT.ipynb
+++ b/notebooks/GlaiveToShareGPT.ipynb
@@ -178,13 +178,6 @@
     "hf_ds = ds.to_huggingface()\n",
     "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {