diff --git a/README.md b/README.md index f23ea4372..52103dd9e 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ If you prefer, you can load datasets directly from the UI without writing any Py ### 🔎 Explore -> [🔗 Try OpenOrca-100K before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k) +> [🔗 Try OpenOrca before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca) Once we've loaded a dataset, we can explore it from the UI and get a sense for what's in the data. More documentation [here](https://docs.lilacml.com/datasets/dataset_explore.html). diff --git a/docs/welcome.md b/docs/welcome.md index e97cead74..e2c0a902e 100644 --- a/docs/welcome.md +++ b/docs/welcome.md @@ -14,9 +14,9 @@ Lilac is an open-source tool that enables data and AI practitioners improve their products by improving their data. -[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k), -where we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language -model" on the OpenOrca dataset! +[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca), where +we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language model" on +the OpenOrca dataset! ## Why use Lilac? diff --git a/lilac/__init__.py b/lilac/__init__.py index 2c7604994..776126d86 100644 --- a/lilac/__init__.py +++ b/lilac/__init__.py @@ -12,7 +12,7 @@ from .data import * # noqa: F403 from .data.dataset_duckdb import DatasetDuckDB from .data.dataset_storage_utils import download, upload -from .db_manager import get_dataset, list_datasets, set_default_dataset_cls +from .db_manager import get_dataset, has_dataset, list_datasets, set_default_dataset_cls from .deploy import deploy_config, deploy_project from .embeddings import * # noqa: F403 from .env import * # noqa: F403 @@ -53,6 +53,7 @@ 'from_dicts', 'from_huggingface', 'get_dataset', + 'has_dataset', 'list_datasets', 'init', 'span', diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py index a4fff42ac..55fb2da24 100644 --- a/lilac/data/dataset.py +++ b/lilac/data/dataset.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +from datasets import Dataset as HuggingFaceDataset from pydantic import ( BaseModel, ConfigDict, @@ -977,6 +978,26 @@ def transform( schema=schema, ) + @abc.abstractmethod + def to_huggingface( + self, + columns: Optional[Sequence[ColumnId]] = None, + filters: Optional[Sequence[FilterLike]] = None, + include_labels: Optional[Sequence[str]] = None, + exclude_labels: Optional[Sequence[str]] = None, + include_deleted: bool = False, + ) -> HuggingFaceDataset: + """Export the dataset to a huggingface dataset. + + Args: + columns: The columns to export. + filters: The filters to apply to the query. + include_labels: The labels to include in the export. + exclude_labels: The labels to exclude in the export. + include_deleted: Whether to include deleted rows in the export. + """ + pass + @abc.abstractmethod def to_json( self, diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index 50c492f17..1326890c2 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -26,6 +26,7 @@ import orjson import pandas as pd import yaml +from datasets import Dataset as HuggingFaceDataset from pandas.api.types import is_object_dtype from pydantic import BaseModel, SerializeAsAny, field_validator from typing_extensions import override @@ -3174,6 +3175,29 @@ def cluster( self, input, output_path, min_cluster_size, topic_fn, overwrite, use_garden, task_id=task_id ) + @override + def to_huggingface( + self, + columns: Optional[Sequence[ColumnId]] = None, + filters: Optional[Sequence[FilterLike]] = None, + include_labels: Optional[Sequence[str]] = None, + exclude_labels: Optional[Sequence[str]] = None, + include_deleted: bool = False, + ) -> HuggingFaceDataset: + filters, _ = self._normalize_filters( + filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest() + ) + filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels)) + rows = self.select_rows( + columns, filters=filters, combine_columns=True, include_deleted=include_deleted + ) + + def _gen() -> Iterator[Item]: + for row in rows: + yield row + + return cast(HuggingFaceDataset, HuggingFaceDataset.from_generator(_gen)) + @override def to_json( self, diff --git a/lilac/data/dataset_export_test.py b/lilac/data/dataset_export_test.py index 9f0cda6d7..aeb9c77f5 100644 --- a/lilac/data/dataset_export_test.py +++ b/lilac/data/dataset_export_test.py @@ -40,6 +40,41 @@ def setup_teardown() -> Iterable[None]: clear_signal_registry() # Teardown. +def test_export_to_huggingface(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None: + dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}]) + dataset.compute_signal(TestSignal(), 'text') + + hf_dataset = dataset.to_huggingface() + + assert list(hf_dataset) == [ + {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}, + {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}}, + ] + + +def test_export_to_huggingface_filters( + make_test_data: TestDataMaker, tmp_path: pathlib.Path +) -> None: + dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}]) + dataset.compute_signal(TestSignal(), 'text') + + # Download a subset of columns with filter. + hf_dataset = dataset.to_huggingface( + columns=['text', 'text.test_signal.flen'], + filters=[('text.test_signal.len', 'greater', 6)], + ) + + assert list(hf_dataset) == [ + {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}} + ] + + hf_dataset = dataset.to_huggingface(filters=[('text.test_signal.flen', 'less_equal', '5')]) + + assert list(hf_dataset) == [ + {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}} + ] + + def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None: dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}]) dataset.compute_signal(TestSignal(), 'text') diff --git a/lilac/data/dataset_storage_utils.py b/lilac/data/dataset_storage_utils.py index 21de0a101..28e00c573 100644 --- a/lilac/data/dataset_storage_utils.py +++ b/lilac/data/dataset_storage_utils.py @@ -34,8 +34,8 @@ def download( Args: url_or_repo: A remote URL to a Lilac-processed dataset. Currently only supports HuggingFace - dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca-100k - or a repo_id: lilacai/lilac-OpenOrca-100k. + dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca + or a repo_id: lilacai/lilac-OpenOrca. project_dir: The project directory to use for the demo. Defaults to `env.LILAC_PROJECT_DIR` which can be set with `ll.set_project_dir()`. dataset_namespace: The local namespace to use. Defaults to 'local'. diff --git a/lilac/db_manager.py b/lilac/db_manager.py index e1dbbdd69..6af8f1159 100644 --- a/lilac/db_manager.py +++ b/lilac/db_manager.py @@ -47,6 +47,24 @@ def get_dataset( return _CACHED_DATASETS[cache_key] +def has_dataset( + namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None +) -> bool: + """Get the dataset instance.""" + if not _DEFAULT_DATASET_CLS: + raise ValueError('Default dataset class not set.') + + project_dir = project_dir or get_project_dir() + try: + # This will try to load the dataset, and throw an error if it doesn't exist because when the + # dataset is not in the cache, it will try to call the constructor, which will error if the + # dataset does not exist. + get_dataset(namespace, dataset_name, project_dir) + return True + except ValueError: + return False + + def remove_dataset_from_cache( namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None ) -> None: diff --git a/lilac/formats/sharegpt.py b/lilac/formats/sharegpt.py index ec545da98..75c9f2e91 100644 --- a/lilac/formats/sharegpt.py +++ b/lilac/formats/sharegpt.py @@ -24,6 +24,10 @@ def _sharegpt_selector(item: Item, conv_from: str) -> str: name='human', selector=lambda item: _sharegpt_selector(item, 'human'), ) +_TOOL_SELECTOR = DatasetFormatInputSelector( + name='tool', + selector=lambda item: _sharegpt_selector(item, 'tool'), +) _GPT_SELECTOR = DatasetFormatInputSelector( name='gpt', selector=lambda item: _sharegpt_selector(item, 'gpt'), @@ -50,8 +54,10 @@ class ShareGPT(DatasetFormat): system: ClassVar[DatasetFormatInputSelector] = _SYSTEM_SELECTOR human: ClassVar[DatasetFormatInputSelector] = _HUMAN_SELECTOR + tool: ClassVar[DatasetFormatInputSelector] = _TOOL_SELECTOR gpt: ClassVar[DatasetFormatInputSelector] = _GPT_SELECTOR input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = { - selector.name: selector for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR] + selector.name: selector + for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR, _TOOL_SELECTOR] } diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index 78a3c1ce9..b0a9acada 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -54,6 +54,26 @@ datasets: path: - answer + - name: glaive-function-calling-v2 + namespace: lilac + source: + dataset_name: lilacai/glaive-function-calling-v2-sharegpt + source_name: huggingface + settings: + tags: [datasets] + ui: + media_paths: + - - conversations + - '*' + - value + markdown_paths: [] + embeddings: + - embedding: gte-small + path: + - conversations + - '*' + - value + - name: open-assistant-conversations-2 namespace: lilac settings: @@ -80,7 +100,7 @@ datasets: - content source: source_name: huggingface - dataset_name: OpenAssistant/oasst2 + dataset_name: lmsys/lmsys-chat-1m embeddings: - embedding: gte-small path: @@ -138,27 +158,6 @@ datasets: path: - question - - name: 'OpenOrca-100k' - namespace: lilac - settings: - tags: [machine-learning] - ui: - media_paths: - - question - - response - preferred_embedding: 'gte-small' - source: - source_name: huggingface - dataset_name: Open-Orca/OpenOrca - sample_size: 100000 - embeddings: - - embedding: gte-small - path: - - question - - embedding: gte-small - path: - - response - - namespace: lilac name: dolphin tags: [datasets] @@ -196,6 +195,13 @@ clusters: dataset_name: glaive-code-assistant input_path: - question + - dataset_namespace: lilac + dataset_name: glaive-function-calling-v2 + input_selector: + format: sharegpt + selector: human + output_path: + - conversation_clusters - dataset_namespace: lilac dataset_name: open-assistant-conversations-2 input_path: @@ -215,10 +221,6 @@ clusters: dataset_name: databricks-dolly-15k-curated-en input_path: - original-instruction - - dataset_namespace: lilac - dataset_name: OpenOrca-100k - input_path: - - question - dataset_namespace: lilac dataset_name: dolphin input_path: diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb new file mode 100644 index 000000000..d01bc85ae --- /dev/null +++ b/notebooks/GlaiveToShareGPT.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convert the unformatted Glaive dataset to ShareGPT\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nikhil/Code/lilac/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import lilac as ll\n", + "\n", + "ll.set_project_dir('./data')\n", + "\n", + "if not ll.has_dataset('local', 'glaive-function-calling-v2'):\n", + " ll.from_huggingface(\n", + " 'glaiveai/glaive-function-calling-v2',\n", + " 'local',\n", + " 'glaive-function-calling-v2',\n", + " )\n", + "\n", + "ds = ll.get_dataset('local', 'glaive-function-calling-v2')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 100%|██████████| 112960/112960 [00:06<00:00, 16609.62it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrote map output to conversations-00000-of-00001.parquet\n" + ] + } + ], + "source": [ + "import re\n", + "\n", + "GLAIVE_ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']\n", + "GLAIVE_TO_SHAREGPT_ROLE = {\n", + " 'SYSTEM': 'system',\n", + " 'USER': 'human',\n", + " 'ASSISTANT': 'gpt',\n", + " 'FUNCTION RESPONSE': 'tool',\n", + "}\n", + "\n", + "\n", + "# The split regex is a role, plus semicolon and space. For example\n", + "# \"USER: \" or \"FUNCTION RESPONSE: \".\n", + "split_re = re.compile(r'({}): '.format('|'.join(GLAIVE_ROLES)))\n", + "\n", + "\n", + "def _parse_chat(row: dict):\n", + " system_prompt = row.get('system')\n", + " # Remove \"SYSTEM: \" from the beginning of the prompt.\n", + " if system_prompt:\n", + " system_prompt = system_prompt.removeprefix('SYSTEM: ')\n", + "\n", + " chat = row['chat']\n", + " # Split chat by split_res, and remove empty strings.\n", + " chats = [s.strip() for s in split_re.split(chat) if s]\n", + "\n", + " # results look like:\n", + " # ['USER', 'Can you book a flight for me from New York to London?', 'ASSISTANT', '...']\n", + " # We now want it to be a dictionary of {'from': 'user', 'value': 'Can you book a flight...'}\n", + " chats = [\n", + " {'from': GLAIVE_TO_SHAREGPT_ROLE[role], 'value': value}\n", + " for role, value in zip(chats[::2], chats[1::2])\n", + " ]\n", + "\n", + " if system_prompt:\n", + " chats = [{'from': GLAIVE_TO_SHAREGPT_ROLE['SYSTEM'], 'value': system_prompt}] + chats\n", + "\n", + " return chats\n", + "\n", + "\n", + "res = ds.map(_parse_chat, output_path='conversations', overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'conversations': [{'from': 'system',\n", + " 'value': 'You are a helpful assistant with access to the '\n", + " 'following functions. Use them if required -\\n'\n", + " '{\\n'\n", + " ' \"name\": \"calculate_median\",\\n'\n", + " ' \"description\": \"Calculate the median of a '\n", + " 'list of numbers\",\\n'\n", + " ' \"parameters\": {\\n'\n", + " ' \"type\": \"object\",\\n'\n", + " ' \"properties\": {\\n'\n", + " ' \"numbers\": {\\n'\n", + " ' \"type\": \"array\",\\n'\n", + " ' \"items\": {\\n'\n", + " ' \"type\": \"number\"\\n'\n", + " ' },\\n'\n", + " ' \"description\": \"The list of '\n", + " 'numbers\"\\n'\n", + " ' }\\n'\n", + " ' },\\n'\n", + " ' \"required\": [\\n'\n", + " ' \"numbers\"\\n'\n", + " ' ]\\n'\n", + " ' }\\n'\n", + " '}\\n'},\n", + " {'from': 'human',\n", + " 'value': 'Hi, I have a list of numbers and I need to find '\n", + " 'the median. The numbers are 5, 2, 9, 1, 7, 4, 6, '\n", + " '3, 8.'},\n", + " {'from': 'gpt',\n", + " 'value': ' {\"name\": \"calculate_median\", '\n", + " '\"arguments\": \\'{\"numbers\": [5, 2, 9, 1, 7, 4, 6, '\n", + " \"3, 8]}'} <|endoftext|>\"},\n", + " {'from': 'tool', 'value': '{\"median\": 5}'},\n", + " {'from': 'gpt',\n", + " 'value': 'The median of your list of numbers is 5. '\n", + " '<|endoftext|>'}]}\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "pprint.pprint(next(ds.select_rows(['conversations'], limit=1)))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating train split: 112960 examples [00:03, 36913.50 examples/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 89.98ba/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 85.01ba/s]s/it]\n", + "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:10<00:00, 5.15s/it]\n", + "Deleting unused files from dataset repository: 100%|██████████| 2/2 [00:01<00:00, 1.33it/s]\n", + "Downloading metadata: 100%|██████████| 2.83k/2.83k [00:00<00:00, 13.0MB/s]\n" + ] + } + ], + "source": [ + "hf_ds = ds.to_huggingface()\n", + "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/run_server_dev.sh b/run_server_dev.sh index ed99841ae..b6e724def 100755 --- a/run_server_dev.sh +++ b/run_server_dev.sh @@ -11,7 +11,11 @@ npm run dev --workspace web/blueprint -- --open & pid[2]=$! # Run the FastAPI server. -export LILAC_PROJECT_DIR='./data' +if [ "$1" ]; then + export LILAC_PROJECT_DIR="$1" +else + export LILAC_PROJECT_DIR="./data" +fi poetry run uvicorn lilac.server:app --reload --port 5432 --host 0.0.0.0 \ --reload-dir lilac & pid[1]=$! diff --git a/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte b/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte index 403b595b9..764423ab2 100644 --- a/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte +++ b/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte @@ -9,7 +9,7 @@ const tryDataset = { namespace: 'lilac', - name: 'OpenOrca-100k', + name: 'OpenOrca', displayName: 'OpenOrca', originalLink: 'https://huggingface.co/datasets/Open-Orca/OpenOrca' }; diff --git a/web/blueprint/src/lib/view_utils.ts b/web/blueprint/src/lib/view_utils.ts index de742eeef..66f1b2355 100644 --- a/web/blueprint/src/lib/view_utils.ts +++ b/web/blueprint/src/lib/view_utils.ts @@ -217,8 +217,7 @@ export function getTaggedDatasets( }); const namespaceSortPriorities = ['lilac']; - // TODO(nsthorat): Don't hard-code this. Let's make this a config. - const pinnedDatasets = ['OpenOrca-100k']; + const pinnedDatasets: string[] = []; // Sort each tag by namespace and then dataset name. const taggedDatasetGroups: NavigationTagGroup[] = [];