From a518a8e0c18f1bb2b809667ff72f8367452ed442 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Mon, 22 Jan 2024 17:47:01 -0500 Subject: [PATCH 1/6] Support glaive-function-calling-v2 in the demo, clusters, and via sharegpt. --- lilac/__init__.py | 3 +- lilac/data/dataset.py | 20 ++++ lilac/data/dataset_duckdb.py | 24 +++++ lilac/db_manager.py | 15 +++ lilac_hf_space.yml | 27 ++++++ notebooks/GlaiveToShareGPT.ipynb | 162 +++++++++++++++++++++++++++++++ run_server_dev.sh | 6 +- 7 files changed, 255 insertions(+), 2 deletions(-) create mode 100644 notebooks/GlaiveToShareGPT.ipynb diff --git a/lilac/__init__.py b/lilac/__init__.py index 2c7604994..776126d86 100644 --- a/lilac/__init__.py +++ b/lilac/__init__.py @@ -12,7 +12,7 @@ from .data import * # noqa: F403 from .data.dataset_duckdb import DatasetDuckDB from .data.dataset_storage_utils import download, upload -from .db_manager import get_dataset, list_datasets, set_default_dataset_cls +from .db_manager import get_dataset, has_dataset, list_datasets, set_default_dataset_cls from .deploy import deploy_config, deploy_project from .embeddings import * # noqa: F403 from .env import * # noqa: F403 @@ -53,6 +53,7 @@ 'from_dicts', 'from_huggingface', 'get_dataset', + 'has_dataset', 'list_datasets', 'init', 'span', diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py index a4fff42ac..2e774c879 100644 --- a/lilac/data/dataset.py +++ b/lilac/data/dataset.py @@ -977,6 +977,26 @@ def transform( schema=schema, ) + @abc.abstractmethod + def to_huggingface( + self, + columns: Optional[Sequence[ColumnId]] = None, + filters: Optional[Sequence[FilterLike]] = None, + include_labels: Optional[Sequence[str]] = None, + exclude_labels: Optional[Sequence[str]] = None, + include_deleted: bool = False, + ) -> HuggingFaceDataset: + """Export the dataset to a huggingface dataset. + + Args: + columns: The columns to export. + filters: The filters to apply to the query. + include_labels: The labels to include in the export. + exclude_labels: The labels to exclude in the export. + include_deleted: Whether to include deleted rows in the export. + """ + pass + @abc.abstractmethod def to_json( self, diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index 50c492f17..1326890c2 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -26,6 +26,7 @@ import orjson import pandas as pd import yaml +from datasets import Dataset as HuggingFaceDataset from pandas.api.types import is_object_dtype from pydantic import BaseModel, SerializeAsAny, field_validator from typing_extensions import override @@ -3174,6 +3175,29 @@ def cluster( self, input, output_path, min_cluster_size, topic_fn, overwrite, use_garden, task_id=task_id ) + @override + def to_huggingface( + self, + columns: Optional[Sequence[ColumnId]] = None, + filters: Optional[Sequence[FilterLike]] = None, + include_labels: Optional[Sequence[str]] = None, + exclude_labels: Optional[Sequence[str]] = None, + include_deleted: bool = False, + ) -> HuggingFaceDataset: + filters, _ = self._normalize_filters( + filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest() + ) + filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels)) + rows = self.select_rows( + columns, filters=filters, combine_columns=True, include_deleted=include_deleted + ) + + def _gen() -> Iterator[Item]: + for row in rows: + yield row + + return cast(HuggingFaceDataset, HuggingFaceDataset.from_generator(_gen)) + @override def to_json( self, diff --git a/lilac/db_manager.py b/lilac/db_manager.py index e1dbbdd69..e020cd11e 100644 --- a/lilac/db_manager.py +++ b/lilac/db_manager.py @@ -47,6 +47,21 @@ def get_dataset( return _CACHED_DATASETS[cache_key] +def has_dataset( + namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None +) -> bool: + """Get the dataset instance.""" + if not _DEFAULT_DATASET_CLS: + raise ValueError('Default dataset class not set.') + + project_dir = project_dir or get_project_dir() + try: + get_dataset(namespace, dataset_name, project_dir) + return True + except ValueError: + return False + + def remove_dataset_from_cache( namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None ) -> None: diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index 78a3c1ce9..a66702088 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -54,6 +54,26 @@ datasets: path: - answer + - name: glaive-function-calling-v2 + namespace: lilac + source: + dataset_name: lilacai/glaive-function-calling-v2-sharegpt + source_name: huggingface + settings: + tags: [datasets] + ui: + media_paths: + - - conversations + - '*' + - value + markdown_paths: [] + embeddings: + - embedding: gte-small + path: + - conversations + - '*' + - value + - name: open-assistant-conversations-2 namespace: lilac settings: @@ -196,6 +216,13 @@ clusters: dataset_name: glaive-code-assistant input_path: - question + - dataset_namespace: lilac + dataset_name: glaive-function-calling-v2 + input_selector: + format: sharegpt + selector: user + output_path: + - conversation_clusters - dataset_namespace: lilac dataset_name: open-assistant-conversations-2 input_path: diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb new file mode 100644 index 000000000..887248340 --- /dev/null +++ b/notebooks/GlaiveToShareGPT.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convert the unformatted Glaive dataset to ShareGPT\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import lilac as ll\n", + "\n", + "ll.set_project_dir('./data')\n", + "\n", + "if not ll.has_dataset('local', 'glaive-function-calling-v2'):\n", + " ll.from_huggingface(\n", + " 'glaiveai/glaive-function-calling-v2',\n", + " 'local',\n", + " 'glaive-function-calling-v2',\n", + " )\n", + "\n", + "ds = ll.get_dataset('local', 'glaive-function-calling-v2')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 0%| | 0/112960 [00:00\n", + "Dataset({\n", + " features: ['chat', 'system', '__hfsplit__', 'chat__cluster', 'conversations'],\n", + " num_rows: 112960\n", + "})\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 90.73ba/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 91.67ba/s]s/it]\n", + "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]\n" + ] + } + ], + "source": [ + "hf_ds = ds.to_huggingface()\n", + "\n", + "print(hf_ds)\n", + "\n", + "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/run_server_dev.sh b/run_server_dev.sh index ed99841ae..b6e724def 100755 --- a/run_server_dev.sh +++ b/run_server_dev.sh @@ -11,7 +11,11 @@ npm run dev --workspace web/blueprint -- --open & pid[2]=$! # Run the FastAPI server. -export LILAC_PROJECT_DIR='./data' +if [ "$1" ]; then + export LILAC_PROJECT_DIR="$1" +else + export LILAC_PROJECT_DIR="./data" +fi poetry run uvicorn lilac.server:app --reload --port 5432 --host 0.0.0.0 \ --reload-dir lilac & pid[1]=$! From 91d8916c1dc216a2f21fa4cdb56514c9bdf229f9 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Mon, 22 Jan 2024 18:09:30 -0500 Subject: [PATCH 2/6] save --- lilac/data/dataset.py | 1 + lilac/data/dataset_duckdb.py | 1 + lilac/data/dataset_export_test.py | 35 +++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py index 2e774c879..55fb2da24 100644 --- a/lilac/data/dataset.py +++ b/lilac/data/dataset.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +from datasets import Dataset as HuggingFaceDataset from pydantic import ( BaseModel, ConfigDict, diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index 1326890c2..b52b5813a 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -3188,6 +3188,7 @@ def to_huggingface( filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest() ) filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels)) + print('filters=', filters) rows = self.select_rows( columns, filters=filters, combine_columns=True, include_deleted=include_deleted ) diff --git a/lilac/data/dataset_export_test.py b/lilac/data/dataset_export_test.py index 9f0cda6d7..aeb9c77f5 100644 --- a/lilac/data/dataset_export_test.py +++ b/lilac/data/dataset_export_test.py @@ -40,6 +40,41 @@ def setup_teardown() -> Iterable[None]: clear_signal_registry() # Teardown. +def test_export_to_huggingface(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None: + dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}]) + dataset.compute_signal(TestSignal(), 'text') + + hf_dataset = dataset.to_huggingface() + + assert list(hf_dataset) == [ + {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}, + {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}}, + ] + + +def test_export_to_huggingface_filters( + make_test_data: TestDataMaker, tmp_path: pathlib.Path +) -> None: + dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}]) + dataset.compute_signal(TestSignal(), 'text') + + # Download a subset of columns with filter. + hf_dataset = dataset.to_huggingface( + columns=['text', 'text.test_signal.flen'], + filters=[('text.test_signal.len', 'greater', 6)], + ) + + assert list(hf_dataset) == [ + {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}} + ] + + hf_dataset = dataset.to_huggingface(filters=[('text.test_signal.flen', 'less_equal', '5')]) + + assert list(hf_dataset) == [ + {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}} + ] + + def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None: dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}]) dataset.compute_signal(TestSignal(), 'text') From db9fd195d06a3602830e32ff7487b432391e2fe2 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Tue, 23 Jan 2024 10:22:20 -0500 Subject: [PATCH 3/6] save --- README.md | 2 +- docs/welcome.md | 6 +- lilac/data/dataset_duckdb.py | 1 - lilac/data/dataset_storage_utils.py | 4 +- lilac/db_manager.py | 3 + lilac/formats/sharegpt.py | 8 +- lilac_hf_space.yml | 25 ---- notebooks/GlaiveToShareGPT.ipynb | 109 +++++++++++++----- .../components/HuggingFaceSpaceWelcome.svelte | 2 +- web/blueprint/src/lib/view_utils.ts | 3 +- 10 files changed, 97 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index f23ea4372..52103dd9e 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ If you prefer, you can load datasets directly from the UI without writing any Py ### 🔎 Explore -> [🔗 Try OpenOrca-100K before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k) +> [🔗 Try OpenOrca before installing!](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca) Once we've loaded a dataset, we can explore it from the UI and get a sense for what's in the data. More documentation [here](https://docs.lilacml.com/datasets/dataset_explore.html). diff --git a/docs/welcome.md b/docs/welcome.md index e97cead74..e2c0a902e 100644 --- a/docs/welcome.md +++ b/docs/welcome.md @@ -14,9 +14,9 @@ Lilac is an open-source tool that enables data and AI practitioners improve their products by improving their data. -[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca-100k), -where we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language -model" on the OpenOrca dataset! +[Try Lilac on HuggingFace Spaces](https://lilacai-lilac.hf.space/datasets#lilac/OpenOrca), where +we've preloaded popular datasets like OpenOrca. Try a semantic search for "As a language model" on +the OpenOrca dataset! ## Why use Lilac? diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index b52b5813a..1326890c2 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -3188,7 +3188,6 @@ def to_huggingface( filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest() ) filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels)) - print('filters=', filters) rows = self.select_rows( columns, filters=filters, combine_columns=True, include_deleted=include_deleted ) diff --git a/lilac/data/dataset_storage_utils.py b/lilac/data/dataset_storage_utils.py index 21de0a101..28e00c573 100644 --- a/lilac/data/dataset_storage_utils.py +++ b/lilac/data/dataset_storage_utils.py @@ -34,8 +34,8 @@ def download( Args: url_or_repo: A remote URL to a Lilac-processed dataset. Currently only supports HuggingFace - dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca-100k - or a repo_id: lilacai/lilac-OpenOrca-100k. + dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca + or a repo_id: lilacai/lilac-OpenOrca. project_dir: The project directory to use for the demo. Defaults to `env.LILAC_PROJECT_DIR` which can be set with `ll.set_project_dir()`. dataset_namespace: The local namespace to use. Defaults to 'local'. diff --git a/lilac/db_manager.py b/lilac/db_manager.py index e020cd11e..6af8f1159 100644 --- a/lilac/db_manager.py +++ b/lilac/db_manager.py @@ -56,6 +56,9 @@ def has_dataset( project_dir = project_dir or get_project_dir() try: + # This will try to load the dataset, and throw an error if it doesn't exist because when the + # dataset is not in the cache, it will try to call the constructor, which will error if the + # dataset does not exist. get_dataset(namespace, dataset_name, project_dir) return True except ValueError: diff --git a/lilac/formats/sharegpt.py b/lilac/formats/sharegpt.py index ec545da98..75c9f2e91 100644 --- a/lilac/formats/sharegpt.py +++ b/lilac/formats/sharegpt.py @@ -24,6 +24,10 @@ def _sharegpt_selector(item: Item, conv_from: str) -> str: name='human', selector=lambda item: _sharegpt_selector(item, 'human'), ) +_TOOL_SELECTOR = DatasetFormatInputSelector( + name='tool', + selector=lambda item: _sharegpt_selector(item, 'tool'), +) _GPT_SELECTOR = DatasetFormatInputSelector( name='gpt', selector=lambda item: _sharegpt_selector(item, 'gpt'), @@ -50,8 +54,10 @@ class ShareGPT(DatasetFormat): system: ClassVar[DatasetFormatInputSelector] = _SYSTEM_SELECTOR human: ClassVar[DatasetFormatInputSelector] = _HUMAN_SELECTOR + tool: ClassVar[DatasetFormatInputSelector] = _TOOL_SELECTOR gpt: ClassVar[DatasetFormatInputSelector] = _GPT_SELECTOR input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = { - selector.name: selector for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR] + selector.name: selector + for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR, _TOOL_SELECTOR] } diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index a66702088..22c3da0ff 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -158,27 +158,6 @@ datasets: path: - question - - name: 'OpenOrca-100k' - namespace: lilac - settings: - tags: [machine-learning] - ui: - media_paths: - - question - - response - preferred_embedding: 'gte-small' - source: - source_name: huggingface - dataset_name: Open-Orca/OpenOrca - sample_size: 100000 - embeddings: - - embedding: gte-small - path: - - question - - embedding: gte-small - path: - - response - - namespace: lilac name: dolphin tags: [datasets] @@ -242,10 +221,6 @@ clusters: dataset_name: databricks-dolly-15k-curated-en input_path: - original-instruction - - dataset_namespace: lilac - dataset_name: OpenOrca-100k - input_path: - - question - dataset_namespace: lilac dataset_name: dolphin input_path: diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb index 887248340..6298e532d 100644 --- a/notebooks/GlaiveToShareGPT.ipynb +++ b/notebooks/GlaiveToShareGPT.ipynb @@ -9,15 +9,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" + "/Users/nikhil/Code/lilac/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], @@ -40,21 +40,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 0%| | 0/112960 [00:00\n", - "Dataset({\n", - " features: ['chat', 'system', '__hfsplit__', 'chat__cluster', 'conversations'],\n", - " num_rows: 112960\n", - "})\n" + "{'conversations': [{'from': 'system',\n", + " 'value': 'You are a helpful assistant with access to the '\n", + " 'following functions. Use them if required -\\n'\n", + " '{\\n'\n", + " ' \"name\": \"calculate_median\",\\n'\n", + " ' \"description\": \"Calculate the median of a '\n", + " 'list of numbers\",\\n'\n", + " ' \"parameters\": {\\n'\n", + " ' \"type\": \"object\",\\n'\n", + " ' \"properties\": {\\n'\n", + " ' \"numbers\": {\\n'\n", + " ' \"type\": \"array\",\\n'\n", + " ' \"items\": {\\n'\n", + " ' \"type\": \"number\"\\n'\n", + " ' },\\n'\n", + " ' \"description\": \"The list of '\n", + " 'numbers\"\\n'\n", + " ' }\\n'\n", + " ' },\\n'\n", + " ' \"required\": [\\n'\n", + " ' \"numbers\"\\n'\n", + " ' ]\\n'\n", + " ' }\\n'\n", + " '}\\n'},\n", + " {'from': 'human',\n", + " 'value': 'Hi, I have a list of numbers and I need to find '\n", + " 'the median. The numbers are 5, 2, 9, 1, 7, 4, 6, '\n", + " '3, 8.'},\n", + " {'from': 'gpt',\n", + " 'value': ' {\"name\": \"calculate_median\", '\n", + " '\"arguments\": \\'{\"numbers\": [5, 2, 9, 1, 7, 4, 6, '\n", + " \"3, 8]}'} <|endoftext|>\"},\n", + " {'from': 'tool', 'value': '{\"median\": 5}'},\n", + " {'from': 'gpt',\n", + " 'value': 'The median of your list of numbers is 5. '\n", + " '<|endoftext|>'}]}\n" ] - }, + } + ], + "source": [ + "import pprint\n", + "\n", + "pprint.pprint(next(ds.select_rows(['conversations'], limit=1)))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 90.73ba/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 91.67ba/s]s/it]\n", - "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]\n" + "Generating train split: 112960 examples [00:03, 36913.50 examples/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 89.98ba/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 85.01ba/s]s/it]\n", + "Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:10<00:00, 5.15s/it]\n", + "Deleting unused files from dataset repository: 100%|██████████| 2/2 [00:01<00:00, 1.33it/s]\n", + "Downloading metadata: 100%|██████████| 2.83k/2.83k [00:00<00:00, 13.0MB/s]\n" ] } ], "source": [ "hf_ds = ds.to_huggingface()\n", - "\n", - "print(hf_ds)\n", - "\n", "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')" ] }, diff --git a/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte b/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte index 403b595b9..764423ab2 100644 --- a/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte +++ b/web/blueprint/src/lib/components/HuggingFaceSpaceWelcome.svelte @@ -9,7 +9,7 @@ const tryDataset = { namespace: 'lilac', - name: 'OpenOrca-100k', + name: 'OpenOrca', displayName: 'OpenOrca', originalLink: 'https://huggingface.co/datasets/Open-Orca/OpenOrca' }; diff --git a/web/blueprint/src/lib/view_utils.ts b/web/blueprint/src/lib/view_utils.ts index de742eeef..66f1b2355 100644 --- a/web/blueprint/src/lib/view_utils.ts +++ b/web/blueprint/src/lib/view_utils.ts @@ -217,8 +217,7 @@ export function getTaggedDatasets( }); const namespaceSortPriorities = ['lilac']; - // TODO(nsthorat): Don't hard-code this. Let's make this a config. - const pinnedDatasets = ['OpenOrca-100k']; + const pinnedDatasets: string[] = []; // Sort each tag by namespace and then dataset name. const taggedDatasetGroups: NavigationTagGroup[] = []; From f1861d9e706c401c6a822a028c9c0e59b5b60594 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Tue, 23 Jan 2024 10:36:21 -0500 Subject: [PATCH 4/6] save --- lilac_hf_space.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index 22c3da0ff..f86e557ed 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -199,7 +199,7 @@ clusters: dataset_name: glaive-function-calling-v2 input_selector: format: sharegpt - selector: user + selector: human output_path: - conversation_clusters - dataset_namespace: lilac From a3378f35390d5e253780356afc3ff70dbc17f620 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Tue, 23 Jan 2024 11:27:03 -0500 Subject: [PATCH 5/6] save --- lilac_hf_space.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index f86e557ed..b0a9acada 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -100,7 +100,7 @@ datasets: - content source: source_name: huggingface - dataset_name: OpenAssistant/oasst2 + dataset_name: lmsys/lmsys-chat-1m embeddings: - embedding: gte-small path: From 6894032644ff5cc394fd850213f0ef8aa73621db Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Tue, 23 Jan 2024 11:28:15 -0500 Subject: [PATCH 6/6] save --- notebooks/GlaiveToShareGPT.ipynb | 7 ------- 1 file changed, 7 deletions(-) diff --git a/notebooks/GlaiveToShareGPT.ipynb b/notebooks/GlaiveToShareGPT.ipynb index 6298e532d..d01bc85ae 100644 --- a/notebooks/GlaiveToShareGPT.ipynb +++ b/notebooks/GlaiveToShareGPT.ipynb @@ -178,13 +178,6 @@ "hf_ds = ds.to_huggingface()\n", "hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {