Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export to HuggingFace. Support glaive-function-calling-v2 in the demo, clusters, and via sharegpt. #1113

Merged
merged 6 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lilac/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .data import * # noqa: F403
from .data.dataset_duckdb import DatasetDuckDB
from .data.dataset_storage_utils import download, upload
from .db_manager import get_dataset, list_datasets, set_default_dataset_cls
from .db_manager import get_dataset, has_dataset, list_datasets, set_default_dataset_cls
from .deploy import deploy_config, deploy_project
from .embeddings import * # noqa: F403
from .env import * # noqa: F403
Expand Down Expand Up @@ -53,6 +53,7 @@
'from_dicts',
'from_huggingface',
'get_dataset',
'has_dataset',
'list_datasets',
'init',
'span',
Expand Down
21 changes: 21 additions & 0 deletions lilac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy as np
import pandas as pd
from datasets import Dataset as HuggingFaceDataset
from pydantic import (
BaseModel,
ConfigDict,
Expand Down Expand Up @@ -977,6 +978,26 @@ def transform(
schema=schema,
)

@abc.abstractmethod
def to_huggingface(
self,
columns: Optional[Sequence[ColumnId]] = None,
filters: Optional[Sequence[FilterLike]] = None,
include_labels: Optional[Sequence[str]] = None,
exclude_labels: Optional[Sequence[str]] = None,
include_deleted: bool = False,
) -> HuggingFaceDataset:
"""Export the dataset to a huggingface dataset.

Args:
columns: The columns to export.
filters: The filters to apply to the query.
include_labels: The labels to include in the export.
exclude_labels: The labels to exclude in the export.
include_deleted: Whether to include deleted rows in the export.
"""
pass

@abc.abstractmethod
def to_json(
self,
Expand Down
25 changes: 25 additions & 0 deletions lilac/data/dataset_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import orjson
import pandas as pd
import yaml
from datasets import Dataset as HuggingFaceDataset
from pandas.api.types import is_object_dtype
from pydantic import BaseModel, SerializeAsAny, field_validator
from typing_extensions import override
Expand Down Expand Up @@ -3174,6 +3175,30 @@ def cluster(
self, input, output_path, min_cluster_size, topic_fn, overwrite, use_garden, task_id=task_id
)

@override
def to_huggingface(
self,
columns: Optional[Sequence[ColumnId]] = None,
filters: Optional[Sequence[FilterLike]] = None,
include_labels: Optional[Sequence[str]] = None,
exclude_labels: Optional[Sequence[str]] = None,
include_deleted: bool = False,
) -> HuggingFaceDataset:
filters, _ = self._normalize_filters(
filter_likes=filters, col_aliases={}, udf_aliases={}, manifest=self.manifest()
)
filters.extend(self._compile_include_exclude_filters(include_labels, exclude_labels))
print('filters=', filters)
nsthorat marked this conversation as resolved.
Show resolved Hide resolved
rows = self.select_rows(
columns, filters=filters, combine_columns=True, include_deleted=include_deleted
)

def _gen() -> Iterator[Item]:
for row in rows:
yield row

return cast(HuggingFaceDataset, HuggingFaceDataset.from_generator(_gen))

@override
def to_json(
self,
Expand Down
35 changes: 35 additions & 0 deletions lilac/data/dataset_export_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,41 @@ def setup_teardown() -> Iterable[None]:
clear_signal_registry() # Teardown.


def test_export_to_huggingface(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
dataset.compute_signal(TestSignal(), 'text')

hf_dataset = dataset.to_huggingface()

assert list(hf_dataset) == [
{'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}},
{'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}},
]


def test_export_to_huggingface_filters(
make_test_data: TestDataMaker, tmp_path: pathlib.Path
) -> None:
dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
dataset.compute_signal(TestSignal(), 'text')

# Download a subset of columns with filter.
hf_dataset = dataset.to_huggingface(
columns=['text', 'text.test_signal.flen'],
filters=[('text.test_signal.len', 'greater', 6)],
)

assert list(hf_dataset) == [
{'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}}
]

hf_dataset = dataset.to_huggingface(filters=[('text.test_signal.flen', 'less_equal', '5')])

assert list(hf_dataset) == [
{'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}
]


def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None:
dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}])
dataset.compute_signal(TestSignal(), 'text')
Expand Down
15 changes: 15 additions & 0 deletions lilac/db_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,21 @@ def get_dataset(
return _CACHED_DATASETS[cache_key]


def has_dataset(
namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
) -> bool:
"""Get the dataset instance."""
if not _DEFAULT_DATASET_CLS:
raise ValueError('Default dataset class not set.')

project_dir = project_dir or get_project_dir()
try:
get_dataset(namespace, dataset_name, project_dir)
return True
except ValueError:
nsthorat marked this conversation as resolved.
Show resolved Hide resolved
return False


def remove_dataset_from_cache(
namespace: str, dataset_name: str, project_dir: Optional[Union[str, pathlib.Path]] = None
) -> None:
Expand Down
27 changes: 27 additions & 0 deletions lilac_hf_space.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,26 @@ datasets:
path:
- answer

- name: glaive-function-calling-v2
namespace: lilac
source:
dataset_name: lilacai/glaive-function-calling-v2-sharegpt
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- - conversations
- '*'
- value
markdown_paths: []
embeddings:
- embedding: gte-small
path:
- conversations
- '*'
- value

- name: open-assistant-conversations-2
namespace: lilac
settings:
Expand Down Expand Up @@ -196,6 +216,13 @@ clusters:
dataset_name: glaive-code-assistant
input_path:
- question
- dataset_namespace: lilac
dataset_name: glaive-function-calling-v2
input_selector:
format: sharegpt
selector: user
output_path:
- conversation_clusters
- dataset_namespace: lilac
dataset_name: open-assistant-conversations-2
input_path:
Expand Down
162 changes: 162 additions & 0 deletions notebooks/GlaiveToShareGPT.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Convert the unformatted Glaive dataset to ShareGPT\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import lilac as ll\n",
"\n",
"ll.set_project_dir('./data')\n",
"\n",
"if not ll.has_dataset('local', 'glaive-function-calling-v2'):\n",
" ll.from_huggingface(\n",
" 'glaiveai/glaive-function-calling-v2',\n",
" 'local',\n",
" 'glaive-function-calling-v2',\n",
" )\n",
"\n",
"ds = ll.get_dataset('local', 'glaive-function-calling-v2')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 0%| | 0/112960 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[local/glaive-function-calling-v2][1 shards] map \"_parse_chat\" to \"('conversations',)\": 100%|██████████| 112960/112960 [00:07<00:00, 15708.45it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wrote map output to conversations-00000-of-00001.parquet\n"
]
}
],
"source": [
"import re\n",
"\n",
"ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']\n",
"\n",
"# The split regex is a role, plus semicolon and space. For example\n",
"# \"USER: \" or \"FUNCTION RESPONSE: \".\n",
"split_re = re.compile(r'({}): '.format('|'.join(ROLES)))\n",
"\n",
"\n",
"def _parse_chat(row: dict):\n",
" system_prompt = row.get('system')\n",
"\n",
" chat = row['chat']\n",
" # Split chat by split_res, and remove empty strings.\n",
" chats = [s.strip() for s in split_re.split(chat) if s]\n",
"\n",
" # results look like:\n",
" # ['USER', 'Can you book a flight for me from New York to London?', 'ASSISTANT', '...']\n",
" # We now want it to be a dictionary of {'from': 'user', 'value': 'Can you book a flight...'}\n",
" chats = [{'from': role.lower(), 'value': value} for role, value in zip(chats[::2], chats[1::2])]\n",
"\n",
" if system_prompt:\n",
" chats = [{'from': 'system', 'value': system_prompt}] + chats\n",
"\n",
" return chats\n",
"\n",
"\n",
"res = ds.map(_parse_chat, output_path='conversations', overwrite=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'datasets.arrow_dataset.Dataset'>\n",
"Dataset({\n",
" features: ['chat', 'system', '__hfsplit__', 'chat__cluster', 'conversations'],\n",
" num_rows: 112960\n",
"})\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 90.73ba/s]\n",
"Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 91.67ba/s]s/it]\n",
"Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]\n"
]
}
],
"source": [
"hf_ds = ds.to_huggingface()\n",
"\n",
"print(hf_ds)\n",
"\n",
"hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
6 changes: 5 additions & 1 deletion run_server_dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ npm run dev --workspace web/blueprint -- --open &
pid[2]=$!

# Run the FastAPI server.
export LILAC_PROJECT_DIR='./data'
nsthorat marked this conversation as resolved.
Show resolved Hide resolved
if [ "$1" ]; then
export LILAC_PROJECT_DIR="$1"
else
export LILAC_PROJECT_DIR="./data"
fi
poetry run uvicorn lilac.server:app --reload --port 5432 --host 0.0.0.0 \
--reload-dir lilac &
pid[1]=$!
Expand Down