Skip to content

Commit

Permalink
Small fixes for the demo. (#1106)
Browse files Browse the repository at this point in the history
Add datasets:
- lmsys
- openorca
- oasst2

Other:
- Reduce spacy spam
- Add OpenAI JSON format as a dataset format so we can cluster
role=human lmsys.
  • Loading branch information
nsthorat authored Jan 22, 2024
1 parent 9aac3dc commit 55adffc
Show file tree
Hide file tree
Showing 11 changed files with 201 additions and 40 deletions.
5 changes: 4 additions & 1 deletion lilac/data/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,10 @@ def cluster_impl(
raise ValueError(f'Path {path} must be a string field.')

elif not output_path:
raise ValueError('output_path must be provided if input is a function.')
raise ValueError(
'`output_path` must be provided to `Dataset.cluster()` when `input` is a user-provided '
'method.'
)

# Output the cluster enrichment to a sibling path, unless an output path is provided by the user.
if output_path:
Expand Down
9 changes: 8 additions & 1 deletion lilac/data/clustering_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Unit tests for dataset.cluster()."""
import re
from typing import ClassVar, Iterable, Iterator

import pytest
Expand Down Expand Up @@ -341,7 +342,13 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
return 'simplification'
return 'other'

with pytest.raises(ValueError, match='output_path must be provided if input is a function'):
with pytest.raises(
ValueError,
match=re.escape(
'`output_path` must be provided to `Dataset.cluster()` when `input` is a '
'user-provided method.'
),
):
dataset.cluster(lambda row: '\n'.join(row['texts']), min_cluster_size=2, topic_fn=topic_fn)

dataset.cluster(
Expand Down
2 changes: 2 additions & 0 deletions lilac/formats/default_formats.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Registers all available dataset formats."""

from ..dataset_format import register_dataset_format
from .openai_json import OpenAIJSON
from .openchat import OpenChat
from .sharegpt import ShareGPT

Expand All @@ -9,3 +10,4 @@ def register_default_formats() -> None:
"""Register all the default dataset formats."""
register_dataset_format(ShareGPT)
register_dataset_format(OpenChat)
register_dataset_format(OpenAIJSON)
58 changes: 58 additions & 0 deletions lilac/formats/openai_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""ShareGPT format."""

from typing import ClassVar

from ..dataset_format import DatasetFormat, DatasetFormatInputSelector
from ..schema import PATH_WILDCARD, VALUE_KEY, Item, PathTuple, Schema, schema


def _openai_selector(item: Item, conv_role: str) -> str:
"""Selector for OpenAI JSON Formatted conversations."""
# TODO(nsthorat): Make this return an array, and not pre-join with newlines.
values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role]
# Get the __value__ key version of text if it's enriched.
values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
return '\n'.join(values)


_USER_SELECTOR = DatasetFormatInputSelector(
name='user',
selector=lambda item: _openai_selector(item, 'user'),
)

_ASSISTANT_SELECTOR = DatasetFormatInputSelector(
name='assistant',
selector=lambda item: _openai_selector(item, 'assistant'),
)


class OpenAIJSON(DatasetFormat):
"""OpenAI JSON format.
Taken from: https://platform.openai.com/docs/api-reference/chat
Note that here "messages" is "conversation" for support with common datasets.
"""

name: ClassVar[str] = 'openai_json'
data_schema: Schema = schema(
{
'conversation': [
{
'role': 'string',
'content': 'string',
}
],
},
)

title_slots: list[tuple[PathTuple, PathTuple]] = [
(('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role'))
]

user: ClassVar[DatasetFormatInputSelector] = _USER_SELECTOR
assistant: ClassVar[DatasetFormatInputSelector] = _ASSISTANT_SELECTOR

input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
selector.name: selector for selector in [_USER_SELECTOR, _ASSISTANT_SELECTOR]
}
28 changes: 28 additions & 0 deletions lilac/formats/openai_json_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Tests for the openai_json format."""


from ..data.dataset_test_utils import TestDataMaker
from .openai_json import OpenAIJSON


def test_infer_open_chat(make_test_data: TestDataMaker) -> None:
dataset = make_test_data(
[
{
'conversation': [
{'role': 'user', 'content': 'Hello'},
{'role': 'assistant', 'content': 'Hi'},
{'role': 'user', 'content': 'How are you today?'},
{'role': 'assistant', 'content': "I'm fine."},
],
},
{
'conversation': [
{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant', 'content': "I'm OpenChat."},
],
},
]
)

assert dataset.manifest().dataset_format == OpenAIJSON()
9 changes: 7 additions & 2 deletions lilac/formats/sharegpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
from typing import ClassVar

from ..dataset_format import DatasetFormat, DatasetFormatInputSelector
from ..schema import PATH_WILDCARD, Item, PathTuple, Schema, schema
from ..schema import PATH_WILDCARD, VALUE_KEY, Item, PathTuple, Schema, schema


def _sharegpt_selector(item: Item, conv_from: str) -> str:
"""Selector for ShareGPT."""
# TODO(nsthorat): Make this return an array, and not pre-join with newlines.
return '\n'.join(conv['value'] for conv in item['conversations'] if conv['from'] == conv_from)
values = [conv['value'] for conv in item['conversations'] if conv['from'] == conv_from]

# Get the __value__ key version of text if it's enriched.
values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]

return '\n'.join(values)


_SYSTEM_SELECTOR = DatasetFormatInputSelector(
Expand Down
7 changes: 6 additions & 1 deletion lilac/signals/text_statistics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Compute text statistics for a document."""
import logging
from typing import TYPE_CHECKING, ClassVar, Iterable, Optional, cast

from typing_extensions import override
Expand All @@ -8,7 +9,7 @@

SPACY_LANG_MODEL = 'en_core_web_sm'
SPACY_BATCH_SIZE = 128
SPACY_MAX_LENGTH = 2_000_000
SPACY_MAX_LENGTH = 4_000_000

NUM_CHARS = 'num_characters'
READABILITY = 'readability'
Expand Down Expand Up @@ -54,6 +55,10 @@ def setup(self) -> None:
'Please install it with `pip install spacy`.'
)

# Ignore spacy warnings that spam the console.
logger = logging.getLogger('textacy')
logger.setLevel(logging.ERROR)

if not spacy.util.is_package(SPACY_LANG_MODEL):
spacy.cli.download(SPACY_LANG_MODEL)
self._lang = spacy.load(
Expand Down
108 changes: 78 additions & 30 deletions lilac_hf_space.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
use_garden: true

datasets:
- namespace: lilac
name: Capybara
- name: Capybara
namespace: lilac
source:
dataset_name: LDJnr/Capybara
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- - conversation
Expand All @@ -17,7 +18,6 @@ datasets:
- '*'
- output
markdown_paths: []
tags: [datasets]
embeddings:
- embedding: gte-small
path:
Expand All @@ -30,21 +30,21 @@ datasets:
- '*'
- output

- namespace: lilac
name: glaive
- name: glaive-code-assistant
namespace: lilac
source:
dataset_name: glaiveai/glaive-code-assistant
source_name: huggingface
settings:
tags: [machine-learning]
ui:
view_type: 'single_item'
tags: [datasets]
ui:
media_paths:
- question
- answer
- - answer_formatted
- answer
# TODO(nsthorat): Run the map for answer_formatted here and reupload. This is from the
# blog post about curating a coding dataset.
# - - answer_formatted
# - answer
markdown_paths: []
embeddings:
- embedding: gte-small
Expand All @@ -54,22 +54,41 @@ datasets:
path:
- answer

- name: open-asssistant-conversations
- name: open-assistant-conversations-2
namespace: lilac
settings:
tags: [machine-learning]
tags: [datasets]
ui:
media_paths:
- text
preferred_embedding: 'gte-small'
source:
source_name: huggingface
dataset_name: OpenAssistant/oasst1
dataset_name: OpenAssistant/oasst2
embeddings:
- embedding: gte-small
path:
- text

- name: lmsys-chat-1m
namespace: lilac
settings:
tags: [logs]
ui:
media_paths:
- - conversation
- '*'
- content
source:
source_name: huggingface
dataset_name: OpenAssistant/oasst2
embeddings:
- embedding: gte-small
path:
- conversation
- '*'
- content

## Old datasets to be cleaned up.
- name: databricks-dolly-15k-curated-en
namespace: lilac
settings:
Expand Down Expand Up @@ -102,6 +121,23 @@ datasets:
path:
- original-response

- name: 'OpenOrca'
namespace: lilac
settings:
tags: [datasets]
ui:
media_paths:
- question
- response
preferred_embedding: 'gte-small'
source:
source_name: huggingface
dataset_name: Open-Orca/OpenOrca
embeddings:
- embedding: gte-small
path:
- question

- name: 'OpenOrca-100k'
namespace: lilac
settings:
Expand All @@ -125,6 +161,7 @@ datasets:

- namespace: lilac
name: dolphin
tags: [datasets]
source:
dataset_name: cognitivecomputations/dolphin
config_name: flan1m-alpaca-uncensored
Expand All @@ -140,22 +177,13 @@ datasets:
markdown_paths: []

signals:
- signal_name: pii
- signal_name: text_statistics
- signal_name: lang_detection
- signal_name: concept_score
namespace: lilac
concept_name: profanity
embedding: gte-small

concept_model_cache_embeddings:
- gte-small
- gte-base
- sbert
- openai
- cohere
# PALM is currently timing-out.
# - palm
# NOTE: This is currently bugging.
# - signal_name: concept_score
# namespace: lilac
# concept_name: profanity
# embedding: gte-small

clusters:
- dataset_namespace: lilac
Expand All @@ -165,13 +193,24 @@ clusters:
- '*'
- input
- dataset_namespace: lilac
dataset_name: glaive
dataset_name: glaive-code-assistant
input_path:
- question
- dataset_namespace: lilac
dataset_name: open-asssistant-conversations
dataset_name: open-assistant-conversations-2
input_path:
- text
- dataset_namespace: lilac
dataset_name: lmsys-chat-1m
input_selector:
format: openai_json
selector: user
output_path:
- conversation__clusters
- dataset_namespace: lilac
dataset_name: OpenOrca
input_path:
- question
- dataset_namespace: lilac
dataset_name: databricks-dolly-15k-curated-en
input_path:
Expand All @@ -184,3 +223,12 @@ clusters:
dataset_name: dolphin
input_path:
- input

concept_model_cache_embeddings:
- gte-small
- gte-base
- sbert
- openai
- cohere
# PALM is currently timing-out.
# - palm
11 changes: 8 additions & 3 deletions scripts/deploy_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,16 @@ def deploy_demo(
# If a dataset is specified, we only sync/load/upload that dataset.
if dataset is not None:
config_to_load = original_parsed_config.model_copy()
config_to_load.datasets = [
d for d in config_to_load.datasets if f'{d.namespace}/{d.name}' in dataset
]
datasets_to_load = [d for d in config_to_load.datasets if f'{d.namespace}/{d.name}' in dataset]
if not datasets_to_load:
raise ValueError(
f'No datasets found with name {dataset}. Available datasets: ',
[f'{d.namespace}/{d.name}' for d in config_to_load.datasets],
)
config_to_load.datasets = datasets_to_load
else:
config_to_load = original_parsed_config

hf_space_org, hf_space_name = hf_space.split('/')

if not skip_sync:
Expand Down
Loading

0 comments on commit 55adffc

Please sign in to comment.