Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small fixes for the demo. #1106

Merged
merged 6 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion lilac/data/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,10 @@ def cluster_impl(
raise ValueError(f'Path {path} must be a string field.')

elif not output_path:
raise ValueError('output_path must be provided if input is a function.')
raise ValueError(
'`output_path` must be provided to `Dataset.cluster()` when `input` is a user-provided '
'method.'
)

# Output the cluster enrichment to a sibling path, unless an output path is provided by the user.
if output_path:
Expand Down
9 changes: 8 additions & 1 deletion lilac/data/clustering_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Unit tests for dataset.cluster()."""
import re
from typing import ClassVar, Iterable, Iterator

import pytest
Expand Down Expand Up @@ -341,7 +342,13 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
return 'simplification'
return 'other'

with pytest.raises(ValueError, match='output_path must be provided if input is a function'):
with pytest.raises(
ValueError,
match=re.escape(
'`output_path` must be provided to `Dataset.cluster()` when `input` is a '
'user-provided method.'
),
):
dataset.cluster(lambda row: '\n'.join(row['texts']), min_cluster_size=2, topic_fn=topic_fn)

dataset.cluster(
Expand Down
2 changes: 2 additions & 0 deletions lilac/formats/default_formats.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Registers all available dataset formats."""

from ..dataset_format import register_dataset_format
from .openai_json import OpenAIJSON
from .openchat import OpenChat
from .sharegpt import ShareGPT

Expand All @@ -9,3 +10,4 @@ def register_default_formats() -> None:
"""Register all the default dataset formats."""
register_dataset_format(ShareGPT)
register_dataset_format(OpenChat)
register_dataset_format(OpenAIJSON)
58 changes: 58 additions & 0 deletions lilac/formats/openai_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""ShareGPT format."""

from typing import ClassVar

from ..dataset_format import DatasetFormat, DatasetFormatInputSelector
from ..schema import PATH_WILDCARD, VALUE_KEY, Item, PathTuple, Schema, schema


def _openai_selector(item: Item, conv_role: str) -> str:
"""Selector for OpenAI JSON Formatted conversations."""
# TODO(nsthorat): Make this return an array, and not pre-join with newlines.
values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role]
# Get the __value__ key version of text if it's enriched.
values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
return '\n'.join(values)


_USER_SELECTOR = DatasetFormatInputSelector(
nsthorat marked this conversation as resolved.
Show resolved Hide resolved
name='user',
selector=lambda item: _openai_selector(item, 'user'),
)

_ASSISTANT_SELECTOR = DatasetFormatInputSelector(
name='assistant',
selector=lambda item: _openai_selector(item, 'assistant'),
)


class OpenAIJSON(DatasetFormat):
"""OpenAI JSON format.

Taken from: https://platform.openai.com/docs/api-reference/chat

Note that here "messages" is "conversation" for support with common datasets.
"""

name: ClassVar[str] = 'openai_json'
data_schema: Schema = schema(
{
'conversation': [
{
'role': 'string',
'content': 'string',
}
],
},
)

title_slots: list[tuple[PathTuple, PathTuple]] = [
(('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role'))
]

user: ClassVar[DatasetFormatInputSelector] = _USER_SELECTOR
assistant: ClassVar[DatasetFormatInputSelector] = _ASSISTANT_SELECTOR

input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
selector.name: selector for selector in [_USER_SELECTOR, _ASSISTANT_SELECTOR]
}
28 changes: 28 additions & 0 deletions lilac/formats/openai_json_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Tests for the openai_json format."""


from ..data.dataset_test_utils import TestDataMaker
from .openai_json import OpenAIJSON


def test_infer_open_chat(make_test_data: TestDataMaker) -> None:
dataset = make_test_data(
[
{
'conversation': [
{'role': 'user', 'content': 'Hello'},
{'role': 'assistant', 'content': 'Hi'},
{'role': 'user', 'content': 'How are you today?'},
{'role': 'assistant', 'content': "I'm fine."},
],
},
{
'conversation': [
{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant', 'content': "I'm OpenChat."},
],
},
]
)

assert dataset.manifest().dataset_format == OpenAIJSON()
9 changes: 7 additions & 2 deletions lilac/formats/sharegpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
from typing import ClassVar

from ..dataset_format import DatasetFormat, DatasetFormatInputSelector
from ..schema import PATH_WILDCARD, Item, PathTuple, Schema, schema
from ..schema import PATH_WILDCARD, VALUE_KEY, Item, PathTuple, Schema, schema


def _sharegpt_selector(item: Item, conv_from: str) -> str:
"""Selector for ShareGPT."""
# TODO(nsthorat): Make this return an array, and not pre-join with newlines.
return '\n'.join(conv['value'] for conv in item['conversations'] if conv['from'] == conv_from)
values = [conv['value'] for conv in item['conversations'] if conv['from'] == conv_from]

# Get the __value__ key version of text if it's enriched.
values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]

return '\n'.join(values)


_SYSTEM_SELECTOR = DatasetFormatInputSelector(
Expand Down
7 changes: 6 additions & 1 deletion lilac/signals/text_statistics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Compute text statistics for a document."""
import logging
from typing import TYPE_CHECKING, ClassVar, Iterable, Optional, cast

from typing_extensions import override
Expand All @@ -8,7 +9,7 @@

SPACY_LANG_MODEL = 'en_core_web_sm'
SPACY_BATCH_SIZE = 128
SPACY_MAX_LENGTH = 2_000_000
SPACY_MAX_LENGTH = 4_000_000

NUM_CHARS = 'num_characters'
READABILITY = 'readability'
Expand Down Expand Up @@ -54,6 +55,10 @@ def setup(self) -> None:
'Please install it with `pip install spacy`.'
)

# Ignore spacy warnings that spam the console.
logger = logging.getLogger('textacy')
logger.setLevel(logging.ERROR)

if not spacy.util.is_package(SPACY_LANG_MODEL):
spacy.cli.download(SPACY_LANG_MODEL)
self._lang = spacy.load(
Expand Down
108 changes: 78 additions & 30 deletions lilac_hf_space.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
use_garden: true

datasets:
- namespace: lilac
name: Capybara
- name: Capybara
namespace: lilac
source:
dataset_name: LDJnr/Capybara
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- - conversation
Expand All @@ -17,7 +18,6 @@ datasets:
- '*'
- output
markdown_paths: []
tags: [datasets]
embeddings:
- embedding: gte-small
path:
Expand All @@ -30,21 +30,21 @@ datasets:
- '*'
- output

- namespace: lilac
name: glaive
- name: glaive-code-assistant
namespace: lilac
source:
dataset_name: glaiveai/glaive-code-assistant
source_name: huggingface
settings:
tags: [machine-learning]
ui:
view_type: 'single_item'
tags: [datasets]
ui:
media_paths:
- question
- answer
- - answer_formatted
- answer
# TODO(nsthorat): Run the map for answer_formatted here and reupload. This is from the
# blog post about curating a coding dataset.
# - - answer_formatted
# - answer
markdown_paths: []
embeddings:
- embedding: gte-small
Expand All @@ -54,22 +54,41 @@ datasets:
path:
- answer

- name: open-asssistant-conversations
- name: open-assistant-conversations-2
namespace: lilac
settings:
tags: [machine-learning]
tags: [datasets]
ui:
media_paths:
- text
preferred_embedding: 'gte-small'
source:
source_name: huggingface
dataset_name: OpenAssistant/oasst1
dataset_name: OpenAssistant/oasst2
embeddings:
- embedding: gte-small
path:
- text

- name: lmsys-chat-1m
namespace: lilac
settings:
tags: [logs]
ui:
media_paths:
- - conversation
- '*'
- content
source:
source_name: huggingface
dataset_name: OpenAssistant/oasst2
embeddings:
- embedding: gte-small
path:
- conversation
- '*'
- content

## Old datasets to be cleaned up.
- name: databricks-dolly-15k-curated-en
namespace: lilac
settings:
Expand Down Expand Up @@ -102,6 +121,23 @@ datasets:
path:
- original-response

- name: 'OpenOrca'
namespace: lilac
settings:
tags: [datasets]
ui:
media_paths:
- question
- response
preferred_embedding: 'gte-small'
source:
source_name: huggingface
dataset_name: Open-Orca/OpenOrca
embeddings:
- embedding: gte-small
path:
- question

- name: 'OpenOrca-100k'
namespace: lilac
settings:
Expand All @@ -125,6 +161,7 @@ datasets:

- namespace: lilac
name: dolphin
tags: [datasets]
source:
dataset_name: cognitivecomputations/dolphin
config_name: flan1m-alpaca-uncensored
Expand All @@ -140,22 +177,13 @@ datasets:
markdown_paths: []

signals:
- signal_name: pii
- signal_name: text_statistics
- signal_name: lang_detection
- signal_name: concept_score
namespace: lilac
concept_name: profanity
embedding: gte-small

concept_model_cache_embeddings:
- gte-small
- gte-base
- sbert
- openai
- cohere
# PALM is currently timing-out.
# - palm
# NOTE: This is currently bugging.
# - signal_name: concept_score
# namespace: lilac
# concept_name: profanity
# embedding: gte-small

clusters:
- dataset_namespace: lilac
Expand All @@ -165,13 +193,24 @@ clusters:
- '*'
- input
- dataset_namespace: lilac
dataset_name: glaive
dataset_name: glaive-code-assistant
input_path:
- question
- dataset_namespace: lilac
dataset_name: open-asssistant-conversations
dataset_name: open-assistant-conversations-2
input_path:
- text
- dataset_namespace: lilac
dataset_name: lmsys-chat-1m
input_selector:
format: openai_json
selector: user
output_path:
- conversation__clusters
- dataset_namespace: lilac
dataset_name: OpenOrca
input_path:
- question
- dataset_namespace: lilac
dataset_name: databricks-dolly-15k-curated-en
input_path:
Expand All @@ -184,3 +223,12 @@ clusters:
dataset_name: dolphin
input_path:
- input

concept_model_cache_embeddings:
- gte-small
- gte-base
- sbert
- openai
- cohere
# PALM is currently timing-out.
# - palm
11 changes: 8 additions & 3 deletions scripts/deploy_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,16 @@ def deploy_demo(
# If a dataset is specified, we only sync/load/upload that dataset.
if dataset is not None:
config_to_load = original_parsed_config.model_copy()
config_to_load.datasets = [
d for d in config_to_load.datasets if f'{d.namespace}/{d.name}' in dataset
]
datasets_to_load = [d for d in config_to_load.datasets if f'{d.namespace}/{d.name}' in dataset]
if not datasets_to_load:
raise ValueError(
f'No datasets found with name {dataset}. Available datasets: ',
[f'{d.namespace}/{d.name}' for d in config_to_load.datasets],
)
config_to_load.datasets = datasets_to_load
else:
config_to_load = original_parsed_config

hf_space_org, hf_space_name = hf_space.split('/')

if not skip_sync:
Expand Down
Loading