Small fixes for the demo. (#1106)

Add datasets: - lmsys - openorca - oasst2 Other: - Reduce spacy spam - Add OpenAI JSON format as a dataset format so we can cluster role=human lmsys.
databricks · Jan 22, 2024 · 55adffc · 55adffc
1 parent 9aac3dc
commit 55adffc
Show file tree

Hide file tree

Showing 11 changed files with 201 additions and 40 deletions.
diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py
@@ -287,7 +287,10 @@ def cluster_impl(
       raise ValueError(f'Path {path} must be a string field.')
 
   elif not output_path:
-    raise ValueError('output_path must be provided if input is a function.')
+    raise ValueError(
+      '`output_path` must be provided to `Dataset.cluster()` when `input` is a user-provided '
+      'method.'
+    )
 
   # Output the cluster enrichment to a sibling path, unless an output path is provided by the user.
   if output_path:

diff --git a/lilac/data/clustering_test.py b/lilac/data/clustering_test.py
@@ -1,4 +1,5 @@
 """Unit tests for dataset.cluster()."""
+import re
 from typing import ClassVar, Iterable, Iterator
 
 import pytest
@@ -341,7 +342,13 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
       return 'simplification'
     return 'other'
 
-  with pytest.raises(ValueError, match='output_path must be provided if input is a function'):
+  with pytest.raises(
+    ValueError,
+    match=re.escape(
+      '`output_path` must be provided to `Dataset.cluster()` when `input` is a '
+      'user-provided method.'
+    ),
+  ):
     dataset.cluster(lambda row: '\n'.join(row['texts']), min_cluster_size=2, topic_fn=topic_fn)
 
   dataset.cluster(

diff --git a/lilac/formats/default_formats.py b/lilac/formats/default_formats.py
@@ -1,6 +1,7 @@
 """Registers all available dataset formats."""
 
 from ..dataset_format import register_dataset_format
+from .openai_json import OpenAIJSON
 from .openchat import OpenChat
 from .sharegpt import ShareGPT
 
@@ -9,3 +10,4 @@ def register_default_formats() -> None:
   """Register all the default dataset formats."""
   register_dataset_format(ShareGPT)
   register_dataset_format(OpenChat)
+  register_dataset_format(OpenAIJSON)
diff --git a/lilac/formats/openai_json.py b/lilac/formats/openai_json.py
@@ -0,0 +1,58 @@
+"""ShareGPT format."""
+
+from typing import ClassVar
+
+from ..dataset_format import DatasetFormat, DatasetFormatInputSelector
+from ..schema import PATH_WILDCARD, VALUE_KEY, Item, PathTuple, Schema, schema
+
+
+def _openai_selector(item: Item, conv_role: str) -> str:
+  """Selector for OpenAI JSON Formatted conversations."""
+  # TODO(nsthorat): Make this return an array, and not pre-join with newlines.
+  values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role]
+  # Get the __value__ key version of text if it's enriched.
+  values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
+  return '\n'.join(values)
+
+
+_USER_SELECTOR = DatasetFormatInputSelector(
+  name='user',
+  selector=lambda item: _openai_selector(item, 'user'),
+)
+
+_ASSISTANT_SELECTOR = DatasetFormatInputSelector(
+  name='assistant',
+  selector=lambda item: _openai_selector(item, 'assistant'),
+)
+
+
+class OpenAIJSON(DatasetFormat):
+  """OpenAI JSON format.
+
+  Taken from: https://platform.openai.com/docs/api-reference/chat
+
+  Note that here "messages" is "conversation" for support with common datasets.
+  """
+
+  name: ClassVar[str] = 'openai_json'
+  data_schema: Schema = schema(
+    {
+      'conversation': [
+        {
+          'role': 'string',
+          'content': 'string',
+        }
+      ],
+    },
+  )
+
+  title_slots: list[tuple[PathTuple, PathTuple]] = [
+    (('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role'))
+  ]
+
+  user: ClassVar[DatasetFormatInputSelector] = _USER_SELECTOR
+  assistant: ClassVar[DatasetFormatInputSelector] = _ASSISTANT_SELECTOR
+
+  input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
+    selector.name: selector for selector in [_USER_SELECTOR, _ASSISTANT_SELECTOR]
+  }
diff --git a/lilac/formats/openai_json_test.py b/lilac/formats/openai_json_test.py
@@ -0,0 +1,28 @@
+"""Tests for the openai_json format."""
+
+
+from ..data.dataset_test_utils import TestDataMaker
+from .openai_json import OpenAIJSON
+
+
+def test_infer_open_chat(make_test_data: TestDataMaker) -> None:
+  dataset = make_test_data(
+    [
+      {
+        'conversation': [
+          {'role': 'user', 'content': 'Hello'},
+          {'role': 'assistant', 'content': 'Hi'},
+          {'role': 'user', 'content': 'How are you today?'},
+          {'role': 'assistant', 'content': "I'm fine."},
+        ],
+      },
+      {
+        'conversation': [
+          {'role': 'user', 'content': 'Who are you?'},
+          {'role': 'assistant', 'content': "I'm OpenChat."},
+        ],
+      },
+    ]
+  )
+
+  assert dataset.manifest().dataset_format == OpenAIJSON()
diff --git a/lilac/formats/sharegpt.py b/lilac/formats/sharegpt.py
@@ -2,13 +2,18 @@
 from typing import ClassVar
 
 from ..dataset_format import DatasetFormat, DatasetFormatInputSelector
-from ..schema import PATH_WILDCARD, Item, PathTuple, Schema, schema
+from ..schema import PATH_WILDCARD, VALUE_KEY, Item, PathTuple, Schema, schema
 
 
 def _sharegpt_selector(item: Item, conv_from: str) -> str:
   """Selector for ShareGPT."""
   # TODO(nsthorat): Make this return an array, and not pre-join with newlines.
-  return '\n'.join(conv['value'] for conv in item['conversations'] if conv['from'] == conv_from)
+  values = [conv['value'] for conv in item['conversations'] if conv['from'] == conv_from]
+
+  # Get the __value__ key version of text if it's enriched.
+  values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
+
+  return '\n'.join(values)
 
 
 _SYSTEM_SELECTOR = DatasetFormatInputSelector(

diff --git a/lilac/signals/text_statistics.py b/lilac/signals/text_statistics.py
@@ -1,4 +1,5 @@
 """Compute text statistics for a document."""
+import logging
 from typing import TYPE_CHECKING, ClassVar, Iterable, Optional, cast
 
 from typing_extensions import override
@@ -8,7 +9,7 @@
 
 SPACY_LANG_MODEL = 'en_core_web_sm'
 SPACY_BATCH_SIZE = 128
-SPACY_MAX_LENGTH = 2_000_000
+SPACY_MAX_LENGTH = 4_000_000
 
 NUM_CHARS = 'num_characters'
 READABILITY = 'readability'
@@ -54,6 +55,10 @@ def setup(self) -> None:
         'Please install it with `pip install spacy`.'
       )
 
+    # Ignore spacy warnings that spam the console.
+    logger = logging.getLogger('textacy')
+    logger.setLevel(logging.ERROR)
+
     if not spacy.util.is_package(SPACY_LANG_MODEL):
       spacy.cli.download(SPACY_LANG_MODEL)
     self._lang = spacy.load(

diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
@@ -2,12 +2,13 @@
 use_garden: true
 
 datasets:
-  - namespace: lilac
-    name: Capybara
+  - name: Capybara
+    namespace: lilac
     source:
       dataset_name: LDJnr/Capybara
       source_name: huggingface
     settings:
+      tags: [datasets]
       ui:
         media_paths:
           - - conversation
@@ -17,7 +18,6 @@ datasets:
             - '*'
             - output
         markdown_paths: []
-      tags: [datasets]
     embeddings:
       - embedding: gte-small
         path:
@@ -30,21 +30,21 @@ datasets:
           - '*'
           - output
 
-  - namespace: lilac
-    name: glaive
+  - name: glaive-code-assistant
+    namespace: lilac
     source:
       dataset_name: glaiveai/glaive-code-assistant
       source_name: huggingface
     settings:
-      tags: [machine-learning]
-      ui:
-        view_type: 'single_item'
+      tags: [datasets]
       ui:
         media_paths:
           - question
           - answer
-          - - answer_formatted
-            - answer
+          # TODO(nsthorat): Run the map for answer_formatted here and reupload. This is from the
+          # blog post about curating a coding dataset.
+          # - - answer_formatted
+          #   - answer
         markdown_paths: []
     embeddings:
       - embedding: gte-small
@@ -54,22 +54,41 @@ datasets:
         path:
           - answer
 
-  - name: open-asssistant-conversations
+  - name: open-assistant-conversations-2
     namespace: lilac
     settings:
-      tags: [machine-learning]
+      tags: [datasets]
       ui:
         media_paths:
           - text
-      preferred_embedding: 'gte-small'
     source:
       source_name: huggingface
-      dataset_name: OpenAssistant/oasst1
+      dataset_name: OpenAssistant/oasst2
     embeddings:
       - embedding: gte-small
         path:
           - text
 
+  - name: lmsys-chat-1m
+    namespace: lilac
+    settings:
+      tags: [logs]
+      ui:
+        media_paths:
+          - - conversation
+            - '*'
+            - content
+    source:
+      source_name: huggingface
+      dataset_name: OpenAssistant/oasst2
+    embeddings:
+      - embedding: gte-small
+        path:
+          - conversation
+          - '*'
+          - content
+
+  ## Old datasets to be cleaned up.
   - name: databricks-dolly-15k-curated-en
     namespace: lilac
     settings:
@@ -102,6 +121,23 @@ datasets:
         path:
           - original-response
 
+  - name: 'OpenOrca'
+    namespace: lilac
+    settings:
+      tags: [datasets]
+      ui:
+        media_paths:
+          - question
+          - response
+      preferred_embedding: 'gte-small'
+    source:
+      source_name: huggingface
+      dataset_name: Open-Orca/OpenOrca
+    embeddings:
+      - embedding: gte-small
+        path:
+          - question
+
   - name: 'OpenOrca-100k'
     namespace: lilac
     settings:
@@ -125,6 +161,7 @@ datasets:
 
   - namespace: lilac
     name: dolphin
+    tags: [datasets]
     source:
       dataset_name: cognitivecomputations/dolphin
       config_name: flan1m-alpaca-uncensored
@@ -140,22 +177,13 @@ datasets:
         markdown_paths: []
 
 signals:
-  - signal_name: pii
   - signal_name: text_statistics
   - signal_name: lang_detection
-  - signal_name: concept_score
-    namespace: lilac
-    concept_name: profanity
-    embedding: gte-small
-
-concept_model_cache_embeddings:
-  - gte-small
-  - gte-base
-  - sbert
-  - openai
-  - cohere
-  # PALM is currently timing-out.
-  # - palm
+  # NOTE: This is currently bugging.
+  # - signal_name: concept_score
+  #   namespace: lilac
+  #   concept_name: profanity
+  #   embedding: gte-small
 
 clusters:
   - dataset_namespace: lilac
@@ -165,13 +193,24 @@ clusters:
       - '*'
       - input
   - dataset_namespace: lilac
-    dataset_name: glaive
+    dataset_name: glaive-code-assistant
     input_path:
       - question
   - dataset_namespace: lilac
-    dataset_name: open-asssistant-conversations
+    dataset_name: open-assistant-conversations-2
     input_path:
       - text
+  - dataset_namespace: lilac
+    dataset_name: lmsys-chat-1m
+    input_selector:
+      format: openai_json
+      selector: user
+    output_path:
+      - conversation__clusters
+  - dataset_namespace: lilac
+    dataset_name: OpenOrca
+    input_path:
+      - question
   - dataset_namespace: lilac
     dataset_name: databricks-dolly-15k-curated-en
     input_path:
@@ -184,3 +223,12 @@ clusters:
     dataset_name: dolphin
     input_path:
       - input
+
+concept_model_cache_embeddings:
+  - gte-small
+  - gte-base
+  - sbert
+  - openai
+  - cohere
+  # PALM is currently timing-out.
+  # - palm
diff --git a/scripts/deploy_demo.py b/scripts/deploy_demo.py
@@ -124,11 +124,16 @@ def deploy_demo(
   # If a dataset is specified, we only sync/load/upload that dataset.
   if dataset is not None:
     config_to_load = original_parsed_config.model_copy()
-    config_to_load.datasets = [
-      d for d in config_to_load.datasets if f'{d.namespace}/{d.name}' in dataset
-    ]
+    datasets_to_load = [d for d in config_to_load.datasets if f'{d.namespace}/{d.name}' in dataset]
+    if not datasets_to_load:
+      raise ValueError(
+        f'No datasets found with name {dataset}. Available datasets: ',
+        [f'{d.namespace}/{d.name}' for d in config_to_load.datasets],
+      )
+    config_to_load.datasets = datasets_to_load
   else:
     config_to_load = original_parsed_config
+
   hf_space_org, hf_space_name = hf_space.split('/')
 
   if not skip_sync: