Fix a few bugs related to concepts and clustering (#1121)

https://huggingface.co/spaces/lilacai/daniel_staging (open capybara, conversations.*.input has embedding index) - Fix a bug where deleting a concept creates 404s - Fix a bug with old instructor code that generates positive examples - Fix a render bug with concepts with 0 positive examples - Fix a bug with clustering where the text contains the "__value__" substring, literally - Fix a bug with lifecycle fastapi when server started in blocking mode - Remove PALM which is timing out - Add `EmbeddingInfo` to the field , so we can more easily find embeddings
databricks · Jan 24, 2024 · e8cc67b · e8cc67b
1 parent 1a99b95
commit e8cc67b
Show file tree

Hide file tree

Showing 32 changed files with 105 additions and 497 deletions.
diff --git a/.env b/.env
@@ -17,7 +17,6 @@
 # Get key from https://platform.openai.com/account/api-keys
 # OPENAI_API_KEY=
 # Get key from https://makersuite.google.com/app/apikey
-# PALM_API_KEY=
 
 # HuggingFace demos: machine that uploads to HuggingFace.
 

diff --git a/docs/blog/introducing-lilac.md b/docs/blog/introducing-lilac.md
@@ -84,8 +84,7 @@ and usable with data staying on-premise. Lilac Concepts utilize powerful on-devi
 [GTE](https://huggingface.co/thenlper/gte-small). However if your application is not sensitive to
 data privacy (e.g. using open-source datasets), you may choose to use more powerful embeddings like
 [OpenAI](https://platform.openai.com/docs/guides/embeddings),
-[Cohere](https://docs.cohere.com/docs/embeddings), [PaLM](https://developers.generativeai.google/),
-or your own! For more information on embeddings,
+[Cohere](https://docs.cohere.com/docs/embeddings), or your own! For more information on embeddings,
 [see our documentation](https://docs.lilacml.com/embeddings/embeddings.html).
 
 ### HuggingFace demo

diff --git a/docs/concepts/concept_use.md b/docs/concepts/concept_use.md
@@ -8,7 +8,7 @@ Applying a concept to text can be done in one of two ways:
 
 A concept is simply a collection of positive and negative examples. To use a concept, we also need
 to choose an _Embedding_. Some embeddings, like `gte` and `sbert` will run entirely on device. Other
-embeddings like `openai`, `cohere`, or `palm` require an API key and will make a request to retrieve
+embeddings like `openai` or `cohere` require an API key and will make a request to retrieve
 embeddings. See [Embeddings](../embeddings/embeddings.md) for details on embeddings.
 
 The quality of a concept depends on the chosen embedding. See [Concept Metrics](concept_metrics.md)

diff --git a/docs/embeddings/embeddings.md b/docs/embeddings/embeddings.md
@@ -19,8 +19,6 @@ Lilac has built-in **remote** embeddings. Using these will _send data to an exte
   need to define `OPENAI_API_KEY` in your environment variables.
 - [`cohere`](https://docs.cohere.com/docs/embeddings): Cohere embeddings. You will need to define
   `COHERE_API_KEY` in your environment variables.
-- [`palm`](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)
-  PaLM embeddings. You will need to define `PALM_API_KEY` in your environment variables.
 
 ## Register your own embedding
 

diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
@@ -42,4 +42,3 @@ To use hosted services for computing embeddings, add the following environment v
 - `OPENAI_API_KEY`: OpenAI API key. You can get one
   [here](https://platform.openai.com/account/api-keys).
 - `COHERE_API_KEY`: Cohere API key. You can get one [here](https://dashboard.cohere.ai/api-keys).
-- `PALM_API_KEY`: PaLM API key. You can get one [here](https://makersuite.google.com/app/apikey).
diff --git a/docs/poetry.lock b/docs/poetry.lock
diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py
@@ -324,7 +324,7 @@ def _flatten_input(item: Item, input_path: PathTuple) -> str:
       # Filter out Nones
       texts = (t for t in texts if t)
       # Deal with enriched items.
-      texts = (t[VALUE_KEY] if VALUE_KEY in t else t for t in texts)
+      texts = (t[VALUE_KEY] if (isinstance(t, dict) and VALUE_KEY in t) else t for t in texts)
       return '\n'.join(texts)
 
     def extract_text(item: Item) -> Item:

diff --git a/lilac/data/dataset_compute_signal_chain_test.py b/lilac/data/dataset_compute_signal_chain_test.py
@@ -10,6 +10,7 @@
 
 from ..schema import (
   EMBEDDING_KEY,
+  EmbeddingInfo,
   Field,
   Item,
   RichData,
@@ -144,6 +145,7 @@ def test_manual_embedding_signal(make_test_data: TestDataMaker, mocker: MockerFi
             'test_embedding': field(
               signal=TestEmbedding().model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
             ),
           },
         )

diff --git a/lilac/data/dataset_compute_signal_test.py b/lilac/data/dataset_compute_signal_test.py
@@ -13,6 +13,7 @@
 from ..schema import (
   EMBEDDING_KEY,
   PATH_WILDCARD,
+  EmbeddingInfo,
   Field,
   Item,
   RichData,
@@ -687,6 +688,7 @@ def test_embedding_signal(make_test_data: TestDataMaker, mocker: MockerFixture)
             'test_embedding': field(
               signal=embedding_signal.model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
             )
           },
         )
@@ -720,6 +722,7 @@ def test_embedding_signal_overwrite(make_test_data: TestDataMaker, mocker: Mocke
             'test_embedding': field(
               signal=embedding_signal.model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
             )
           },
         )
@@ -782,6 +785,7 @@ def test_delete_embedding(make_test_data: TestDataMaker, mocker: MockerFixture)
             'test_embedding': field(
               signal=embedding_signal.model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
             )
           },
         )

diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
@@ -68,6 +68,7 @@
   TIMESTAMP,
   VALUE_KEY,
   Bin,
+  EmbeddingInfo,
   Field,
   Item,
   MapFn,
@@ -1168,7 +1169,8 @@ def compute_embedding(
 
     output_path = _col_destination_path(signal_col, is_computed_signal=True)
     output_dir = os.path.join(self.dataset_path, _signal_dir(output_path))
-    signal_schema = create_signal_schema(signal, input_path, manifest.data_schema)
+    embedding_info = EmbeddingInfo(input_path=input_path, embedding=embedding)
+    signal_schema = create_signal_schema(signal, input_path, manifest.data_schema, embedding_info)
 
     assert signal_schema, 'Signal schema should be defined for `TextEmbeddingSignal`.'
 

diff --git a/lilac/data/dataset_select_rows_schema_test.py b/lilac/data/dataset_select_rows_schema_test.py
@@ -9,6 +9,7 @@
 from ..schema import (
   EMBEDDING_KEY,
   PATH_WILDCARD,
+  EmbeddingInfo,
   Field,
   Item,
   RichData,
@@ -382,6 +383,7 @@ def test_search_semantic_schema(make_test_data: TestDataMaker) -> None:
             'test_embedding': field(
               signal=test_embedding.model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
             ),
             expected_world_signal.key(): field(
               signal=expected_world_signal.model_dump(exclude_none=True),
@@ -437,6 +439,7 @@ def test_search_concept_schema(make_test_data: TestDataMaker) -> None:
             'test_embedding': field(
               signal=test_embedding.model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
             ),
             expected_world_signal.key(): field(
               signal=expected_world_signal.model_dump(exclude_none=True),

diff --git a/lilac/data/dataset_test.py b/lilac/data/dataset_test.py
@@ -17,6 +17,7 @@
   EMBEDDING_KEY,
   ROWID,
   SPAN_KEY,
+  EmbeddingInfo,
   Field,
   Item,
   MapType,
@@ -611,6 +612,7 @@ def test_config_from_dataset(make_test_data: TestDataMaker) -> None:
             'test_embedding': field(
               signal=TestEmbedding().model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
             ),
           },
         )

diff --git a/lilac/data/dataset_utils.py b/lilac/data/dataset_utils.py
@@ -26,6 +26,7 @@
   STRING,
   TEXT_SPAN_END_FEATURE,
   TEXT_SPAN_START_FEATURE,
+  EmbeddingInfo,
   Field,
   Item,
   MapFn,
@@ -132,7 +133,10 @@ def create_json_map_output_schema(map_schema: Field, output_path: PathTuple) ->
 
 
 def create_signal_schema(
-  signal: Signal, source_path: PathTuple, current_schema: Schema
+  signal: Signal,
+  source_path: PathTuple,
+  current_schema: Schema,
+  embedding_info: Optional[EmbeddingInfo] = None,
 ) -> Optional[Schema]:
   """Create a signal schema describing the enriched fields.
 
@@ -148,6 +152,8 @@ def create_signal_schema(
     return None
 
   signal_schema.signal = signal.model_dump(exclude_none=True)
+  if embedding_info:
+    signal_schema.embedding = embedding_info
 
   enriched_schema = field(fields={signal.key(is_computed_signal=True): signal_schema})
 

diff --git a/lilac/embeddings/palm.py b/lilac/embeddings/palm.py
diff --git a/lilac/env.py b/lilac/env.py
@@ -47,9 +47,6 @@ class LilacEnvironment(BaseModel):
   COHERE_API_KEY: str = PydanticField(
     description='The Cohere API key, used for computing `cohere` embeddings.'
   )
-  PALM_API_KEY: str = PydanticField(
-    description='The PaLM API key, used for computing `palm` embeddings.'
-  )
 
   # HuggingFace demo.
   HF_ACCESS_TOKEN: str = PydanticField(

diff --git a/lilac/load_test.py b/lilac/load_test.py
@@ -24,7 +24,16 @@
 from .env import set_project_dir
 from .load import load
 from .project import PROJECT_CONFIG_FILENAME, init
-from .schema import EMBEDDING_KEY, Field, Item, RichData, chunk_embedding, field, schema
+from .schema import (
+  EMBEDDING_KEY,
+  EmbeddingInfo,
+  Field,
+  Item,
+  RichData,
+  chunk_embedding,
+  field,
+  schema,
+)
 from .signal import (
   TextEmbeddingSignal,
   TextSignal,
@@ -275,6 +284,7 @@ def test_load_embeddings(tmp_path: pathlib.Path) -> None:
             'test_embedding': field(
               signal=TestEmbedding().model_dump(exclude_none=True),
               fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
+              embedding=EmbeddingInfo(input_path=('str',), embedding='test_embedding'),
             )
           },
         ),

diff --git a/lilac/router_concept.py b/lilac/router_concept.py
@@ -29,7 +29,7 @@
 
 @router.get('/', response_model_exclude_none=True)
 def get_concepts(
-  user: Annotated[Optional[UserInfo], Depends(get_session_user)]
+  user: Annotated[Optional[UserInfo], Depends(get_session_user)],
 ) -> list[ConceptInfo]:
   """List the concepts."""
   return DISK_CONCEPT_DB.list(user)
@@ -271,8 +271,9 @@ def generate_examples(description: str) -> list[str]:
     client = instructor.patch(openai.OpenAI())
 
     completion = client.chat.completions.create(
-      model='gpt-3.5-turbo',
-      functions=[Examples.openai_schema],
+      model='gpt-3.5-turbo-1106',
+      response_model=Examples,
+      temperature=0.0,
       messages=[
         {
           'role': 'system',
@@ -284,8 +285,7 @@ def generate_examples(description: str) -> list[str]:
         },
       ],
     )
-    result = Examples.from_response(completion)
-    return result.examples
+    return completion.examples
 
   except openai.AuthenticationError:
     raise ValueError(