Skip to content

Commit

Permalink
Fix a few bugs related to concepts and clustering (#1121)
Browse files Browse the repository at this point in the history
https://huggingface.co/spaces/lilacai/daniel_staging (open capybara,
conversations.*.input has embedding index)

- Fix a bug where deleting a concept creates 404s
- Fix a bug with old instructor code that generates positive examples
- Fix a render bug with concepts with 0 positive examples
- Fix a bug with clustering where the text contains the "__value__"
substring, literally
- Fix a bug with lifecycle fastapi when server started in blocking mode
- Remove PALM which is timing out
- Add `EmbeddingInfo` to the field , so we can more easily find
embeddings
  • Loading branch information
dsmilkov authored Jan 24, 2024
1 parent 1a99b95 commit e8cc67b
Show file tree
Hide file tree
Showing 32 changed files with 105 additions and 497 deletions.
1 change: 0 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
# Get key from https://platform.openai.com/account/api-keys
# OPENAI_API_KEY=
# Get key from https://makersuite.google.com/app/apikey
# PALM_API_KEY=

# HuggingFace demos: machine that uploads to HuggingFace.

Expand Down
3 changes: 1 addition & 2 deletions docs/blog/introducing-lilac.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,7 @@ and usable with data staying on-premise. Lilac Concepts utilize powerful on-devi
[GTE](https://huggingface.co/thenlper/gte-small). However if your application is not sensitive to
data privacy (e.g. using open-source datasets), you may choose to use more powerful embeddings like
[OpenAI](https://platform.openai.com/docs/guides/embeddings),
[Cohere](https://docs.cohere.com/docs/embeddings), [PaLM](https://developers.generativeai.google/),
or your own! For more information on embeddings,
[Cohere](https://docs.cohere.com/docs/embeddings), or your own! For more information on embeddings,
[see our documentation](https://docs.lilacml.com/embeddings/embeddings.html).

### HuggingFace demo
Expand Down
2 changes: 1 addition & 1 deletion docs/concepts/concept_use.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Applying a concept to text can be done in one of two ways:

A concept is simply a collection of positive and negative examples. To use a concept, we also need
to choose an _Embedding_. Some embeddings, like `gte` and `sbert` will run entirely on device. Other
embeddings like `openai`, `cohere`, or `palm` require an API key and will make a request to retrieve
embeddings like `openai` or `cohere` require an API key and will make a request to retrieve
embeddings. See [Embeddings](../embeddings/embeddings.md) for details on embeddings.

The quality of a concept depends on the chosen embedding. See [Concept Metrics](concept_metrics.md)
Expand Down
2 changes: 0 additions & 2 deletions docs/embeddings/embeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ Lilac has built-in **remote** embeddings. Using these will _send data to an exte
need to define `OPENAI_API_KEY` in your environment variables.
- [`cohere`](https://docs.cohere.com/docs/embeddings): Cohere embeddings. You will need to define
`COHERE_API_KEY` in your environment variables.
- [`palm`](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)
PaLM embeddings. You will need to define `PALM_API_KEY` in your environment variables.

## Register your own embedding

Expand Down
1 change: 0 additions & 1 deletion docs/getting_started/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,3 @@ To use hosted services for computing embeddings, add the following environment v
- `OPENAI_API_KEY`: OpenAI API key. You can get one
[here](https://platform.openai.com/account/api-keys).
- `COHERE_API_KEY`: Cohere API key. You can get one [here](https://dashboard.cohere.ai/api-keys).
- `PALM_API_KEY`: PaLM API key. You can get one [here](https://makersuite.google.com/app/apikey).
7 changes: 3 additions & 4 deletions docs/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion lilac/data/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def _flatten_input(item: Item, input_path: PathTuple) -> str:
# Filter out Nones
texts = (t for t in texts if t)
# Deal with enriched items.
texts = (t[VALUE_KEY] if VALUE_KEY in t else t for t in texts)
texts = (t[VALUE_KEY] if (isinstance(t, dict) and VALUE_KEY in t) else t for t in texts)
return '\n'.join(texts)

def extract_text(item: Item) -> Item:
Expand Down
2 changes: 2 additions & 0 deletions lilac/data/dataset_compute_signal_chain_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from ..schema import (
EMBEDDING_KEY,
EmbeddingInfo,
Field,
Item,
RichData,
Expand Down Expand Up @@ -144,6 +145,7 @@ def test_manual_embedding_signal(make_test_data: TestDataMaker, mocker: MockerFi
'test_embedding': field(
signal=TestEmbedding().model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
),
},
)
Expand Down
4 changes: 4 additions & 0 deletions lilac/data/dataset_compute_signal_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from ..schema import (
EMBEDDING_KEY,
PATH_WILDCARD,
EmbeddingInfo,
Field,
Item,
RichData,
Expand Down Expand Up @@ -687,6 +688,7 @@ def test_embedding_signal(make_test_data: TestDataMaker, mocker: MockerFixture)
'test_embedding': field(
signal=embedding_signal.model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
)
},
)
Expand Down Expand Up @@ -720,6 +722,7 @@ def test_embedding_signal_overwrite(make_test_data: TestDataMaker, mocker: Mocke
'test_embedding': field(
signal=embedding_signal.model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
)
},
)
Expand Down Expand Up @@ -782,6 +785,7 @@ def test_delete_embedding(make_test_data: TestDataMaker, mocker: MockerFixture)
'test_embedding': field(
signal=embedding_signal.model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
)
},
)
Expand Down
4 changes: 3 additions & 1 deletion lilac/data/dataset_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
TIMESTAMP,
VALUE_KEY,
Bin,
EmbeddingInfo,
Field,
Item,
MapFn,
Expand Down Expand Up @@ -1168,7 +1169,8 @@ def compute_embedding(

output_path = _col_destination_path(signal_col, is_computed_signal=True)
output_dir = os.path.join(self.dataset_path, _signal_dir(output_path))
signal_schema = create_signal_schema(signal, input_path, manifest.data_schema)
embedding_info = EmbeddingInfo(input_path=input_path, embedding=embedding)
signal_schema = create_signal_schema(signal, input_path, manifest.data_schema, embedding_info)

assert signal_schema, 'Signal schema should be defined for `TextEmbeddingSignal`.'

Expand Down
3 changes: 3 additions & 0 deletions lilac/data/dataset_select_rows_schema_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ..schema import (
EMBEDDING_KEY,
PATH_WILDCARD,
EmbeddingInfo,
Field,
Item,
RichData,
Expand Down Expand Up @@ -382,6 +383,7 @@ def test_search_semantic_schema(make_test_data: TestDataMaker) -> None:
'test_embedding': field(
signal=test_embedding.model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
),
expected_world_signal.key(): field(
signal=expected_world_signal.model_dump(exclude_none=True),
Expand Down Expand Up @@ -437,6 +439,7 @@ def test_search_concept_schema(make_test_data: TestDataMaker) -> None:
'test_embedding': field(
signal=test_embedding.model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
),
expected_world_signal.key(): field(
signal=expected_world_signal.model_dump(exclude_none=True),
Expand Down
2 changes: 2 additions & 0 deletions lilac/data/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
EMBEDDING_KEY,
ROWID,
SPAN_KEY,
EmbeddingInfo,
Field,
Item,
MapType,
Expand Down Expand Up @@ -611,6 +612,7 @@ def test_config_from_dataset(make_test_data: TestDataMaker) -> None:
'test_embedding': field(
signal=TestEmbedding().model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('text',), embedding='test_embedding'),
),
},
)
Expand Down
8 changes: 7 additions & 1 deletion lilac/data/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
STRING,
TEXT_SPAN_END_FEATURE,
TEXT_SPAN_START_FEATURE,
EmbeddingInfo,
Field,
Item,
MapFn,
Expand Down Expand Up @@ -132,7 +133,10 @@ def create_json_map_output_schema(map_schema: Field, output_path: PathTuple) ->


def create_signal_schema(
signal: Signal, source_path: PathTuple, current_schema: Schema
signal: Signal,
source_path: PathTuple,
current_schema: Schema,
embedding_info: Optional[EmbeddingInfo] = None,
) -> Optional[Schema]:
"""Create a signal schema describing the enriched fields.
Expand All @@ -148,6 +152,8 @@ def create_signal_schema(
return None

signal_schema.signal = signal.model_dump(exclude_none=True)
if embedding_info:
signal_schema.embedding = embedding_info

enriched_schema = field(fields={signal.key(is_computed_signal=True): signal_schema})

Expand Down
96 changes: 0 additions & 96 deletions lilac/embeddings/palm.py

This file was deleted.

3 changes: 0 additions & 3 deletions lilac/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ class LilacEnvironment(BaseModel):
COHERE_API_KEY: str = PydanticField(
description='The Cohere API key, used for computing `cohere` embeddings.'
)
PALM_API_KEY: str = PydanticField(
description='The PaLM API key, used for computing `palm` embeddings.'
)

# HuggingFace demo.
HF_ACCESS_TOKEN: str = PydanticField(
Expand Down
12 changes: 11 additions & 1 deletion lilac/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@
from .env import set_project_dir
from .load import load
from .project import PROJECT_CONFIG_FILENAME, init
from .schema import EMBEDDING_KEY, Field, Item, RichData, chunk_embedding, field, schema
from .schema import (
EMBEDDING_KEY,
EmbeddingInfo,
Field,
Item,
RichData,
chunk_embedding,
field,
schema,
)
from .signal import (
TextEmbeddingSignal,
TextSignal,
Expand Down Expand Up @@ -275,6 +284,7 @@ def test_load_embeddings(tmp_path: pathlib.Path) -> None:
'test_embedding': field(
signal=TestEmbedding().model_dump(exclude_none=True),
fields=[field('string_span', fields={EMBEDDING_KEY: 'embedding'})],
embedding=EmbeddingInfo(input_path=('str',), embedding='test_embedding'),
)
},
),
Expand Down
10 changes: 5 additions & 5 deletions lilac/router_concept.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

@router.get('/', response_model_exclude_none=True)
def get_concepts(
user: Annotated[Optional[UserInfo], Depends(get_session_user)]
user: Annotated[Optional[UserInfo], Depends(get_session_user)],
) -> list[ConceptInfo]:
"""List the concepts."""
return DISK_CONCEPT_DB.list(user)
Expand Down Expand Up @@ -271,8 +271,9 @@ def generate_examples(description: str) -> list[str]:
client = instructor.patch(openai.OpenAI())

completion = client.chat.completions.create(
model='gpt-3.5-turbo',
functions=[Examples.openai_schema],
model='gpt-3.5-turbo-1106',
response_model=Examples,
temperature=0.0,
messages=[
{
'role': 'system',
Expand All @@ -284,8 +285,7 @@ def generate_examples(description: str) -> list[str]:
},
],
)
result = Examples.from_response(completion)
return result.examples
return completion.examples

except openai.AuthenticationError:
raise ValueError(
Expand Down
Loading

0 comments on commit e8cc67b

Please sign in to comment.