Skip to content

Commit

Permalink
add llama-3 instruct. add snowflake and bge-m3 embedding models. fix …
Browse files Browse the repository at this point in the history
…device usage for embeds. improve labeling prompt. upgrade tiktoken and other requirements
  • Loading branch information
enjalot committed May 15, 2024
1 parent e9083fd commit 304f995
Show file tree
Hide file tree
Showing 6 changed files with 255 additions and 200 deletions.
32 changes: 15 additions & 17 deletions latentscope/models/chat_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
},
{
"provider": "openai",
"name": "gpt-3.5-turbo-0125",
"id": "openai-gpt-3.5-turbo",
"name": "gpt-4o",
"id": "openai-gpt-4o",
"params": {
"max_tokens": 4096
"max_tokens": 128000
}
},
{
Expand All @@ -24,6 +24,14 @@
"max_tokens": 128000
}
},
{
"provider": "openai",
"name": "gpt-4-turbo",
"id": "openai-gpt-4-turbo",
"params": {
"max_tokens": 128000
}
},
{
"provider": "openai",
"name": "gpt-4",
Expand Down Expand Up @@ -58,21 +66,11 @@
},
{
"provider": "transformers",
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"sanitized_name": "TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"id": "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"name": "meta-llama/Meta-Llama-3-8B-Instruct",
"sanitized_name": "meta-llama___Meta-Llama-3-8B-Instruct",
"id": "transformers-meta-llama___Meta-Llama-3-8B-Instruct",
"params": {
"max_tokens": 2048
"max_tokens": 8192
}
},
{
"provider": "transformers",
"name": "HuggingFaceH4/zephyr-7b-beta",
"sanitized_name": "HuggingFaceH4___zephyr-7b-beta",
"id": "transformers-HuggingFaceH4___zephyr-7b-beta",
"params": {
"max_tokens": 4096
}
}

]
41 changes: 40 additions & 1 deletion latentscope/models/embedding_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"modality": "text",
"params": {
"max_tokens": 8192,
"rps": true,
"truncation": true,
"padding": true,
"pooling": "mean"
Expand All @@ -68,12 +69,39 @@
"modality": "text",
"params": {
"max_tokens": 8192,
"rps": true,
"truncation": true,
"padding": true,
"pooling": "mean",
"dimensions": [768, 512, 256, 128, 64]
}
},
{
"provider": "transformers",
"name": "Snowflake/snowflake-arctic-embed-s",
"sanitized_name": "Snowflake___snowflake-arctic-embed-s",
"id": "transformers-Snowflake___snowflake-arctic-embed-s",
"modality": "text",
"params": {
"truncation": true,
"padding": true,
"pooling": "cls"
}
},
{
"provider": "transformers",
"name": "Snowflake/snowflake-arctic-embed-m-long",
"sanitized_name": "Snowflake___snowflake-arctic-embed-m-long",
"id": "transformers-Snowflake___snowflake-arctic-embed-m-long",
"modality": "text",
"params": {
"max_tokens": 8192,
"rps": true,
"truncation": true,
"padding": true,
"pooling": "cls"
}
},
{
"provider": "transformers",
"name": "intfloat/e5-large-v2",
Expand All @@ -97,7 +125,18 @@
"padding": true,
"pooling": "mean"
}
},
},{
"provider": "transformers",
"name": "BAAI/bge-m3",
"sanitized_name": "BAAI___bge-m3",
"id": "transformers-BAAI___bge-m3",
"modality": "text",
"params": {
"truncation": true,
"padding": true,
"pooling": "cls"
}
},
{
"provider": "transformers",
"name": "BAAI/bge-large-en-v1.5",
Expand Down
7 changes: 2 additions & 5 deletions latentscope/models/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,7 @@ def load_model(self):
print("ERROR: No API key found for OpenAI")
print("Missing 'OPENAI_API_KEY' variable in:", f"{os.getcwd()}/.env")
self.client = OpenAI(api_key=api_key)
# special case for the new embedding models
if self.name in ["text-embedding-3-small", "text-embedding-3-large"]:
self.encoder = tiktoken.get_encoding("cl100k_base")
else:
self.encoder = tiktoken.encoding_for_model(self.name)
self.encoder = tiktoken.encoding_for_model(self.name)

def embed(self, inputs, dimensions=None):
time.sleep(0.01) # TODO proper rate limiting
Expand Down Expand Up @@ -46,6 +42,7 @@ def load_model(self):
self.client = OpenAI(api_key=get_key("OPENAI_API_KEY"))
self.encoder = tiktoken.encoding_for_model(self.name)


def chat(self, messages):
response = self.client.chat.completions.create(
model=self.name,
Expand Down
31 changes: 26 additions & 5 deletions latentscope/models/providers/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ def __init__(self, name, params):
super().__init__(name, params)
import torch
self.torch = torch
self.device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

def cls_pooling(self, model_output):
return model_output[0][:, 0]
Expand All @@ -19,17 +20,26 @@ def mean_pooling(self, model_output, attention_mask):
return self.torch.sum(token_embeddings * input_mask_expanded, 1) / self.torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def load_model(self):
from transformers import AutoTokenizer, AutoModel, pipeline
from transformers import AutoTokenizer, AutoModel

if "rps" in self.params and self.params["rps"]:
self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True, safe_serialization=True, rotary_scaling_factor=2 )
else:
self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True)

print("CONFIG", self.model.config)

if self.name == "nomic-ai/nomic-embed-text-v1" or self.name == "nomic-ai/nomic-embed-text-v1.5":
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=self.params["max_tokens"])
self.model = AutoModel.from_pretrained("nomic-ai/nomic-embed-text-v1", trust_remote_code=True, rotary_scaling_factor=2 )
else:
self.tokenizer = AutoTokenizer.from_pretrained(self.name)
self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True)

self.model.to(self.device)
self.model.eval()

def embed(self, inputs, dimensions=None):
encoded_input = self.tokenizer(inputs, padding=self.params["padding"], truncation=self.params["truncation"], return_tensors='pt')
encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()}
pool = self.params["pooling"]
# Compute token embeddings
with self.torch.no_grad():
Expand All @@ -52,16 +62,27 @@ def embed(self, inputs, dimensions=None):


class TransformersChatProvider(ChatModelProvider):
def __init__(self, name, params):
super().__init__(name, params)
import torch
from transformers import pipeline
self.torch = torch
self.pipeline = pipeline

def load_model(self):
# self.pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")
# self.pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="cpu")
# TODO: support bfloat16 for non mac environmentss
self.pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
self.pipe = self.pipeline("text-generation", model=self.name, torch_dtype=self.torch.float16, device_map="auto")
self.encoder = self.pipe.tokenizer

def chat(self, messages, max_new_tokens=24):
prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = self.pipe(prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
generated_text = outputs[0]["generated_text"]
print("GENERATED TEXT", generated_text)
return generated_text.split("<|assistant|>")[1].strip()
if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
generated_text = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[1].strip()
elif "<|assistant|>" in generated_text:
generated_text = generated_text.split("<|assistant|>")[1].strip()
return generated_text
6 changes: 3 additions & 3 deletions latentscope/scripts/label_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ def labeler(dataset_id, text_column="text", cluster_id="cluster-001", model_id="
model.load_model()
enc = model.encoder

system_prompt = {"role":"system", "content": f"""You're job is to summarize lists of items with a short label of no more than 4 words.
system_prompt = {"role":"system", "content": f"""You're job is to summarize lists of items with a short label of no more than 4 words. The items are part of a cluster and the label will be used to distinguish this cluster from others, so pay attention to what makes this group of similar items distinct.
{context}
The user will submit a bulleted list of items and you should choose a label that best summarizes the theme of the list so that someone browsing the labels will have a good idea of what is in the list.
Do not use punctuation, just return a few words that summarize the list."""}
Do not use punctuation, Do not explain yourself, respond with only a few words that summarize the list."""}

# TODO: why the extra 10 for openai?
max_tokens = model.params["max_tokens"] - len(enc.encode(system_prompt["content"])) - 10
Expand Down Expand Up @@ -129,7 +129,7 @@ def labeler(dataset_id, text_column="text", cluster_id="cluster-001", model_id="
try:
time.sleep(0.01)
messages=[
system_prompt, {"role":"user", "content": batch[0]} # TODO hardcoded batch size
system_prompt, {"role":"user", "content": "Here is a list of items, please summarize the list into a label using only a few words:\n" + batch[0]} # TODO hardcoded batch size
]
label = model.chat(messages)
labels.append(label)
Expand Down
Loading

0 comments on commit 304f995

Please sign in to comment.