Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prompt templates #18

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 56 additions & 128 deletions toponymy/llm_wrappers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import string
import re
from warnings import warn
import json

import tokenizers
import transformers

_GET_TOPIC_NAME_REGEX = r'\{\s*"topic_name":\s*.*?, "topic_specificity":\s*\d+\.\d+\s*\}'
_GET_TOPIC_CLUSTER_NAMES_REGEX = r'\{\s*"new_topic_name_mapping":\s*.*?, "topic_specificities": .*?\}'

try:

import llama_cpp


class LlamaCppWrapper:

def __init__(self, model_path, **kwargs):
Expand All @@ -32,37 +38,56 @@ def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
try:
topic_name_info_raw = self.llm(prompt, temperature=temperature)
topic_name_info_text = topic_name_info_raw["choices"][0]["text"]
topic_name_info = json.loads(topic_name_info_text)
result = []
for old_name, name_mapping in zip(old_names, topic_name_info):
if old_name.lower() == list(name_mapping.keys())[0].lower():
result.append(list(name_mapping.values()[0]))
else:
result.append(old_name)

topic_name_info = re.findall(_GET_TOPIC_CLUSTER_NAMES_REGEX, topic_name_info_text)[0]
topic_name_info = json.loads(topic_name_info)
mapping = topic_name_info["new_topic_name_mapping"]
result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
return result
except:
return old_names

def llm_instruction(self, kind="base_layer"):
if kind == "base_layer":
return "\nThe short distinguising topic name is:\n"
elif kind == "intermediate_layer":
return "\nThe short topic name that encompasses the sub-topics is:\n"
elif kind == "remedy":
return "\nA better and more specific name that still captures the topic of these article titles is:\n"
else:
raise ValueError(
f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
)

except ImportError:
pass

try:
import huggingface_hub
import transformers

class HuggingFaceWrapper:

import json
def __init__(self, model, **kwargs):
self.model = model
self.llm = transformers.pipeline("text-generation", model=model, **kwargs)

def generate_topic_name(self, prompt, temperature=0.8):
try:
topic_name_info_raw = self.llm(prompt, max_length=256, temperature=temperature)
topic_name_info_text = topic_name_info_raw[0]["generated_text"][-1]['content']
topic_name_info = re.findall(_GET_TOPIC_NAME_REGEX, topic_name_info_text)[0]
topic_name_info = json.loads(topic_name_info)
topic_name = topic_name_info["topic_name"]
except:
topic_name = ""

return topic_name

def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
try:
topic_name_info_raw = self.llm(prompt, max_length=1024, temperature=temperature)
topic_name_info_text = topic_name_info_raw[0]["generated_text"][-1]['content']
topic_name_info = re.findall(_GET_TOPIC_CLUSTER_NAMES_REGEX, topic_name_info_text)[0]
topic_name_info = json.loads(topic_name_info)
mapping = topic_name_info["new_topic_name_mapping"]
result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
return result
except:
return old_names


except ImportError:
pass

try:
import cohere

class CohereWrapper:
Expand Down Expand Up @@ -92,66 +117,29 @@ def generate_topic_name(self, prompt, temperature=0.5):
topic_name = ""
return topic_name

def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
def generate_topic_cluster_names(self, prompt, old_names, temperature=0.8):
try:
topic_name_info_raw = self.llm.chat(
message=prompt,
model=self.model,
temperature=temperature,
response_format={ "type": "json_object" },
max_tokens=2048,
)
topic_name_info_text = topic_name_info_raw.text
topic_name_info = json.loads(topic_name_info_text)
mapping = topic_name_info["new_topic_name_mapping"]
result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
return result
except Exception as e:
warn(f"Failed to generate topic cluster names with Cohere: {e}")
return old_names

result = []
for old_name, name_mapping in zip(old_names, topic_name_info):
try:
if old_name.lower() == list(name_mapping.keys())[0].lower():
result.append(list(name_mapping.values())[0])
else:
warn(
f"Old name {old_name} does not match the new name {list(name_mapping.keys())[0]}"
)
# use old_name?
result.append(list(name_mapping.values())[0])
except:
result.append(old_name)

return result

def llm_instruction(self, kind="base_layer"):
if kind == "base_layer":
return """
You are to give a brief (five to ten word) name describing this group.
The topic name should be as specific as you can reasonably make it, while still describing the all example texts.
The response should be in JSON formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
elif kind == "intermediate_layer":
return """
You are to give a brief (three to five word) name describing this group of papers.
The topic should be the most specific topic that encompasses the breadth of sub-topics, with a focus on the major sub-topics.
The response should be in JSON formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
elif kind == "remedy":
return """
You are to give a brief (three to ten word) name describing this group of papers that better captures the specific details of this group.
The topic should be the most specific topic that encompasses the full breadth of sub-topics.
The response should be in JSON formatted as {"topic_name":<NAME>, "less_specific_topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
else:
raise ValueError(
f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
)

except:
pass

try:

import json

import anthropic

class AnthropicWrapper:
Expand Down Expand Up @@ -187,47 +175,16 @@ def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
)
topic_name_info_text = topic_name_info_raw.content[0].text
topic_name_info = json.loads(topic_name_info_text)
result = []
for old_name, name_mapping in zip(old_names, topic_name_info):
if old_name.lower() == list(name_mapping.keys())[0].lower():
result.append(list(name_mapping.values()[0]))
else:
result.append(old_name)

mapping = topic_name_info["new_topic_name_mapping"]
result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
return result
except:
return old_names

def llm_instruction(self, kind="base_layer"):
if kind == "base_layer":
return """
You are to give a brief (five to ten word) name describing this group.
The topic name should be as specific as you can reasonably make it, while still describing the all example texts.
The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
elif kind == "intermediate_layer":
return """
You are to give a brief (three to five word) name describing this group of papers.
The topic should be the most specific topic that encompasses the breadth of sub-topics, with a focus on the major sub-topics.
The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
elif kind == "remedy":
return """
You are to give a brief (five to ten word) name describing this group of papers that better captures the specific details of this group.
The topic should be the most specific topic that encompasses the full breadth of sub-topics.
The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "less_specific_topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
else:
raise ValueError(
f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
)

except:
pass

try:
import json

import openai

class OpenAIWrapper:
Expand Down Expand Up @@ -268,40 +225,11 @@ def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
)
topic_name_info_text = topic_name_info_raw.choices[0].message.content
topic_name_info = json.loads(topic_name_info_text)
result = []
for old_name, name_mapping in zip(old_names, topic_name_info):
if old_name.lower() == list(name_mapping.keys())[0].lower():
result.append(list(name_mapping.values()[0]))
else:
result.append(old_name)

mapping = topic_name_info["new_topic_name_mapping"]
result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
return result
except:
return old_names

def llm_instruction(self, kind="base_layer"):
if kind == "base_layer":
return """
You are to give a brief (five to ten word) name describing this group.
The topic name should be as specific as you can reasonably make it, while still describing the all example texts.
The response must be **ONLY** JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
elif kind == "intermediate_layer":
return """
You are to give a brief (three to five word) name describing this group of papers.
The topic should be the most specific topic that encompasses the breadth of sub-topics, with a focus on the major sub-topics.
The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
elif kind == "remedy":
return """
You are to give a brief (five to ten word) name describing this group of papers that better captures the specific details of this group.
The topic should be the most specific topic that encompasses the full breadth of sub-topics.
The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "less_specific_topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
"""
else:
raise ValueError(
f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
)

except:
pass
Loading