TutteInstitute · lmcinnes · Oct 19, 2024 · Oct 19, 2024 · Oct 22, 2024 · Oct 25, 2024
diff --git a/toponymy/llm_wrappers.py b/toponymy/llm_wrappers.py
@@ -1,13 +1,19 @@
 import string
+import re
 from warnings import warn
+import json
 
 import tokenizers
 import transformers
 
+_GET_TOPIC_NAME_REGEX = r'\{\s*"topic_name":\s*.*?, "topic_specificity":\s*\d+\.\d+\s*\}'
+_GET_TOPIC_CLUSTER_NAMES_REGEX = r'\{\s*"new_topic_name_mapping":\s*.*?, "topic_specificities": .*?\}'
+
 try:
 
     import llama_cpp
 
+
     class LlamaCppWrapper:
 
         def __init__(self, model_path, **kwargs):
@@ -32,37 +38,56 @@ def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
             try:
                 topic_name_info_raw = self.llm(prompt, temperature=temperature)
                 topic_name_info_text = topic_name_info_raw["choices"][0]["text"]
-                topic_name_info = json.loads(topic_name_info_text)
-                result = []
-                for old_name, name_mapping in zip(old_names, topic_name_info):
-                    if old_name.lower() == list(name_mapping.keys())[0].lower():
-                        result.append(list(name_mapping.values()[0]))
-                    else:
-                        result.append(old_name)
-
+                topic_name_info = re.findall(_GET_TOPIC_CLUSTER_NAMES_REGEX, topic_name_info_text)[0]
+                topic_name_info = json.loads(topic_name_info)
+                mapping = topic_name_info["new_topic_name_mapping"]
+                result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
                 return result
             except:
                 return old_names
 
-        def llm_instruction(self, kind="base_layer"):
-            if kind == "base_layer":
-                return "\nThe short distinguising topic name is:\n"
-            elif kind == "intermediate_layer":
-                return "\nThe short topic name that encompasses the sub-topics is:\n"
-            elif kind == "remedy":
-                return "\nA better and more specific name that still captures the topic of these article titles is:\n"
-            else:
-                raise ValueError(
-                    f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
-                )
-
 except ImportError:
     pass
 
 try:
+    import huggingface_hub
+    import transformers
+
+    class HuggingFaceWrapper:
 
-    import json
+        def __init__(self, model, **kwargs):
+            self.model = model
+            self.llm = transformers.pipeline("text-generation", model=model, **kwargs)
+
+        def generate_topic_name(self, prompt, temperature=0.8):
+            try:
+                topic_name_info_raw = self.llm(prompt, max_length=256, temperature=temperature)
+                topic_name_info_text = topic_name_info_raw[0]["generated_text"][-1]['content']
+                topic_name_info = re.findall(_GET_TOPIC_NAME_REGEX, topic_name_info_text)[0]
+                topic_name_info = json.loads(topic_name_info)
+                topic_name = topic_name_info["topic_name"]
+            except:
+                topic_name = ""
 
+            return topic_name
+
+        def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
+            try:
+                topic_name_info_raw = self.llm(prompt, max_length=1024, temperature=temperature)
+                topic_name_info_text = topic_name_info_raw[0]["generated_text"][-1]['content']
+                topic_name_info = re.findall(_GET_TOPIC_CLUSTER_NAMES_REGEX, topic_name_info_text)[0]
+                topic_name_info = json.loads(topic_name_info)
+                mapping = topic_name_info["new_topic_name_mapping"]
+                result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
+                return result
+            except:
+                return old_names
+
+
+except ImportError:
+    pass
+
+try:
     import cohere
 
     class CohereWrapper:
@@ -92,66 +117,29 @@ def generate_topic_name(self, prompt, temperature=0.5):
                 topic_name = ""
             return topic_name
 
-        def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
+        def generate_topic_cluster_names(self, prompt, old_names, temperature=0.8):
             try:
                 topic_name_info_raw = self.llm.chat(
                     message=prompt,
                     model=self.model,
                     temperature=temperature,
+                    response_format={ "type": "json_object" },
+                    max_tokens=2048,
                 )
                 topic_name_info_text = topic_name_info_raw.text
                 topic_name_info = json.loads(topic_name_info_text)
+                mapping = topic_name_info["new_topic_name_mapping"]
+                result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
+                return result
             except Exception as e:
                 warn(f"Failed to generate topic cluster names with Cohere: {e}")
                 return old_names
 
-            result = []
-            for old_name, name_mapping in zip(old_names, topic_name_info):
-                try:
-                    if old_name.lower() == list(name_mapping.keys())[0].lower():
-                        result.append(list(name_mapping.values())[0])
-                    else:
-                        warn(
-                            f"Old name {old_name} does not match the new name {list(name_mapping.keys())[0]}"
-                        )
-                        # use old_name?
-                        result.append(list(name_mapping.values())[0])
-                except:
-                    result.append(old_name)
-
-            return result
-
-        def llm_instruction(self, kind="base_layer"):
-            if kind == "base_layer":
-                return """
-You are to give a brief (five to ten word) name describing this group.
-The topic name should be as specific as you can reasonably make it, while still describing the all example texts.
-The response should be in JSON formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-                """
-            elif kind == "intermediate_layer":
-                return """
-You are to give a brief (three to five word) name describing this group of papers.
-The topic should be the most specific topic that encompasses the breadth of sub-topics, with a focus on the major sub-topics.
-The response should be in JSON formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-                """
-            elif kind == "remedy":
-                return """
-You are to give a brief (three to ten word) name describing this group of papers that better captures the specific details of this group.
-The topic should be the most specific topic that encompasses the full breadth of sub-topics.
-The response should be in JSON formatted as {"topic_name":<NAME>, "less_specific_topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-"""
-            else:
-                raise ValueError(
-                    f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
-                )
 
 except:
     pass
 
 try:
-
-    import json
-
     import anthropic
 
     class AnthropicWrapper:
@@ -187,47 +175,16 @@ def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
                 )
                 topic_name_info_text = topic_name_info_raw.content[0].text
                 topic_name_info = json.loads(topic_name_info_text)
-                result = []
-                for old_name, name_mapping in zip(old_names, topic_name_info):
-                    if old_name.lower() == list(name_mapping.keys())[0].lower():
-                        result.append(list(name_mapping.values()[0]))
-                    else:
-                        result.append(old_name)
-
+                mapping = topic_name_info["new_topic_name_mapping"]
+                result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
                 return result
             except:
                 return old_names
 
-        def llm_instruction(self, kind="base_layer"):
-            if kind == "base_layer":
-                return """
-You are to give a brief (five to ten word) name describing this group.
-The topic name should be as specific as you can reasonably make it, while still describing the all example texts.
-The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-                """
-            elif kind == "intermediate_layer":
-                return """
-You are to give a brief (three to five word) name describing this group of papers.
-The topic should be the most specific topic that encompasses the breadth of sub-topics, with a focus on the major sub-topics.
-The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-                """
-            elif kind == "remedy":
-                return """
-You are to give a brief (five to ten word) name describing this group of papers that better captures the specific details of this group.
-The topic should be the most specific topic that encompasses the full breadth of sub-topics.
-The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "less_specific_topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-"""
-            else:
-                raise ValueError(
-                    f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
-                )
-
 except:
     pass
 
 try:
-    import json
-
     import openai
 
     class OpenAIWrapper:
@@ -268,40 +225,11 @@ def generate_topic_cluster_names(self, prompt, old_names, temperature=0.5):
                 )
                 topic_name_info_text = topic_name_info_raw.choices[0].message.content
                 topic_name_info = json.loads(topic_name_info_text)
-                result = []
-                for old_name, name_mapping in zip(old_names, topic_name_info):
-                    if old_name.lower() == list(name_mapping.keys())[0].lower():
-                        result.append(list(name_mapping.values()[0]))
-                    else:
-                        result.append(old_name)
-
+                mapping = topic_name_info["new_topic_name_mapping"]
+                result = [mapping.get(f"{n}. {name}", name) for n, name in enumerate(old_names)]
                 return result
             except:
                 return old_names
 
-        def llm_instruction(self, kind="base_layer"):
-            if kind == "base_layer":
-                return """
-You are to give a brief (five to ten word) name describing this group.
-The topic name should be as specific as you can reasonably make it, while still describing the all example texts.
-The response must be **ONLY** JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-                """
-            elif kind == "intermediate_layer":
-                return """
-You are to give a brief (three to five word) name describing this group of papers.
-The topic should be the most specific topic that encompasses the breadth of sub-topics, with a focus on the major sub-topics.
-The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-                """
-            elif kind == "remedy":
-                return """
-You are to give a brief (five to ten word) name describing this group of papers that better captures the specific details of this group.
-The topic should be the most specific topic that encompasses the full breadth of sub-topics.
-The response should be only JSON with no preamble formatted as {"topic_name":<NAME>, "less_specific_topic_name":<NAME>, "topic_specificity":<SCORE>} where SCORE is a value in the range 0 to 1.
-"""
-            else:
-                raise ValueError(
-                    f"Invalid llm_imnstruction kind; should be one of 'base_layer', 'intermediate_layer', or 'remedy' not '{kind}'"
-                )
-
 except:
     pass