From 603557d5a61a387bbfe293943ce390930e9115ad Mon Sep 17 00:00:00 2001
From: Kalpesh Krishna <kalpeshk2011@gmail.com>
Date: Mon, 29 May 2023 14:39:13 -0400
Subject: [PATCH 1/4] Adding cost estimates for OpenAI API usage

---
 factscore/atomic_facts.py | 57 +++++++++++++++++++++++++----------
 factscore/factscorer.py   | 62 ++++++++++++++++++++++++++++++++-------
 2 files changed, 93 insertions(+), 26 deletions(-)

diff --git a/factscore/atomic_facts.py b/factscore/atomic_facts.py
index 66c2b76..38133fb 100644
--- a/factscore/atomic_facts.py
+++ b/factscore/atomic_facts.py
@@ -41,17 +41,29 @@ def __init__(self, key_path, demon_dir, model_name=None, gpt3_cache_file=None):
     def save_cache(self):
         self.openai_lm.save_cache()
 
-    def run(self, generation):
-        """Convert the generation into a set of atomic facts."""
+    def estimate_cost(self, generation):
+        """Estimate the cost of generating the atomic facts."""
         if self.preprocess_fn:
             paragraphs = self.preprocess(generation)
         else:
             paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0]
 
-        atomic_facts, para_breaks = self.get_atomic_facts_from_paragraph(paragraphs)
-        return atomic_facts, para_breaks
+        num_words = 0
+        for para in paragraphs:
+            num_words += len(para.split(" "))
 
-    def get_atomic_facts_from_paragraph(self, paragraphs):
+        return num_words * 0.0008
+
+    def run(self, generation, estimate=False):
+        """Convert the generation into a set of atomic facts. Return a total words cost if estimate=True."""
+        if self.preprocess_fn:
+            paragraphs = self.preprocess(generation)
+        else:
+            paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0]
+
+        return self.get_atomic_facts_from_paragraph(paragraphs, estimate=estimate)
+
+    def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False):
         sentences = []
         para_breaks = []
         for para_idx, paragraph in enumerate(paragraphs):
@@ -71,9 +83,14 @@ def get_atomic_facts_from_paragraph(self, paragraphs):
 
             sentences += curr_sentences
 
-        atoms = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \
-                    (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
-                    (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))])
+        atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \
+                            (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
+                            (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], estimate=estimate)
+
+        if estimate:
+            return atoms_or_estimate
+        else:
+            atoms = atoms_or_estimate
 
         atomic_facts_pairs = []
         for i, sent in enumerate(sentences):
@@ -98,7 +115,9 @@ def get_atomic_facts_from_paragraph(self, paragraphs):
         return atomic_facts_pairs, para_breaks
 
 
-    def get_init_atomic_facts_from_sentence(self, sentences):
+    def get_init_atomic_facts_from_sentence(self, sentences, estimate=False):
+        """Get the initial atomic facts from the sentences. Return a total words cost if estimate=True."""
+
         is_bio = self.is_bio
         demons = self.demons
 
@@ -129,15 +148,21 @@ def get_init_atomic_facts_from_sentence(self, sentences):
             prompts.append(prompt)
             prompt_to_sent[prompt] = sentence
 
-        for prompt in prompts:
-            output, _ = self.openai_lm.generate(prompt)
-            atoms[prompt_to_sent[prompt]] = text_to_sentences(output)
+        if estimate:
+            total_words_estimate = 0
+            for prompt in prompts:
+                total_words_estimate += len(prompt.split())
+            return total_words_estimate
+        else:
+            for prompt in prompts:
+                output, _ = self.openai_lm.generate(prompt)
+                atoms[prompt_to_sent[prompt]] = text_to_sentences(output)
 
-        for key, value in demons.items():
-            if key not in atoms:
-                atoms[key] = value
+            for key, value in demons.items():
+                if key not in atoms:
+                    atoms[key] = value
 
-        return atoms
+            return atoms
 
 
 def preprocess_fn(generation, model):
diff --git a/factscore/factscorer.py b/factscore/factscorer.py
index 3b702b3..84ff131 100644
--- a/factscore/factscorer.py
+++ b/factscore/factscorer.py
@@ -77,6 +77,25 @@ def register_knowledge_source(self, name="enwiki-20230401", db_path=None, data_p
                                  "npm-single",
                                  cache_file=os.path.join(self.cache_dir, f"npm-{name}.pkl"))
 
+
+    def print_cost_estimates(self, total_words, task, model):
+        # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+        # Number of tokens are roughly 4/3 of the number of words
+        total_tokens = total_words * 4.0 / 3
+
+        # https://openai.com/pricing
+        # if we use davinci-003, the cost is $0.02 per 1000 tokens
+        # if we use gpt-3.5-turbo, the cost is $0.002 per 1000 tokens
+        if model == "davinci-003":
+            rate = 0.02
+        elif model == "gpt-3.5-turbo":
+            rate = 0.002
+
+        total_cost = total_tokens * rate / 1000
+
+        # print the total words, tokens, and cost along with rate
+        logging.critical("Estimated OpenAI API cost for %s ($%.3f per 1000 tokens): $%.2f for %d words and %d tokens" % (task, rate, total_cost, total_words, total_tokens))
+
     def get_score(self,
                   topics,
                   generations,
@@ -108,6 +127,13 @@ def get_score(self,
                                                         demon_dir=os.path.join(self.data_dir, "demos"),
                                                         gpt3_cache_file=os.path.join(self.cache_dir, "InstructGPT.pkl"))
 
+            # estimate the total cost of atomic fact generation
+            total_words = 0
+            for gen in generations:
+                total_words += self.af_generator.run(gen, estimate=True)
+
+            self.print_cost_estimates(total_words, task="atomic fact generation", model="davinci-003")
+
             if verbose:
                 topics = tqdm(topics)
 
@@ -121,12 +147,21 @@ def get_score(self,
                     atomic_facts.append(curr_afs)
                 if len(atomic_facts) % 10 == 0:
                     self.af_generator.save_cache()
-            
+
             assert len(atomic_facts)==len(topics)
             self.af_generator.save_cache()
-        
+
         respond_ratio = np.mean([facts is not None for facts in atomic_facts])
 
+        if "ChatGPT" in self.model_name:
+            # estimate the total cost of response generation
+            total_words = 0
+            for topic, generation, facts in zip(topics, generations, atomic_facts):
+                if facts is not None:
+                    total_words += self._get_score(topic, generation, facts, knowledge_source, estimate=True)
+
+            self.print_cost_estimates(total_words, task="factscore evaluation", model="gpt-3.5-turbo")
+
         if verbose:
             topics = tqdm(topics)
 
@@ -142,16 +177,17 @@ def get_score(self,
                 scores.append(score)
                 if len(scores) % 10 == 0:
                     self.save_cache()
-        
+
         self.save_cache()
 
         return {"score": np.mean(scores),
                 "respond_ratio": respond_ratio,
                 "decisions": decisions,
-                "num_facts_per_response": np.mean([len(d) for d in decisions])}
+                "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])}
 
-    def _get_score(self, topic, generation, atomic_facts, knowledge_source):
+    def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate=False):
         decisions = []
+        total_words = 0
         for atom in atomic_facts:
             atom = atom.strip()
             if self.lm:
@@ -164,6 +200,9 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
                 if not definition[-1] in string.punctuation:
                     definition += "."
                 prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip())
+                if estimate:
+                    total_words += len(prompt.split())
+                    continue
                 output = self.lm.generate(prompt)
 
                 if type(output[1])==np.ndarray:
@@ -195,7 +234,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
 
             decisions.append({"atom": atom, "is_supported": is_supported})
 
-        return decisions
+        if estimate:
+            return total_words
+        else:
+            return decisions
 
 if __name__ == '__main__':
 
@@ -235,7 +277,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
     logging.basicConfig(format='%(asctime)s - %(name)s - %(message)s',
                         datefmt='%m/%d/%Y %H:%M:%S',
                         level=logging.ERROR if args.print_rate_limit_error else logging.CRITICAL)
-    
+
     fs = FactScorer(model_name=args.model_name,
                     data_dir=args.data_dir,
                     model_dir=args.model_dir,
@@ -264,9 +306,9 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
                        generations=generations,
                        atomic_facts=atomic_facts if args.use_atomic_facts else None,
                        verbose=args.verbose)
-    logging.critical("FActScore=%.1f%%" % (100*out["score"]))
-    logging.critical("Respond ratio=%.1f%%" % (100*out["respond_ratio"]))
-    logging.critical("# Atomic facts per response=%.1f" % (out["num_facts_per_response"]))
+    logging.critical("FActScore = %.1f%%" % (100*out["score"]))
+    logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"]))
+    logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"]))
 
 
 

From 264124c4146e442da885681b876062fb35170aa8 Mon Sep 17 00:00:00 2001
From: Kalpesh Krishna <kalpeshk2011@gmail.com>
Date: Mon, 29 May 2023 14:41:54 -0400
Subject: [PATCH 2/4] Cleaning up code a bit

---
 factscore/atomic_facts.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/factscore/atomic_facts.py b/factscore/atomic_facts.py
index 38133fb..a857aa7 100644
--- a/factscore/atomic_facts.py
+++ b/factscore/atomic_facts.py
@@ -41,19 +41,6 @@ def __init__(self, key_path, demon_dir, model_name=None, gpt3_cache_file=None):
     def save_cache(self):
         self.openai_lm.save_cache()
 
-    def estimate_cost(self, generation):
-        """Estimate the cost of generating the atomic facts."""
-        if self.preprocess_fn:
-            paragraphs = self.preprocess(generation)
-        else:
-            paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0]
-
-        num_words = 0
-        for para in paragraphs:
-            num_words += len(para.split(" "))
-
-        return num_words * 0.0008
-
     def run(self, generation, estimate=False):
         """Convert the generation into a set of atomic facts. Return a total words cost if estimate=True."""
         if self.preprocess_fn:

From 8b40af0aaa16b3ecb9e718aa54d396216371f6c6 Mon Sep 17 00:00:00 2001
From: Kalpesh Krishna <kalpeshk2011@gmail.com>
Date: Fri, 2 Jun 2023 17:45:25 -0400
Subject: [PATCH 3/4] Adding cache into cost estimation

---
 factscore/atomic_facts.py | 20 +++++++++++---------
 factscore/factscorer.py   | 24 ++++++++++++++++++------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/factscore/atomic_facts.py b/factscore/atomic_facts.py
index a857aa7..6e640dd 100644
--- a/factscore/atomic_facts.py
+++ b/factscore/atomic_facts.py
@@ -41,16 +41,16 @@ def __init__(self, key_path, demon_dir, model_name=None, gpt3_cache_file=None):
     def save_cache(self):
         self.openai_lm.save_cache()
 
-    def run(self, generation, estimate=False):
-        """Convert the generation into a set of atomic facts. Return a total words cost if estimate=True."""
+    def run(self, generation, cost_estimate=None):
+        """Convert the generation into a set of atomic facts. Return a total words cost if cost_estimate != None."""
         if self.preprocess_fn:
             paragraphs = self.preprocess(generation)
         else:
             paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0]
 
-        return self.get_atomic_facts_from_paragraph(paragraphs, estimate=estimate)
+        return self.get_atomic_facts_from_paragraph(paragraphs, cost_estimate=cost_estimate)
 
-    def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False):
+    def get_atomic_facts_from_paragraph(self, paragraphs, cost_estimate=None):
         sentences = []
         para_breaks = []
         for para_idx, paragraph in enumerate(paragraphs):
@@ -72,9 +72,9 @@ def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False):
 
         atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \
                             (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
-                            (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], estimate=estimate)
+                            (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], cost_estimate=cost_estimate)
 
-        if estimate:
+        if cost_estimate:
             return atoms_or_estimate
         else:
             atoms = atoms_or_estimate
@@ -102,8 +102,8 @@ def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False):
         return atomic_facts_pairs, para_breaks
 
 
-    def get_init_atomic_facts_from_sentence(self, sentences, estimate=False):
-        """Get the initial atomic facts from the sentences. Return a total words cost if estimate=True."""
+    def get_init_atomic_facts_from_sentence(self, sentences, cost_estimate=None):
+        """Get the initial atomic facts from the sentences. Return a total words cost if cost_estimate != None."""
 
         is_bio = self.is_bio
         demons = self.demons
@@ -135,9 +135,11 @@ def get_init_atomic_facts_from_sentence(self, sentences, estimate=False):
             prompts.append(prompt)
             prompt_to_sent[prompt] = sentence
 
-        if estimate:
+        if cost_estimate:
             total_words_estimate = 0
             for prompt in prompts:
+                if cost_estimate == "consider_cache" and (prompt.strip() + "_0") in self.openai_lm.cache_dict:
+                    continue
                 total_words_estimate += len(prompt.split())
             return total_words_estimate
         else:
diff --git a/factscore/factscorer.py b/factscore/factscorer.py
index 84ff131..e3c9ba6 100644
--- a/factscore/factscorer.py
+++ b/factscore/factscorer.py
@@ -20,6 +20,7 @@ def __init__(self,
                  model_dir=".cache/factscore",
                  cache_dir=".cache/factscore",
                  openai_key="api.key",
+                 cost_estimate="consider_cache",
                  batch_size=256):
         assert model_name in ["retrieval+llama", "retrieval+llama+npm", "retrieval+ChatGPT", "npm", "retrieval+ChatGPT+npm"]
         self.model_name = model_name
@@ -36,6 +37,7 @@ def __init__(self,
             os.makedirs(cache_dir)
 
         self.af_generator = None
+        self.cost_estimate = cost_estimate
 
         if "llama" in model_name:
             self.lm = CLM("inst-llama-7B",
@@ -130,7 +132,7 @@ def get_score(self,
             # estimate the total cost of atomic fact generation
             total_words = 0
             for gen in generations:
-                total_words += self.af_generator.run(gen, estimate=True)
+                total_words += self.af_generator.run(gen, cost_estimate=self.cost_estimate)
 
             self.print_cost_estimates(total_words, task="atomic fact generation", model="davinci-003")
 
@@ -158,7 +160,7 @@ def get_score(self,
             total_words = 0
             for topic, generation, facts in zip(topics, generations, atomic_facts):
                 if facts is not None:
-                    total_words += self._get_score(topic, generation, facts, knowledge_source, estimate=True)
+                    total_words += self._get_score(topic, generation, facts, knowledge_source, cost_estimate=self.cost_estimate)
 
             self.print_cost_estimates(total_words, task="factscore evaluation", model="gpt-3.5-turbo")
 
@@ -185,7 +187,7 @@ def get_score(self,
                 "decisions": decisions,
                 "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])}
 
-    def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate=False):
+    def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_estimate=None):
         decisions = []
         total_words = 0
         for atom in atomic_facts:
@@ -200,9 +202,14 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate
                 if not definition[-1] in string.punctuation:
                     definition += "."
                 prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip())
-                if estimate:
+
+                if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict:
+                    total_words += len(prompt.split())
+                    continue
+                elif cost_estimate == "ignore_cache":
                     total_words += len(prompt.split())
                     continue
+
                 output = self.lm.generate(prompt)
 
                 if type(output[1])==np.ndarray:
@@ -234,7 +241,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate
 
             decisions.append({"atom": atom, "is_supported": is_supported})
 
-        if estimate:
+        if cost_estimate:
             return total_words
         else:
             return decisions
@@ -260,6 +267,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate
     parser.add_argument('--cache_dir',
                         type=str,
                         default=".cache/factscore/")
+    parser.add_argument('--cost_estimate',
+                        type=str,
+                        default="consider_cache",
+                        choices=["consider_cache", "ignore_cache"])
     parser.add_argument('--use_atomic_facts',
                         action="store_true")
     parser.add_argument('--verbose',
@@ -282,7 +293,8 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate
                     data_dir=args.data_dir,
                     model_dir=args.model_dir,
                     cache_dir=args.cache_dir,
-                    openai_key=args.openai_key)
+                    openai_key=args.openai_key,
+                    cost_estimate=args.cost_estimate)
 
     tot = 0
     topics, generations, atomic_facts = [], [], []

From 026faffba1deacb910d0d02c3e0668c242af9343 Mon Sep 17 00:00:00 2001
From: Kalpesh Krishna <kalpeshk2011@gmail.com>
Date: Fri, 2 Jun 2023 17:57:55 -0400
Subject: [PATCH 4/4] Bug fix in the FactScore get_score() cost estimation

---
 factscore/factscorer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/factscore/factscorer.py b/factscore/factscorer.py
index e3c9ba6..f38f40d 100644
--- a/factscore/factscorer.py
+++ b/factscore/factscorer.py
@@ -203,11 +203,11 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est
                     definition += "."
                 prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip())
 
-                if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict:
-                    total_words += len(prompt.split())
-                    continue
-                elif cost_estimate == "ignore_cache":
-                    total_words += len(prompt.split())
+                if cost_estimate:
+                    if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict:
+                        total_words += len(prompt.split())
+                    elif cost_estimate == "ignore_cache":
+                        total_words += len(prompt.split())
                     continue
 
                 output = self.lm.generate(prompt)