Merge pull request #12 from shmsw25/api-cost-estimates

Adding cost estimates for OpenAI API usage
shmsw25 · Jun 6, 2023 · 90786c3 · 90786c3
2 parents 91637f8 + 026faff
commit 90786c3
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 27 deletions.
diff --git a/factscore/atomic_facts.py b/factscore/atomic_facts.py
@@ -41,17 +41,16 @@ def __init__(self, key_path, demon_dir, model_name=None, gpt3_cache_file=None):
     def save_cache(self):
         self.openai_lm.save_cache()
 
-    def run(self, generation):
-        """Convert the generation into a set of atomic facts."""
+    def run(self, generation, cost_estimate=None):
+        """Convert the generation into a set of atomic facts. Return a total words cost if cost_estimate != None."""
         if self.preprocess_fn:
             paragraphs = self.preprocess(generation)
         else:
             paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0]
 
-        atomic_facts, para_breaks = self.get_atomic_facts_from_paragraph(paragraphs)
-        return atomic_facts, para_breaks
+        return self.get_atomic_facts_from_paragraph(paragraphs, cost_estimate=cost_estimate)
 
-    def get_atomic_facts_from_paragraph(self, paragraphs):
+    def get_atomic_facts_from_paragraph(self, paragraphs, cost_estimate=None):
         sentences = []
         para_breaks = []
         for para_idx, paragraph in enumerate(paragraphs):
@@ -71,9 +70,14 @@ def get_atomic_facts_from_paragraph(self, paragraphs):
 
             sentences += curr_sentences
 
-        atoms = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \
-                    (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
-                    (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))])
+        atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \
+                            (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
+                            (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], cost_estimate=cost_estimate)
+
+        if cost_estimate:
+            return atoms_or_estimate
+        else:
+            atoms = atoms_or_estimate
 
         atomic_facts_pairs = []
         for i, sent in enumerate(sentences):
@@ -98,7 +102,9 @@ def get_atomic_facts_from_paragraph(self, paragraphs):
         return atomic_facts_pairs, para_breaks
 
 
-    def get_init_atomic_facts_from_sentence(self, sentences):
+    def get_init_atomic_facts_from_sentence(self, sentences, cost_estimate=None):
+        """Get the initial atomic facts from the sentences. Return a total words cost if cost_estimate != None."""
+
         is_bio = self.is_bio
         demons = self.demons
 
@@ -129,15 +135,23 @@ def get_init_atomic_facts_from_sentence(self, sentences):
             prompts.append(prompt)
             prompt_to_sent[prompt] = sentence
 
-        for prompt in prompts:
-            output, _ = self.openai_lm.generate(prompt)
-            atoms[prompt_to_sent[prompt]] = text_to_sentences(output)
+        if cost_estimate:
+            total_words_estimate = 0
+            for prompt in prompts:
+                if cost_estimate == "consider_cache" and (prompt.strip() + "_0") in self.openai_lm.cache_dict:
+                    continue
+                total_words_estimate += len(prompt.split())
+            return total_words_estimate
+        else:
+            for prompt in prompts:
+                output, _ = self.openai_lm.generate(prompt)
+                atoms[prompt_to_sent[prompt]] = text_to_sentences(output)
 
-        for key, value in demons.items():
-            if key not in atoms:
-                atoms[key] = value
+            for key, value in demons.items():
+                if key not in atoms:
+                    atoms[key] = value
 
-        return atoms
+            return atoms
 
 
 def preprocess_fn(generation, model):

diff --git a/factscore/factscorer.py b/factscore/factscorer.py
@@ -20,6 +20,7 @@ def __init__(self,
                  model_dir=".cache/factscore",
                  cache_dir=".cache/factscore",
                  openai_key="api.key",
+                 cost_estimate="consider_cache",
                  batch_size=256):
         assert model_name in ["retrieval+llama", "retrieval+llama+npm", "retrieval+ChatGPT", "npm", "retrieval+ChatGPT+npm"]
         self.model_name = model_name
@@ -36,6 +37,7 @@ def __init__(self,
             os.makedirs(cache_dir)
 
         self.af_generator = None
+        self.cost_estimate = cost_estimate
 
         if "llama" in model_name:
             self.lm = CLM("inst-llama-7B",
@@ -77,6 +79,25 @@ def register_knowledge_source(self, name="enwiki-20230401", db_path=None, data_p
                                  "npm-single",
                                  cache_file=os.path.join(self.cache_dir, f"npm-{name}.pkl"))
 
+
+    def print_cost_estimates(self, total_words, task, model):
+        # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+        # Number of tokens are roughly 4/3 of the number of words
+        total_tokens = total_words * 4.0 / 3
+
+        # https://openai.com/pricing
+        # if we use davinci-003, the cost is $0.02 per 1000 tokens
+        # if we use gpt-3.5-turbo, the cost is $0.002 per 1000 tokens
+        if model == "davinci-003":
+            rate = 0.02
+        elif model == "gpt-3.5-turbo":
+            rate = 0.002
+
+        total_cost = total_tokens * rate / 1000
+
+        # print the total words, tokens, and cost along with rate
+        logging.critical("Estimated OpenAI API cost for %s ($%.3f per 1000 tokens): $%.2f for %d words and %d tokens" % (task, rate, total_cost, total_words, total_tokens))
+
     def get_score(self,
                   topics,
                   generations,
@@ -108,6 +129,13 @@ def get_score(self,
                                                         demon_dir=os.path.join(self.data_dir, "demos"),
                                                         gpt3_cache_file=os.path.join(self.cache_dir, "InstructGPT.pkl"))
 
+            # estimate the total cost of atomic fact generation
+            total_words = 0
+            for gen in generations:
+                total_words += self.af_generator.run(gen, cost_estimate=self.cost_estimate)
+
+            self.print_cost_estimates(total_words, task="atomic fact generation", model="davinci-003")
+
             if verbose:
                 topics = tqdm(topics)
 
@@ -121,12 +149,21 @@ def get_score(self,
                     atomic_facts.append(curr_afs)
                 if len(atomic_facts) % 10 == 0:
                     self.af_generator.save_cache()
-            
+
             assert len(atomic_facts)==len(topics)
             self.af_generator.save_cache()
-        
+
         respond_ratio = np.mean([facts is not None for facts in atomic_facts])
 
+        if "ChatGPT" in self.model_name:
+            # estimate the total cost of response generation
+            total_words = 0
+            for topic, generation, facts in zip(topics, generations, atomic_facts):
+                if facts is not None:
+                    total_words += self._get_score(topic, generation, facts, knowledge_source, cost_estimate=self.cost_estimate)
+
+            self.print_cost_estimates(total_words, task="factscore evaluation", model="gpt-3.5-turbo")
+
         if verbose:
             topics = tqdm(topics)
 
@@ -142,16 +179,17 @@ def get_score(self,
                 scores.append(score)
                 if len(scores) % 10 == 0:
                     self.save_cache()
-        
+
         self.save_cache()
 
         return {"score": np.mean(scores),
                 "respond_ratio": respond_ratio,
                 "decisions": decisions,
-                "num_facts_per_response": np.mean([len(d) for d in decisions])}
+                "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])}
 
-    def _get_score(self, topic, generation, atomic_facts, knowledge_source):
+    def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_estimate=None):
         decisions = []
+        total_words = 0
         for atom in atomic_facts:
             atom = atom.strip()
             if self.lm:
@@ -164,6 +202,14 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
                 if not definition[-1] in string.punctuation:
                     definition += "."
                 prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip())
+
+                if cost_estimate:
+                    if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict:
+                        total_words += len(prompt.split())
+                    elif cost_estimate == "ignore_cache":
+                        total_words += len(prompt.split())
+                    continue
+
                 output = self.lm.generate(prompt)
 
                 if type(output[1])==np.ndarray:
@@ -195,7 +241,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
 
             decisions.append({"atom": atom, "is_supported": is_supported})
 
-        return decisions
+        if cost_estimate:
+            return total_words
+        else:
+            return decisions
 
 if __name__ == '__main__':
 
@@ -218,6 +267,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
     parser.add_argument('--cache_dir',
                         type=str,
                         default=".cache/factscore/")
+    parser.add_argument('--cost_estimate',
+                        type=str,
+                        default="consider_cache",
+                        choices=["consider_cache", "ignore_cache"])
     parser.add_argument('--use_atomic_facts',
                         action="store_true")
     parser.add_argument('--verbose',
@@ -235,12 +288,13 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
     logging.basicConfig(format='%(asctime)s - %(name)s - %(message)s',
                         datefmt='%m/%d/%Y %H:%M:%S',
                         level=logging.ERROR if args.print_rate_limit_error else logging.CRITICAL)
-    
+
     fs = FactScorer(model_name=args.model_name,
                     data_dir=args.data_dir,
                     model_dir=args.model_dir,
                     cache_dir=args.cache_dir,
-                    openai_key=args.openai_key)
+                    openai_key=args.openai_key,
+                    cost_estimate=args.cost_estimate)
 
     tot = 0
     topics, generations, atomic_facts = [], [], []
@@ -264,9 +318,9 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source):
                        generations=generations,
                        atomic_facts=atomic_facts if args.use_atomic_facts else None,
                        verbose=args.verbose)
-    logging.critical("FActScore=%.1f%%" % (100*out["score"]))
-    logging.critical("Respond ratio=%.1f%%" % (100*out["respond_ratio"]))
-    logging.critical("# Atomic facts per response=%.1f" % (out["num_facts_per_response"]))
+    logging.critical("FActScore = %.1f%%" % (100*out["score"]))
+    logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"]))
+    logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"]))