From 603557d5a61a387bbfe293943ce390930e9115ad Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Mon, 29 May 2023 14:39:13 -0400 Subject: [PATCH 1/4] Adding cost estimates for OpenAI API usage --- factscore/atomic_facts.py | 57 +++++++++++++++++++++++++---------- factscore/factscorer.py | 62 ++++++++++++++++++++++++++++++++------- 2 files changed, 93 insertions(+), 26 deletions(-) diff --git a/factscore/atomic_facts.py b/factscore/atomic_facts.py index 66c2b76..38133fb 100644 --- a/factscore/atomic_facts.py +++ b/factscore/atomic_facts.py @@ -41,17 +41,29 @@ def __init__(self, key_path, demon_dir, model_name=None, gpt3_cache_file=None): def save_cache(self): self.openai_lm.save_cache() - def run(self, generation): - """Convert the generation into a set of atomic facts.""" + def estimate_cost(self, generation): + """Estimate the cost of generating the atomic facts.""" if self.preprocess_fn: paragraphs = self.preprocess(generation) else: paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0] - atomic_facts, para_breaks = self.get_atomic_facts_from_paragraph(paragraphs) - return atomic_facts, para_breaks + num_words = 0 + for para in paragraphs: + num_words += len(para.split(" ")) - def get_atomic_facts_from_paragraph(self, paragraphs): + return num_words * 0.0008 + + def run(self, generation, estimate=False): + """Convert the generation into a set of atomic facts. Return a total words cost if estimate=True.""" + if self.preprocess_fn: + paragraphs = self.preprocess(generation) + else: + paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0] + + return self.get_atomic_facts_from_paragraph(paragraphs, estimate=estimate) + + def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False): sentences = [] para_breaks = [] for para_idx, paragraph in enumerate(paragraphs): @@ -71,9 +83,14 @@ def get_atomic_facts_from_paragraph(self, paragraphs): sentences += curr_sentences - atoms = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \ - (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \ - (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))]) + atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \ + (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \ + (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], estimate=estimate) + + if estimate: + return atoms_or_estimate + else: + atoms = atoms_or_estimate atomic_facts_pairs = [] for i, sent in enumerate(sentences): @@ -98,7 +115,9 @@ def get_atomic_facts_from_paragraph(self, paragraphs): return atomic_facts_pairs, para_breaks - def get_init_atomic_facts_from_sentence(self, sentences): + def get_init_atomic_facts_from_sentence(self, sentences, estimate=False): + """Get the initial atomic facts from the sentences. Return a total words cost if estimate=True.""" + is_bio = self.is_bio demons = self.demons @@ -129,15 +148,21 @@ def get_init_atomic_facts_from_sentence(self, sentences): prompts.append(prompt) prompt_to_sent[prompt] = sentence - for prompt in prompts: - output, _ = self.openai_lm.generate(prompt) - atoms[prompt_to_sent[prompt]] = text_to_sentences(output) + if estimate: + total_words_estimate = 0 + for prompt in prompts: + total_words_estimate += len(prompt.split()) + return total_words_estimate + else: + for prompt in prompts: + output, _ = self.openai_lm.generate(prompt) + atoms[prompt_to_sent[prompt]] = text_to_sentences(output) - for key, value in demons.items(): - if key not in atoms: - atoms[key] = value + for key, value in demons.items(): + if key not in atoms: + atoms[key] = value - return atoms + return atoms def preprocess_fn(generation, model): diff --git a/factscore/factscorer.py b/factscore/factscorer.py index 3b702b3..84ff131 100644 --- a/factscore/factscorer.py +++ b/factscore/factscorer.py @@ -77,6 +77,25 @@ def register_knowledge_source(self, name="enwiki-20230401", db_path=None, data_p "npm-single", cache_file=os.path.join(self.cache_dir, f"npm-{name}.pkl")) + + def print_cost_estimates(self, total_words, task, model): + # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them + # Number of tokens are roughly 4/3 of the number of words + total_tokens = total_words * 4.0 / 3 + + # https://openai.com/pricing + # if we use davinci-003, the cost is $0.02 per 1000 tokens + # if we use gpt-3.5-turbo, the cost is $0.002 per 1000 tokens + if model == "davinci-003": + rate = 0.02 + elif model == "gpt-3.5-turbo": + rate = 0.002 + + total_cost = total_tokens * rate / 1000 + + # print the total words, tokens, and cost along with rate + logging.critical("Estimated OpenAI API cost for %s ($%.3f per 1000 tokens): $%.2f for %d words and %d tokens" % (task, rate, total_cost, total_words, total_tokens)) + def get_score(self, topics, generations, @@ -108,6 +127,13 @@ def get_score(self, demon_dir=os.path.join(self.data_dir, "demos"), gpt3_cache_file=os.path.join(self.cache_dir, "InstructGPT.pkl")) + # estimate the total cost of atomic fact generation + total_words = 0 + for gen in generations: + total_words += self.af_generator.run(gen, estimate=True) + + self.print_cost_estimates(total_words, task="atomic fact generation", model="davinci-003") + if verbose: topics = tqdm(topics) @@ -121,12 +147,21 @@ def get_score(self, atomic_facts.append(curr_afs) if len(atomic_facts) % 10 == 0: self.af_generator.save_cache() - + assert len(atomic_facts)==len(topics) self.af_generator.save_cache() - + respond_ratio = np.mean([facts is not None for facts in atomic_facts]) + if "ChatGPT" in self.model_name: + # estimate the total cost of response generation + total_words = 0 + for topic, generation, facts in zip(topics, generations, atomic_facts): + if facts is not None: + total_words += self._get_score(topic, generation, facts, knowledge_source, estimate=True) + + self.print_cost_estimates(total_words, task="factscore evaluation", model="gpt-3.5-turbo") + if verbose: topics = tqdm(topics) @@ -142,16 +177,17 @@ def get_score(self, scores.append(score) if len(scores) % 10 == 0: self.save_cache() - + self.save_cache() return {"score": np.mean(scores), "respond_ratio": respond_ratio, "decisions": decisions, - "num_facts_per_response": np.mean([len(d) for d in decisions])} + "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])} - def _get_score(self, topic, generation, atomic_facts, knowledge_source): + def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate=False): decisions = [] + total_words = 0 for atom in atomic_facts: atom = atom.strip() if self.lm: @@ -164,6 +200,9 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source): if not definition[-1] in string.punctuation: definition += "." prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip()) + if estimate: + total_words += len(prompt.split()) + continue output = self.lm.generate(prompt) if type(output[1])==np.ndarray: @@ -195,7 +234,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source): decisions.append({"atom": atom, "is_supported": is_supported}) - return decisions + if estimate: + return total_words + else: + return decisions if __name__ == '__main__': @@ -235,7 +277,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source): logging.basicConfig(format='%(asctime)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.ERROR if args.print_rate_limit_error else logging.CRITICAL) - + fs = FactScorer(model_name=args.model_name, data_dir=args.data_dir, model_dir=args.model_dir, @@ -264,9 +306,9 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source): generations=generations, atomic_facts=atomic_facts if args.use_atomic_facts else None, verbose=args.verbose) - logging.critical("FActScore=%.1f%%" % (100*out["score"])) - logging.critical("Respond ratio=%.1f%%" % (100*out["respond_ratio"])) - logging.critical("# Atomic facts per response=%.1f" % (out["num_facts_per_response"])) + logging.critical("FActScore = %.1f%%" % (100*out["score"])) + logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"])) + logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"])) From 264124c4146e442da885681b876062fb35170aa8 Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Mon, 29 May 2023 14:41:54 -0400 Subject: [PATCH 2/4] Cleaning up code a bit --- factscore/atomic_facts.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/factscore/atomic_facts.py b/factscore/atomic_facts.py index 38133fb..a857aa7 100644 --- a/factscore/atomic_facts.py +++ b/factscore/atomic_facts.py @@ -41,19 +41,6 @@ def __init__(self, key_path, demon_dir, model_name=None, gpt3_cache_file=None): def save_cache(self): self.openai_lm.save_cache() - def estimate_cost(self, generation): - """Estimate the cost of generating the atomic facts.""" - if self.preprocess_fn: - paragraphs = self.preprocess(generation) - else: - paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0] - - num_words = 0 - for para in paragraphs: - num_words += len(para.split(" ")) - - return num_words * 0.0008 - def run(self, generation, estimate=False): """Convert the generation into a set of atomic facts. Return a total words cost if estimate=True.""" if self.preprocess_fn: From 8b40af0aaa16b3ecb9e718aa54d396216371f6c6 Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Fri, 2 Jun 2023 17:45:25 -0400 Subject: [PATCH 3/4] Adding cache into cost estimation --- factscore/atomic_facts.py | 20 +++++++++++--------- factscore/factscorer.py | 24 ++++++++++++++++++------ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/factscore/atomic_facts.py b/factscore/atomic_facts.py index a857aa7..6e640dd 100644 --- a/factscore/atomic_facts.py +++ b/factscore/atomic_facts.py @@ -41,16 +41,16 @@ def __init__(self, key_path, demon_dir, model_name=None, gpt3_cache_file=None): def save_cache(self): self.openai_lm.save_cache() - def run(self, generation, estimate=False): - """Convert the generation into a set of atomic facts. Return a total words cost if estimate=True.""" + def run(self, generation, cost_estimate=None): + """Convert the generation into a set of atomic facts. Return a total words cost if cost_estimate != None.""" if self.preprocess_fn: paragraphs = self.preprocess(generation) else: paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0] - return self.get_atomic_facts_from_paragraph(paragraphs, estimate=estimate) + return self.get_atomic_facts_from_paragraph(paragraphs, cost_estimate=cost_estimate) - def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False): + def get_atomic_facts_from_paragraph(self, paragraphs, cost_estimate=None): sentences = [] para_breaks = [] for para_idx, paragraph in enumerate(paragraphs): @@ -72,9 +72,9 @@ def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False): atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \ (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \ - (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], estimate=estimate) + (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], cost_estimate=cost_estimate) - if estimate: + if cost_estimate: return atoms_or_estimate else: atoms = atoms_or_estimate @@ -102,8 +102,8 @@ def get_atomic_facts_from_paragraph(self, paragraphs, estimate=False): return atomic_facts_pairs, para_breaks - def get_init_atomic_facts_from_sentence(self, sentences, estimate=False): - """Get the initial atomic facts from the sentences. Return a total words cost if estimate=True.""" + def get_init_atomic_facts_from_sentence(self, sentences, cost_estimate=None): + """Get the initial atomic facts from the sentences. Return a total words cost if cost_estimate != None.""" is_bio = self.is_bio demons = self.demons @@ -135,9 +135,11 @@ def get_init_atomic_facts_from_sentence(self, sentences, estimate=False): prompts.append(prompt) prompt_to_sent[prompt] = sentence - if estimate: + if cost_estimate: total_words_estimate = 0 for prompt in prompts: + if cost_estimate == "consider_cache" and (prompt.strip() + "_0") in self.openai_lm.cache_dict: + continue total_words_estimate += len(prompt.split()) return total_words_estimate else: diff --git a/factscore/factscorer.py b/factscore/factscorer.py index 84ff131..e3c9ba6 100644 --- a/factscore/factscorer.py +++ b/factscore/factscorer.py @@ -20,6 +20,7 @@ def __init__(self, model_dir=".cache/factscore", cache_dir=".cache/factscore", openai_key="api.key", + cost_estimate="consider_cache", batch_size=256): assert model_name in ["retrieval+llama", "retrieval+llama+npm", "retrieval+ChatGPT", "npm", "retrieval+ChatGPT+npm"] self.model_name = model_name @@ -36,6 +37,7 @@ def __init__(self, os.makedirs(cache_dir) self.af_generator = None + self.cost_estimate = cost_estimate if "llama" in model_name: self.lm = CLM("inst-llama-7B", @@ -130,7 +132,7 @@ def get_score(self, # estimate the total cost of atomic fact generation total_words = 0 for gen in generations: - total_words += self.af_generator.run(gen, estimate=True) + total_words += self.af_generator.run(gen, cost_estimate=self.cost_estimate) self.print_cost_estimates(total_words, task="atomic fact generation", model="davinci-003") @@ -158,7 +160,7 @@ def get_score(self, total_words = 0 for topic, generation, facts in zip(topics, generations, atomic_facts): if facts is not None: - total_words += self._get_score(topic, generation, facts, knowledge_source, estimate=True) + total_words += self._get_score(topic, generation, facts, knowledge_source, cost_estimate=self.cost_estimate) self.print_cost_estimates(total_words, task="factscore evaluation", model="gpt-3.5-turbo") @@ -185,7 +187,7 @@ def get_score(self, "decisions": decisions, "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])} - def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate=False): + def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_estimate=None): decisions = [] total_words = 0 for atom in atomic_facts: @@ -200,9 +202,14 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate if not definition[-1] in string.punctuation: definition += "." prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip()) - if estimate: + + if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict: + total_words += len(prompt.split()) + continue + elif cost_estimate == "ignore_cache": total_words += len(prompt.split()) continue + output = self.lm.generate(prompt) if type(output[1])==np.ndarray: @@ -234,7 +241,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate decisions.append({"atom": atom, "is_supported": is_supported}) - if estimate: + if cost_estimate: return total_words else: return decisions @@ -260,6 +267,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate parser.add_argument('--cache_dir', type=str, default=".cache/factscore/") + parser.add_argument('--cost_estimate', + type=str, + default="consider_cache", + choices=["consider_cache", "ignore_cache"]) parser.add_argument('--use_atomic_facts', action="store_true") parser.add_argument('--verbose', @@ -282,7 +293,8 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, estimate data_dir=args.data_dir, model_dir=args.model_dir, cache_dir=args.cache_dir, - openai_key=args.openai_key) + openai_key=args.openai_key, + cost_estimate=args.cost_estimate) tot = 0 topics, generations, atomic_facts = [], [], [] From 026faffba1deacb910d0d02c3e0668c242af9343 Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Fri, 2 Jun 2023 17:57:55 -0400 Subject: [PATCH 4/4] Bug fix in the FactScore get_score() cost estimation --- factscore/factscorer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/factscore/factscorer.py b/factscore/factscorer.py index e3c9ba6..f38f40d 100644 --- a/factscore/factscorer.py +++ b/factscore/factscorer.py @@ -203,11 +203,11 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est definition += "." prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip()) - if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict: - total_words += len(prompt.split()) - continue - elif cost_estimate == "ignore_cache": - total_words += len(prompt.split()) + if cost_estimate: + if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict: + total_words += len(prompt.split()) + elif cost_estimate == "ignore_cache": + total_words += len(prompt.split()) continue output = self.lm.generate(prompt)