From dc5fee44bb837efb9a6a6bb8fd85008111548731 Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Tue, 27 Jun 2023 10:30:46 -0400 Subject: [PATCH 1/6] Bumping up to version 0.1.7 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 79c8f1b..4d3fa82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "factscore" -version = "0.1.6" +version = "0.1.7" description = "FactScore is an automatic evaluation metric for factual precision in long-form text generation. It uses large language models and retrieval to break down generations into atomic facts and then measure the correctness with respect to a knowledge source (like Wikipedia)." authors = ["Sewon Min ", "Kalpesh Krishna ", "Xinxi Lyu "] license = "MIT" From 18192fcc97fff7a8fb24c1b1c41a4537fd3bfc64 Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Wed, 30 Aug 2023 13:39:44 -0400 Subject: [PATCH 2/6] Small changes to ensure custom corpus addition works --- .gitignore | 1 + factscore/factscorer.py | 20 +++--- factscore/retrieval.py | 5 +- preprocessing/postprocess_acl.py | 23 +++++++ preprocessing/preprocess_acl.py | 102 +++++++++++++++++++++++++++++ preprocessing/preprocess_acl_kb.py | 16 +++++ 6 files changed, 158 insertions(+), 9 deletions(-) create mode 100644 preprocessing/postprocess_acl.py create mode 100644 preprocessing/preprocess_acl.py create mode 100644 preprocessing/preprocess_acl_kb.py diff --git a/.gitignore b/.gitignore index 2d75dd8..959ac62 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,4 @@ fs-venv poetry.lock +acl-publication-info.74k.parquet diff --git a/factscore/factscorer.py b/factscore/factscorer.py index bb3c6b0..531659d 100644 --- a/factscore/factscorer.py +++ b/factscore/factscorer.py @@ -108,15 +108,12 @@ def get_score(self, atomic_facts=None, knowledge_source=None, verbose=False): - if knowledge_source is None: - # use the default one (enwiki-20230401) + # use the default knowledge source knowledge_source = "enwiki-20230401" - if knowledge_source not in self.retrieval: - self.register_knowledge_source(knowledge_source) - else: - assert knowledge_source in self.retrieval, \ - f"{knowledge_source} is not registered yet. Please use `register_knowledge_source()` function to register it with a database" + + if knowledge_source not in self.retrieval: + self.register_knowledge_source(knowledge_source) if type(topics)==len(generations)==str: topics = [topics] @@ -294,6 +291,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est parser.add_argument('--cache_dir', type=str, default=".cache/factscore/") + parser.add_argument('--knowledge_source', + type=str, + default=None) + parser.add_argument('--cost_estimate', type=str, @@ -351,6 +352,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est generations=generations, gamma=args.gamma, atomic_facts=atomic_facts if args.use_atomic_facts else None, + knowledge_source=args.knowledge_source, verbose=args.verbose) logging.critical("FActScore = %.1f%%" % (100*out["score"])) if "init_score" in out: @@ -358,5 +360,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"])) logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"])) - + # Save out as a json file + with open(args.input_path.replace(".jsonl", f"_factscore_output.json"), 'w') as f: + f.write(json.dumps(out) + "\n") diff --git a/factscore/retrieval.py b/factscore/retrieval.py index 66576d4..c2733d5 100644 --- a/factscore/retrieval.py +++ b/factscore/retrieval.py @@ -56,6 +56,8 @@ def build_db(self, db_path, data_path): with open(data_path, "r") as f: for line in f: + if tot % 100 == 0: + print(f"Finished reading {tot} lines ({(time.time() - start_time) / 60:.2f} min)") dp = json.loads(line) title = dp["title"] text = dp["text"] @@ -77,7 +79,7 @@ def build_db(self, db_path, data_path): while offset < len(tokens): passages.append(tokens[offset:offset+MAX_LENGTH]) offset += MAX_LENGTH - + psgs = [tokenizer.decode(tokens) for tokens in passages if np.sum([t not in [0, 2] for t in tokens])>0] text = SPECIAL_SEPARATOR.join(psgs) output_lines.append((title, text)) @@ -99,6 +101,7 @@ def get_text_from_title(self, title): """Fetch the raw text of the doc for 'doc_id'.""" cursor = self.connection.cursor() cursor.execute("SELECT text FROM documents WHERE title = ?", (title,)) + # cursor.execute("SELECT title FROM documents") results = cursor.fetchall() results = [r for r in results] cursor.close() diff --git a/preprocessing/postprocess_acl.py b/preprocessing/postprocess_acl.py new file mode 100644 index 0000000..9859313 --- /dev/null +++ b/preprocessing/postprocess_acl.py @@ -0,0 +1,23 @@ +import csv +import json + +# read json file acl_chatgpt_outputs_factscore_output.json + +with open('acl_chatgpt_outputs_factscore_output.json') as f: + data = json.load(f) + +output_csv = [] + +for idx, instance in enumerate(data['decisions']): + for atomic in instance: + output_csv.append({ + 'atom': atomic['atom'], + 'factscore_is_supported': atomic['is_supported'], + 'instance_id': idx + }) + +# write to csv +with open('acl_chatgpt_outputs_factscore_output.csv', 'w') as f: + writer = csv.DictWriter(f, fieldnames=['instance_id', 'atom', 'factscore_is_supported']) + writer.writeheader() + writer.writerows(output_csv) diff --git a/preprocessing/preprocess_acl.py b/preprocessing/preprocess_acl.py new file mode 100644 index 0000000..9015230 --- /dev/null +++ b/preprocessing/preprocess_acl.py @@ -0,0 +1,102 @@ +import pandas as pd +import tqdm +import json +import openai +from factscore.openai_lm import call_ChatGPT + + +df = pd.read_parquet('acl-publication-info.74k.parquet') +titles = df['title'].tolist() +abstracts = df['abstract'].tolist() +full_text = df['full_text'].tolist() +years = df['year'].tolist() +authors = [[y.strip() for y in x.split("and\n")] if x is not None else None for x in df['author'].tolist()] + +# # build the corpus first +# output_corpus = [] +# for title, abstract, ftext, author in zip(titles, abstracts, full_text, authors): +# if author is not None and ftext.strip(): +# output_corpus.append({"title": title, "text": ftext}) + +# print(f"Number of papers in the corpus: {len(output_corpus)} ({len(titles) - len(output_corpus)} filtered)") + +# # write the corpus to a jsonl file +# with open("acl_corpus.jsonl", 'w') as f: +# for line in output_corpus: +# f.write(json.dumps(line) + "\n") + +prompt_titles = [ + "Dense Passage Retrieval for Open-Domain Question Answering", + "AmbigQA: Answering Ambiguous Open-domain Questions", + "MetaICL: Learning to Learn In Context", + "Noisy Channel Language Model Prompting for Few-Shot Text Classification", + "Joint Passage Ranking for Diverse Multi-Answer Retrieval", + "Reformulating Unsupervised Style Transfer as Paraphrase Generation", + "Syntactically Supervised Transformers for Faster Neural Machine Translation", + "Hurdles to Progress in Long-form Question Answering", + "Generating Question-Answer Hierarchies", + "Do Long-Range Language Models Actually Use Long-Range Context?" +] + +prompts_list = [] + +# find all papers whose author is Kalpesh Krishna +for title, abstract, ftext, author in zip(titles, abstracts, full_text, authors): + if title.strip() in prompt_titles: + assert ftext.strip() + +for title in prompt_titles: + prompts_list.append( + f"Give me a summary of the research paper titled \"{title}\"." + ) + +with open("api.key", 'r') as f: + api_key = f.readline() +openai.api_key = api_key.strip() + +responses = [] +for ptitle, prompt in tqdm.tqdm(zip(prompt_titles, prompts_list)): + message = [{"role": "user", "content": prompt}] + response = call_ChatGPT(message, model_name="gpt-3.5-turbo-0301") + responses.append({ + "topic": ptitle, + "output": response["choices"][0]["message"]["content"] + }) + +# # write the corpus to a jsonl file +with open("acl_chatgpt_outputs.jsonl", 'w') as f: + for line in responses: + f.write(json.dumps(line) + "\n") + +# def count_freqs(counts, str_name="authors"): +# counts = [(k, v) for k, v in counts.items()] +# freq_list = [0, 1, 3, 10, 20, 50, 100000] +# print("") +# for i in range(len(freq_list) - 1): +# num_counts = [x for x in counts if x[1] > freq_list[i] and x[1] <= freq_list[i + 1]] +# print(f"Number of {str_name} with {freq_list[i]} < freq <= {freq_list[i + 1]}: {len(num_counts)} ({len(num_counts) / len(counts) * 100:.2f}%)") +# print("") + +# count_freqs(Counter(authors), "authors") + +# all_entities = [] +# for idx, abstract in tqdm.tqdm(enumerate(abstracts)): +# doc = nlp(abstract) +# curr_ents = [] +# for ent in doc.ents: +# curr_ents.append(ent.text.strip()) +# curr_ents = list(set(curr_ents)) +# all_entities.append(curr_ents) + +# if (idx + 1) % 3000 == 0: +# count_freqs(Counter([y for x in all_entities for y in x]), "entities") + +# # write all_entities to a pickle file +# with open("indexes/acl_entities.pkl", 'wb') as f: +# pickle.dump(all_entities, f) + +# acl_counts = Counter([y for x in all_entities for y in x]) +# # sort by frequency +# acl_counts = [(k, v) for k, v in acl_counts.items()] +# acl_counts = sorted(acl_counts, key=lambda x: x[1], reverse=True) + diff --git a/preprocessing/preprocess_acl_kb.py b/preprocessing/preprocess_acl_kb.py new file mode 100644 index 0000000..44ece90 --- /dev/null +++ b/preprocessing/preprocess_acl_kb.py @@ -0,0 +1,16 @@ +from factscore.factscorer import FactScorer + +fs = FactScorer() + +# this will create a database using your file +# for English Wikipedia (18GB)), it takes ~8 hours +# once DB file is created, you can reuse it by only specifying `db_path` +fs.register_knowledge_source("acl_corpus", + data_path="acl_corpus.jsonl", + db_path=None) + +# # now, when you compute a score, specify knowledge source to use +# out = fs.get_score(topics, generations, knowledge_source=name_of_your_knowledge_source) +# print (out["score"]) # FActScore +# print (out["respond_ratio"]) # % of responding (not abstaining from answering) +# print (out["num_facts_per_response"]) # average number of atomic facts per response From 99cd0e92366da4eeaf7912b1f5c24467fc67088d Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Wed, 30 Aug 2023 13:49:39 -0400 Subject: [PATCH 3/6] Cleaning up custom corpus changes --- factscore/retrieval.py | 5 +-- preprocessing/postprocess_acl.py | 23 ---------- preprocessing/preprocess_acl.py | 71 +++++++----------------------- preprocessing/preprocess_acl_kb.py | 16 ------- 4 files changed, 18 insertions(+), 97 deletions(-) delete mode 100644 preprocessing/postprocess_acl.py delete mode 100644 preprocessing/preprocess_acl_kb.py diff --git a/factscore/retrieval.py b/factscore/retrieval.py index c2733d5..66576d4 100644 --- a/factscore/retrieval.py +++ b/factscore/retrieval.py @@ -56,8 +56,6 @@ def build_db(self, db_path, data_path): with open(data_path, "r") as f: for line in f: - if tot % 100 == 0: - print(f"Finished reading {tot} lines ({(time.time() - start_time) / 60:.2f} min)") dp = json.loads(line) title = dp["title"] text = dp["text"] @@ -79,7 +77,7 @@ def build_db(self, db_path, data_path): while offset < len(tokens): passages.append(tokens[offset:offset+MAX_LENGTH]) offset += MAX_LENGTH - + psgs = [tokenizer.decode(tokens) for tokens in passages if np.sum([t not in [0, 2] for t in tokens])>0] text = SPECIAL_SEPARATOR.join(psgs) output_lines.append((title, text)) @@ -101,7 +99,6 @@ def get_text_from_title(self, title): """Fetch the raw text of the doc for 'doc_id'.""" cursor = self.connection.cursor() cursor.execute("SELECT text FROM documents WHERE title = ?", (title,)) - # cursor.execute("SELECT title FROM documents") results = cursor.fetchall() results = [r for r in results] cursor.close() diff --git a/preprocessing/postprocess_acl.py b/preprocessing/postprocess_acl.py deleted file mode 100644 index 9859313..0000000 --- a/preprocessing/postprocess_acl.py +++ /dev/null @@ -1,23 +0,0 @@ -import csv -import json - -# read json file acl_chatgpt_outputs_factscore_output.json - -with open('acl_chatgpt_outputs_factscore_output.json') as f: - data = json.load(f) - -output_csv = [] - -for idx, instance in enumerate(data['decisions']): - for atomic in instance: - output_csv.append({ - 'atom': atomic['atom'], - 'factscore_is_supported': atomic['is_supported'], - 'instance_id': idx - }) - -# write to csv -with open('acl_chatgpt_outputs_factscore_output.csv', 'w') as f: - writer = csv.DictWriter(f, fieldnames=['instance_id', 'atom', 'factscore_is_supported']) - writer.writeheader() - writer.writerows(output_csv) diff --git a/preprocessing/preprocess_acl.py b/preprocessing/preprocess_acl.py index 9015230..9d7e80d 100644 --- a/preprocessing/preprocess_acl.py +++ b/preprocessing/preprocess_acl.py @@ -3,27 +3,30 @@ import json import openai from factscore.openai_lm import call_ChatGPT +from factscore.factscorer import FactScorer df = pd.read_parquet('acl-publication-info.74k.parquet') titles = df['title'].tolist() -abstracts = df['abstract'].tolist() full_text = df['full_text'].tolist() -years = df['year'].tolist() -authors = [[y.strip() for y in x.split("and\n")] if x is not None else None for x in df['author'].tolist()] -# # build the corpus first -# output_corpus = [] -# for title, abstract, ftext, author in zip(titles, abstracts, full_text, authors): -# if author is not None and ftext.strip(): -# output_corpus.append({"title": title, "text": ftext}) +acl_corpus = [] +for x, y in zip(titles, full_text): + if x.strip() == "" or y.strip() == "": + continue + acl_corpus.append({"title": x, "text": y}) -# print(f"Number of papers in the corpus: {len(output_corpus)} ({len(titles) - len(output_corpus)} filtered)") +with open("acl_corpus.jsonl", 'w') as f: + for line in acl_corpus: + f.write(json.dumps(line) + "\n") + +fs = FactScorer() +# this will create a database using your file +# once DB file is created, you can reuse it by only specifying `db_path` +fs.register_knowledge_source("acl_corpus", + data_path="acl_corpus.jsonl", + db_path=None) -# # write the corpus to a jsonl file -# with open("acl_corpus.jsonl", 'w') as f: -# for line in output_corpus: -# f.write(json.dumps(line) + "\n") prompt_titles = [ "Dense Passage Retrieval for Open-Domain Question Answering", @@ -40,15 +43,8 @@ prompts_list = [] -# find all papers whose author is Kalpesh Krishna -for title, abstract, ftext, author in zip(titles, abstracts, full_text, authors): - if title.strip() in prompt_titles: - assert ftext.strip() - for title in prompt_titles: - prompts_list.append( - f"Give me a summary of the research paper titled \"{title}\"." - ) + prompts_list.append(f"Give me a summary of the research paper titled \"{title}\".") with open("api.key", 'r') as f: api_key = f.readline() @@ -67,36 +63,3 @@ with open("acl_chatgpt_outputs.jsonl", 'w') as f: for line in responses: f.write(json.dumps(line) + "\n") - -# def count_freqs(counts, str_name="authors"): -# counts = [(k, v) for k, v in counts.items()] -# freq_list = [0, 1, 3, 10, 20, 50, 100000] -# print("") -# for i in range(len(freq_list) - 1): -# num_counts = [x for x in counts if x[1] > freq_list[i] and x[1] <= freq_list[i + 1]] -# print(f"Number of {str_name} with {freq_list[i]} < freq <= {freq_list[i + 1]}: {len(num_counts)} ({len(num_counts) / len(counts) * 100:.2f}%)") -# print("") - -# count_freqs(Counter(authors), "authors") - -# all_entities = [] -# for idx, abstract in tqdm.tqdm(enumerate(abstracts)): -# doc = nlp(abstract) -# curr_ents = [] -# for ent in doc.ents: -# curr_ents.append(ent.text.strip()) -# curr_ents = list(set(curr_ents)) -# all_entities.append(curr_ents) - -# if (idx + 1) % 3000 == 0: -# count_freqs(Counter([y for x in all_entities for y in x]), "entities") - -# # write all_entities to a pickle file -# with open("indexes/acl_entities.pkl", 'wb') as f: -# pickle.dump(all_entities, f) - -# acl_counts = Counter([y for x in all_entities for y in x]) -# # sort by frequency -# acl_counts = [(k, v) for k, v in acl_counts.items()] -# acl_counts = sorted(acl_counts, key=lambda x: x[1], reverse=True) - diff --git a/preprocessing/preprocess_acl_kb.py b/preprocessing/preprocess_acl_kb.py deleted file mode 100644 index 44ece90..0000000 --- a/preprocessing/preprocess_acl_kb.py +++ /dev/null @@ -1,16 +0,0 @@ -from factscore.factscorer import FactScorer - -fs = FactScorer() - -# this will create a database using your file -# for English Wikipedia (18GB)), it takes ~8 hours -# once DB file is created, you can reuse it by only specifying `db_path` -fs.register_knowledge_source("acl_corpus", - data_path="acl_corpus.jsonl", - db_path=None) - -# # now, when you compute a score, specify knowledge source to use -# out = fs.get_score(topics, generations, knowledge_source=name_of_your_knowledge_source) -# print (out["score"]) # FActScore -# print (out["respond_ratio"]) # % of responding (not abstaining from answering) -# print (out["num_facts_per_response"]) # average number of atomic facts per response From 2b45af1d4ecb6d339446c68991e1ae098bcd6bdc Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Wed, 30 Aug 2023 13:55:36 -0400 Subject: [PATCH 4/6] Some small changes to README on the custom corpus --- README.md | 5 ++--- preprocessing/preprocess_acl.py | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c0d1ab6..b0d62e0 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,7 @@ python -m factscore.factscorer --input_path {input_path} --model_name {estimator - `--print_rate_limit_error`: It specified, it prints out rate limit errors from OpenAI API. - `--cost_estimate`: This flag decides the type of OpenAI API cost estimation that we provide before calling it. It can be `"consider_cache"` (default) or `"ignore_cache"`. - `--abstain_detection`: This flag optionally enables automatic detection of abstained responses. By default this is disabled, but it is recommended to add your own function tailored to your model. The currently supported detectors are `"generic"` and `"perplexity_ai"`, and their implementations can be found in [`factscore/abstain_detection.py`](factscore/abstain_detection.py). There are two methods to add your own abstain function: a) clone our GitHub repository to install `factscore` locally (`pip install --editable .`), and then add your function to [`factscore/abstain_detection.py`](factscore/abstain_detection.py) directly; b) process your abstain detection outside our package, and use empty strings in the `output` key for the JSONL file used in `--input_path`. - -This command uses the English Wikipedia from 2023/04/01 as a knowledge source. See [this section](#To-use-a-custom-knowledge-source) to use your own database as a knowledge source! +- `--knowledge_source`: In case the default knowledge source (Wikipedia - 2023/04/01) will not be used, preprocess it using the [instructions below](#To-use-a-custom-knowledge-source), and then specify the knowledge_source name under this flag. ## To evaluate your own LM @@ -143,4 +142,4 @@ print (out["respond_ratio"]) # % of responding (not abstaining from answering) print (out["num_facts_per_response"]) # average number of atomic facts per response ``` - +To see an example of constructing the ACL anthology knowledge source, see [`preprocessing/preprocess_acl.py`](preprocessing/preprocess_acl.py). diff --git a/preprocessing/preprocess_acl.py b/preprocessing/preprocess_acl.py index 9d7e80d..c5bbb7d 100644 --- a/preprocessing/preprocess_acl.py +++ b/preprocessing/preprocess_acl.py @@ -5,7 +5,8 @@ from factscore.openai_lm import call_ChatGPT from factscore.factscorer import FactScorer - +# File downloaded from https://github.com/shauryr/ACL-anthology-corpus +# https://drive.google.com/file/d/1CFCzNGlTls0H-Zcaem4Hg_ETj4ebhcDO/view?usp=sharing df = pd.read_parquet('acl-publication-info.74k.parquet') titles = df['title'].tolist() full_text = df['full_text'].tolist() From 8263550c46d61106b71510f86ba5e7edc8a2ee3f Mon Sep 17 00:00:00 2001 From: Kalpesh Krishna Date: Sat, 7 Oct 2023 16:34:04 -0400 Subject: [PATCH 5/6] Add EMNLP 2023 acceptance to the README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b0d62e0..818f14b 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![PyPI version factscore](https://badge.fury.io/py/factscore.svg)](https://pypi.python.org/pypi/factscore/) [![Downloads](https://pepy.tech/badge/factscore)](https://pepy.tech/project/factscore) -This is the official release accompanying our preprint, [FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation](https://arxiv.org/abs/2305.14251). FActScore is available as a PIP package as well. +This is the official release accompanying our EMNLP 2023 paper, [FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation](https://arxiv.org/abs/2305.14251). FActScore is available as a PIP package as well. If you find FActScore useful, please cite: ``` From 78f2fb412b37b1a23933debe9f9d4fd94a55174f Mon Sep 17 00:00:00 2001 From: Sewon Min Date: Sun, 8 Oct 2023 15:56:16 -0700 Subject: [PATCH 6/6] update bibtex --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 818f14b..6c776b4 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,11 @@ This is the official release accompanying our EMNLP 2023 paper, [FActScore: Fine If you find FActScore useful, please cite: ``` -@article{ factscore, +@inproceedings{ factscore, title={ {FActScore}: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation }, author={ Min, Sewon and Krishna, Kalpesh and Lyu, Xinxi and Lewis, Mike and Yih, Wen-tau and Koh, Pang Wei and Iyyer, Mohit and Zettlemoyer, Luke and Hajishirzi, Hannaneh }, year={ 2023 }, - journal={ arXiv preprint arXiv:2305.14251 }, + booktitle = { EMNLP }, url={ https://arxiv.org/abs/2305.14251 } } ```