diff --git a/.gitignore b/.gitignore index 2d75dd8..959ac62 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,4 @@ fs-venv poetry.lock +acl-publication-info.74k.parquet diff --git a/README.md b/README.md index c0d1ab6..6c776b4 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,15 @@ [![PyPI version factscore](https://badge.fury.io/py/factscore.svg)](https://pypi.python.org/pypi/factscore/) [![Downloads](https://pepy.tech/badge/factscore)](https://pepy.tech/project/factscore) -This is the official release accompanying our preprint, [FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation](https://arxiv.org/abs/2305.14251). FActScore is available as a PIP package as well. +This is the official release accompanying our EMNLP 2023 paper, [FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation](https://arxiv.org/abs/2305.14251). FActScore is available as a PIP package as well. If you find FActScore useful, please cite: ``` -@article{ factscore, +@inproceedings{ factscore, title={ {FActScore}: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation }, author={ Min, Sewon and Krishna, Kalpesh and Lyu, Xinxi and Lewis, Mike and Yih, Wen-tau and Koh, Pang Wei and Iyyer, Mohit and Zettlemoyer, Luke and Hajishirzi, Hannaneh }, year={ 2023 }, - journal={ arXiv preprint arXiv:2305.14251 }, + booktitle = { EMNLP }, url={ https://arxiv.org/abs/2305.14251 } } ``` @@ -74,8 +74,7 @@ python -m factscore.factscorer --input_path {input_path} --model_name {estimator - `--print_rate_limit_error`: It specified, it prints out rate limit errors from OpenAI API. - `--cost_estimate`: This flag decides the type of OpenAI API cost estimation that we provide before calling it. It can be `"consider_cache"` (default) or `"ignore_cache"`. - `--abstain_detection`: This flag optionally enables automatic detection of abstained responses. By default this is disabled, but it is recommended to add your own function tailored to your model. The currently supported detectors are `"generic"` and `"perplexity_ai"`, and their implementations can be found in [`factscore/abstain_detection.py`](factscore/abstain_detection.py). There are two methods to add your own abstain function: a) clone our GitHub repository to install `factscore` locally (`pip install --editable .`), and then add your function to [`factscore/abstain_detection.py`](factscore/abstain_detection.py) directly; b) process your abstain detection outside our package, and use empty strings in the `output` key for the JSONL file used in `--input_path`. - -This command uses the English Wikipedia from 2023/04/01 as a knowledge source. See [this section](#To-use-a-custom-knowledge-source) to use your own database as a knowledge source! +- `--knowledge_source`: In case the default knowledge source (Wikipedia - 2023/04/01) will not be used, preprocess it using the [instructions below](#To-use-a-custom-knowledge-source), and then specify the knowledge_source name under this flag. ## To evaluate your own LM @@ -143,4 +142,4 @@ print (out["respond_ratio"]) # % of responding (not abstaining from answering) print (out["num_facts_per_response"]) # average number of atomic facts per response ``` - +To see an example of constructing the ACL anthology knowledge source, see [`preprocessing/preprocess_acl.py`](preprocessing/preprocess_acl.py). diff --git a/factscore/factscorer.py b/factscore/factscorer.py index bb3c6b0..531659d 100644 --- a/factscore/factscorer.py +++ b/factscore/factscorer.py @@ -108,15 +108,12 @@ def get_score(self, atomic_facts=None, knowledge_source=None, verbose=False): - if knowledge_source is None: - # use the default one (enwiki-20230401) + # use the default knowledge source knowledge_source = "enwiki-20230401" - if knowledge_source not in self.retrieval: - self.register_knowledge_source(knowledge_source) - else: - assert knowledge_source in self.retrieval, \ - f"{knowledge_source} is not registered yet. Please use `register_knowledge_source()` function to register it with a database" + + if knowledge_source not in self.retrieval: + self.register_knowledge_source(knowledge_source) if type(topics)==len(generations)==str: topics = [topics] @@ -294,6 +291,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est parser.add_argument('--cache_dir', type=str, default=".cache/factscore/") + parser.add_argument('--knowledge_source', + type=str, + default=None) + parser.add_argument('--cost_estimate', type=str, @@ -351,6 +352,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est generations=generations, gamma=args.gamma, atomic_facts=atomic_facts if args.use_atomic_facts else None, + knowledge_source=args.knowledge_source, verbose=args.verbose) logging.critical("FActScore = %.1f%%" % (100*out["score"])) if "init_score" in out: @@ -358,5 +360,7 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"])) logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"])) - + # Save out as a json file + with open(args.input_path.replace(".jsonl", f"_factscore_output.json"), 'w') as f: + f.write(json.dumps(out) + "\n") diff --git a/preprocessing/preprocess_acl.py b/preprocessing/preprocess_acl.py new file mode 100644 index 0000000..c5bbb7d --- /dev/null +++ b/preprocessing/preprocess_acl.py @@ -0,0 +1,66 @@ +import pandas as pd +import tqdm +import json +import openai +from factscore.openai_lm import call_ChatGPT +from factscore.factscorer import FactScorer + +# File downloaded from https://github.com/shauryr/ACL-anthology-corpus +# https://drive.google.com/file/d/1CFCzNGlTls0H-Zcaem4Hg_ETj4ebhcDO/view?usp=sharing +df = pd.read_parquet('acl-publication-info.74k.parquet') +titles = df['title'].tolist() +full_text = df['full_text'].tolist() + +acl_corpus = [] +for x, y in zip(titles, full_text): + if x.strip() == "" or y.strip() == "": + continue + acl_corpus.append({"title": x, "text": y}) + +with open("acl_corpus.jsonl", 'w') as f: + for line in acl_corpus: + f.write(json.dumps(line) + "\n") + +fs = FactScorer() +# this will create a database using your file +# once DB file is created, you can reuse it by only specifying `db_path` +fs.register_knowledge_source("acl_corpus", + data_path="acl_corpus.jsonl", + db_path=None) + + +prompt_titles = [ + "Dense Passage Retrieval for Open-Domain Question Answering", + "AmbigQA: Answering Ambiguous Open-domain Questions", + "MetaICL: Learning to Learn In Context", + "Noisy Channel Language Model Prompting for Few-Shot Text Classification", + "Joint Passage Ranking for Diverse Multi-Answer Retrieval", + "Reformulating Unsupervised Style Transfer as Paraphrase Generation", + "Syntactically Supervised Transformers for Faster Neural Machine Translation", + "Hurdles to Progress in Long-form Question Answering", + "Generating Question-Answer Hierarchies", + "Do Long-Range Language Models Actually Use Long-Range Context?" +] + +prompts_list = [] + +for title in prompt_titles: + prompts_list.append(f"Give me a summary of the research paper titled \"{title}\".") + +with open("api.key", 'r') as f: + api_key = f.readline() +openai.api_key = api_key.strip() + +responses = [] +for ptitle, prompt in tqdm.tqdm(zip(prompt_titles, prompts_list)): + message = [{"role": "user", "content": prompt}] + response = call_ChatGPT(message, model_name="gpt-3.5-turbo-0301") + responses.append({ + "topic": ptitle, + "output": response["choices"][0]["message"]["content"] + }) + +# # write the corpus to a jsonl file +with open("acl_chatgpt_outputs.jsonl", 'w') as f: + for line in responses: + f.write(json.dumps(line) + "\n") diff --git a/pyproject.toml b/pyproject.toml index 825d6fc..947afa9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "factscore" -version = "0.1.9" +version = "0.2.0" description = "FactScore is an automatic evaluation metric for factual precision in long-form text generation. It uses large language models and retrieval to break down generations into atomic facts and then measure the correctness with respect to a knowledge source (like Wikipedia)." authors = ["Sewon Min ", "Kalpesh Krishna ", "Xinxi Lyu "] license = "MIT"