Skip to content

Commit

Permalink
Bumping up to version 0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
martiansideofthemoon committed Oct 14, 2023
2 parents 57e2880 + 78f2fb4 commit 03ff672
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,4 @@ fs-venv

poetry.lock

acl-publication-info.74k.parquet
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
[![PyPI version factscore](https://badge.fury.io/py/factscore.svg)](https://pypi.python.org/pypi/factscore/)
[![Downloads](https://pepy.tech/badge/factscore)](https://pepy.tech/project/factscore)

This is the official release accompanying our preprint, [FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation](https://arxiv.org/abs/2305.14251). FActScore is available as a PIP package as well.
This is the official release accompanying our EMNLP 2023 paper, [FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation](https://arxiv.org/abs/2305.14251). FActScore is available as a PIP package as well.

If you find FActScore useful, please cite:
```
@article{ factscore,
@inproceedings{ factscore,
title={ {FActScore}: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation },
author={ Min, Sewon and Krishna, Kalpesh and Lyu, Xinxi and Lewis, Mike and Yih, Wen-tau and Koh, Pang Wei and Iyyer, Mohit and Zettlemoyer, Luke and Hajishirzi, Hannaneh },
year={ 2023 },
journal={ arXiv preprint arXiv:2305.14251 },
booktitle = { EMNLP },
url={ https://arxiv.org/abs/2305.14251 }
}
```
Expand Down Expand Up @@ -74,8 +74,7 @@ python -m factscore.factscorer --input_path {input_path} --model_name {estimator
- `--print_rate_limit_error`: It specified, it prints out rate limit errors from OpenAI API.
- `--cost_estimate`: This flag decides the type of OpenAI API cost estimation that we provide before calling it. It can be `"consider_cache"` (default) or `"ignore_cache"`.
- `--abstain_detection`: This flag optionally enables automatic detection of abstained responses. By default this is disabled, but it is recommended to add your own function tailored to your model. The currently supported detectors are `"generic"` and `"perplexity_ai"`, and their implementations can be found in [`factscore/abstain_detection.py`](factscore/abstain_detection.py). There are two methods to add your own abstain function: a) clone our GitHub repository to install `factscore` locally (`pip install --editable .`), and then add your function to [`factscore/abstain_detection.py`](factscore/abstain_detection.py) directly; b) process your abstain detection outside our package, and use empty strings in the `output` key for the JSONL file used in `--input_path`.

This command uses the English Wikipedia from 2023/04/01 as a knowledge source. See [this section](#To-use-a-custom-knowledge-source) to use your own database as a knowledge source!
- `--knowledge_source`: In case the default knowledge source (Wikipedia - 2023/04/01) will not be used, preprocess it using the [instructions below](#To-use-a-custom-knowledge-source), and then specify the knowledge_source name under this flag.

## To evaluate your own LM

Expand Down Expand Up @@ -143,4 +142,4 @@ print (out["respond_ratio"]) # % of responding (not abstaining from answering)
print (out["num_facts_per_response"]) # average number of atomic facts per response
```


To see an example of constructing the ACL anthology knowledge source, see [`preprocessing/preprocess_acl.py`](preprocessing/preprocess_acl.py).
20 changes: 12 additions & 8 deletions factscore/factscorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,12 @@ def get_score(self,
atomic_facts=None,
knowledge_source=None,
verbose=False):

if knowledge_source is None:
# use the default one (enwiki-20230401)
# use the default knowledge source
knowledge_source = "enwiki-20230401"
if knowledge_source not in self.retrieval:
self.register_knowledge_source(knowledge_source)
else:
assert knowledge_source in self.retrieval, \
f"{knowledge_source} is not registered yet. Please use `register_knowledge_source()` function to register it with a database"

if knowledge_source not in self.retrieval:
self.register_knowledge_source(knowledge_source)

if type(topics)==len(generations)==str:
topics = [topics]
Expand Down Expand Up @@ -294,6 +291,10 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est
parser.add_argument('--cache_dir',
type=str,
default=".cache/factscore/")
parser.add_argument('--knowledge_source',
type=str,
default=None)


parser.add_argument('--cost_estimate',
type=str,
Expand Down Expand Up @@ -351,12 +352,15 @@ def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_est
generations=generations,
gamma=args.gamma,
atomic_facts=atomic_facts if args.use_atomic_facts else None,
knowledge_source=args.knowledge_source,
verbose=args.verbose)
logging.critical("FActScore = %.1f%%" % (100*out["score"]))
if "init_score" in out:
logging.critical("FActScore w/o length penalty = %.1f%%" % (100*out["init_score"]))
logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"]))
logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"]))


# Save out as a json file
with open(args.input_path.replace(".jsonl", f"_factscore_output.json"), 'w') as f:
f.write(json.dumps(out) + "\n")

66 changes: 66 additions & 0 deletions preprocessing/preprocess_acl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pandas as pd
import tqdm
import json
import openai
from factscore.openai_lm import call_ChatGPT
from factscore.factscorer import FactScorer

# File downloaded from https://github.com/shauryr/ACL-anthology-corpus
# https://drive.google.com/file/d/1CFCzNGlTls0H-Zcaem4Hg_ETj4ebhcDO/view?usp=sharing
df = pd.read_parquet('acl-publication-info.74k.parquet')
titles = df['title'].tolist()
full_text = df['full_text'].tolist()

acl_corpus = []
for x, y in zip(titles, full_text):
if x.strip() == "" or y.strip() == "":
continue
acl_corpus.append({"title": x, "text": y})

with open("acl_corpus.jsonl", 'w') as f:
for line in acl_corpus:
f.write(json.dumps(line) + "\n")

fs = FactScorer()
# this will create a database using your file
# once DB file is created, you can reuse it by only specifying `db_path`
fs.register_knowledge_source("acl_corpus",
data_path="acl_corpus.jsonl",
db_path=None)


prompt_titles = [
"Dense Passage Retrieval for Open-Domain Question Answering",
"AmbigQA: Answering Ambiguous Open-domain Questions",
"MetaICL: Learning to Learn In Context",
"Noisy Channel Language Model Prompting for Few-Shot Text Classification",
"Joint Passage Ranking for Diverse Multi-Answer Retrieval",
"Reformulating Unsupervised Style Transfer as Paraphrase Generation",
"Syntactically Supervised Transformers for Faster Neural Machine Translation",
"Hurdles to Progress in Long-form Question Answering",
"Generating Question-Answer Hierarchies",
"Do Long-Range Language Models Actually Use Long-Range Context?"
]

prompts_list = []

for title in prompt_titles:
prompts_list.append(f"Give me a summary of the research paper titled \"{title}\".")

with open("api.key", 'r') as f:
api_key = f.readline()
openai.api_key = api_key.strip()

responses = []
for ptitle, prompt in tqdm.tqdm(zip(prompt_titles, prompts_list)):
message = [{"role": "user", "content": prompt}]
response = call_ChatGPT(message, model_name="gpt-3.5-turbo-0301")
responses.append({
"topic": ptitle,
"output": response["choices"][0]["message"]["content"]
})

# # write the corpus to a jsonl file
with open("acl_chatgpt_outputs.jsonl", 'w') as f:
for line in responses:
f.write(json.dumps(line) + "\n")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "factscore"
version = "0.1.9"
version = "0.2.0"
description = "FactScore is an automatic evaluation metric for factual precision in long-form text generation. It uses large language models and retrieval to break down generations into atomic facts and then measure the correctness with respect to a knowledge source (like Wikipedia)."
authors = ["Sewon Min <[email protected]>", "Kalpesh Krishna <[email protected]>", "Xinxi Lyu <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit 03ff672

Please sign in to comment.