Skip to content

Commit

Permalink
Merge pull request #1 from aaronkossler/t5_web
Browse files Browse the repository at this point in the history
T5
  • Loading branch information
aaronkossler authored Dec 21, 2023
2 parents f0c585c + d05c8c8 commit 787668f
Show file tree
Hide file tree
Showing 213 changed files with 1,217 additions and 342 deletions.
111 changes: 82 additions & 29 deletions data_preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,57 @@
import pandas as pd
from sklearn.model_selection import train_test_split
import sys

sys.path.append("..")
import re


# Execute create splits to create the required data splits and write the evaluation sets as jsons

def build_abs_path():
# Get the current working directory
current_working_directory = os.getcwd()

# Find the last occurrence of "triviaqa" in the current working directory
last_occurrence_index = current_working_directory.rfind("trivia_qa")

# Truncate the path after the last occurrence of "triviaqa"
truncated_path = current_working_directory[:last_occurrence_index + len("trivia_qa") + 1]
data_path = truncated_path + "triviaqa_data/"

return data_path


# create data splits
# Alternatively, set "web" as domain
def create_splits(hf_datasets = False, as_list_of_dicts = False, create_eval = True, write_path = "../eval_splits", domain = "wikipedia"):
def create_splits(hf_datasets=False, as_list_of_dicts=False, create_eval=False, write_path="../eval_splits",
domain="wikipedia"):
if domain == "wikipedia":
val_size = 7900
elif domain == "web":
val_size = 9500
# download via datasets module
if hf_datasets:
if domain == "wikipedia":
trivia_qa = datasets.load_dataset('trivia_qa', name="rc.wikipedia")
elif domain == "web":
trivia_qa = datasets.load_dataset('trivia_qa', name="rc.web")
trivia_qa = datasets.load_dataset('trivia_qa', name=f"rc.{domain}")
train_split = trivia_qa["train"].train_test_split(shuffle=False, train_size=val_size)

train_split = trivia_qa["train"].train_test_split(shuffle=False, train_size=7900)
validation = train_split["train"]
train = train_split["test"]
test = trivia_qa["validation"]
# download from website
else:
data_path = "../triviaqa_data"
#print(bool(os.path.exists(data_path) and os.listdir(data_path)))
#exit()
data_path = build_abs_path()
# print(bool(os.path.exists(data_path) and os.listdir(data_path)))
# exit()
if not (os.path.exists(data_path) and os.listdir(data_path)):
print("Downloading data...")
wget.download("https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz", out="../triviaqa-rc.tar.gz")
with tarfile.open("../triviaqa-rc.tar.gz", "r:gz") as tar:
tar.extractall(path=data_path)

train_val = pd.DataFrame(pd.read_json(data_path+'/qa/wikipedia-train.json', encoding='utf-8'))["Data"]
validation, train = train_test_split(train_val, shuffle=False, train_size=7900)
test = pd.DataFrame(pd.read_json(data_path+'/qa/wikipedia-dev.json', encoding='utf-8'))["Data"]

#print(train.info(), train.tolist()[0])
train_val = pd.DataFrame(pd.read_json(data_path + f'/qa/{domain}-train.json', encoding='utf-8'))["Data"]
validation, train = train_test_split(train_val, shuffle=False, train_size=val_size)
test = pd.DataFrame(pd.read_json(data_path + f'/qa/{domain}-dev.json', encoding='utf-8'))["Data"]

if as_list_of_dicts:
splits = {
Expand All @@ -54,19 +71,21 @@ def create_splits(hf_datasets = False, as_list_of_dicts = False, create_eval = T
"validation": validation,
"test": test
}
"""
if create_eval:
#eval_data = preprocess_eval_datasets(splits)

if create_eval and as_list_of_dicts:
# eval_data = preprocess_eval_datasets(splits)
eval_data = {
"validation": splits["validation"],
"test": splits["test"]
"test": splits["test"],
"train": splits["train"]
}
write_files(eval_data, write_path, domain)"""
write_files(eval_data, write_path, domain)

return splits


# Convert the evaluation data (= validation and test) to the desired format
def preprocess_eval_datasets(data, convert_eval = ["validation", "test"]):
def preprocess_eval_datasets(data, convert_eval=["validation", "test", "train"]):
evaluation = {}

for split in convert_eval:
Expand All @@ -92,7 +111,16 @@ def preprocess_eval_datasets(data, convert_eval = ["validation", "test"]):
question = item["question"]
question_id = item["question_id"]
question_source = item["question_source"]
search_results = []
search_results = [
{
"Description": item["search_results"]["description"][index],
"Filename": item["search_results"]["filename"][index],
"Rank": item["search_results"]["rank"][index],
"Title": item["search_results"]["title"][index],
"Url": item["search_results"]["url"][index]
}
for index in range(len(item["search_results"]["filename"]))
]
data_item = {
"Answer": answer,
"EntityPages": entity_pages,
Expand All @@ -107,6 +135,7 @@ def preprocess_eval_datasets(data, convert_eval = ["validation", "test"]):

return evaluation


def write_files(eval_data, write_path, domain):
for key, val in eval_data.items():
output = {
Expand All @@ -121,12 +150,36 @@ def write_files(eval_data, write_path, domain):
with open(write_path + "/{}_{}.json".format(key, domain), "w") as f:
json.dump(output, f)

def build_context(item, domain):
texts = []
for pages in item["EntityPages"]:
filename = pages["Filename"]
text = open(f"../triviaqa_data/evidence/{domain}/{filename}", mode="r", encoding="utf-8").read()
texts.append(text)
context = " ".join(texts)

return context
def cleanup_context(text):
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'File:.*\n', '', text)
return text


def page_to_context(page, domain, format_text):
filename = page["Filename"]
text = open(f"{build_abs_path()}/evidence/{domain}/{filename}", mode="r", encoding="utf-8").read()
if format_text:
text = cleanup_context(text)
return text


def build_context(item, domain, format_text=False):
context = ""
if domain == "wikipedia":
texts = []
for page in item["EntityPages"]:
text = page_to_context(page, domain, format_text)
texts.append(text)
context = " ".join(texts)
if domain == "web":
context = {}
for page in item["EntityPages"]:
text = page_to_context(page, domain, format_text)
context[page["Filename"]] = text
for result in item["SearchResults"]:
text = page_to_context(result, domain, format_text)
context[result["Filename"]] = text

return context
File renamed without changes.
1 change: 1 addition & 0 deletions data_splits/train_wikipedia.json

Large diffs are not rendered by default.

File renamed without changes.
41 changes: 0 additions & 41 deletions evaluate_models/evaluate_results.py

This file was deleted.

17 changes: 17 additions & 0 deletions pretrained/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# HuggingFace Pipeline

## Script Execution

To run predictions for different HuggingFace models, the pre_pipeline script shall be executed. The following parameters need to be considered:

- _model_: The name of the model to be used. The model name can be found on the [HuggingFace website](https://huggingface.co/models).
- _domain_ (default → wikipedia): Specify the domain of the data to be fine-tuned on. The domain is used to load the correct dataset.
- _gpu_ (default → yes): Specify if GPU should be used for inference.

An example of how an execution might look like is shown in the following example:

python pre_pipeline.py --model deepset/minilm-uncased-squad2 --domain wikipedia --gpu yes

## Model results

The results can be found in the project report.
13 changes: 13 additions & 0 deletions pretrained/pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
#SBATCH --ntasks=40
#SBATCH --time=48:00:00
#SBATCH --gres=gpu:1
#SBATCH --output=outputs/output.txt
#SBATCH --error=outputs/error.txt
#SBATCH --job-name=t5_pipeline
#SBATCH --mem=128000

pip install --upgrade pip
pip install -r ../requirements.txt

python pre_pipeline.py --model deepset/roberta-base-squad2 --domain web --gpu yes
126 changes: 126 additions & 0 deletions pretrained/pre_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import json
import os
from tqdm import tqdm
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import FARMReader
from haystack.nodes import BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline


def save_predictions(predictions, path, filename):
if not os.path.exists(path):
os.makedirs(path)

# Convert the dictionary to a JSON string
json_string = json.dumps(predictions)

# Write the JSON string to a file
with open(f"{path}/{filename}", "w") as f:
f.write(json_string)


def article_to_document_store(article, question_id):
document_store = InMemoryDocumentStore(use_bm25=True)
document = {
"content": article,
"meta": {
"question_id": question_id
},
}
document_store.write_documents([document])
return document_store


def read_json(path):
return open(path, mode="r", encoding="utf-8").read()


def run_pipeline(documents, reader, query, top_k):
retriever = BM25Retriever(document_store=documents)
pipe = ExtractiveQAPipeline(reader, retriever)
prediction = pipe.run(
query=query,
params={"Retriever": {"top_k": top_k}, "Reader": {"top_k": top_k}})
if prediction["answers"]:
return prediction["answers"][0].answer
else:
return ""


class Predictor:
def __init__(self, model, domain, test, gpu, debug):
self.model = model
self.domain = domain
self.test = test
if gpu == "yes":
self.gpu = True
else:
self.gpu = False
if debug == "yes":
self.debug = True
else:
self.debug = False

def build_document_stores(self):
documents = {}
if self.domain == "wikipedia":
for row in tqdm(self.test, desc="Building Document Stores"):
document_store = InMemoryDocumentStore(use_bm25=True)
for page in row["EntityPages"]:
filename = page["Filename"]
article = read_json(f"../triviaqa_data/evidence/wikipedia/{filename}")
document = {
"content": article,
"meta": {
"question_id": row["QuestionId"]
},
}
document_store.write_documents([document])
documents[row["QuestionId"]] = document_store
if self.domain == "web":
for row in self.test:
for index, page in enumerate(row["EntityPages"]):
filename = page["Filename"]
article = read_json(f"../triviaqa_data/evidence/wikipedia/{filename}")
document_store = article_to_document_store(article, row["QuestionId"])
documents[f"{row['QuestionId']}--{filename}"] = document_store
for index, result in enumerate(row["SearchResults"]):
filename = result["Filename"]
article = read_json(f"../triviaqa_data/evidence/web/{filename}")
document_store = article_to_document_store(article, row["QuestionId"])
documents[f"{row['QuestionId']}--{filename}"] = document_store
return documents

def reader(self):
return FARMReader(model_name_or_path=self.model, use_gpu=self.gpu)

def predict(self):
documents = self.build_document_stores()
reader = self.reader()
predictions = {}
if self.domain == "wikipedia":
for entry in tqdm(self.test, desc="Predicting Answers"):
prediction = run_pipeline(documents[entry['QuestionId']], reader, entry['Question'], 1)
predictions[entry['QuestionId']] = prediction
if self.domain == "web":
for entry in tqdm(self.test, desc="Predicting Answers"):
for page in entry["EntityPages"]:
filename = page["Filename"]
prediction = run_pipeline(documents[f"{entry['QuestionId']}--{filename}"], reader,
entry['Question'], 1)
if self.debug:
print(f"Question: {entry['Question']}")
print(f"Answers: {prediction}")
print(f"Filename: {filename}")
predictions[f"{entry['QuestionId']}--{filename}"] = prediction
for result in entry["SearchResults"]:
filename = result["Filename"]
prediction = run_pipeline(documents[f"{entry['QuestionId']}--{filename}"], reader,
entry['Question'], 1)
if self.debug:
print(f"Question: {entry['Question']}")
print(f"Answers: {prediction}")
print(f"Filename: {filename}")
predictions[f"{entry['QuestionId']}--{filename}"] = prediction

return predictions
Loading

0 comments on commit 787668f

Please sign in to comment.