Merge pull request #1 from aaronkossler/t5_web

T5
aaronkossler · Dec 21, 2023 · 787668f · 787668f
2 parents f0c585c + d05c8c8
commit 787668f
Show file tree

Hide file tree

Showing 213 changed files with 1,217 additions and 342 deletions.
diff --git a/data_preprocessing/preprocessing.py b/data_preprocessing/preprocessing.py
@@ -7,40 +7,57 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 import sys
+
 sys.path.append("..")
+import re
+
 
 # Execute create splits to create the required data splits and write the evaluation sets as jsons
 
+def build_abs_path():
+    # Get the current working directory
+    current_working_directory = os.getcwd()
+
+    # Find the last occurrence of "triviaqa" in the current working directory
+    last_occurrence_index = current_working_directory.rfind("trivia_qa")
+
+    # Truncate the path after the last occurrence of "triviaqa"
+    truncated_path = current_working_directory[:last_occurrence_index + len("trivia_qa") + 1]
+    data_path = truncated_path + "triviaqa_data/"
+
+    return data_path
+
+
 # create data splits
 # Alternatively, set "web" as domain
-def create_splits(hf_datasets = False, as_list_of_dicts = False, create_eval = True, write_path = "../eval_splits", domain = "wikipedia"):
+def create_splits(hf_datasets=False, as_list_of_dicts=False, create_eval=False, write_path="../eval_splits",
+                  domain="wikipedia"):
+    if domain == "wikipedia":
+        val_size = 7900
+    elif domain == "web":
+        val_size = 9500
     # download via datasets module
     if hf_datasets:
-        if domain == "wikipedia":
-            trivia_qa = datasets.load_dataset('trivia_qa', name="rc.wikipedia")
-        elif domain == "web":
-            trivia_qa = datasets.load_dataset('trivia_qa', name="rc.web")
+        trivia_qa = datasets.load_dataset('trivia_qa', name=f"rc.{domain}")
+        train_split = trivia_qa["train"].train_test_split(shuffle=False, train_size=val_size)
 
-        train_split = trivia_qa["train"].train_test_split(shuffle=False, train_size=7900)
         validation = train_split["train"]
         train = train_split["test"]
         test = trivia_qa["validation"]
     # download from website
     else:
-        data_path = "../triviaqa_data"
-        #print(bool(os.path.exists(data_path) and os.listdir(data_path)))
-        #exit()
+        data_path = build_abs_path()
+        # print(bool(os.path.exists(data_path) and os.listdir(data_path)))
+        # exit()
         if not (os.path.exists(data_path) and os.listdir(data_path)):
             print("Downloading data...")
             wget.download("https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz", out="../triviaqa-rc.tar.gz")
             with tarfile.open("../triviaqa-rc.tar.gz", "r:gz") as tar:
                 tar.extractall(path=data_path)
 
-        train_val = pd.DataFrame(pd.read_json(data_path+'/qa/wikipedia-train.json', encoding='utf-8'))["Data"]
-        validation, train = train_test_split(train_val, shuffle=False, train_size=7900)
-        test = pd.DataFrame(pd.read_json(data_path+'/qa/wikipedia-dev.json', encoding='utf-8'))["Data"]
-
-    #print(train.info(), train.tolist()[0])
+        train_val = pd.DataFrame(pd.read_json(data_path + f'/qa/{domain}-train.json', encoding='utf-8'))["Data"]
+        validation, train = train_test_split(train_val, shuffle=False, train_size=val_size)
+        test = pd.DataFrame(pd.read_json(data_path + f'/qa/{domain}-dev.json', encoding='utf-8'))["Data"]
 
     if as_list_of_dicts:
         splits = {
@@ -54,19 +71,21 @@ def create_splits(hf_datasets = False, as_list_of_dicts = False, create_eval = T
             "validation": validation,
             "test": test
         }
-    """
-    if create_eval:
-        #eval_data = preprocess_eval_datasets(splits)
+
+    if create_eval and as_list_of_dicts:
+        # eval_data = preprocess_eval_datasets(splits)
         eval_data = {
             "validation": splits["validation"],
-            "test": splits["test"]
+            "test": splits["test"],
+            "train": splits["train"]
         }
-        write_files(eval_data, write_path, domain)"""
+        write_files(eval_data, write_path, domain)
 
     return splits
 
+
 # Convert the evaluation data (= validation and test) to the desired format
-def preprocess_eval_datasets(data, convert_eval = ["validation", "test"]):
+def preprocess_eval_datasets(data, convert_eval=["validation", "test", "train"]):
     evaluation = {}
 
     for split in convert_eval:
@@ -92,7 +111,16 @@ def preprocess_eval_datasets(data, convert_eval = ["validation", "test"]):
             question = item["question"]
             question_id = item["question_id"]
             question_source = item["question_source"]
-            search_results = []
+            search_results = [
+                {
+                    "Description": item["search_results"]["description"][index],
+                    "Filename": item["search_results"]["filename"][index],
+                    "Rank": item["search_results"]["rank"][index],
+                    "Title": item["search_results"]["title"][index],
+                    "Url": item["search_results"]["url"][index]
+                }
+                for index in range(len(item["search_results"]["filename"]))
+            ]
             data_item = {
                 "Answer": answer,
                 "EntityPages": entity_pages,
@@ -107,6 +135,7 @@ def preprocess_eval_datasets(data, convert_eval = ["validation", "test"]):
 
     return evaluation
 
+
 def write_files(eval_data, write_path, domain):
     for key, val in eval_data.items():
         output = {
@@ -121,12 +150,36 @@ def write_files(eval_data, write_path, domain):
         with open(write_path + "/{}_{}.json".format(key, domain), "w") as f:
             json.dump(output, f)
 
-def build_context(item, domain):
-    texts = []
-    for pages in item["EntityPages"]:
-        filename = pages["Filename"]
-        text = open(f"../triviaqa_data/evidence/{domain}/{filename}", mode="r", encoding="utf-8").read()
-        texts.append(text)
-    context = " ".join(texts)
 
-    return context
+def cleanup_context(text):
+    text = re.sub(r'\[.*?\]', '', text)
+    text = re.sub(r'File:.*\n', '', text)
+    return text
+
+
+def page_to_context(page, domain, format_text):
+    filename = page["Filename"]
+    text = open(f"{build_abs_path()}/evidence/{domain}/{filename}", mode="r", encoding="utf-8").read()
+    if format_text:
+        text = cleanup_context(text)
+    return text
+
+
+def build_context(item, domain, format_text=False):
+    context = ""
+    if domain == "wikipedia":
+        texts = []
+        for page in item["EntityPages"]:
+            text = page_to_context(page, domain, format_text)
+            texts.append(text)
+        context = " ".join(texts)
+    if domain == "web":
+        context = {}
+        for page in item["EntityPages"]:
+            text = page_to_context(page, domain, format_text)
+            context[page["Filename"]] = text
+        for result in item["SearchResults"]:
+            text = page_to_context(result, domain, format_text)
+            context[result["Filename"]] = text
+
+    return context
diff --git a/evaluate_models/test_wikipedia.json → data_splits/test_wikipedia.json b/evaluate_models/test_wikipedia.json → data_splits/test_wikipedia.json
diff --git a/data_splits/train_wikipedia.json b/data_splits/train_wikipedia.json
diff --git a/evaluate_models/validation_wikipedia.json → data_splits/validation_wikipedia.json b/evaluate_models/validation_wikipedia.json → data_splits/validation_wikipedia.json
diff --git a/evaluate_models/evaluate_results.py b/evaluate_models/evaluate_results.py
diff --git a/pretrained/README.md b/pretrained/README.md
@@ -0,0 +1,17 @@
+# HuggingFace Pipeline
+
+## Script Execution
+
+To run predictions for different HuggingFace models, the pre_pipeline script shall be executed. The following parameters need to be considered:
+
+- _model_: The name of the model to be used. The model name can be found on the [HuggingFace website](https://huggingface.co/models).
+- _domain_ (default &rarr; wikipedia): Specify the domain of the data to be fine-tuned on. The domain is used to load the correct dataset.
+- _gpu_ (default &rarr; yes): Specify if GPU should be used for inference.
+
+An example of how an execution might look like is shown in the following example:
+
+    python pre_pipeline.py --model deepset/minilm-uncased-squad2 --domain wikipedia --gpu yes
+
+## Model results
+
+The results can be found in the project report.
diff --git a/pretrained/pipeline.sh b/pretrained/pipeline.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH --ntasks=40
+#SBATCH --time=48:00:00
+#SBATCH --gres=gpu:1
+#SBATCH --output=outputs/output.txt
+#SBATCH --error=outputs/error.txt
+#SBATCH --job-name=t5_pipeline
+#SBATCH --mem=128000
+
+pip install --upgrade pip
+pip install -r ../requirements.txt
+
+python pre_pipeline.py --model deepset/roberta-base-squad2 --domain web --gpu yes
diff --git a/pretrained/pre_functions.py b/pretrained/pre_functions.py
@@ -0,0 +1,126 @@
+import json
+import os
+from tqdm import tqdm
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.nodes import FARMReader
+from haystack.nodes import BM25Retriever
+from haystack.pipelines import ExtractiveQAPipeline
+
+
+def save_predictions(predictions, path, filename):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    # Convert the dictionary to a JSON string
+    json_string = json.dumps(predictions)
+
+    # Write the JSON string to a file
+    with open(f"{path}/{filename}", "w") as f:
+        f.write(json_string)
+
+
+def article_to_document_store(article, question_id):
+    document_store = InMemoryDocumentStore(use_bm25=True)
+    document = {
+        "content": article,
+        "meta": {
+            "question_id": question_id
+        },
+    }
+    document_store.write_documents([document])
+    return document_store
+
+
+def read_json(path):
+    return open(path, mode="r", encoding="utf-8").read()
+
+
+def run_pipeline(documents, reader, query, top_k):
+    retriever = BM25Retriever(document_store=documents)
+    pipe = ExtractiveQAPipeline(reader, retriever)
+    prediction = pipe.run(
+        query=query,
+        params={"Retriever": {"top_k": top_k}, "Reader": {"top_k": top_k}})
+    if prediction["answers"]:
+        return prediction["answers"][0].answer
+    else:
+        return ""
+
+
+class Predictor:
+    def __init__(self, model, domain, test, gpu, debug):
+        self.model = model
+        self.domain = domain
+        self.test = test
+        if gpu == "yes":
+            self.gpu = True
+        else:
+            self.gpu = False
+        if debug == "yes":
+            self.debug = True
+        else:
+            self.debug = False
+
+    def build_document_stores(self):
+        documents = {}
+        if self.domain == "wikipedia":
+            for row in tqdm(self.test, desc="Building Document Stores"):
+                document_store = InMemoryDocumentStore(use_bm25=True)
+                for page in row["EntityPages"]:
+                    filename = page["Filename"]
+                    article = read_json(f"../triviaqa_data/evidence/wikipedia/{filename}")
+                    document = {
+                        "content": article,
+                        "meta": {
+                            "question_id": row["QuestionId"]
+                        },
+                    }
+                    document_store.write_documents([document])
+                documents[row["QuestionId"]] = document_store
+        if self.domain == "web":
+            for row in self.test:
+                for index, page in enumerate(row["EntityPages"]):
+                    filename = page["Filename"]
+                    article = read_json(f"../triviaqa_data/evidence/wikipedia/{filename}")
+                    document_store = article_to_document_store(article, row["QuestionId"])
+                    documents[f"{row['QuestionId']}--{filename}"] = document_store
+                for index, result in enumerate(row["SearchResults"]):
+                    filename = result["Filename"]
+                    article = read_json(f"../triviaqa_data/evidence/web/{filename}")
+                    document_store = article_to_document_store(article, row["QuestionId"])
+                    documents[f"{row['QuestionId']}--{filename}"] = document_store
+        return documents
+
+    def reader(self):
+        return FARMReader(model_name_or_path=self.model, use_gpu=self.gpu)
+
+    def predict(self):
+        documents = self.build_document_stores()
+        reader = self.reader()
+        predictions = {}
+        if self.domain == "wikipedia":
+            for entry in tqdm(self.test, desc="Predicting Answers"):
+                prediction = run_pipeline(documents[entry['QuestionId']], reader, entry['Question'], 1)
+                predictions[entry['QuestionId']] = prediction
+        if self.domain == "web":
+            for entry in tqdm(self.test, desc="Predicting Answers"):
+                for page in entry["EntityPages"]:
+                    filename = page["Filename"]
+                    prediction = run_pipeline(documents[f"{entry['QuestionId']}--{filename}"], reader,
+                                              entry['Question'], 1)
+                    if self.debug:
+                        print(f"Question: {entry['Question']}")
+                        print(f"Answers: {prediction}")
+                        print(f"Filename: {filename}")
+                    predictions[f"{entry['QuestionId']}--{filename}"] = prediction
+                for result in entry["SearchResults"]:
+                    filename = result["Filename"]
+                    prediction = run_pipeline(documents[f"{entry['QuestionId']}--{filename}"], reader,
+                                              entry['Question'], 1)
+                    if self.debug:
+                        print(f"Question: {entry['Question']}")
+                        print(f"Answers: {prediction}")
+                        print(f"Filename: {filename}")
+                    predictions[f"{entry['QuestionId']}--{filename}"] = prediction
+
+        return predictions