[pre-commit.ci lite] apply automatic fixes

Future-House · Feb 1, 2025 · f94f602 · f94f602
1 parent c6cf7d3
commit f94f602
Showing 1 changed file with 27 additions and 18 deletions.
diff --git a/docs/tutorials/running_on_lfrqa.md b/docs/tutorials/running_on_lfrqa.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-The **LFRQA dataset** was introduced in the paper [*LFRQA: Large-Scale Few-Shot Retrieval Question Answering*](https://arxiv.org/pdf/2407.13998). It features **1,404 science questions** (along with other categories) that have been human-annotated with answers. This tutorial walks through the process of setting up the dataset for use.
+The **LFRQA dataset** was introduced in the paper [_LFRQA: Large-Scale Few-Shot Retrieval Question Answering_](https://arxiv.org/pdf/2407.13998). It features **1,404 science questions** (along with other categories) that have been human-annotated with answers. This tutorial walks through the process of setting up the dataset for use.
 
 ## Step 1: Download the Annotations
 
@@ -50,7 +50,9 @@ import os
 import pandas as pd
 
 # Load questions dataset
-questions = pd.read_json("rag-qa-benchmarking/annotations_science_with_citation.jsonl", lines=True)
+questions = pd.read_json(
+    "rag-qa-benchmarking/annotations_science_with_citation.jsonl", lines=True
+)
 
 # Load documents dataset
 docs = pd.read_csv(
@@ -68,7 +70,7 @@ If needed, we can limit the number of documents used:
 ```python
 percentage_to_use = 100  # Adjust this to use a subset of documents
 proportion_to_use = percentage_to_use / 100
-papers_directory = 'rag-qa-benchmarking/lfrqa'
+papers_directory = "rag-qa-benchmarking/lfrqa"
 amount_of_docs_to_use = int(len(docs) * proportion_to_use)
 partial_docs = docs.head(amount_of_docs_to_use)
 print(f"Using {amount_of_docs_to_use} out of {len(docs)} documents")
@@ -83,10 +85,14 @@ os.makedirs(f"{papers_directory}/science_docs_for_paperqa/files", exist_ok=True)
 for i, row in partial_docs.iterrows():
     doc_id = row["doc_id"]
     doc_text = row["doc_text"]
-
-    with open(f"{papers_directory}/science_docs_for_paperqa/files/{doc_id}.txt", "w", encoding="utf-8") as f:
+
+    with open(
+        f"{papers_directory}/science_docs_for_paperqa/files/{doc_id}.txt",
+        "w",
+        encoding="utf-8",
+    ) as f:
         f.write(doc_text)
-        
+
     if i % int(len(partial_docs) * 0.05) == 0:
         progress = (i + 1) / len(partial_docs)
         print(f"Progress: {progress:.2%}")
@@ -98,15 +104,17 @@ The **manifest file** keeps track of document metadata for the dataset:
 
 ```python
 manifest = partial_docs.copy()
-manifest['file_location'] = manifest['doc_id'].apply(lambda x: f"files/{x}.txt")
-manifest['doi'] = ""
-manifest['overwrite_fields_from_metadata'] = False
-manifest['title'] = manifest['doc_id'].apply(lambda x: x)
-manifest['key'] = manifest['title']
-manifest['docname'] = manifest['title']
-manifest['citation'] = "_"
-manifest.drop(columns=['doc_id', 'doc_text'], inplace=True)
-manifest.to_csv(f"{papers_directory}/science_docs_for_paperqa/manifest.csv", index=False)
+manifest["file_location"] = manifest["doc_id"].apply(lambda x: f"files/{x}.txt")
+manifest["doi"] = ""
+manifest["overwrite_fields_from_metadata"] = False
+manifest["title"] = manifest["doc_id"].apply(lambda x: x)
+manifest["key"] = manifest["title"]
+manifest["docname"] = manifest["title"]
+manifest["citation"] = "_"
+manifest.drop(columns=["doc_id", "doc_text"], inplace=True)
+manifest.to_csv(
+    f"{papers_directory}/science_docs_for_paperqa/manifest.csv", index=False
+)
 ```
 
 ## Step 7: Filter and Save Questions
@@ -115,7 +123,9 @@ Finally, we filter the question set to ensure we only include questions that ref
 
 ```python
 partial_questions = questions[
-    questions.gold_doc_ids.apply(lambda ids: all(id < amount_of_docs_to_use for id in ids))
+    questions.gold_doc_ids.apply(
+        lambda ids: all(id < amount_of_docs_to_use for id in ids)
+    )
 ]
 partial_questions.to_csv(f"{papers_directory}/questions.csv", index=False)
 ```
@@ -124,5 +134,4 @@ partial_questions.to_csv(f"{papers_directory}/questions.csv", index=False)
 
 You now have the **LFRQA dataset** prepared for use! The dataset includes properly formatted documents and a manifest file that makes it easier to work with **PaperQA** or other retrieval-based QA frameworks.
 
-For more details, refer to the original paper: [*LFRQA: Large-Scale Few-Shot Retrieval Question Answering*](https://arxiv.org/pdf/2407.13998).
-
+For more details, refer to the original paper: [_LFRQA: Large-Scale Few-Shot Retrieval Question Answering_](https://arxiv.org/pdf/2407.13998).