Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --extractRawText to extract raw text from xml #63

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions grobid_client/grobid_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@
import time
import concurrent.futures
import ntpath
from dataclasses import dataclass

import requests
import pathlib
import xml.etree.ElementTree as ET

from .client import ApiClient

Expand Down Expand Up @@ -348,6 +351,66 @@ def process_txt(

return (txt_file, status, res.text)


TIE_NS = "{http://www.tei-c.org/ns/1.0}"


@dataclass
class RawText:
txt: str = ""


def extract_raw_text_from_element_recursive(el, raw_txt):
txt = (el.text or "").strip()
if txt:
if el.tag == f"{TIE_NS}head":
raw_txt.txt += "\n\n"
raw_txt.txt += txt
if el.tag == f"{TIE_NS}head":
raw_txt.txt += "\n"
else:
raw_txt.txt += " "
for child in el:
extract_raw_text_from_element_recursive(child, raw_txt)
if el.tail and isinstance(el.tail, str):
tail = el.tail.strip()
if tail:
raw_txt.txt += el.tail
raw_txt.txt += " "


def extract_raw_text_from_tei_xml(dirpath, filename):
tree = ET.parse(f"{dirpath}/{filename}")
root = tree.getroot()
textEl = root.find(f"{TIE_NS}text")
raw_txt = RawText()
if textEl:
bodyEl = textEl.find(f"{TIE_NS}body")
if bodyEl:
for child in bodyEl:
if child.tag == f"{TIE_NS}div":
extract_raw_text_from_element_recursive(child, raw_txt)
raw_txt.txt = raw_txt.txt.strip("\n")
raw_txt.txt = raw_txt.txt.strip()
if raw_txt.txt:
raw_txt.txt += "\n"
return raw_txt.txt


def extract_raw_text(output_path, force):
for (dirpath, dirnames, filenames) in os.walk(output_path):
for filename in filenames:
if filename.endswith(".tei.xml"):
txt_file_name = filename[:-len(".tei.xml")] + ".txt"
txt_file_path = pathlib.Path(dirpath, txt_file_name)
if txt_file_path.exists():
if not force:
print(txt_file_name, "already exist, skipping... (use --force to reprocess txt output files)")
continue
txt = extract_raw_text_from_tei_xml(dirpath, filename)
txt_file_path.write_text(txt)


def main():
valid_services = [
"processFulltextDocument",
Expand Down Expand Up @@ -415,6 +478,11 @@ def main():
action="store_true",
help="segment sentences in the text content of the document with additional <s> elements",
)
parser.add_argument(
"--extractRawText",
action="store_true",
help="In --processFulltextDocument mode, additionally extracts raw text from the pdf and stores it in .txt file"
)
parser.add_argument(
"--verbose",
action="store_true",
Expand Down Expand Up @@ -459,6 +527,10 @@ def main():
print("Missing or invalid service, must be one of", valid_services)
exit(1)

if args.extractRawText and service != "processFulltextDocument":
print("--extractRawText only allowed with processFulltextDocument")
exit(1)

try:
client = GrobidClient(config_path=config_path)
except ServerUnavailableException:
Expand All @@ -481,6 +553,8 @@ def main():
force=force,
verbose=verbose,
)
if args.extractRawText and service == "processFulltextDocument":
extract_raw_text(output_path or input_path, force)

runtime = round(time.time() - start_time, 3)
print("runtime: %s seconds " % (runtime))
Expand Down