Skip to content

Commit

Permalink
More document parsing
Browse files Browse the repository at this point in the history
caufieldjh committed Nov 29, 2023
1 parent 1e63682 commit 79597f4
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions src/ontogpt/evaluation/craft/eval_craft_ner.py
Original file line number Diff line number Diff line change
@@ -57,6 +57,8 @@
THIS_DIR = Path(__file__).parent
DATABASE_DIR = Path(__file__).parent / "database" / "all"

MONDO_PURL_PREFIX = "http://purl.obolibrary.org/obo/MONDO_"

# These are the entity types involved in this dataset.
TARGET_TYPES = [
"AnatomicalElement",
@@ -138,8 +140,12 @@ def load_test_cases(self) -> Iterable[Document]:
def load_cases(self, path: Path) -> Iterable[Document]:
logger.info(f"Loading {path}")

entities_by_text = defaultdict(list)

# Load documents
# Remove extra empty lines from input text
# Parse annotations to identifier only
# MONDO ids are full PURLs for some reason, so fix those too
for docfilepath in path.glob("*.txt"):
logger.info(f"Loading text doc {docfilepath}")
with open(docfilepath, "r") as docfile:
@@ -148,18 +154,19 @@ def load_cases(self, path: Path) -> Iterable[Document]:
logger.info(f"Loading corresponding annotation file {annfilepath}")
with open(annfilepath, "r") as annfile:
annotations = annfile.readlines()
print(doctext)
print(len(annotations))

doc = {}
these_annotations = []
for annotation in annotations:
uri = (annotation.split())[1]
if uri.startswith(MONDO_PURL_PREFIX):
uri = uri.replace(MONDO_PURL_PREFIX, 'MONDO:')
if uri not in these_annotations:
these_annotations.append(uri)


# Validate documents

# with gzip.open(str(path), "rb") as f:
# collection = biocxml.load(f)
# chemicals_by_text = defaultdict(list)
# diseases_by_text = defaultdict(list)
# for document in collection.documents:
# these_annotations = []
# doc = {}
# for p in document.passages:
# doc[p.infons["type"]] = p.text
# for a in p.annotations:

0 comments on commit 79597f4

Please sign in to comment.