diff --git a/.gitignore b/.gitignore
index 1fd389c..8b3ac10 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,4 @@ snakejob.*
listings/
dist
+debug
diff --git a/src/bioconverters/constants.py b/src/bioconverters/constants.py
new file mode 100644
index 0000000..5d2f871
--- /dev/null
+++ b/src/bioconverters/constants.py
@@ -0,0 +1,50 @@
+GREEK_ALPHABET = {
+ '\u0391': 'Alpha',
+ '\u0392': 'Beta',
+ '\u0393': 'Gamma',
+ '\u0394': 'Delta',
+ '\u0395': 'Epsilon',
+ '\u0396': 'Zeta',
+ '\u0397': 'Eta',
+ '\u0398': 'Theta',
+ '\u0399': 'Iota',
+ '\u039A': 'Kappa',
+ '\u039B': 'Lambda',
+ '\u039C': 'Mu',
+ '\u039D': 'Nu',
+ '\u039E': 'Xi',
+ '\u039F': 'Omicron',
+ '\u03A0': 'Pi',
+ '\u03A1': 'Rho',
+ '\u03A3': 'Sigma',
+ '\u03A4': 'Tau',
+ '\u03A5': 'Upsilon',
+ '\u03A6': 'Phi',
+ '\u03A7': 'Chi',
+ '\u03A8': 'Psi',
+ '\u03A9': 'Omega',
+ '\u03B1': 'alpha',
+ '\u03B2': 'beta',
+ '\u03B3': 'gamma',
+ '\u03B4': 'delta',
+ '\u03B5': 'epsilon',
+ '\u03B6': 'zeta',
+ '\u03B7': 'eta',
+ '\u03B8': 'theta',
+ '\u03B9': 'iota',
+ '\u03BA': 'kappa',
+ '\u03BB': 'lambda',
+ '\u03BC': 'mu',
+ '\u03BD': 'nu',
+ '\u03BE': 'xi',
+ '\u03BF': 'omicron',
+ '\u03C0': 'pi',
+ '\u03C1': 'rho',
+ '\u03C3': 'sigma',
+ '\u03C4': 'tau',
+ '\u03C5': 'upsilon',
+ '\u03C6': 'phi',
+ '\u03C7': 'chi',
+ '\u03C8': 'psi',
+ '\u03C9': 'omega',
+}
diff --git a/src/bioconverters/utils.py b/src/bioconverters/utils.py
index 4e2b142..18e8d4c 100644
--- a/src/bioconverters/utils.py
+++ b/src/bioconverters/utils.py
@@ -2,9 +2,13 @@
import unicodedata
import uuid
import xml.etree.cElementTree as etree
+from copy import copy
from typing import Callable, Dict, Iterable, List, Optional, Tuple
import bioc
+from unidecode import unidecode
+
+from .constants import GREEK_ALPHABET
# XML elements to ignore the contents of
IGNORE_LIST = [
@@ -19,7 +23,7 @@
"tex-math",
"mml:math",
"object-id",
- "ext-link",
+ "ext-link", # TODO: should we keep URL content? some of these have text rather than the URL as inner content
]
# XML elements to separate text between (into different passages)
@@ -34,15 +38,18 @@
"label",
]
+
TABLE_DELIMITER = '\t'
TABLE_DELIMITED_TAGS = {'tr', 'th', 'td'}
+# Tags that should be pre-pended with a space on merge
+PSEUDO_SPACE_TAGS = {'sup', 'break'}
+ANNOTATION_MARKER_PATTERN = r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
class TextChunk:
text: str
xml_node: str
xml_path: str
- non_separating: bool = False
is_tail: bool = False
is_annotation: bool = False
@@ -51,14 +58,12 @@ def __init__(
text,
xml_node,
xml_path=None,
- non_separating=False,
is_tail=False,
is_annotation=False,
):
self.text = text
self.xml_node = xml_node
self.xml_path = xml_path
- self.non_separating = non_separating or is_annotation
self.is_tail = is_tail
self.is_annotation = is_annotation
@@ -72,10 +77,10 @@ def __repr__(self):
tag = self.tag
if self.is_tail:
tag = f'{tag}#'
- ns = '-ns' if self.non_separating else ''
- tag = f'{tag}{ns}'
if self.text:
tag = f'{tag}+text[{len(self.text)}]'
+ if self.is_annotation:
+ tag = f'{tag}@'
return tag
@property
@@ -92,9 +97,9 @@ def remove_brackets_without_words(text: str) -> str:
changed = True
previous_text = text
while changed:
- fixed = re.sub(r"\([^\w\t]*\)", "", previous_text)
- fixed = re.sub(r"\[[^\w\t]*\]", "", fixed)
- fixed = re.sub(r"\{[^\w\t]*\}", "", fixed)
+ fixed = re.sub(r"\([^\w\t-]*\)", "", previous_text)
+ fixed = re.sub(r"\[[^\w\t-]*\]", "", fixed)
+ fixed = re.sub(r"\{[^\w\t-]*\}", "", fixed)
changed = bool(previous_text != fixed)
previous_text = fixed
return fixed
@@ -115,22 +120,30 @@ def cleanup_text(text: str) -> str:
"""
# Remove some "control-like" characters (left/right separator)
text = text.replace(u"\u2028", " ").replace(u"\u2029", " ")
+ text = text.replace('°', ' ° ')
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == TABLE_DELIMITER)
text = "".join(ch if unicodedata.category(ch)[0] != "Z" else " " for ch in text)
+ # replace greek letters with their long-form equivalent
+ for greek_letter, replacement in GREEK_ALPHABET.items():
+ text = text.replace(greek_letter, replacement)
+
+ text = unidecode(text, errors='preserve')
+
# Remove repeated commands and commas next to periods
text = re.sub(r",([^\S\t]*,)*", ",", text)
text = re.sub(r"(,[^\S\t]*)*\.", ".", text)
text = remove_brackets_without_words(text)
- # remove extra spaces from in-text figute/table citations
+ # remove extra spaces from in-text figure/table citations
text = re.sub(r'\([^\S\t]*([^)]*[^\s)])[^\S\t]*\)', r'(\1)', text)
# remove trailing spaces before periods
text = re.sub(r'[^\S\t]+\.(\s|$)', r'.\1', text)
# remove extra spaces around commas/semi-colons
- text = re.sub(r'[^\S\t]*([,;])[^\S\t]+', r'\1 ', text)
+ text = re.sub(r'[^\S\t]*([,;:])([^\S\t]+)', r'\1 ', text)
+ text = re.sub(r'[^\S\t]*([,;:])$', r'\1', text)
# trim leading and trailing non tab whitespace
text = re.sub(r'(^|\t)([^\S\t]+)', r'\1', text)
@@ -168,17 +181,14 @@ def merge_adjacent_xref_siblings(elem_list):
If two XML elements in a list are adjacent and both xrefs separated only by punctuation, merge them
"""
siblings = []
-
for elem in elem_list:
if siblings and elem.tag == 'xref' and siblings[-1].tag == 'xref':
# merge these 2 if the tail of the first element is a punctuation mark
prev_tail = (siblings[-1].tail or '').strip()
if (
- siblings[-1].tail
- and len(prev_tail) == 1
- and unicodedata.category(prev_tail)[0] == 'P'
- and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type')
- ):
+ not prev_tail
+ or (len(prev_tail) == 1 and unicodedata.category(prev_tail[0])[0] == 'P')
+ ) and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type'):
siblings[-1].text = (siblings[-1].text or '') + prev_tail + (elem.text or '')
siblings[-1].tail = elem.tail
@@ -189,7 +199,7 @@ def merge_adjacent_xref_siblings(elem_list):
def get_tag_path(mapping: Dict[etree.Element, etree.Element], node: etree.Element) -> str:
"""
- Get a string representing the path of the currentl XML node in the heirachry of the XML file
+ Get a string representing the path of the current XML node in the hierarchy of the XML file
"""
path = []
current_node = node
@@ -217,8 +227,8 @@ def tag_handler(
except NotImplementedError:
pass
# Extract any raw text directly in XML element or just after
- head = (elem.text or "").strip()
- tail = (elem.tail or "").strip()
+ head = elem.text or ""
+ tail = elem.tail or ""
# Then get the text from all child XML nodes recursively
child_passages = []
@@ -246,16 +256,14 @@ def tag_handler(
):
# Check if the tag should be ignored (so don't use main contents)
return [
- TextChunk(tail, elem, non_separating=True, is_tail=True),
+ TextChunk(tail, elem, is_tail=True),
]
return [TextChunk(head, elem)] + child_passages + [TextChunk(tail, elem, is_tail=True)]
def strip_annotation_markers(
- text: str,
- annotations_map: Dict[str, str],
- marker_pattern=r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}',
+ text: str, annotations_map: Dict[str, str], marker_pattern=ANNOTATION_MARKER_PATTERN
) -> Tuple[str, List[bioc.BioCAnnotation]]:
"""
Given a set of annotations, remove any which are found in the current text and return
@@ -271,7 +279,9 @@ def strip_annotation_markers(
transformed_text = text
pattern = (
- r'([^\S\t]*)([\(\[\{][^\S\t]*)?(' + marker_pattern + r')([^\S\t]*[\)\]\}])?([^\S\t]*)(\.)?'
+ r'([^\S\t]*)([\(\[\{][^\S\t]*)?('
+ + marker_pattern
+ + r')([^\S\t]*[\)\]\}])?([^\S\t]*)([\.,;:])?'
)
matched_annotations: List[Tuple[int, int, str]] = []
@@ -286,13 +296,17 @@ def strip_annotation_markers(
end_offset = 0
matched_brackets = (
- br_open and br_close and br_open.strip() + br_close.strip() in {'\{\}', '[]', '()'}
+ br_open and br_close and br_open.strip() + br_close.strip() in {r'{}', '[]', '()'}
)
if not matched_brackets and (br_open or br_close):
# do not include in the sequence to be removed from the text
- start_offset += len(ws_start or '') + len(br_open or '')
- end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
+ if br_open:
+ start_offset += len(ws_start or '') + len(br_open or '')
+ # end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
+ else:
+ # start_offset += len(ws_start or '') + len(br_open or '')
+ end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
elif not period:
if ws_end:
end_offset += len(ws_end)
@@ -327,7 +341,41 @@ def strip_annotation_markers(
return transformed_text, transformed_annotations
-def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk:
+def remove_style_tags(chunk_list_in: List[TextChunk], style_tags=['italic', 'bold', 'emph']):
+ """
+ Given some list of text chunks, simplify the list to remove consecutive style-only tags
+ """
+ if len(chunk_list_in) < 4:
+ return chunk_list_in
+
+ start_index = 1
+ chunk_list = chunk_list_in[:]
+
+ while start_index < len(chunk_list) - 2:
+ current_tag = chunk_list[start_index]
+
+ if current_tag.tag not in style_tags or current_tag.is_tail:
+ start_index += 1
+ continue
+
+ closing_tag = chunk_list[start_index + 1]
+
+ if closing_tag.tag != current_tag.tag or not closing_tag.is_tail:
+ start_index += 1
+ continue
+
+ chunk_list[start_index - 1] = copy(chunk_list[start_index - 1])
+ chunk_list[start_index - 1].text = (
+ chunk_list[start_index - 1].text + current_tag.text + closing_tag.text
+ )
+ chunk_list = chunk_list[:start_index] + chunk_list[start_index + 2 :]
+
+ length_diff = sum([len(c.text) for c in chunk_list_in]) - sum([len(c.text) for c in chunk_list])
+ assert length_diff == 0, f'characters changed {length_diff}'
+ return chunk_list
+
+
+def merge_text_chunks(chunk_list: List[TextChunk], annotations_map=None) -> TextChunk:
"""
Merge some list of text chunks and pick the most top-level xml node associated with the list to be the new node for the chunk
@@ -336,40 +384,48 @@ def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk:
if annotations_map is None:
# if no mapping is expected, simply drop annotation chunks
chunk_list = [c for c in chunk_list if not c.is_annotation]
-
+ chunk_list = remove_style_tags(chunk_list)
merge = []
for i, current_chunk in enumerate(chunk_list):
if i > 0:
previous_chunk = chunk_list[i - 1]
- join_char = ' '
+ join_char = ''
tags = {previous_chunk.tag, current_chunk.tag}
- if any(
- [
- previous_chunk.is_annotation,
- current_chunk.is_annotation,
- previous_chunk.non_separating,
- current_chunk.non_separating,
- current_chunk.is_tail and not (current_chunk.text or previous_chunk.text),
- ]
+ if current_chunk.tag == 'sup':
+ if not current_chunk.is_tail:
+ if re.match(r'^\s*(−|-)?\d+\s*$', current_chunk.text):
+ if (
+ previous_chunk.text
+ and unicodedata.category(previous_chunk.text[-1])[0] != 'P'
+ ):
+ join_char = '^'
+ elif (
+ current_chunk.text
+ and previous_chunk.text
+ and unicodedata.category(current_chunk.text[0])[0]
+ == unicodedata.category(previous_chunk.text[-1])[0]
+ ):
+ join_char = '-'
+ elif current_chunk.tag in PSEUDO_SPACE_TAGS or (
+ current_chunk.tag == 'xref' and not current_chunk.is_annotation
):
- join_char = ''
+ join_char = ' '
elif len(tags) == 1 and tags & TABLE_DELIMITED_TAGS and not current_chunk.is_tail:
join_char = TABLE_DELIMITER
merge.append(join_char)
- current_text = cleanup_text(current_chunk.text)
if current_chunk.is_annotation:
ann_id = f'ANN_{uuid.uuid4()}'
- annotations_map[ann_id] = current_text
+ annotations_map[ann_id] = current_chunk.text
merge.append(ann_id)
else:
- merge.append(current_text)
+ merge.append(current_chunk.text)
text = ''.join(merge)
# Remove any newlines (as they can be trusted to be syntactically important)
- text = text.replace('\n', '')
+ text = text.replace('\n', ' ')
text = cleanup_text(text)
first_non_tail_node = chunk_list[0].xml_node
@@ -399,6 +455,7 @@ def extract_text_chunks(
"""
if not isinstance(element_list, list):
element_list = [element_list]
+
raw_text_chunks = []
for elem in element_list:
raw_text_chunks.extend(tag_handler(elem, tag_handlers))
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6c054cb..2c9eac0 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,9 @@
from xml.sax.saxutils import escape
import pytest
+from hypothesis import given, infer
+from hypothesis import strategies as st
+
from bioconverters.utils import (
TABLE_DELIMITER,
cleanup_text,
@@ -13,8 +16,6 @@
remove_brackets_without_words,
strip_annotation_markers,
)
-from hypothesis import given, infer
-from hypothesis import strategies as st
from .util import data_file_path
@@ -62,6 +63,169 @@ def test_extract_text_chunks_sibling_xrefs():
assert locations == [113, 325]
+PARSING_CASES = [
+ ['incubator containing 5% CO2', 'incubator containing 5% CO2'],
+ [
+ '
Class IA PI3K dimers are composed of a p110 catalytic subunit and a p85 regulatory subunit, each with three isoforms encoded by three genes
The AR, like other members of the steroid hormone receptor family, is a ligand-activated transcription factor which has distinct structural and functional domains
The predominant type of mutation i.e. loss of function, was well represented in the NTD. Mutations L57Q, E198G, D221H, A234T, S296R; S334P, P340L, P504L and D528G all displayed loss of function with E198G showing the greatest reduction (50% at 1 nM) and P340L also being present in AIS. The loss of transactivational ability was generally seen in both basal activity and across a wide range of DHT concentrations. A possible explanation for the loss of function of mutation A234T is that it is located at the start of the highly conserved motif (residues 234 to 247