diff --git a/.gitignore b/.gitignore index 1fd389c..8b3ac10 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ snakejob.* listings/ dist +debug diff --git a/src/bioconverters/constants.py b/src/bioconverters/constants.py new file mode 100644 index 0000000..5d2f871 --- /dev/null +++ b/src/bioconverters/constants.py @@ -0,0 +1,50 @@ +GREEK_ALPHABET = { + '\u0391': 'Alpha', + '\u0392': 'Beta', + '\u0393': 'Gamma', + '\u0394': 'Delta', + '\u0395': 'Epsilon', + '\u0396': 'Zeta', + '\u0397': 'Eta', + '\u0398': 'Theta', + '\u0399': 'Iota', + '\u039A': 'Kappa', + '\u039B': 'Lambda', + '\u039C': 'Mu', + '\u039D': 'Nu', + '\u039E': 'Xi', + '\u039F': 'Omicron', + '\u03A0': 'Pi', + '\u03A1': 'Rho', + '\u03A3': 'Sigma', + '\u03A4': 'Tau', + '\u03A5': 'Upsilon', + '\u03A6': 'Phi', + '\u03A7': 'Chi', + '\u03A8': 'Psi', + '\u03A9': 'Omega', + '\u03B1': 'alpha', + '\u03B2': 'beta', + '\u03B3': 'gamma', + '\u03B4': 'delta', + '\u03B5': 'epsilon', + '\u03B6': 'zeta', + '\u03B7': 'eta', + '\u03B8': 'theta', + '\u03B9': 'iota', + '\u03BA': 'kappa', + '\u03BB': 'lambda', + '\u03BC': 'mu', + '\u03BD': 'nu', + '\u03BE': 'xi', + '\u03BF': 'omicron', + '\u03C0': 'pi', + '\u03C1': 'rho', + '\u03C3': 'sigma', + '\u03C4': 'tau', + '\u03C5': 'upsilon', + '\u03C6': 'phi', + '\u03C7': 'chi', + '\u03C8': 'psi', + '\u03C9': 'omega', +} diff --git a/src/bioconverters/utils.py b/src/bioconverters/utils.py index 4e2b142..18e8d4c 100644 --- a/src/bioconverters/utils.py +++ b/src/bioconverters/utils.py @@ -2,9 +2,13 @@ import unicodedata import uuid import xml.etree.cElementTree as etree +from copy import copy from typing import Callable, Dict, Iterable, List, Optional, Tuple import bioc +from unidecode import unidecode + +from .constants import GREEK_ALPHABET # XML elements to ignore the contents of IGNORE_LIST = [ @@ -19,7 +23,7 @@ "tex-math", "mml:math", "object-id", - "ext-link", + "ext-link", # TODO: should we keep URL content? some of these have text rather than the URL as inner content ] # XML elements to separate text between (into different passages) @@ -34,15 +38,18 @@ "label", ] + TABLE_DELIMITER = '\t' TABLE_DELIMITED_TAGS = {'tr', 'th', 'td'} +# Tags that should be pre-pended with a space on merge +PSEUDO_SPACE_TAGS = {'sup', 'break'} +ANNOTATION_MARKER_PATTERN = r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' class TextChunk: text: str xml_node: str xml_path: str - non_separating: bool = False is_tail: bool = False is_annotation: bool = False @@ -51,14 +58,12 @@ def __init__( text, xml_node, xml_path=None, - non_separating=False, is_tail=False, is_annotation=False, ): self.text = text self.xml_node = xml_node self.xml_path = xml_path - self.non_separating = non_separating or is_annotation self.is_tail = is_tail self.is_annotation = is_annotation @@ -72,10 +77,10 @@ def __repr__(self): tag = self.tag if self.is_tail: tag = f'{tag}#' - ns = '-ns' if self.non_separating else '' - tag = f'{tag}{ns}' if self.text: tag = f'{tag}+text[{len(self.text)}]' + if self.is_annotation: + tag = f'{tag}@' return tag @property @@ -92,9 +97,9 @@ def remove_brackets_without_words(text: str) -> str: changed = True previous_text = text while changed: - fixed = re.sub(r"\([^\w\t]*\)", "", previous_text) - fixed = re.sub(r"\[[^\w\t]*\]", "", fixed) - fixed = re.sub(r"\{[^\w\t]*\}", "", fixed) + fixed = re.sub(r"\([^\w\t-]*\)", "", previous_text) + fixed = re.sub(r"\[[^\w\t-]*\]", "", fixed) + fixed = re.sub(r"\{[^\w\t-]*\}", "", fixed) changed = bool(previous_text != fixed) previous_text = fixed return fixed @@ -115,22 +120,30 @@ def cleanup_text(text: str) -> str: """ # Remove some "control-like" characters (left/right separator) text = text.replace(u"\u2028", " ").replace(u"\u2029", " ") + text = text.replace('°', ' ° ') text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == TABLE_DELIMITER) text = "".join(ch if unicodedata.category(ch)[0] != "Z" else " " for ch in text) + # replace greek letters with their long-form equivalent + for greek_letter, replacement in GREEK_ALPHABET.items(): + text = text.replace(greek_letter, replacement) + + text = unidecode(text, errors='preserve') + # Remove repeated commands and commas next to periods text = re.sub(r",([^\S\t]*,)*", ",", text) text = re.sub(r"(,[^\S\t]*)*\.", ".", text) text = remove_brackets_without_words(text) - # remove extra spaces from in-text figute/table citations + # remove extra spaces from in-text figure/table citations text = re.sub(r'\([^\S\t]*([^)]*[^\s)])[^\S\t]*\)', r'(\1)', text) # remove trailing spaces before periods text = re.sub(r'[^\S\t]+\.(\s|$)', r'.\1', text) # remove extra spaces around commas/semi-colons - text = re.sub(r'[^\S\t]*([,;])[^\S\t]+', r'\1 ', text) + text = re.sub(r'[^\S\t]*([,;:])([^\S\t]+)', r'\1 ', text) + text = re.sub(r'[^\S\t]*([,;:])$', r'\1', text) # trim leading and trailing non tab whitespace text = re.sub(r'(^|\t)([^\S\t]+)', r'\1', text) @@ -168,17 +181,14 @@ def merge_adjacent_xref_siblings(elem_list): If two XML elements in a list are adjacent and both xrefs separated only by punctuation, merge them """ siblings = [] - for elem in elem_list: if siblings and elem.tag == 'xref' and siblings[-1].tag == 'xref': # merge these 2 if the tail of the first element is a punctuation mark prev_tail = (siblings[-1].tail or '').strip() if ( - siblings[-1].tail - and len(prev_tail) == 1 - and unicodedata.category(prev_tail)[0] == 'P' - and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type') - ): + not prev_tail + or (len(prev_tail) == 1 and unicodedata.category(prev_tail[0])[0] == 'P') + ) and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type'): siblings[-1].text = (siblings[-1].text or '') + prev_tail + (elem.text or '') siblings[-1].tail = elem.tail @@ -189,7 +199,7 @@ def merge_adjacent_xref_siblings(elem_list): def get_tag_path(mapping: Dict[etree.Element, etree.Element], node: etree.Element) -> str: """ - Get a string representing the path of the currentl XML node in the heirachry of the XML file + Get a string representing the path of the current XML node in the hierarchy of the XML file """ path = [] current_node = node @@ -217,8 +227,8 @@ def tag_handler( except NotImplementedError: pass # Extract any raw text directly in XML element or just after - head = (elem.text or "").strip() - tail = (elem.tail or "").strip() + head = elem.text or "" + tail = elem.tail or "" # Then get the text from all child XML nodes recursively child_passages = [] @@ -246,16 +256,14 @@ def tag_handler( ): # Check if the tag should be ignored (so don't use main contents) return [ - TextChunk(tail, elem, non_separating=True, is_tail=True), + TextChunk(tail, elem, is_tail=True), ] return [TextChunk(head, elem)] + child_passages + [TextChunk(tail, elem, is_tail=True)] def strip_annotation_markers( - text: str, - annotations_map: Dict[str, str], - marker_pattern=r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', + text: str, annotations_map: Dict[str, str], marker_pattern=ANNOTATION_MARKER_PATTERN ) -> Tuple[str, List[bioc.BioCAnnotation]]: """ Given a set of annotations, remove any which are found in the current text and return @@ -271,7 +279,9 @@ def strip_annotation_markers( transformed_text = text pattern = ( - r'([^\S\t]*)([\(\[\{][^\S\t]*)?(' + marker_pattern + r')([^\S\t]*[\)\]\}])?([^\S\t]*)(\.)?' + r'([^\S\t]*)([\(\[\{][^\S\t]*)?(' + + marker_pattern + + r')([^\S\t]*[\)\]\}])?([^\S\t]*)([\.,;:])?' ) matched_annotations: List[Tuple[int, int, str]] = [] @@ -286,13 +296,17 @@ def strip_annotation_markers( end_offset = 0 matched_brackets = ( - br_open and br_close and br_open.strip() + br_close.strip() in {'\{\}', '[]', '()'} + br_open and br_close and br_open.strip() + br_close.strip() in {r'{}', '[]', '()'} ) if not matched_brackets and (br_open or br_close): # do not include in the sequence to be removed from the text - start_offset += len(ws_start or '') + len(br_open or '') - end_offset += len(period or '') + len(ws_end or '') + len(br_close or '') + if br_open: + start_offset += len(ws_start or '') + len(br_open or '') + # end_offset += len(period or '') + len(ws_end or '') + len(br_close or '') + else: + # start_offset += len(ws_start or '') + len(br_open or '') + end_offset += len(period or '') + len(ws_end or '') + len(br_close or '') elif not period: if ws_end: end_offset += len(ws_end) @@ -327,7 +341,41 @@ def strip_annotation_markers( return transformed_text, transformed_annotations -def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk: +def remove_style_tags(chunk_list_in: List[TextChunk], style_tags=['italic', 'bold', 'emph']): + """ + Given some list of text chunks, simplify the list to remove consecutive style-only tags + """ + if len(chunk_list_in) < 4: + return chunk_list_in + + start_index = 1 + chunk_list = chunk_list_in[:] + + while start_index < len(chunk_list) - 2: + current_tag = chunk_list[start_index] + + if current_tag.tag not in style_tags or current_tag.is_tail: + start_index += 1 + continue + + closing_tag = chunk_list[start_index + 1] + + if closing_tag.tag != current_tag.tag or not closing_tag.is_tail: + start_index += 1 + continue + + chunk_list[start_index - 1] = copy(chunk_list[start_index - 1]) + chunk_list[start_index - 1].text = ( + chunk_list[start_index - 1].text + current_tag.text + closing_tag.text + ) + chunk_list = chunk_list[:start_index] + chunk_list[start_index + 2 :] + + length_diff = sum([len(c.text) for c in chunk_list_in]) - sum([len(c.text) for c in chunk_list]) + assert length_diff == 0, f'characters changed {length_diff}' + return chunk_list + + +def merge_text_chunks(chunk_list: List[TextChunk], annotations_map=None) -> TextChunk: """ Merge some list of text chunks and pick the most top-level xml node associated with the list to be the new node for the chunk @@ -336,40 +384,48 @@ def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk: if annotations_map is None: # if no mapping is expected, simply drop annotation chunks chunk_list = [c for c in chunk_list if not c.is_annotation] - + chunk_list = remove_style_tags(chunk_list) merge = [] for i, current_chunk in enumerate(chunk_list): if i > 0: previous_chunk = chunk_list[i - 1] - join_char = ' ' + join_char = '' tags = {previous_chunk.tag, current_chunk.tag} - if any( - [ - previous_chunk.is_annotation, - current_chunk.is_annotation, - previous_chunk.non_separating, - current_chunk.non_separating, - current_chunk.is_tail and not (current_chunk.text or previous_chunk.text), - ] + if current_chunk.tag == 'sup': + if not current_chunk.is_tail: + if re.match(r'^\s*(−|-)?\d+\s*$', current_chunk.text): + if ( + previous_chunk.text + and unicodedata.category(previous_chunk.text[-1])[0] != 'P' + ): + join_char = '^' + elif ( + current_chunk.text + and previous_chunk.text + and unicodedata.category(current_chunk.text[0])[0] + == unicodedata.category(previous_chunk.text[-1])[0] + ): + join_char = '-' + elif current_chunk.tag in PSEUDO_SPACE_TAGS or ( + current_chunk.tag == 'xref' and not current_chunk.is_annotation ): - join_char = '' + join_char = ' ' elif len(tags) == 1 and tags & TABLE_DELIMITED_TAGS and not current_chunk.is_tail: join_char = TABLE_DELIMITER merge.append(join_char) - current_text = cleanup_text(current_chunk.text) if current_chunk.is_annotation: ann_id = f'ANN_{uuid.uuid4()}' - annotations_map[ann_id] = current_text + annotations_map[ann_id] = current_chunk.text merge.append(ann_id) else: - merge.append(current_text) + merge.append(current_chunk.text) text = ''.join(merge) # Remove any newlines (as they can be trusted to be syntactically important) - text = text.replace('\n', '') + text = text.replace('\n', ' ') text = cleanup_text(text) first_non_tail_node = chunk_list[0].xml_node @@ -399,6 +455,7 @@ def extract_text_chunks( """ if not isinstance(element_list, list): element_list = [element_list] + raw_text_chunks = [] for elem in element_list: raw_text_chunks.extend(tag_handler(elem, tag_handlers)) diff --git a/tests/test_utils.py b/tests/test_utils.py index 6c054cb..2c9eac0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,6 +5,9 @@ from xml.sax.saxutils import escape import pytest +from hypothesis import given, infer +from hypothesis import strategies as st + from bioconverters.utils import ( TABLE_DELIMITER, cleanup_text, @@ -13,8 +16,6 @@ remove_brackets_without_words, strip_annotation_markers, ) -from hypothesis import given, infer -from hypothesis import strategies as st from .util import data_file_path @@ -62,6 +63,169 @@ def test_extract_text_chunks_sibling_xrefs(): assert locations == [113, 325] +PARSING_CASES = [ + ['incubator containing 5% CO2', 'incubator containing 5% CO2'], + [ + 'Activating mutations in ALK provide a therapeutic target in neuroblastoma', + 'Activating mutations in ALK provide a therapeutic target in neuroblastoma', + ], + ['104', '10^4'], + ['especially in CBL-W802* cells', 'especially in CBL-W802* cells'], + [ + 'influenced by the presence of allelic variants—GSTP1 Ile105Val (rs1695) and GSTP1 Ala114Val (rs1138272), with homozygote', + 'influenced by the presence of allelic variants--GSTP1 Ile105Val (rs1695) and GSTP1 Ala114Val (rs1138272), with homozygote', + ], + [ + '''breast cancer, clear cell renal carcinoma, and colon cancer6 + 7 + 8 + 9 + 10 have successfully identified''', + 'breast cancer, clear cell renal carcinoma, and colon cancer have successfully identified', + ], + ['Labela', 'Label a'], + [ + 'Introduction of the NTRK3 G623R mutation', + 'Introduction of the NTRK3 G623R mutation', + ], + ['Patientsample', 'Patient sample'], + [ + ''', and in the transgenic +GATA-1, +low mouse''', + ', and in the transgenic GATA-1, low mouse', + ], + [ + 'we selected an allele (designated cic4) that removes', + 'we selected an allele (designated cic^4) that removes', + ], + [ + 'whereas a CIC derivative lacking the HMG-box is mainly cytoplasmic [9], implying', + 'whereas a CIC derivative lacking the HMG-box is mainly cytoplasmic, implying', + ], + [ + 'inactivated by somatic mutations [2230], but', + 'inactivated by somatic mutations, but', + ], + [ + 'regulation of the Wnt-β-catenin pathway', + 'regulation of the Wnt-beta-catenin pathway', + ], + [ + 'previously reported cell lines (CAL27, CAL33, Detroit 562, UM-SCC-47, SCC-25, SCC-9, UM-SCC-11B and UM-SCC-17B) [6], while', + 'previously reported cell lines (CAL27, CAL33, Detroit 562, UM-SCC-47, SCC-25, SCC-9, UM-SCC-11B and UM-SCC-17B), while', + ], + [ + 'clinic-pathologic parameters, χ2 and Fisher exact tests', + 'clinic-pathologic parameters, chi2 and Fisher exact tests', + ], + [ + 'due to RB1 inhibition [38], the specific', + 'due to RB1 inhibition, the specific', + ], + # TODO: discuss with jake best moves for cases below + [ + 'the specific HPV+ gene expression', + 'the specific HPV+ gene expression', + ], + [ + 'known to be resistant to 1st and 2nd generation EGFR-TKIS, osimertinib', + 'known to be resistant to 1st and 2nd generation EGFR-TKIS, osimertinib', + ], + [ + 'at 37°C in a humidified 5% CO2 incubator', + 'at 37 deg C in a humidified 5% CO2 incubator', + ], + [ + 'seeded at concentrations below 1 × 106/ml, selected', + 'seeded at concentrations below 1 x 10^6/ml, selected', + ], + [ + 'PCR cycling parameters were: one cycle of 95 °C for 15 min; 35 cycles of 95 °C for 20 s, 60 °C for 30 s, and 72 °C for 1 min; followed by one cycle of 72 °C for 3 min.', + 'PCR cycling parameters were: one cycle of 95 deg C for 15 min; 35 cycles of 95 deg C for 20 s, 60 deg C for 30 s, and 72 deg C for 1 min; followed by one cycle of 72 deg C for 3 min.', + ], + [ + '9 patients with a BRAF-mutant tumour', + '9 patients with a BRAF-mutant tumour', + ], + [ + 'patients with BRAFWT tumours', + 'patients with BRAF-WT tumours', + ], + ['MSIhi tumours', 'MSI-hi tumours'], + ['P53mutation', 'P53 mutation'], + [ + 'upper limit of normal, creatinine clearance ⩾30 ml min−1,', + 'upper limit of normal, creatinine clearance ⩾30 ml min^-1,', + ], + ['P = 1.0 × 10−6', 'P = 1.0 x 10^-6'], + [ + 'domains [13]: the N-terminal domain', + 'domains: the N-terminal domain', + ], + [ + 'motif (residues 234 to 247 [56]) immediately', + 'motif (residues 234 to 247) immediately', + ], + [ + 'the oncometabolite R(–)-2-hydroxyglutarate at the', + 'the oncometabolite R(-)-2-hydroxyglutarate at the', + ], + ['[3H]-Thymidine', '[3H]-Thymidine'], + [ + '

Class IA PI3K dimers are composed of a p110 catalytic subunit and a p85 regulatory subunit, each with three isoforms encoded by three genes17. Mutations in five of these genes have been observed in many human cancers3134. Our data show that mutations in the p85β (PIK3R2) regulatory and p110α (PIK3CA) catalytic subunits are a common cause of megalencephaly syndromes, albeit with a clear genotype-phenotype correlation as PIK3R2 and PIK3CA mutations are associated with MPPH (P = 3.3 × 10−6) and MCAP (P = 1.0 × 10−6), respectively (Supplementary Table 9, Online Methods). Both PIK3R1 and PIK3R2 have oncogenic potential, and mutations including the glycine-to-arginine substitution of PIK3R2 found in MPPH (p.Gly373Arg) and substitution of the homologous amino acid residue in PIK3R1 (p.Gly376Arg) have been found in cancer32. Available functional studies showed that several of these mutations disrupt the inactive conformation of the PI3K dimer and maintain the catalytic subunit in a high activity state32,35. Our observations in lymphoblastoid cells derived from patient LR00-016a1 show that the p.Gly373Arg mutation results in increased PI3K activity and elevated PI3K-mTOR signaling, further supporting this mechanism.

', + 'Class IA PI3K dimers are composed of a p110 catalytic subunit and a p85 regulatory subunit, each with three isoforms encoded by three genes. Mutations in five of these genes have been observed in many human cancers. Our data show that mutations in the p85beta (PIK3R2) regulatory and p110alpha (PIK3CA) catalytic subunits are a common cause of megalencephaly syndromes, albeit with a clear genotype-phenotype correlation as PIK3R2 and PIK3CA mutations are associated with MPPH (P = 3.3 x 10^-6) and MCAP (P = 1.0 x 10^-6), respectively (Supplementary Table 9,Online Methods). Both PIK3R1 and PIK3R2 have oncogenic potential, and mutations including the glycine-to-arginine substitution of PIK3R2 found in MPPH (p.Gly373Arg) and substitution of the homologous amino acid residue in PIK3R1 (p.Gly376Arg) have been found in cancer. Available functional studies showed that several of these mutations disrupt the inactive conformation of the PI3K dimer and maintain the catalytic subunit in a high activity state. Our observations in lymphoblastoid cells derived from patient LR00-016a1 show that the p.Gly373Arg mutation results in increased PI3K activity and elevated PI3K-mTOR signaling, further supporting this mechanism.', + ], + [ + '

The AR, like other members of the steroid hormone receptor family, is a ligand-activated transcription factor which has distinct structural and functional domains [13]: the N-terminal domain (NTD) important for transactivation; the DNA binding domain (DBD) and the C-terminal ligand binding domain (LBD). Upon ligand binding, the AR undergoes conformational transformation facilitating intra- and intermolecular interactions [14]. The transactivational capability of the AR is modulated by several signaling systems [15] through a range of post-translational modifications [13], [16]. Although the AR exerts most of its actions by functioning as a transcription factor binding to specific response elements, non-genomic effects can also contribute to the regulatory outcome. Activation of the phosphatidylinositol 3-kinase (PI3K)/Akt signaling pathway not only regulates AR activity through phosphorylation of the receptor, but also has a major role in the process leading to invasion and metastasis of PCa cells through downstream phosphorylation of affiliated substrates leading to protection from apoptosis and increased cell survival. The AR can stimulate PI3K/Akt signaling by interacting directly with the p85α regulatory subunit of PI3K in response to synthetic and natural androgens [17] through its NTD [18], and by binding and stimulating Akt1 within lipid rafts [19]. Many different processes are involved in the acquisition of hormone resistance [20] and they follow several diverse routes. Activation of sufficient levels of AR in a castration environment can occur through missense mutations within the AR [21], or splice variants, which result in: enhanced binding of androgens; creation of a constitutively active receptor [22][25]; promiscuous binding of other ligands [26][30] or altered recruitment of co-activators and co-repressors to the NTD and LBD. The levels of AR can be raised through increased expression, altered protein turnover and gene amplification [31][33]. In addition, aberrant intratumoral androgen synthesis can lead to activation of AR [34].

', + 'The AR, like other members of the steroid hormone receptor family, is a ligand-activated transcription factor which has distinct structural and functional domains: the N-terminal domain (NTD) important for transactivation; the DNA binding domain (DBD) and the C-terminal ligand binding domain (LBD). Upon ligand binding, the AR undergoes conformational transformation facilitating intra- and intermolecular interactions. The transactivational capability of the AR is modulated by several signaling systems through a range of post-translational modifications. Although the AR exerts most of its actions by functioning as a transcription factor binding to specific response elements, non-genomic effects can also contribute to the regulatory outcome. Activation of the phosphatidylinositol 3-kinase (PI3K)/Akt signaling pathway not only regulates AR activity through phosphorylation of the receptor, but also has a major role in the process leading to invasion and metastasis of PCa cells through downstream phosphorylation of affiliated substrates leading to protection from apoptosis and increased cell survival. The AR can stimulate PI3K/Akt signaling by interacting directly with the p85alpha regulatory subunit of PI3K in response to synthetic and natural androgens through its NTD, and by binding and stimulating Akt1 within lipid rafts. Many different processes are involved in the acquisition of hormone resistance and they follow several diverse routes. Activation of sufficient levels of AR in a castration environment can occur through missense mutations within the AR, or splice variants, which result in: enhanced binding of androgens; creation of a constitutively active receptor; promiscuous binding of other ligands or altered recruitment of co-activators and co-repressors to the NTD and LBD. The levels of AR can be raised through increased expression, altered protein turnover and gene amplification. In addition, aberrant intratumoral androgen synthesis can lead to activation of AR.', + ], + [ + '

The predominant type of mutation i.e. loss of function, was well represented in the NTD. Mutations L57Q, E198G, D221H, A234T, S296R; S334P, P340L, P504L and D528G all displayed loss of function with E198G showing the greatest reduction (50% at 1 nM) and P340L also being present in AIS. The loss of transactivational ability was generally seen in both basal activity and across a wide range of DHT concentrations. A possible explanation for the loss of function of mutation A234T is that it is located at the start of the highly conserved motif (residues 234 to 247 [56]) immediately carboxyl-terminal of TAU-1 which forms the interaction site for the Hsp70-interacting protein E3 ligase CHIP [57].

', + 'The predominant type of mutation i.e. loss of function, was well represented in the NTD. Mutations L57Q, E198G, D221H, A234T, S296R; S334P, P340L, P504L and D528G all displayed loss of function with E198G showing the greatest reduction (50% at 1 nM) and P340L also being present in AIS. The loss of transactivational ability was generally seen in both basal activity and across a wide range of DHT concentrations. A possible explanation for the loss of function of mutation A234T is that it is located at the start of the highly conserved motif (residues 234 to 247) immediately carboxyl-terminal of TAU-1 which forms the interaction site for the Hsp70-interacting protein E3 ligase CHIP.', + ], +] + + +@pytest.mark.parametrize('input_text,output_text', PARSING_CASES) +@pytest.mark.parametrize('annotations', [True, False]) +def test_extract_text_chunks(input_text, output_text, annotations): + xml_input = f'
{input_text}
' + root_nodes = [etree.fromstring(xml_input)] + + if annotations: + map = {} + chunks = extract_text_chunks(root_nodes, annotations_map=map) + result, _ = strip_annotation_markers(''.join(c.text for c in chunks), map) + else: + chunks = extract_text_chunks(root_nodes) + result = ''.join(c.text for c in chunks) + print([c.text for c in chunks]) + print('extracted', ''.join(chunk.text for chunk in chunks)) + print(chunks) + + print(len(result), len(output_text)) + diff_start = -1 + for i, (c1, c2) in enumerate(zip(result, output_text)): + if c1 != c2: + diff_start = i + break + if diff_start >= 0: + print( + [ + repr(output_text[max(diff_start - 10, 0) : diff_start]), + repr(output_text[diff_start : diff_start + 10]), + ] + ) + print( + [ + repr(result[max(diff_start - 10, 0) : diff_start]), + repr(result[diff_start : diff_start + 10]), + ] + ) + assert result == output_text + + def test_extract_figure_label(): xml_input = '
10.1371/journal.pone.0026760.g003Anchorage-independent growth of ERBB2 mutants.
' root_nodes = [etree.fromstring(xml_input)] @@ -106,6 +270,12 @@ def test_extract_figure_label(): 'This is a sentence with an in-text citation.', [42], ), + ( + '(residues 234 to 247 ANN_a2b8dd34-f190-41c7-98f6-259aa8d402e8) immediately', + {'ANN_a2b8dd34-f190-41c7-98f6-259aa8d402e8': '1'}, + '(residues 234 to 247) immediately', + [19], + ), ], ids=[ 'single citation', @@ -113,11 +283,12 @@ def test_extract_figure_label(): 'middle sentence citation', 'round-brackets', 'square-brackets', + 'end of brackets', ], ) def test_strip_annotation_markers(text, annotations_map, expected_text, expected_locations): text_result, annotations_result = strip_annotation_markers( - text, annotations_map, marker_pattern=r'ANN_\d+' + text, annotations_map, marker_pattern=r'ANN_[-\w]+' ) assert text_result == expected_text locations = [] @@ -127,16 +298,6 @@ def test_strip_annotation_markers(text, annotations_map, expected_text, expected assert locations == expected_locations -def test_extract_title_with_italics(): - xml = '
Activating mutations in ALK provide a therapeutic target in neuroblastoma
' - chunks = extract_text_chunks([etree.fromstring(xml)]) - assert len(chunks) == 1 - assert ( - 'Activating mutations in ALK provide a therapeutic target in neuroblastoma' - == chunks[0].text - ) - - @given( values=st.lists( st.text(alphabet=st.characters(blacklist_categories=['Cc', 'Cs'])), min_size=1, max_size=50 @@ -253,6 +414,14 @@ def test_floating_table(): ('extra space , before comma', 'extra space, before comma'), ('extra space ; before semi-colon', 'extra space; before semi-colon'), (' }{ \t}{ ', '}{\t}{'), + ( + 'A possible (residues 234 to 247 ) immediately', + 'A possible (residues 234 to 247) immediately', + ), + ( + 'the oncometabolite R(–)-2-hydroxyglutarate at the', + 'the oncometabolite R(-)-2-hydroxyglutarate at the', + ), ], ) def test_cleanup_text(input, output):