Skip to content

Commit

Permalink
Merge pull request #33 from BIH-CEI/31-auto-parse-ordinals-in-data-mo…
Browse files Browse the repository at this point in the history
…del-input

added auto ordinal parsing

we did it after doctest hell
  • Loading branch information
frehburg authored Sep 6, 2024
2 parents 082946c + af2d874 commit 36b0a25
Show file tree
Hide file tree
Showing 12 changed files with 212 additions and 82 deletions.
110 changes: 61 additions & 49 deletions src/defining_erdri_cds.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/rarelink_phenopacket_mapper/data_standards/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class DataField:

def __str__(self):
ret = "DataField(\n"
ret += f"\t\tordinal, section={self.ordinal} {self.section},\n"
ret += f"\t\tname={self.name},\n"
ret += f"\t\tsection={self.section},\n"
ret += f"\t\tordinal and name={self.ordinal} {self.name},\n"
ret += f"\t\tdata type={self.data_type}, required={self.required},\n"
ret += f"\t\tsepcification={self.specification}\n"
ret += "\t)"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Selection of rare disease specific data models"""
from .rarelink_datamodel import RARELINK_DATA_MODEL
from .erdri_cds import ERDRI_CDS
from .data_type_string_representation import parse_type_string_representation
from rarelink_phenopacket_mapper.utils.parsing.parse_data_type import parse_data_type

__all__ = ["RARELINK_DATA_MODEL", "ERDRI_CDS", "parse_type_string_representation"]
__all__ = ["RARELINK_DATA_MODEL", "ERDRI_CDS", "parse_data_type"]
40 changes: 25 additions & 15 deletions src/rarelink_phenopacket_mapper/pipeline/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from phenopackets.schema.v2 import Phenopacket

from rarelink_phenopacket_mapper.data_standards import DataModel, DataModelInstance, DataField, CodeSystem
from rarelink_phenopacket_mapper.data_standards.data_models import RARELINK_DATA_MODEL, parse_type_string_representation
from rarelink_phenopacket_mapper.data_standards.data_models import RARELINK_DATA_MODEL, parse_data_type
from rarelink_phenopacket_mapper.utils import loc_default
from rarelink_phenopacket_mapper.utils.parsing import parse_ordinal


def _read_csv(path: Path, data_model: DataModel) -> List[DataModelInstance]:
Expand Down Expand Up @@ -60,17 +62,18 @@ def read_data_model(
path: Union[str, Path],
file_type: Literal['csv', 'excel', 'unknown'] = 'unknown',
column_names: Dict[str, str] = MappingProxyType({
'name': 'name',
'section': '',
'description': 'description',
'data_type': 'data_type',
'required': 'required',
'specification': '',
'ordinal': ''
DataField.name.__name__: 'data_field_name',
DataField.section.__name__: 'data_model_section',
DataField.description.__name__: 'description',
DataField.data_type.__name__: 'data_type',
DataField.required.__name__: 'required',
DataField.specification.__name__: 'specification',
DataField.ordinal.__name__: 'ordinal'
}),
parse_data_types: bool = False,
compliance: Literal['soft', 'hard'] = 'soft',
remove_line_breaks: bool = False,
parse_ordinals: bool = True,
) -> DataModel:
"""Reads a Data Model from a file
Expand All @@ -85,6 +88,8 @@ def read_data_model(
:param compliance: Only applicable if `parse_data_types=True`, otherwise does nothing. `'soft'` raises warnings upon
encountering invalid data types, `'hard'` raises `ValueError`.
:param remove_line_breaks: Whether to remove line breaks from string values
:param parse_ordinals: Whether to extract the ordinal number from the field name. Warning: this can overwrite values
Ordinals could look like: "1.1.", "1.", "I.a.", or "ii.", etc.
"""
if isinstance(column_names, MappingProxyType):
inv_column_names = dict(column_names)
Expand Down Expand Up @@ -135,21 +140,25 @@ def remove_line_breaks_if_not_none(value):

data_fields = []
for i in range(len(df)):
data_field_name = df.loc[i, column_names.get('name', '')]
section = df.loc[i, column_names.get('section', '')]
data_type = df.loc[i, column_names.get('data_type', '')]
description = df.loc[i, column_names.get('description', '')]
required = bool(df.loc[i, column_names.get('required', '')])
specification = df.loc[i, column_names.get('specification', '')]
data_field_name = loc_default(df, row_index=i, column_name=column_names.get(DataField.name.__name__, ''))
section = loc_default(df, row_index=i, column_name=column_names.get(DataField.section.__name__, ''))
data_type = loc_default(df, row_index=i, column_name=column_names.get(DataField.data_type.__name__, ''))
description = loc_default(df, row_index=i, column_name=column_names.get(DataField.description.__name__, ''))
required = bool(loc_default(df, row_index=i, column_name=column_names.get(DataField.required.__name__, '')))
specification = loc_default(df, row_index=i, column_name=column_names.get(DataField.specification.__name__, ''))
ordinal = loc_default(df, row_index=i, column_name=column_names.get(DataField.ordinal.__name__, ''))

if remove_line_breaks:
data_field_name = remove_line_breaks_if_not_none(data_field_name)
section = remove_line_breaks_if_not_none(section)
description = remove_line_breaks_if_not_none(description)
specification = remove_line_breaks_if_not_none(specification)

if parse_ordinals:
ordinal, data_field_name = parse_ordinal(data_field_name)

if parse_data_types:
data_type = parse_type_string_representation(type_str=data_type, resources=resources, compliance=compliance)
data_type = parse_data_type(type_str=data_type, resources=resources, compliance=compliance)

data_fields.append(
DataField(
Expand All @@ -159,6 +168,7 @@ def remove_line_breaks_if_not_none(value):
description=description,
required=required,
specification=specification,
ordinal=ordinal
)
)

Expand Down
3 changes: 2 additions & 1 deletion src/rarelink_phenopacket_mapper/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This submodule contains utility functions that are used throughout the package."""
from .create_ipynb_in_code import NotebookBuilder
from .pandas_utils import loc_default

__all__ = ["NotebookBuilder"]
__all__ = ["NotebookBuilder", "loc_default"]
9 changes: 9 additions & 0 deletions src/rarelink_phenopacket_mapper/utils/get_field_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from rarelink_phenopacket_mapper.data_standards import DataField


def get_field_name(field):
return field.__name__


if __name__ == "__main__":
print(get_field_name(DataField.data_type))
19 changes: 19 additions & 0 deletions src/rarelink_phenopacket_mapper/utils/pandas_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Any

import pandas as pd


def loc_default(df: pd.DataFrame, row_index: int, column_name: str, default: Any = '') -> Any:
"""Safely performs loc on a `pd.DataFrame`, returns default value if something goes wrong
:param df: the dataframe
:param row_index: index of the row
:param column_name: name of the column
:param default: default value to return if some error occurs
:return: Value at the row and column specified
"""
try:
return df.loc[row_index, column_name]
except Exception as e:
print(e)
return default
9 changes: 9 additions & 0 deletions src/rarelink_phenopacket_mapper/utils/parsing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""This module contains utility functions concerning the parsing of strings to python values"""

from .parse_data_type import parse_data_type
from .parse_ordinal import parse_ordinal

__all__ = [
"parse_data_type",
"parse_ordinal"
]
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
}


def parse_type_string_representation(
def parse_data_type(
type_str: str,
resources: List[CodeSystem],
compliance: Literal['soft', 'hard'] = 'soft'
Expand All @@ -29,7 +29,7 @@ def parse_type_string_representation(
the case described above.
E.g.
>>> parse_type_string_representation("integer, str, Boolean", [])
>>> parse_data_type("integer, str, Boolean", [])
[<class 'int'>, <class 'str'>, <class 'bool'>]
:param type_str:
Expand All @@ -43,23 +43,23 @@ def parse_type_string_representation(
single_type_strings = type_str.split(',')
types = []
for single in single_type_strings:
types.append(_parse_single_string_type_repr(type_str=single, resources=resources, compliance=compliance))
types.append(_parse_single_data_type(type_str=single, resources=resources, compliance=compliance))

if not types:
return [Any]

return types


def _parse_single_string_type_repr(
def _parse_single_data_type(
type_str: str,
resources: List[CodeSystem],
compliance: Literal['soft', 'hard'] = 'soft'
) -> Union[Any, CodeSystem, type, str]:
"""Parses a string representing a data type to the `type` in Python
E.g.:
>>> _parse_single_string_type_repr('date', [])
>>> _parse_single_data_type('date', [])
<class 'rarelink_phenopacket_mapper.data_standards.date.Date'>
:param type_str:
Expand Down
41 changes: 41 additions & 0 deletions src/rarelink_phenopacket_mapper/utils/parsing/parse_ordinal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from typing import Tuple
import re


def parse_ordinal(field_name_str: str) -> Tuple[str, str]:
"""
Parsing `DataField.name` string to separate strings containing the ordinal and the name respectively
This method is meant as part of reading in a `DataModel` from a file, where data model fields might have an ordinal
attached to them (e.g., "1.1. Pseudonym"), which this method can then neatly separate into ordinal="1.1." and
name="Pseudonym".
>>> parse_ordinal("1.1. Pseudonym")
('1.1', 'Pseudonym')
>>> parse_ordinal("1. Pseudonym")
('1', 'Pseudonym')
>>> parse_ordinal("I.a. Pseudonym")
('I.a', 'Pseudonym')
>>> parse_ordinal("ii. Pseudonym")
('ii', 'Pseudonym')
:param field_name_str: name of the field, containing an ordinal, to parse
:returns: a tuple containing the ordinal and the name
"""
# Regex to extract the section number and field name
match = re.match(r"([0-9]+(?:\.[0-9]+)*|[Iivxlc]+\.[a-z]*|[a-z]*)\.?\s*(.+)", field_name_str, re.IGNORECASE)

if match:
# Extract the field number and description
ordinal = match.group(1).strip()
field_name = match.group(2).strip()

if ordinal[-1] == '.':
ordinal = ordinal[0:-1]

return ordinal, field_name
else:
return '', field_name_str # since this is more for optics, do not raise error and just do "nothing"
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import List, Any
from typing import Any

import pytest

from rarelink_phenopacket_mapper.data_standards import Date
from rarelink_phenopacket_mapper.data_standards.code_system import HPO, SNOMED_CT, ICD9
from rarelink_phenopacket_mapper.data_standards.data_models import parse_type_string_representation
from rarelink_phenopacket_mapper.data_standards.data_models.data_type_string_representation import \
_parse_single_string_type_repr
from rarelink_phenopacket_mapper.data_standards.data_models import parse_data_type
from rarelink_phenopacket_mapper.utils.parsing.parse_data_type import \
_parse_single_data_type


@pytest.fixture
Expand Down Expand Up @@ -42,8 +42,8 @@ def resources():
("integer, boolean, string, date, HP", [int, bool, str, Date, HPO]),
]
)
def test_parse_string_type_representation(type_str, result, resources):
assert parse_type_string_representation(type_str, resources) == result
def test_parse_data_type(type_str, result, resources):
assert parse_data_type(type_str, resources) == result


@pytest.mark.parametrize(
Expand All @@ -65,5 +65,5 @@ def test_parse_string_type_representation(type_str, result, resources):
("icd-9", ICD9), # synonym
]
)
def test__parse_single_string_type_repr(type_str, result, resources):
assert _parse_single_string_type_repr(type_str, resources) == result
def test__parse_single_data_type(type_str, result, resources):
assert _parse_single_data_type(type_str, resources) == result
29 changes: 29 additions & 0 deletions tests/utils/parsing/test_parse_ordinal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pytest

from rarelink_phenopacket_mapper.utils.parsing import parse_ordinal


@pytest.mark.parametrize(
"field_name, result", [
("1.1. Pseudonym", ("1.1", "Pseudonym")),
("2.1. Date of Birth", ("2.1", "Date of Birth")),
("2.2. Sex", ("2.2", "Sex")),
("3.1. Patient's status", ("3.1", "Patient's status")),
("3.2. Date of death", ("3.2", "Date of death")),
("4.1. First contact with specialised centre", ("4.1", "First contact with specialised centre")),
("5.1. Age at onset", ("5.1", "Age at onset")),
("5.2. Age at diagnosis", ("5.2", "Age at diagnosis")),
("6.1. Diagnosis of the rare disease", ("6.1", "Diagnosis of the rare disease")),
("6.2. Genetic diagnosis", ("6.2", "Genetic diagnosis")),
("6.3. Undiagnosed case", ("6.3", "Undiagnosed case")),
("1.1 Pseudonym", ("1.1", "Pseudonym")),
("1. Pseudonym", ("1", "Pseudonym")),
("1 Pseudonym", ("1", "Pseudonym")),
("I.a. Pseudonym", ("I.a", "Pseudonym")),
("I.a Pseudonym", ("I.a", "Pseudonym")),
("ii. Pseudonym", ("ii", "Pseudonym")),
("ii Pseudonym", ("ii", "Pseudonym")),
]
)
def test_parse_ordinal(field_name, result):
assert parse_ordinal(field_name) == result

0 comments on commit 36b0a25

Please sign in to comment.