Skip to content

Commit

Permalink
Merge pull request #91 from BIH-CEI/65-implement-data-loading
Browse files Browse the repository at this point in the history
65 implement data loading
  • Loading branch information
frehburg authored Sep 19, 2024
2 parents f3ff7ea + c3e9c96 commit 87ca9eb
Show file tree
Hide file tree
Showing 13 changed files with 662 additions and 177 deletions.
390 changes: 303 additions & 87 deletions notebooks/erdri_cds_definition_in_code.ipynb

Large diffs are not rendered by default.

Binary file modified res/test_data/erdri/erdri_cds_test_data.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion src/phenopacket_mapper/data_standards/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .date import Date
from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC
from .code import Coding, CodeableConcept
from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue
from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue, DataSet
from . import data_models

__all__ = [
Expand Down
5 changes: 5 additions & 0 deletions src/phenopacket_mapper/data_standards/code_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ def __str__(self):
def __repr__(self):
return str(self)

def __contains__(self, item):
from phenopacket_mapper.data_standards import Coding
if isinstance(item, Coding):
return self == item.system


SNOMED_CT = CodeSystem(
name="SNOMED CT",
Expand Down
232 changes: 207 additions & 25 deletions src/phenopacket_mapper/data_standards/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
`DataModelInstance` class is used to define an instance of a `DataModel`, i.e. a record in a dataset.
"""

from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
from types import MappingProxyType
from typing import Union, List, Literal, Dict
from typing import Union, List, Literal, Dict, Optional, Any
import warnings

import pandas as pd

from phenopacket_mapper.data_standards import CodeSystem
from phenopacket_mapper.data_standards.date import Date
from phenopacket_mapper.data_standards.value_set import ValueSet
Expand All @@ -26,24 +28,44 @@ class DataField:
A dataa field is the equivalent of a column in a table. It has a name, a value set, a description, a section, a
required flag, a specification, and an ordinal.
The string for the `id` field is generated from the `name` field using the `str_to_valid_id` function from the
`phenopacket_mapper.utils` module. This attempts to convert the `name` field. Sometimes this might not work as
desired, in which case the `id` field can be set manually.
Naming rules for the `id` field:
- The `id` field must be a valid Python identifier
- The `id` field must start with a letter or the underscore character
- The `id` field must cannot start with a number
- The `id` field can only contain lowercase alpha-numeric characters and underscores (a-z, 0-9, and _ )
- The `id` field cannot be any of the Python keywords (e.g. `in`, `is`, `not`, `class`, etc.).
- The `id` field must be unique within a `DataModel`
:ivar name: Name of the field
:ivar value_set: Value set of the field
:ivar id: Id of the field, adhering to the naming rules stated above
:ivar description: Description of the field
:ivar section: Section of the field (Only applicable if the data model is divided into sections)
:ivar required: Required flag of the field
:ivar specification: Text specification of the field (a description of the value set and field)
:ivar ordinal: Ordinal of the field (E.g. 1.1, 1.2, 2.1, etc.)
"""
name: str
value_set: ValueSet
description: str = ''
section: str = ''
required: bool = True
specification: str = None
ordinal: str = None
name: str = field()
value_set: ValueSet = field()
id: str = field(default=None)
description: str = field(default='')
section: str = field(default='')
required: bool = field(default=True)
specification: str = field(default='')
ordinal: str = field(default='')

def __post_init__(self):
if not self.id:
from phenopacket_mapper.utils import str_to_valid_id
object.__setattr__(self, 'id', str_to_valid_id(self.name))

def __str__(self):
ret = "DataField(\n"
ret += f"\t\tid: {self.id},\n"
ret += f"\t\tsection: {self.section},\n"
ret += f"\t\tordinal, name: ({self.ordinal}, {self.name}),\n"
ret += f"\t\tvalue_set: {self.value_set}, required: {self.required},\n"
Expand All @@ -58,9 +80,11 @@ class DataFieldValue:
Equivalent to a cell value in a table.
:ivar row_no: The id of the value, i.e. the row number
:ivar field: DataField: The `DataField` to which this value belongs and which defines the value set for the field.
:ivar value: The value of the field.
"""
row_no: Union[str, int]
field: DataField
value: Union[int, float, str, bool, Date, CodeSystem]

Expand All @@ -72,13 +96,28 @@ def validate(self) -> bool:
:return: True if the instance is valid, False otherwise
"""
if self.field.required and self.value is None:
if self.field.required and self.value is None: # no value
warnings.warn(f"Field {self.field.name} is required but has no value")
return False
if self.value is not None and self.field.value_set:
if self.value in self.field.value_set:
elif self.value is not None and self.field.value_set:
if Any in self.field.value_set: # value set allows any
return True
elif self.value in self.field.value_set: # raw value (likely a primitive) is in the value set
return True
warnings.warn(f"Value {self.value} is not in the value set of field {self.field.name}")
else: # check if the value matches one of the types in the value set
for e in self.field.value_set:
if isinstance(e, type):
cur_type = e
if cur_type is type(self.value):
return True
elif isinstance(e, CodeSystem):
cs = e
from phenopacket_mapper.data_standards import Coding
if isinstance(self.value, Coding) and self.value.system == cs:
return True

warnings.warn(f"Value {self.value} of type {type(self.value)} is not in the value set of field "
f"{self.field.name} (row {self.row_no})")
return False


Expand All @@ -88,13 +127,32 @@ class DataModel:
A data model can be used to import data and map it to the Phenopacket schema. It is made up of a list of `DataField`
Given that all `DataField` objects in a `DataModel` have unique names, the `id` field is generated from the `name`.
E.g.: `DataField(name='Date of Birth', ...)` will have an `id` of `'date_of_birth'`. The `DataField` objects can
be accessed using the `id` as an attribute of the `DataModel` object. E.g.: `data_model.date_of_birth`. This is
useful in the data reading and mapping processes.
>>> data_model = DataModel("Test data model", [DataField(name="Field 1", value_set=ValueSet())])
>>> data_model.field_1
DataField(name='Field 1', value_set=ValueSet(elements=[], name='', description=''), id='field_1', description='', section='', required=True, specification='', ordinal='')
:ivar data_model_name: Name of the data model
:ivar fields: List of `DataField` objects
:ivar resources: List of `CodeSystem` objects
"""
data_model_name: str
fields: List[DataField]
resources: List[CodeSystem]
data_model_name: str = field()
fields: List[DataField] = field()
resources: List[CodeSystem] = field(default_factory=list)

def __post_init__(self):
if len(self.fields) != len(set([f.id for f in self.fields])):
raise ValueError("All fields in a DataModel must have unique identifiers")

def __getattr__(self, var_name: str) -> DataField:
for f in self.fields:
if f.id == var_name:
return f
raise AttributeError(f"'DataModel' object has no attribute '{var_name}'")

def __str__(self):
ret = f"DataModel(name={self.data_model_name}\n"
Expand All @@ -106,21 +164,63 @@ def __str__(self):
ret += ")"
return ret

def get_field(self, field_id: str, default: Optional = None) -> Optional[DataField]:
"""Returns a DataField object by its id
:param field_id: The id of the field
:param default: The default value to return if the field is not found
:return: The DataField object
"""
for f in self.fields:
if f.id == field_id:
return f
if default or default is None:
return default
raise ValueError(f"Field with id {field_id} not found in DataModel")

def get_field_ids(self) -> List[str]:
"""Returns a list of the ids of the DataFields in the DataModel"""
return [f.id for f in self.fields]

def load_data(
self,
path: Union[str, Path],
compliance: Literal['soft', 'hard'] = 'soft'
) -> List['DataModelInstance']:
compliance: Literal['soft', 'hard'] = 'soft',
**kwargs
) -> 'DataSet':
"""Loads data from a file using a DataModel definition
To call this method, pass the column name for each field in the DataModel as a keyword argument. This is done
by passing the field id followed by '_column'. E.g. if the DataModel has a field with id 'date_of_birth', the
column name in the file should be passed as 'date_of_birth_column'. The method will raise an error if any of
the fields are missing.
E.g.:
```python
data_model = DataModel("Test data model", [DataField(name="Field 1", value_set=ValueSet())])
data_model.load_data("data.csv", field_1_column="column_name_in_file")
```
:param path: Path to the file containing the data
:param compliance: Compliance level to use when loading the data. If 'soft', the data will be loaded even if it
does not comply with the data model definition. If 'hard', the data will be loaded only if it
complies with the data model definition.
:param compliance: Compliance level to use when loading the data.
:param kwargs: Dynamically passed parameters that match {id}_column for each item
:return: A list of `DataModelInstance` objects
"""
column_names = dict()
for f in self.fields:
column_param = f"{f.id}_column"
if column_param not in kwargs:
raise TypeError(f"load_data() missing 1 required argument: '{column_param}'")
else:
column_names[f.id] = kwargs[column_param]

from phenopacket_mapper.pipeline import load_data_using_data_model
return load_data_using_data_model(path, self, compliance)
return load_data_using_data_model(
path=path,
data_model=self,
column_names=column_names,
compliance=compliance
)

@staticmethod
def from_file(
Expand Down Expand Up @@ -171,17 +271,26 @@ def from_file(
def load_data_using_data_model(
path: Union[str, Path],
data_model: 'DataModel',
column_names: Dict[str, str],
compliance: Literal['soft', 'hard'] = 'soft',
) -> List['DataModelInstance']:
) -> 'DataSet':
"""Loads data from a file using a DataModel definition
:param path: Path to formatted csv or excel file
:param data_model: DataModel to use for reading the file
:param column_names: A dictionary mapping from the id of each field of the `DataField` to the name of a
column in the file
:param compliance: Compliance level to enforce when reading the file. If 'soft', the file can have extra fields
that are not in the DataModel. If 'hard', the file must have all fields in the DataModel.
:return: List of DataModelInstances
"""
return data_model.load_data(path, compliance)
from phenopacket_mapper.pipeline import load_data_using_data_model
return load_data_using_data_model(
path=path,
data_model=data_model,
column_names=column_names,
compliance=compliance
)


@dataclass(slots=True)
Expand All @@ -190,12 +299,14 @@ class DataModelInstance:
This class is used to define an instance of a `DataModel`, i.e. a record or row in a dataset.
:ivar row_no: The id of the instance, i.e. the row number
:ivar data_model: The `DataModel` object that defines the data model for this instance
:ivar values: A list of `DataFieldValue` objects, each adhering to the `DataField` definition in the `DataModel`
:ivar compliance: Compliance level to enforce when validating the instance. If 'soft', the instance can have extra
fields that are not in the DataModel. If 'hard', the instance must have all fields in the
DataModel.
"""
row_no: Union[int, str]
data_model: DataModel
values: List[DataFieldValue]
compliance: Literal['soft', 'hard'] = 'soft'
Expand All @@ -211,7 +322,7 @@ def validate(self) -> bool:
:return: True if the instance is valid, False otherwise
"""
error_msg = f"Instance values do not comply with their respective fields' valuesets. {self}"
error_msg = f"Instance values do not comply with their respective fields' valuesets. (row {self.row_no})"
for v in self.values:
if not v.validate():
if self.compliance == 'hard':
Expand All @@ -221,4 +332,75 @@ def validate(self) -> bool:
return False
else:
raise ValueError(f"Compliance level {self.compliance} is not valid")

is_required = set(f.id for f in self.data_model.fields if f.required)
fields_present = set(v.field.id for v in self.values)

if len(missing_fields := (is_required - fields_present)) > 0:
error_msg = (f"Required fields are missing in the instance. (row {self.row_no}) "
f"\n(missing_fields={', '.join(missing_fields)})")
if self.compliance == 'hard':
raise ValueError(error_msg)
elif self.compliance == 'soft':
warnings.warn(error_msg)
return False
else:
raise ValueError(f"Compliance level {self.compliance} is not valid")
return True

def __iter__(self):
return iter(self.values)

def __getattr__(self, var_name: str) -> DataFieldValue:
fields = [v.field.id for v in self.values]
if var_name in fields:
return self.values[fields.index(var_name)]
raise AttributeError(f"'DataModelInstance' object has no attribute '{var_name}'")


@dataclass(slots=True, frozen=True)
class DataSet:
"""This class defines a dataset as defined by a `DataModel`
This class is used to define a dataset as defined by a `DataModel`. It is a collection of `DataModelInstance`
objects.
:ivar data_model: The `DataModel` object that defines the data model for this dataset
:ivar data: A list of `DataModelInstance` objects, each adhering to the `DataField` definition in the `DataModel`
:ivar data_frame: A pandas DataFrame representation of the dataset
"""
data_model: 'DataModel' = field()
data: List[DataModelInstance] = field()

@property
def height(self):
return len(self.data)

@property
def width(self):
return len(self.data_model.fields)

@property
def data_frame(self) -> pd.DataFrame:
column_names = [f.id for f in self.data_model.fields]
data_dict = {c: list() for c in column_names}
for instance in self.data:
for f in self.data_model.fields:
field_id = f.id
try:
dfv: DataFieldValue = getattr(instance, field_id)
value = dfv.value
except AttributeError:
value = None
finally:
data_dict[field_id].append(value)
return pd.DataFrame(data_dict, columns=column_names)

def __iter__(self):
return iter(self.data)

def head(self, n: int = 5):
if self.data_frame is not None:
return self.data_frame.head(n)
else:
warnings.warn("No data frame object available for this dataset")
4 changes: 2 additions & 2 deletions src/phenopacket_mapper/pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""This module includes the pipeline for mapping data to phenopackets."""

from .input import read_file, read_data_model, read_phenopackets, read_phenopacket_from_json, load_data_using_data_model
from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_data_using_data_model
from .mapper import PhenopacketMapper
from .output import write
from .validate import validate, read_validate

__all__ = [
'read_file', 'read_data_model', 'read_phenopackets', 'read_phenopacket_from_json', 'load_data_using_data_model',
'read_data_model', 'read_phenopackets', 'read_phenopacket_from_json', 'load_data_using_data_model',
'write',
'PhenopacketMapper'
]
Loading

0 comments on commit 87ca9eb

Please sign in to comment.