Skip to content

Commit

Permalink
Merge pull request #118 from BIH-CEI/115-add-preprocessing-2
Browse files Browse the repository at this point in the history
115 add preprocessing 2
  • Loading branch information
frehburg authored Sep 24, 2024
2 parents c19b614 + 2b28cca commit 0b73035
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 4 deletions.
4 changes: 2 additions & 2 deletions src/phenopacket_mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

__version__ = "0.0.1"

from . import cli, data_standards, pipeline
from . import cli, data_standards, pipeline, preprocessing

from .pipeline import PhenopacketMapper

__all__ = ["cli", "data_standards", "pipeline", "PhenopacketMapper"]
__all__ = ["cli", "data_standards", "pipeline", "PhenopacketMapper", "preprocessing"]
59 changes: 57 additions & 2 deletions src/phenopacket_mapper/data_standards/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@
from dataclasses import dataclass, field
from pathlib import Path
from types import MappingProxyType
from typing import Union, List, Literal, Dict, Optional, Any
from typing import Union, List, Literal, Dict, Optional, Any, Callable
import warnings

import pandas as pd

from phenopacket_mapper.data_standards import CodeSystem
from phenopacket_mapper.data_standards.date import Date
from phenopacket_mapper.data_standards.value_set import ValueSet
from phenopacket_mapper.preprocessing import preprocess, preprocess_method


@dataclass(slots=True, frozen=True)
Expand Down Expand Up @@ -83,7 +84,7 @@ def __str__(self):
return ret


@dataclass(slots=True, frozen=True)
@dataclass(slots=True)
class DataFieldValue:
"""This class defines the value of a `DataField` in a `DataModelInstance`
Expand Down Expand Up @@ -411,6 +412,60 @@ def data_frame(self) -> pd.DataFrame:
def __iter__(self):
return iter(self.data)

def preprocess(
self,
fields: Union[str, DataField, List[Union[str, DataField]]],
mapping: Union[Dict, Callable],
**kwargs
):
"""Preprocesses a field in the dataset
Preprocessing happens in place, i.e. the values in the dataset are modified directly.
If fields is a list of fields, the mapping must be a method that can handle a list of values being passed as
value to it. E.g.:
```python
def preprocess_method(values, method, **kwargs):
field1, field2 = values
# do something with values
return "preprocessed_values" + kwargs["arg1"] + kwargs["arg2"]
dataset.preprocess(["field_1", "field_2"], preprocess_method, arg1="value1", arg2="value2")
```
:param fields: Data fields to be preprocessed, will be passed onto `mapping`
:param mapping: A dictionary or method to use for preprocessing
"""
field_ids = list()
for f in fields:
if isinstance(field, str):
field_ids.append(f)
elif isinstance(f, DataField):
field_ids.append(f.id)
else:
raise ValueError(f"Field {field} is not of type str or DataField")

if len(field_ids) == 0:
raise ValueError("No fields to preprocess")
elif len(field_ids) == 1:
field_id = field_ids[0]
for instance in self.data:
for v in instance.values:
if v.field.id == field_id:
v.value = preprocess(v.value, mapping, **kwargs)
else:
if isinstance(mapping, dict):
raise ValueError("Mapping dictionary cannot be used to preprocess multiple fields")
elif isinstance(mapping, Callable):
values = list()
for instance in self.data:
for field_id in field_ids:
for v in instance.values:
if v.field.id == field_id:
values.append(v.value)

preprocess_method(values, mapping, **kwargs)

def head(self, n: int = 5):
if self.data_frame is not None:
return self.data_frame.head(n)
Expand Down
7 changes: 7 additions & 0 deletions src/phenopacket_mapper/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Methods for preprocessing data before mapping to Phenopackets."""

from .preprocess_dict import preprocess_dict
from .preprocess_method import preprocess_method
from .preprocess import preprocess

__all__ = ["preprocess_dict", "preprocess_method", "preprocess"]
23 changes: 23 additions & 0 deletions src/phenopacket_mapper/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import warnings
from typing import Any, Union, Dict, Callable

from phenopacket_mapper.preprocessing import preprocess_dict, preprocess_method


def preprocess(
value: Any,
mapping: Union[Dict, Callable],
**kwargs
) -> Any:
"""Preprocess a value before mapping to a Phenopacket.
Relies on `preprocess_dict` and `preprocess_method` to preprocess using a dictionary or method, respectively. Please
consult the documentation for these functions for more information.
"""
if isinstance(mapping, dict):
return preprocess_dict(value, mapping)
elif isinstance(mapping, Callable):
return preprocess_method(value, mapping, **kwargs)

warnings.warn(f"Mapping type {type(mapping)} in preprocessing not supported. Returning original value.")
return value
21 changes: 21 additions & 0 deletions src/phenopacket_mapper/preprocessing/preprocess_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import warnings
from typing import Any, Dict


def preprocess_dict(value: Any, mapping_dict: Dict) -> Any:
"""Takes a value and uses a mapping dictionary to preprocess it.
If the value is in the mapping dictionary, the corresponding value is returned.
If the value is not in the mapping dictionary, the original value is returned.
:param value: The value to preprocess.
:param mapping_dict: A dictionary containing the mapping rules.
:return: The preprocessed value.
"""
try:
ret_value = mapping_dict[value]
except KeyError:
ret_value = value
warnings.warn(f"Value {value} not found in mapping dictionary.")

return ret_value
25 changes: 25 additions & 0 deletions src/phenopacket_mapper/preprocessing/preprocess_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import Any, Callable


def preprocess_method(value: Any, method: Callable, **kwargs) -> Any:
"""Takes a value and uses a method to preprocess it.
The method is called with the value as an argument.
If the method raises an exception, the original value is returned.
If the method requires additional arguments, they can be passed as keyword arguments in `kwargs`.
Please write the method such that it is callable as `method(value, **kwargs)`.
:param value: The value to preprocess.
:param method: The method to use for preprocessing.
:param kwargs: Additional arguments for the method.
:return: The preprocessed value.
"""
try:
ret_value = method(value, **kwargs)
except Exception as e:
ret_value = value
print(f"Error while preprocessing value {value} with method {method}. Error message: {e}")

return ret_value

0 comments on commit 0b73035

Please sign in to comment.