Skip to content

Commit

Permalink
Merge pull request #174 from BIH-CEI/156-add-util-method-that-loads-d…
Browse files Browse the repository at this point in the history
…ata-in-whatever-format-and-provides-an-iterator-for-instances

156 add util method that loads data in whatever format and provides an iterator for instances
frehburg authored Oct 15, 2024
2 parents f7471b4 + 55ad3f1 commit cb9221f
Showing 20 changed files with 311 additions and 181 deletions.
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -27,7 +27,14 @@ classifiers = [
"Topic :: Scientific/Engineering :: Machine Learning"
]
dependencies = [
"loguru", "phenopackets", "pandas", "openpyxl", "jupyter", "requests", "bs4",
"loguru",
"phenopackets",
"pandas",
"openpyxl",
"jupyter",
"requests",
"bs4",
"xmltodict==0.14.1",
]
dynamic = ["version"]

14 changes: 11 additions & 3 deletions src/phenopacket_mapper/__init__.py
Original file line number Diff line number Diff line change
@@ -2,8 +2,16 @@

__version__ = "0.0.1"

from . import cli, data_standards, pipeline, preprocessing, api_requests
from . import data_standards, validate, preprocessing, api_requests, mapping, utils

from .pipeline import PhenopacketMapper
from .data_standards import DataModel
from .mapping import PhenopacketMapper

__all__ = ["cli", "data_standards", "pipeline", "PhenopacketMapper", "preprocessing", "api_requests"]
__all__ = [
"data_standards", "DataModel",
"validate",
"preprocessing",
"api_requests",
"mapping", "PhenopacketMapper",
"utils",
]
1 change: 0 additions & 1 deletion src/phenopacket_mapper/cli/__init__.py

This file was deleted.

50 changes: 0 additions & 50 deletions src/phenopacket_mapper/cli/main.py

This file was deleted.

32 changes: 0 additions & 32 deletions src/phenopacket_mapper/cli/mapping_command.py

This file was deleted.

50 changes: 0 additions & 50 deletions src/phenopacket_mapper/cli/quickstart_command.py

This file was deleted.

19 changes: 0 additions & 19 deletions src/phenopacket_mapper/cli/validate_command.py

This file was deleted.

3 changes: 2 additions & 1 deletion src/phenopacket_mapper/data_standards/data_model.py
Original file line number Diff line number Diff line change
@@ -256,6 +256,7 @@ def load_data(
:param kwargs: Dynamically passed parameters that match {id}_column for each item
:return: A list of `DataModelInstance` objects
"""
# TODO: move the dynamic params to the load method in utils.io
column_names = dict()
for f in self.fields:
column_param = f"{f.id}_column"
@@ -264,7 +265,7 @@ def load_data(
else:
column_names[f.id] = kwargs[column_param]

from phenopacket_mapper.pipeline import load_data_using_data_model
from phenopacket_mapper.utils.io import load_data_using_data_model
return load_data_using_data_model(
path=path,
data_model=self,
12 changes: 0 additions & 12 deletions src/phenopacket_mapper/pipeline/__init__.py

This file was deleted.

19 changes: 19 additions & 0 deletions src/phenopacket_mapper/utils/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""This module handles the input and output of data."""

from .read_json import read_json
from .read_xml import read_xml, parse_xml
from .data_reader import DataReader
from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_data_using_data_model
from .output import write

__all__ = [
'read_json',
'read_xml', 'parse_xml',
'DataReader',
'read_data_model',
'read_phenopackets',
'read_phenopacket_from_json',
'load_data_using_data_model',

'write',
]
111 changes: 111 additions & 0 deletions src/phenopacket_mapper/utils/io/data_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from pathlib import Path
from typing import Union, Tuple, List, Iterable, Literal, Dict
from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase

import pandas as pd

from phenopacket_mapper.utils.io import read_json, read_xml


class DataReader:
def __init__(
self,
file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]],
encoding: str = 'utf-8',
file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None
):
"""Initializes the data reader.
:param file: a `str`, :class:`Path` or :class:`IOBase` to read from. If `str` or :class:`Path`, then the
input is interpreted as a path to a local file.
:param encoding: The encoding to use when reading the file. Default is 'utf-8'.
:param file_extension: The file extension of the file to read. If `None`, the file extension is inferred from the
file path. Default is `None`.
"""
# TODO: fix read xml
# TODO: add option to pass a list of files to read
self.is_dir = False
self.file_extension = None

if isinstance(file, str):
self.path = Path(file)
self.file = open(self.path, "r", encoding=encoding)

if file_extension is None: # extract the file extension from the file path
file_extension = self.path.suffix[1:]

self.handle_file_extension(file_extension)
elif isinstance(file, Path):
if not file.exists():
raise FileNotFoundError(f"File {file} does not exist.")
if file.is_file():
self.path = file
self.file = open(self.path, "r", encoding=encoding)

if file_extension is None: # extract the file extension from the file path
file_extension = self.path.suffix[1:]

self.handle_file_extension(file_extension)
elif file.is_dir():
self.is_dir = True

elif isinstance(file, IOBase):
if isinstance(file, (TextIOWrapper, TextIOBase)):
pass
elif isinstance(file, (BytesIO, BufferedIOBase)):
self.file = TextIOWrapper(file, encoding=encoding)

if file_extension is None:
raise ValueError("File extension must be provided when passing a file buffer.")
else:
self.handle_file_extension(file_extension)

self.data, self.iterable = self._read()

def handle_file_extension(self, fe: str):
if fe.lower() in ['csv', 'xlsx', 'json', 'xml']:
self.file_extension = fe.lower()
else:
raise ValueError(f"File extension {fe} not recognized.")

def _read(self) -> Tuple[Union[pd.DataFrame, List, Dict], Iterable]:
"""Reads the data.
:return: The data and an iterable representation of the data.
"""
# we know that file is always a buffer with the contents of the file
# change this to work with self.file
if not self.is_dir:
if self.file_extension == 'csv':
df = pd.read_csv(self.file)
return df, [row for row in df.iterrows()]
elif self.file_extension == 'xlsx':
df = pd.read_excel(self.file)
return df, [row for row in df.iterrows()]
elif self.file_extension == 'json':
return (file_contents := read_json(self.file)), [file_contents]
elif self.file_extension == 'xml':
return (file_contents := read_xml(self.file)), [file_contents]
else:
raise ValueError(f'Unknown file type with extension {self.file_extension}')
elif self.is_dir:
# collect list of all files in the folder
files: List[Path] = [file for file in self.path.iterdir() if file.is_file()]
file_extension = list(set([file.suffix[1:] for file in files]))
if len(file_extension) > 1:
raise ValueError(f"Cannot read files of different types: {file_extension}")
elif len(file_extension) == 0:
raise ValueError(f"No files found in the directory specified: {self.file}")

self.handle_file_extension(file_extension[0])

if self.file_extension == 'json':
jsons = [read_json(file) for file in files]
return jsons, jsons
elif self.file_extension == 'xml':
xmls = [read_xml(file) for file in files]
return xmls, xmls
else:
raise ValueError(f"File extension {file_extension} not recognized or not supported for reading files "
f"from a directory. Specified directory: {self.file}. Extensions found: "
f"{file_extension}")
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@
DataSet
from phenopacket_mapper.utils import loc_default
from phenopacket_mapper.utils import parsing
from phenopacket_mapper.utils.io.data_reader import DataReader
from phenopacket_mapper.utils.parsing import parse_ordinal


@@ -165,13 +166,11 @@ def load_data_using_data_model(
else:
raise ValueError(f'Path must be a string or Path object, not {type(path)}')

file_extension = path.suffix[1:]
if file_extension == 'csv':
df = pd.read_csv(path)
elif file_extension == 'xlsx':
df = pd.read_excel(path)
else:
raise ValueError(f'Unknown file type with extension {file_extension}')
dr = DataReader(path)
data, data_iterable = dr.data, dr.iterable

# TODO: for the moment assume that the data is a pandas DataFrame
df = data

# check column_names is in the correct format
if isinstance(column_names, MappingProxyType):
@@ -226,15 +225,15 @@ def read_phenopackets(dir_path: Path) -> List[Phenopacket]:
return phenopackets_list


def read_phenopacket_from_json(file_path: Union[str, Path]) -> Phenopacket:
def read_phenopacket_from_json(path: Union[str, Path]) -> Phenopacket:
"""Reads a Phenopacket from a JSON file.
:param file_path: The path to the JSON file.
:type file_path: Union[str, Path]
:param path: The path to the JSON file.
:type path: Union[str, Path]
:return: The loaded Phenopacket.
:rtype: Phenopacket
"""
with open(file_path, 'r') as fh:
with open(path, 'r') as fh:
json_data = fh.read()
phenopacket = Phenopacket()
Parse(json_data, phenopacket)
File renamed without changes.
18 changes: 18 additions & 0 deletions src/phenopacket_mapper/utils/io/read_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import json
from io import IOBase
from pathlib import Path
from typing import Union, Dict


def read_json(path: Union[str, Path, IOBase]) -> Dict:
if isinstance(path, str):
path = Path(path)

if isinstance(path, Path):
f = open(path)
elif isinstance(path, IOBase):
f = path
else:
raise ValueError(f"Invalid input type {type(path)}.")

return json.load(f)
64 changes: 64 additions & 0 deletions src/phenopacket_mapper/utils/io/read_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from io import IOBase
from pathlib import Path
from typing import Union, Dict
import xmltodict


def read_xml(path: Union[str, Path, IOBase], encoding='utf-8') -> Dict:
if isinstance(path, str):
path = Path(path)

if isinstance(path, Path):
with open(path, 'r', encoding=encoding) as f:
return parse_xml(f)
elif isinstance(path, IOBase):
return parse_xml(path)
else:
raise ValueError(f"Invalid input type {type(path)}.")


def _post_process_xml_dict(dict_: Dict) -> Dict:
def parse_primitive_value(value: str):
if value.isdigit():
return int(value)
elif value.lower() == "true":
return True
elif value.lower() == "false":
return False
else:
try:
return float(value)
except ValueError:
pass
return value

for k, v in dict_.items():
print(f"{k=}, {type(k)=}, {v=}, {type(v)=}")
if isinstance(v, dict):
if v == {'@xsi:nil': 'true'}: # resolves <null xsi:nil="true"/>
dict_[k] = None
else:
dict_[k] = _post_process_xml_dict(v)
elif isinstance(v, list):
list_ = []
print(f"{v=}")
for i, item in enumerate(v):
print(f"{item=}, {type(item)=}")
if isinstance(item, dict):
list_.append(_post_process_xml_dict(item))
else:
list_.append(parse_primitive_value(item))
dict_[k] = list_
elif isinstance(v, str):
dict_[k] = parse_primitive_value(v)

return dict_


def parse_xml(file: IOBase) -> Dict:
"""Parse an XML file into a dictionary with inferred types."""
dict_ = xmltodict.parse(file.read())
print(f"{dict_=}, {type(dict_)=}")
dict_ = _post_process_xml_dict(dict_)
return dict_

8 changes: 8 additions & 0 deletions src/phenopacket_mapper/validate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""This module includes the validate for mapping data to phenopackets."""

from .validate import validate, read_validate

__all__ = [
'validate',
'read_validate'
]
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@

from phenopackets.schema.v2 import Phenopacket

from phenopacket_mapper.pipeline import read_phenopackets
from phenopacket_mapper.utils.io import read_phenopackets


class Validator:
Empty file.
25 changes: 25 additions & 0 deletions tests/utils/io/test_read_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from io import StringIO

import pytest

from phenopacket_mapper.utils.io import read_json


@pytest.mark.parametrize(
"inp,expected",
[
('{"string": "Hello World"}', {"string": "Hello World"}),
('{"object": {"a": "b","c": "d"}}', {"object": {"a": "b","c": "d"}}),
('{"number": 123}', {"number": 123}),
('{"number": -123}', {"number": -123}),
('{"number": 123.4}', {"number": 123.4}),
('{"null": null}', {"null": None}),
('{"color": "gold"}', {"color": "gold"}),
('{"boolean": true}', {"boolean": True}),
('{"boolean": false}', {"boolean": False}),
('{"array": [1,2,3]}', {"array": [1,2,3]}),
('{"array": [1,2,3],"boolean": true, "color": "gold","null": null,"number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"}', {"array": [1,2,3],"boolean": True, "color": "gold","null": None, "number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"})
],
)
def test_read_json(inp, expected):
assert read_json(StringIO(inp)) == expected
34 changes: 34 additions & 0 deletions tests/utils/io/test_read_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from phenopacket_mapper.utils.io import read_xml

import pytest
from io import StringIO

@pytest.mark.parametrize(
"inp,expected",
[
('<string>Hello World</string>', {"string": "Hello World"}),
('<object><a>b</a><c>d</c></object>', {"object": {"a": "b", "c": "d"}}),
('<number>123</number>', {"number": 123}),
('<number>-123</number>', {"number": -123}),
('<number>123.4</number>', {"number": 123.4}),
('<null></null>', {"null": None}), # empty tag
('<null />', {"null": None}), # empty tag
('<null xsi:nil="true"/>', {"null": None}), # explicit null
('<color>gold</color>', {"color": "gold"}),
('<boolean>true</boolean>', {"boolean": True}),
('<boolean>false</boolean>', {"boolean": False}),
('<array><item>1</item><item>2</item><item>3</item></array>', {"array": {"item": [1, 2, 3]}}),
('<root>'
'<array><item>1</item><item>2</item><item>3</item></array>'
'<boolean>true</boolean>'
'<color>gold</color>'
'<number>123</number>'
'<object><a>b</a><c>d</c></object>'
'<string>Hello World</string>'
'</root>',
{"root":{"array": {"item": [1, 2, 3]}, "boolean": True, "color": "gold", "number": 123, "object": {"a": "b", "c": "d"}, "string": "Hello World"}}
)
],
)
def test_read_xml(inp, expected):
assert read_xml(StringIO(inp)) == expected

0 comments on commit cb9221f

Please sign in to comment.