generated from frehburg/TemplateForPythonProjects
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Loading status checks…
Merge pull request #174 from BIH-CEI/156-add-util-method-that-loads-d…
…ata-in-whatever-format-and-provides-an-iterator-for-instances 156 add util method that loads data in whatever format and provides an iterator for instances
Showing
20 changed files
with
311 additions
and
181 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
"""This module handles the input and output of data.""" | ||
|
||
from .read_json import read_json | ||
from .read_xml import read_xml, parse_xml | ||
from .data_reader import DataReader | ||
from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_data_using_data_model | ||
from .output import write | ||
|
||
__all__ = [ | ||
'read_json', | ||
'read_xml', 'parse_xml', | ||
'DataReader', | ||
'read_data_model', | ||
'read_phenopackets', | ||
'read_phenopacket_from_json', | ||
'load_data_using_data_model', | ||
|
||
'write', | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
from pathlib import Path | ||
from typing import Union, Tuple, List, Iterable, Literal, Dict | ||
from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase | ||
|
||
import pandas as pd | ||
|
||
from phenopacket_mapper.utils.io import read_json, read_xml | ||
|
||
|
||
class DataReader: | ||
def __init__( | ||
self, | ||
file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], | ||
encoding: str = 'utf-8', | ||
file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None | ||
): | ||
"""Initializes the data reader. | ||
:param file: a `str`, :class:`Path` or :class:`IOBase` to read from. If `str` or :class:`Path`, then the | ||
input is interpreted as a path to a local file. | ||
:param encoding: The encoding to use when reading the file. Default is 'utf-8'. | ||
:param file_extension: The file extension of the file to read. If `None`, the file extension is inferred from the | ||
file path. Default is `None`. | ||
""" | ||
# TODO: fix read xml | ||
# TODO: add option to pass a list of files to read | ||
self.is_dir = False | ||
self.file_extension = None | ||
|
||
if isinstance(file, str): | ||
self.path = Path(file) | ||
self.file = open(self.path, "r", encoding=encoding) | ||
|
||
if file_extension is None: # extract the file extension from the file path | ||
file_extension = self.path.suffix[1:] | ||
|
||
self.handle_file_extension(file_extension) | ||
elif isinstance(file, Path): | ||
if not file.exists(): | ||
raise FileNotFoundError(f"File {file} does not exist.") | ||
if file.is_file(): | ||
self.path = file | ||
self.file = open(self.path, "r", encoding=encoding) | ||
|
||
if file_extension is None: # extract the file extension from the file path | ||
file_extension = self.path.suffix[1:] | ||
|
||
self.handle_file_extension(file_extension) | ||
elif file.is_dir(): | ||
self.is_dir = True | ||
|
||
elif isinstance(file, IOBase): | ||
if isinstance(file, (TextIOWrapper, TextIOBase)): | ||
pass | ||
elif isinstance(file, (BytesIO, BufferedIOBase)): | ||
self.file = TextIOWrapper(file, encoding=encoding) | ||
|
||
if file_extension is None: | ||
raise ValueError("File extension must be provided when passing a file buffer.") | ||
else: | ||
self.handle_file_extension(file_extension) | ||
|
||
self.data, self.iterable = self._read() | ||
|
||
def handle_file_extension(self, fe: str): | ||
if fe.lower() in ['csv', 'xlsx', 'json', 'xml']: | ||
self.file_extension = fe.lower() | ||
else: | ||
raise ValueError(f"File extension {fe} not recognized.") | ||
|
||
def _read(self) -> Tuple[Union[pd.DataFrame, List, Dict], Iterable]: | ||
"""Reads the data. | ||
:return: The data and an iterable representation of the data. | ||
""" | ||
# we know that file is always a buffer with the contents of the file | ||
# change this to work with self.file | ||
if not self.is_dir: | ||
if self.file_extension == 'csv': | ||
df = pd.read_csv(self.file) | ||
return df, [row for row in df.iterrows()] | ||
elif self.file_extension == 'xlsx': | ||
df = pd.read_excel(self.file) | ||
return df, [row for row in df.iterrows()] | ||
elif self.file_extension == 'json': | ||
return (file_contents := read_json(self.file)), [file_contents] | ||
elif self.file_extension == 'xml': | ||
return (file_contents := read_xml(self.file)), [file_contents] | ||
else: | ||
raise ValueError(f'Unknown file type with extension {self.file_extension}') | ||
elif self.is_dir: | ||
# collect list of all files in the folder | ||
files: List[Path] = [file for file in self.path.iterdir() if file.is_file()] | ||
file_extension = list(set([file.suffix[1:] for file in files])) | ||
if len(file_extension) > 1: | ||
raise ValueError(f"Cannot read files of different types: {file_extension}") | ||
elif len(file_extension) == 0: | ||
raise ValueError(f"No files found in the directory specified: {self.file}") | ||
|
||
self.handle_file_extension(file_extension[0]) | ||
|
||
if self.file_extension == 'json': | ||
jsons = [read_json(file) for file in files] | ||
return jsons, jsons | ||
elif self.file_extension == 'xml': | ||
xmls = [read_xml(file) for file in files] | ||
return xmls, xmls | ||
else: | ||
raise ValueError(f"File extension {file_extension} not recognized or not supported for reading files " | ||
f"from a directory. Specified directory: {self.file}. Extensions found: " | ||
f"{file_extension}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import json | ||
from io import IOBase | ||
from pathlib import Path | ||
from typing import Union, Dict | ||
|
||
|
||
def read_json(path: Union[str, Path, IOBase]) -> Dict: | ||
if isinstance(path, str): | ||
path = Path(path) | ||
|
||
if isinstance(path, Path): | ||
f = open(path) | ||
elif isinstance(path, IOBase): | ||
f = path | ||
else: | ||
raise ValueError(f"Invalid input type {type(path)}.") | ||
|
||
return json.load(f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from io import IOBase | ||
from pathlib import Path | ||
from typing import Union, Dict | ||
import xmltodict | ||
|
||
|
||
def read_xml(path: Union[str, Path, IOBase], encoding='utf-8') -> Dict: | ||
if isinstance(path, str): | ||
path = Path(path) | ||
|
||
if isinstance(path, Path): | ||
with open(path, 'r', encoding=encoding) as f: | ||
return parse_xml(f) | ||
elif isinstance(path, IOBase): | ||
return parse_xml(path) | ||
else: | ||
raise ValueError(f"Invalid input type {type(path)}.") | ||
|
||
|
||
def _post_process_xml_dict(dict_: Dict) -> Dict: | ||
def parse_primitive_value(value: str): | ||
if value.isdigit(): | ||
return int(value) | ||
elif value.lower() == "true": | ||
return True | ||
elif value.lower() == "false": | ||
return False | ||
else: | ||
try: | ||
return float(value) | ||
except ValueError: | ||
pass | ||
return value | ||
|
||
for k, v in dict_.items(): | ||
print(f"{k=}, {type(k)=}, {v=}, {type(v)=}") | ||
if isinstance(v, dict): | ||
if v == {'@xsi:nil': 'true'}: # resolves <null xsi:nil="true"/> | ||
dict_[k] = None | ||
else: | ||
dict_[k] = _post_process_xml_dict(v) | ||
elif isinstance(v, list): | ||
list_ = [] | ||
print(f"{v=}") | ||
for i, item in enumerate(v): | ||
print(f"{item=}, {type(item)=}") | ||
if isinstance(item, dict): | ||
list_.append(_post_process_xml_dict(item)) | ||
else: | ||
list_.append(parse_primitive_value(item)) | ||
dict_[k] = list_ | ||
elif isinstance(v, str): | ||
dict_[k] = parse_primitive_value(v) | ||
|
||
return dict_ | ||
|
||
|
||
def parse_xml(file: IOBase) -> Dict: | ||
"""Parse an XML file into a dictionary with inferred types.""" | ||
dict_ = xmltodict.parse(file.read()) | ||
print(f"{dict_=}, {type(dict_)=}") | ||
dict_ = _post_process_xml_dict(dict_) | ||
return dict_ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
"""This module includes the validate for mapping data to phenopackets.""" | ||
|
||
from .validate import validate, read_validate | ||
|
||
__all__ = [ | ||
'validate', | ||
'read_validate' | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from io import StringIO | ||
|
||
import pytest | ||
|
||
from phenopacket_mapper.utils.io import read_json | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"inp,expected", | ||
[ | ||
('{"string": "Hello World"}', {"string": "Hello World"}), | ||
('{"object": {"a": "b","c": "d"}}', {"object": {"a": "b","c": "d"}}), | ||
('{"number": 123}', {"number": 123}), | ||
('{"number": -123}', {"number": -123}), | ||
('{"number": 123.4}', {"number": 123.4}), | ||
('{"null": null}', {"null": None}), | ||
('{"color": "gold"}', {"color": "gold"}), | ||
('{"boolean": true}', {"boolean": True}), | ||
('{"boolean": false}', {"boolean": False}), | ||
('{"array": [1,2,3]}', {"array": [1,2,3]}), | ||
('{"array": [1,2,3],"boolean": true, "color": "gold","null": null,"number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"}', {"array": [1,2,3],"boolean": True, "color": "gold","null": None, "number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"}) | ||
], | ||
) | ||
def test_read_json(inp, expected): | ||
assert read_json(StringIO(inp)) == expected |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from phenopacket_mapper.utils.io import read_xml | ||
|
||
import pytest | ||
from io import StringIO | ||
|
||
@pytest.mark.parametrize( | ||
"inp,expected", | ||
[ | ||
('<string>Hello World</string>', {"string": "Hello World"}), | ||
('<object><a>b</a><c>d</c></object>', {"object": {"a": "b", "c": "d"}}), | ||
('<number>123</number>', {"number": 123}), | ||
('<number>-123</number>', {"number": -123}), | ||
('<number>123.4</number>', {"number": 123.4}), | ||
('<null></null>', {"null": None}), # empty tag | ||
('<null />', {"null": None}), # empty tag | ||
('<null xsi:nil="true"/>', {"null": None}), # explicit null | ||
('<color>gold</color>', {"color": "gold"}), | ||
('<boolean>true</boolean>', {"boolean": True}), | ||
('<boolean>false</boolean>', {"boolean": False}), | ||
('<array><item>1</item><item>2</item><item>3</item></array>', {"array": {"item": [1, 2, 3]}}), | ||
('<root>' | ||
'<array><item>1</item><item>2</item><item>3</item></array>' | ||
'<boolean>true</boolean>' | ||
'<color>gold</color>' | ||
'<number>123</number>' | ||
'<object><a>b</a><c>d</c></object>' | ||
'<string>Hello World</string>' | ||
'</root>', | ||
{"root":{"array": {"item": [1, 2, 3]}, "boolean": True, "color": "gold", "number": 123, "object": {"a": "b", "c": "d"}, "string": "Hello World"}} | ||
) | ||
], | ||
) | ||
def test_read_xml(inp, expected): | ||
assert read_xml(StringIO(inp)) == expected |