Merge pull request #174 from BIH-CEI/156-add-util-method-that-loads-d…

…ata-in-whatever-format-and-provides-an-iterator-for-instances 156 add util method that loads data in whatever format and provides an iterator for instances
BIH-CEI · Oct 15, 2024 · cb9221f · cb9221f
2 parents f7471b4 + 55ad3f1
commit cb9221f
Showing 20 changed files with 311 additions and 181 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,14 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Machine Learning"
 ]
 dependencies = [
-    "loguru", "phenopackets", "pandas", "openpyxl", "jupyter", "requests", "bs4",
+    "loguru",
+    "phenopackets",
+    "pandas",
+    "openpyxl",
+    "jupyter",
+    "requests",
+    "bs4",
+    "xmltodict==0.14.1",
 ]
 dynamic = ["version"]
 

diff --git a/src/phenopacket_mapper/__init__.py b/src/phenopacket_mapper/__init__.py
@@ -2,8 +2,16 @@
 
 __version__ = "0.0.1"
 
-from . import cli, data_standards, pipeline, preprocessing, api_requests
+from . import data_standards, validate, preprocessing, api_requests, mapping, utils
 
-from .pipeline import PhenopacketMapper
+from .data_standards import DataModel
+from .mapping import PhenopacketMapper
 
-__all__ = ["cli", "data_standards", "pipeline", "PhenopacketMapper", "preprocessing", "api_requests"]
+__all__ = [
+    "data_standards", "DataModel",
+    "validate",
+    "preprocessing",
+    "api_requests",
+    "mapping", "PhenopacketMapper",
+    "utils",
+]
diff --git a/src/phenopacket_mapper/cli/__init__.py b/src/phenopacket_mapper/cli/__init__.py
diff --git a/src/phenopacket_mapper/cli/main.py b/src/phenopacket_mapper/cli/main.py
diff --git a/src/phenopacket_mapper/cli/mapping_command.py b/src/phenopacket_mapper/cli/mapping_command.py
diff --git a/src/phenopacket_mapper/cli/quickstart_command.py b/src/phenopacket_mapper/cli/quickstart_command.py
diff --git a/src/phenopacket_mapper/cli/validate_command.py b/src/phenopacket_mapper/cli/validate_command.py
diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py
@@ -256,6 +256,7 @@ def load_data(
         :param kwargs: Dynamically passed parameters that match {id}_column for each item
         :return: A list of `DataModelInstance` objects
         """
+        # TODO: move the dynamic params to the load method in utils.io
         column_names = dict()
         for f in self.fields:
             column_param = f"{f.id}_column"
@@ -264,7 +265,7 @@ def load_data(
             else:
                 column_names[f.id] = kwargs[column_param]
 
-        from phenopacket_mapper.pipeline import load_data_using_data_model
+        from phenopacket_mapper.utils.io import load_data_using_data_model
         return load_data_using_data_model(
             path=path,
             data_model=self,

diff --git a/src/phenopacket_mapper/pipeline/__init__.py b/src/phenopacket_mapper/pipeline/__init__.py
diff --git a/src/phenopacket_mapper/utils/io/__init__.py b/src/phenopacket_mapper/utils/io/__init__.py
@@ -0,0 +1,19 @@
+"""This module handles the input and output of data."""
+
+from .read_json import read_json
+from .read_xml import read_xml, parse_xml
+from .data_reader import DataReader
+from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_data_using_data_model
+from .output import write
+
+__all__ = [
+    'read_json',
+    'read_xml', 'parse_xml',
+    'DataReader',
+    'read_data_model',
+    'read_phenopackets',
+    'read_phenopacket_from_json',
+    'load_data_using_data_model',
+
+    'write',
+]
diff --git a/src/phenopacket_mapper/utils/io/data_reader.py b/src/phenopacket_mapper/utils/io/data_reader.py
@@ -0,0 +1,111 @@
+from pathlib import Path
+from typing import Union, Tuple, List, Iterable, Literal, Dict
+from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase
+
+import pandas as pd
+
+from phenopacket_mapper.utils.io import read_json, read_xml
+
+
+class DataReader:
+    def __init__(
+            self,
+            file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]],
+            encoding: str = 'utf-8',
+            file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None
+    ):
+        """Initializes the data reader.
+
+        :param file: a `str`, :class:`Path` or :class:`IOBase` to read from. If `str` or :class:`Path`, then the
+        input is interpreted as a path to a local file.
+        :param encoding: The encoding to use when reading the file. Default is 'utf-8'.
+        :param file_extension: The file extension of the file to read. If `None`, the file extension is inferred from the
+        file path. Default is `None`.
+        """
+        # TODO: fix read xml
+        # TODO: add option to pass a list of files to read
+        self.is_dir = False
+        self.file_extension = None
+
+        if isinstance(file, str):
+            self.path = Path(file)
+            self.file = open(self.path, "r", encoding=encoding)
+
+            if file_extension is None:  # extract the file extension from the file path
+                file_extension = self.path.suffix[1:]
+
+            self.handle_file_extension(file_extension)
+        elif isinstance(file, Path):
+            if not file.exists():
+                raise FileNotFoundError(f"File {file} does not exist.")
+            if file.is_file():
+                self.path = file
+                self.file = open(self.path, "r", encoding=encoding)
+
+                if file_extension is None:  # extract the file extension from the file path
+                    file_extension = self.path.suffix[1:]
+
+                self.handle_file_extension(file_extension)
+            elif file.is_dir():
+                self.is_dir = True
+
+        elif isinstance(file, IOBase):
+            if isinstance(file, (TextIOWrapper, TextIOBase)):
+                pass
+            elif isinstance(file, (BytesIO, BufferedIOBase)):
+                self.file = TextIOWrapper(file, encoding=encoding)
+
+            if file_extension is None:
+                raise ValueError("File extension must be provided when passing a file buffer.")
+            else:
+                self.handle_file_extension(file_extension)
+
+        self.data, self.iterable = self._read()
+
+    def handle_file_extension(self, fe: str):
+        if fe.lower() in ['csv', 'xlsx', 'json', 'xml']:
+            self.file_extension = fe.lower()
+        else:
+            raise ValueError(f"File extension {fe} not recognized.")
+
+    def _read(self) -> Tuple[Union[pd.DataFrame, List, Dict], Iterable]:
+        """Reads the data.
+
+        :return: The data and an iterable representation of the data.
+        """
+        # we know that file is always a buffer with the contents of the file
+        # change this to work with self.file
+        if not self.is_dir:
+            if self.file_extension == 'csv':
+                df = pd.read_csv(self.file)
+                return df, [row for row in df.iterrows()]
+            elif self.file_extension == 'xlsx':
+                df = pd.read_excel(self.file)
+                return df, [row for row in df.iterrows()]
+            elif self.file_extension == 'json':
+                return (file_contents := read_json(self.file)), [file_contents]
+            elif self.file_extension == 'xml':
+                return (file_contents := read_xml(self.file)), [file_contents]
+            else:
+                raise ValueError(f'Unknown file type with extension {self.file_extension}')
+        elif self.is_dir:
+            # collect list of all files in the folder
+            files: List[Path] = [file for file in self.path.iterdir() if file.is_file()]
+            file_extension = list(set([file.suffix[1:] for file in files]))
+            if len(file_extension) > 1:
+                raise ValueError(f"Cannot read files of different types: {file_extension}")
+            elif len(file_extension) == 0:
+                raise ValueError(f"No files found in the directory specified: {self.file}")
+
+            self.handle_file_extension(file_extension[0])
+
+            if self.file_extension == 'json':
+                jsons = [read_json(file) for file in files]
+                return jsons, jsons
+            elif self.file_extension == 'xml':
+                xmls = [read_xml(file) for file in files]
+                return xmls, xmls
+            else:
+                raise ValueError(f"File extension {file_extension} not recognized or not supported for reading files "
+                                 f"from a directory. Specified directory: {self.file}. Extensions found: "
+                                 f"{file_extension}")
diff --git a/src/phenopacket_mapper/pipeline/input.py → src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/pipeline/input.py → src/phenopacket_mapper/utils/io/input.py
@@ -12,6 +12,7 @@
     DataSet
 from phenopacket_mapper.utils import loc_default
 from phenopacket_mapper.utils import parsing
+from phenopacket_mapper.utils.io.data_reader import DataReader
 from phenopacket_mapper.utils.parsing import parse_ordinal
 
 
@@ -165,13 +166,11 @@ def load_data_using_data_model(
     else:
         raise ValueError(f'Path must be a string or Path object, not {type(path)}')
 
-    file_extension = path.suffix[1:]
-    if file_extension == 'csv':
-        df = pd.read_csv(path)
-    elif file_extension == 'xlsx':
-        df = pd.read_excel(path)
-    else:
-        raise ValueError(f'Unknown file type with extension {file_extension}')
+    dr = DataReader(path)
+    data, data_iterable = dr.data, dr.iterable
+
+    # TODO: for the moment assume that the data is a pandas DataFrame
+    df = data
 
     # check column_names is in the correct format
     if isinstance(column_names, MappingProxyType):
@@ -226,15 +225,15 @@ def read_phenopackets(dir_path: Path) -> List[Phenopacket]:
     return phenopackets_list
 
 
-def read_phenopacket_from_json(file_path: Union[str, Path]) -> Phenopacket:
+def read_phenopacket_from_json(path: Union[str, Path]) -> Phenopacket:
     """Reads a Phenopacket from a JSON file.
 
-    :param file_path: The path to the JSON file.
-    :type file_path: Union[str, Path]
+    :param path: The path to the JSON file.
+    :type path: Union[str, Path]
     :return: The loaded Phenopacket.
     :rtype: Phenopacket
     """
-    with open(file_path, 'r') as fh:
+    with open(path, 'r') as fh:
         json_data = fh.read()
         phenopacket = Phenopacket()
         Parse(json_data, phenopacket)

diff --git a/src/phenopacket_mapper/pipeline/output.py → src/phenopacket_mapper/utils/io/output.py b/src/phenopacket_mapper/pipeline/output.py → src/phenopacket_mapper/utils/io/output.py
diff --git a/src/phenopacket_mapper/utils/io/read_json.py b/src/phenopacket_mapper/utils/io/read_json.py
@@ -0,0 +1,18 @@
+import json
+from io import IOBase
+from pathlib import Path
+from typing import Union, Dict
+
+
+def read_json(path: Union[str, Path, IOBase]) -> Dict:
+    if isinstance(path, str):
+        path = Path(path)
+
+    if isinstance(path, Path):
+        f = open(path)
+    elif isinstance(path, IOBase):
+        f = path
+    else:
+        raise ValueError(f"Invalid input type {type(path)}.")
+
+    return json.load(f)
diff --git a/src/phenopacket_mapper/utils/io/read_xml.py b/src/phenopacket_mapper/utils/io/read_xml.py
@@ -0,0 +1,64 @@
+from io import IOBase
+from pathlib import Path
+from typing import Union, Dict
+import xmltodict
+
+
+def read_xml(path: Union[str, Path, IOBase], encoding='utf-8') -> Dict:
+    if isinstance(path, str):
+        path = Path(path)
+
+    if isinstance(path, Path):
+        with open(path, 'r', encoding=encoding) as f:
+            return parse_xml(f)
+    elif isinstance(path, IOBase):
+        return parse_xml(path)
+    else:
+        raise ValueError(f"Invalid input type {type(path)}.")
+
+
+def _post_process_xml_dict(dict_: Dict) -> Dict:
+    def parse_primitive_value(value: str):
+        if value.isdigit():
+            return int(value)
+        elif value.lower() == "true":
+            return True
+        elif value.lower() == "false":
+            return False
+        else:
+            try:
+                return float(value)
+            except ValueError:
+                pass
+        return value
+
+    for k, v in dict_.items():
+        print(f"{k=}, {type(k)=}, {v=}, {type(v)=}")
+        if isinstance(v, dict):
+            if v == {'@xsi:nil': 'true'}:  # resolves <null xsi:nil="true"/>
+                dict_[k] = None
+            else:
+                dict_[k] = _post_process_xml_dict(v)
+        elif isinstance(v, list):
+            list_ = []
+            print(f"{v=}")
+            for i, item in enumerate(v):
+                print(f"{item=}, {type(item)=}")
+                if isinstance(item, dict):
+                    list_.append(_post_process_xml_dict(item))
+                else:
+                    list_.append(parse_primitive_value(item))
+            dict_[k] = list_
+        elif isinstance(v, str):
+            dict_[k] = parse_primitive_value(v)
+
+    return dict_
+
+
+def parse_xml(file: IOBase) -> Dict:
+    """Parse an XML file into a dictionary with inferred types."""
+    dict_ = xmltodict.parse(file.read())
+    print(f"{dict_=}, {type(dict_)=}")
+    dict_ = _post_process_xml_dict(dict_)
+    return dict_
+
diff --git a/src/phenopacket_mapper/validate/__init__.py b/src/phenopacket_mapper/validate/__init__.py
@@ -0,0 +1,8 @@
+"""This module includes the validate for mapping  data to phenopackets."""
+
+from .validate import validate, read_validate
+
+__all__ = [
+    'validate',
+    'read_validate'
+]
diff --git a/src/phenopacket_mapper/pipeline/validate.py → src/phenopacket_mapper/validate/validate.py b/src/phenopacket_mapper/pipeline/validate.py → src/phenopacket_mapper/validate/validate.py
@@ -3,7 +3,7 @@
 
 from phenopackets.schema.v2 import Phenopacket
 
-from phenopacket_mapper.pipeline import read_phenopackets
+from phenopacket_mapper.utils.io import read_phenopackets
 
 
 class Validator:

diff --git a/tests/utils/io/test_data_reader.py b/tests/utils/io/test_data_reader.py
diff --git a/tests/utils/io/test_read_json.py b/tests/utils/io/test_read_json.py
@@ -0,0 +1,25 @@
+from io import StringIO
+
+import pytest
+
+from phenopacket_mapper.utils.io import read_json
+
+
+@pytest.mark.parametrize(
+    "inp,expected",
+    [
+        ('{"string": "Hello World"}', {"string": "Hello World"}),
+        ('{"object": {"a": "b","c": "d"}}', {"object": {"a": "b","c": "d"}}),
+        ('{"number": 123}', {"number": 123}),
+        ('{"number": -123}', {"number": -123}),
+        ('{"number": 123.4}', {"number": 123.4}),
+        ('{"null": null}', {"null": None}),
+        ('{"color": "gold"}', {"color": "gold"}),
+        ('{"boolean": true}', {"boolean": True}),
+        ('{"boolean": false}', {"boolean": False}),
+        ('{"array": [1,2,3]}', {"array": [1,2,3]}),
+        ('{"array": [1,2,3],"boolean": true, "color": "gold","null": null,"number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"}', {"array": [1,2,3],"boolean": True, "color": "gold","null": None, "number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"})
+    ],
+)
+def test_read_json(inp, expected):
+    assert read_json(StringIO(inp)) == expected
diff --git a/tests/utils/io/test_read_xml.py b/tests/utils/io/test_read_xml.py
@@ -0,0 +1,34 @@
+from phenopacket_mapper.utils.io import read_xml
+
+import pytest
+from io import StringIO
+
+@pytest.mark.parametrize(
+    "inp,expected",
+    [
+        ('<string>Hello World</string>', {"string": "Hello World"}),
+        ('<object><a>b</a><c>d</c></object>', {"object": {"a": "b", "c": "d"}}),
+        ('<number>123</number>', {"number": 123}),
+        ('<number>-123</number>', {"number": -123}),
+        ('<number>123.4</number>', {"number": 123.4}),
+        ('<null></null>', {"null": None}),  # empty tag
+        ('<null />', {"null": None}),  # empty tag
+        ('<null xsi:nil="true"/>', {"null": None}),  # explicit null
+        ('<color>gold</color>', {"color": "gold"}),
+        ('<boolean>true</boolean>', {"boolean": True}),
+        ('<boolean>false</boolean>', {"boolean": False}),
+        ('<array><item>1</item><item>2</item><item>3</item></array>', {"array": {"item": [1, 2, 3]}}),
+        ('<root>'
+         '<array><item>1</item><item>2</item><item>3</item></array>'
+         '<boolean>true</boolean>'
+         '<color>gold</color>'
+         '<number>123</number>'
+         '<object><a>b</a><c>d</c></object>'
+         '<string>Hello World</string>'
+         '</root>',
+         {"root":{"array": {"item": [1, 2, 3]}, "boolean": True, "color": "gold", "number": 123, "object": {"a": "b", "c": "d"}, "string": "Hello World"}}
+         )
+    ],
+)
+def test_read_xml(inp, expected):
+    assert read_xml(StringIO(inp)) == expected