dermatologist · Mandalka · Dec 29, 2022 · Jan 13, 2023 · Jan 13, 2023 · Jan 13, 2023
diff --git a/README.md b/README.md
@@ -67,7 +67,10 @@ print(df.info())
 ```
 
 ## Columns
-* see df.columns
+
+Since automatically prepopulated in most cases you have not manually to define mappings from FHIR Elements to dataframe columns to analyse your FHIR resources:
+
+See `df.columns`:
 
 ```
 patientId
@@ -82,6 +85,26 @@ resource.gender
 ...
 ```
 
+### Add columns by filtering FHIR elements or values by FHIRPath (optional)
+
+To add additional pandas dataframe columns by [FHIRPath](http://hl7.org/fhir/fhirpath.html) for example for filtering FHIR elements like codings by certain codesystem(s) you can map FHIR path expressions to (custom) column names by the parameter `columns_by_fhirpaths` (implemented for import of FHIR bundle files and FHIR search results, but not yet for import of ndjson format) with FHIR path expressions which are supported by the [FHIRPath implementation `fhirpath.py`](https://github.com/beda-software/fhirpath-py).
+
+#### Example: Additional column with exclusive the Snomed coding
+
+Example to add an additional dataframe column named `code_snomed` with exclusive the Snomed coding from the FHIR element `code` (even if FHIR resources contain multiple values with different codesystems in the element `code`):
+
+```python
+import fhiry.parallel as fp
+
+mappings_columns_by_fhirpaths = {
+    "code_snomed": "code.coding.where(system = 'http://snomed.info/sct').code",
+}
+
+df = fp.process('/path/to/fhir/resources', columns_by_fhirpaths = mappings_columns_by_fhirpaths)
+
+print(df.info())
+```
+
 ### [Documentation](https://dermatologist.github.io/fhiry/)
 
 ## Contributors

diff --git a/fhir-search.md b/fhir-search.md
@@ -4,7 +4,7 @@ Import resources from [FHIR Search API](https://www.hl7.org/fhir/search.html) re
 
 ## FHIR search query parameters
 
-For filter options you can set by `search_parameters` see [FHIR search common parameters for all resource types](https://www.hl7.org/fhir/search.html#standard) and additional FHIR search parameters for certain resource types like [Patient](https://www.hl7.org/fhir/patient.html#search), [Condition](https://www.hl7.org/fhir/condition.html#search), [Observation](https://www.hl7.org/fhir/observation.html#search), ...
+For filter options you can set by `search_parameters` see the [standard FHIR search](https://www.hl7.org/fhir/search.html) parameters like [FHIR search common parameters for all resource types](https://www.hl7.org/fhir/search.html#standard) and additional FHIR search parameters for certain resource types like [Patient](https://www.hl7.org/fhir/patient.html#search), [Condition](https://www.hl7.org/fhir/condition.html#search), [Observation](https://www.hl7.org/fhir/observation.html#search), ...
 
 ## Example: Import all observations from FHIR server
 
@@ -39,7 +39,36 @@ print(df.info())
 ```
 
 ## Columns
-* see [`df.columns`](README.md#columns)
+
+Since automatically prepopulated in most cases you have not manually to define mappings from FHIR Elements to dataframe columns to analyse your FHIR resources:
+
+See [`df.columns`](README.md#columns)
+
+### Add columns by filtering FHIR elements or values by FHIRPath (optional)
+
+To add additional pandas dataframe columns by [FHIRPath](http://hl7.org/fhir/fhirpath.html) for example for filtering FHIR elements like codings by certain codesystem(s) you can map FHIR path expressions to (custom) column names by the parameter `columns_by_fhirpaths` with FHIR path expressions which are supported by the [FHIRPath implementation `fhirpath.py`](https://github.com/beda-software/fhirpath-py).
+
+#### Example: Additional column with exclusive the Snomed coding
+
+Example to add an additional dataframe column named `code_snomed` with exclusive the Snomed coding from the FHIR element `code` (even if FHIR resources contain multiple values with different codesystems in the element `code`):
+
+```python
+from fhiry.fhirsearch import Fhirsearch
+
+fs = Fhirsearch(fhir_base_url = "http://fhir-server:8080/fhir")
+
+my_fhir_search_parameters = {
+}
+
+mappings_columns_by_fhirpaths = {
+    "code_snomed": "code.coding.where(system = 'http://snomed.info/sct').code",
+}
+
+df = fs.search(resource_type = "Condition", search_parameters = my_fhir_search_parameters, columns_by_fhirpaths = mappings_columns_by_fhirpaths)
+
+print(df.info())
+```
+
 
 ## Connection settings
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,10 @@
 #
 #    pip-compile
 #
+antlr4-python3-runtime==4.8
+    # via fhirpathpy
+fhirpathpy==0.1.1
+    # via fhiry (setup.py)
 numpy==1.23.5
     # via pandas
 pandas==1.5.2

diff --git a/setup.cfg b/setup.cfg
@@ -51,6 +51,7 @@ package_dir =
 # For more information, check out https://semver.org/.
 install_requires =
     importlib-metadata; python_version<"3.8"
+    fhirpathpy
     pandas
 
 [options.packages.find]

diff --git a/src/fhiry/fhirsearch.py b/src/fhiry/fhirsearch.py
@@ -20,7 +20,7 @@ def __init__(self, fhir_base_url):
         # SSL Certificates: https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification
         self.requests_kwargs = {}
 
-    def search(self, resource_type="Patient", search_parameters={}):
+    def search(self, resource_type="Patient", search_parameters={}, columns_by_fhirpaths={}):
 
         headers = {"Content-Type": "application/fhir+json"}
 
@@ -33,15 +33,15 @@ def search(self, resource_type="Patient", search_parameters={}):
         bundle_dict = r.json()
 
         if 'entry' in bundle_dict:
-            df = process_bundle(bundle_dict)
+            df = process_bundle(bundle_dict, columns_by_fhirpaths=columns_by_fhirpaths)
 
             next_page_url = get_next_page_url(bundle_dict)
 
             while next_page_url:
                 r = requests.get(next_page_url, headers=headers, **self.requests_kwargs)
                 r.raise_for_status()
                 bundle_dict = r.json()
-                df_page = process_bundle(bundle_dict)
+                df_page = process_bundle(bundle_dict, columns_by_fhirpaths=columns_by_fhirpaths)
                 df = pd.concat([df, df_page])
 
                 next_page_url = get_next_page_url(bundle_dict)
@@ -51,8 +51,9 @@ def search(self, resource_type="Patient", search_parameters={}):
         return df
 
 
-def process_bundle(bundle_dict):
+def process_bundle(bundle_dict, columns_by_fhirpaths={}):
     f = Fhiry()
+    f.columns_by_fhirpaths = columns_by_fhirpaths
     f.process_bundle_dict(bundle_dict)
     return f.df
 

diff --git a/src/fhiry/fhiry.py b/src/fhiry/fhiry.py
@@ -9,14 +9,16 @@
 import pandas as pd
 import json
 import os
-
+import fhirpathpy
 
 class Fhiry(object):
     def __init__(self):
         self._df = None
         self._filename = ""
         self._folder = ""
 
+        self._columns_by_fhirpaths = {}
+
         # Codes from the FHIR datatype "coding"
         # (f.e. element resource.code.coding or element resource.clinicalStatus.coding)
         # are extracted to a col "codingcodes"
@@ -42,6 +44,10 @@ def filename(self):
     def folder(self):
         return self._folder
 
+    @property
+    def columns_by_fhirpaths(self):
+        return self._columns_by_fhirpaths
+
     @property
     def delete_col_raw_coding(self):
         return self._delete_col_raw_coding
@@ -59,14 +65,33 @@ def folder(self, folder):
     def delete_col_raw_coding(self, delete_col_raw_coding):
         self._delete_col_raw_coding = delete_col_raw_coding
 
+    @columns_by_fhirpaths.setter
+    def columns_by_fhirpaths(self, columns_by_fhirpaths):
+        self._columns_by_fhirpaths = columns_by_fhirpaths
+
     def read_bundle_from_file(self, filename):
         with open(filename, 'r') as f:
-            json_in = f.read()
-            json_in = json.loads(json_in)
-            return pd.json_normalize(json_in['entry'])
+            bundle_dict = json.load(f)
+
+        df = self.read_bundle_from_bundle_dict(bundle_dict)
+
+        return df
 
     def read_bundle_from_bundle_dict(self, bundle_dict):
-        return pd.json_normalize(bundle_dict['entry'])
+
+        # Flatten nested object structure to flat table structure
+        df = pd.json_normalize(bundle_dict['entry'])
+
+        # if (optional/additional) Fhirpath to dataframe column mappings,
+        # add values from FHIR paths to mapped columns
+        for col, fhirpath in self._columns_by_fhirpaths.items():
+            df[col] = None
+            i = 0
+            for entry in bundle_dict['entry']:
+                df.at[i, col] = fhirpathpy.evaluate(entry['resource'], path=fhirpath)
+                i += 1
+
+        return df
 
     def delete_unwanted_cols(self):
         if 'resource.text.div' in self._df.columns:

diff --git a/src/fhiry/parallel.py b/src/fhiry/parallel.py
@@ -4,8 +4,9 @@
 import pandas as pd
 
 
-def process_file(file):
+def process_file(file, columns_by_fhirpaths={}):
     f = Fhiry()
+    f.columns_by_fhirpaths = columns_by_fhirpaths
     return f.process_file(file)
 
 
@@ -14,16 +15,19 @@ def process_ndjson(file):
     return f.process_file(file)
 
 
-def process(folder):
+def process(folder, columns_by_fhirpaths={}):
 
     pool = mp.Pool(mp.cpu_count())
 
     filenames = []
     for filename in os.listdir(folder):
         if filename.endswith(".json"):
-            filenames.append(folder + '/' + filename)
+            # tuple with both arguments of function process_file(file, columns_by_fhirpaths)
+            process_file_args = (folder + '/' + filename, columns_by_fhirpaths)
+            # append to list of this tuples for starmap
+            filenames.append(process_file_args)
 
-    list_of_dataframes = pool.map(process_file, filenames)
+    list_of_dataframes = pool.starmap(process_file, filenames)
     pool.close()
     return pd.concat(list_of_dataframes)
 

diff --git a/tests/test_fhirsearch.py b/tests/test_fhirsearch.py
@@ -42,7 +42,11 @@ def test_fhirsearch():
     fs = Fhirsearch(fhir_base_url="http://fhir-server/fhir")
     fs.page_size = 2
 
-    df = fs.search(resource_type="Condition", search_parameters={})
+    my_columns_by_fhirpaths = {
+        "snomed_code": "code.coding.where(system = 'http://snomed.info/sct').code",
+        "icd10gm_code": "code.coding.where(system = 'http://fhir.de/CodeSystem/bfarm/icd-10-gm').code",
+    }
+    df = fs.search(resource_type="Condition", search_parameters={}, columns_by_fhirpaths=my_columns_by_fhirpaths)
 
     # resulting df must include all 5 condition resources (processed from all three mocked search results pages)
     assert len(df) == 5
@@ -56,3 +60,6 @@ def test_fhirsearch():
 
     # There is no resource with code A05.0 in the FHIR search results
     assert len(df[df['resource.code.codingcodes'].astype('string') == "['A05.0']"]) == 0
+
+    # Test if additional column "icd10gm_code" extracted by the mapped FHIR Path
+    assert len(df[df['icd10gm_code'].astype('string') == "['A04.0']"]) == 1
diff --git a/tests/test_fhiry.py b/tests/test_fhiry.py
@@ -41,3 +41,53 @@ def test_process_parallel(capsys):
     print(df.info())
     captured = capsys.readouterr()
     assert '1194' in captured.out
+
+
+def test_columns_by_fhirpaths(f, capsys):
+
+    # set a mapping to extract the Snomed Code (of a condition) by Fhir path to a new column "code_snomed"
+    f.columns_by_fhirpaths = {
+        "code_snomed": "code.coding.where(system = 'http://snomed.info/sct').code",
+    }
+
+    f.filename = resource_filename(__name__, 'resources') + '/afhir.json'
+    f.process_df()
+
+    # filter df by the resource id of a condition
+    testcondition = f.df[f.df['resource.id'] == "5dcd2d71-207e-46e9-b948-f5c9121580dd"]
+    assert len(testcondition) == 1
+
+    # since the condition yet has the index number from the filtered bundle
+    # reset the index so the (only) condition/row in the filtered dataframe
+    # can be accessed by index number 0
+    testcondition = testcondition.reset_index(drop=True)
+
+    # is there a new column "code_snomed" in which the right Snomed code
+    # of this condition was extracted by the mapped fhir path?
+    assert testcondition.at[0, 'code_snomed'] == ['162864005']
+
+def test_process_parallel_with_columns_by_fhirpaths(capsys):
+    folder = resource_filename(__name__, 'resources')
+    import src.fhiry.parallel as fp
+
+    # set a mapping to extract the Snomed Code (of a condition) by Fhir path to a new column "code_snomed"
+    my_columns_by_fhirpaths = {
+        "code_snomed": "code.coding.where(system = 'http://snomed.info/sct').code",
+    }
+
+    df = fp.process(folder, columns_by_fhirpaths=my_columns_by_fhirpaths)
+    print(df.info())
+    captured = capsys.readouterr()
+    assert '1194' in captured.out
+
+    # filter df by the resource id of a condition
+    testcondition = df[df['resource.id'] == "5dcd2d71-207e-46e9-b948-f5c9121580dd"]
+
+    # since the condition yet has the index number from the filtered bundle
+    # reset the index so the (only) condition/row in the filtered dataframe
+    # can be accessed by index number 0
+    testcondition = testcondition.reset_index(drop=True)
+
+    # is there a new column "code_snomed" in which the right Snomed code
+    # of this condition was extracted by the mapped fhir path?
+    assert testcondition.at[0, 'code_snomed'] == ['162864005']