InsightSoftwareConsortium · zivy · Aug 24, 2023 · Aug 8, 2023 · Aug 10, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -88,4 +88,6 @@ jobs:
     - name: run the test
       env: 
         SIMPLE_ITK_MEMORY_CONSTRAINED_ENVIRONMENT: 1
-      run: pytest -v --tb=short -k "${{matrix.inputs}}" tests/test_notebooks.py::Test_notebooks::test_python_notebook
+      run: |
+        pytest -v --tb=short -k "${{matrix.inputs}}" tests/test_notebooks.py::Test_notebooks::test_python_notebook
+        pytest -v --tb=short tests/test_scripts.py
diff --git a/Python/71_Trust_But_Verify.ipynb b/Python/71_Trust_But_Verify.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Trust but Verify - Inspection of Large Image Collections\n",
     "\n",
-    "This notebook and accompanying [Python script](characterize_data.py) illustrate the use of SimpleITK as a tool for efficient data inspection on large image collections, as part of familiarizing oneself with the data and performing cleanup prior to its use in deep learning or any other supervised machine learning approach.\n",
+    "This notebook and accompanying [Python script](scripts/characterize_data.py) illustrate the use of SimpleITK as a tool for efficient data inspection on large image collections, as part of familiarizing oneself with the data and performing cleanup prior to its use in deep learning or any other supervised machine learning approach.\n",
     "\n",
     "The reasons for inspecting your data before using it include:\n",
     "1. Identification of corrupt images.\n",
@@ -118,7 +118,7 @@
    "source": [
     "## Characterizing  image set\n",
     "\n",
-    "To characterize the image set we have written a [Python script](characterize_data.py) that you should run from the command line. This script is very flexible and allows you to robustly characterize your image set. Try the various options and learn more about your data. You'd be surprised how many times the data isn't what you thought it is when only relying on visual inspection. The script allows you to inspect your data both on a file by file basis and as DICOM series where an image (volume) is stored in multiple files.\n",
+    "To characterize the image set we have written a [Python script](scripts/characterize_data.py) that you should run from the command line. This script is very flexible and allows you to robustly characterize your image set. Try the various options and learn more about your data. You'd be surprised how many times the data isn't what you thought it is when only relying on visual inspection. The script allows you to inspect your data both on a file by file basis and as DICOM series where an image (volume) is stored in multiple files.\n",
     "\n",
     "File by file:\n",
     "```\n",

diff --git a/Python/characterize_data.py → Python/scripts/characterize_data.py b/Python/characterize_data.py → Python/scripts/characterize_data.py
@@ -68,6 +68,11 @@
 Additionally, minimal analysis of the raw information is performed:
 1. If there are duplicate images these are reported in output_duplicates.csv.
 2. Two figures: output_image_size_distribution.pdf and output_min_max_intensity_distribution.pdf
+
+NOTE: For the same directory structure, the order of the rows in the output csv file will vary
+across operating systems (order of files in the "files" column also varies). This is a consequence
+of using os.walk to traverse the file system (internally os.walk uses os.scandir and that method's 
+documentation says "The entries are yielded in arbitrary order.").  
 """
 
 
@@ -179,7 +184,9 @@ def inspect_single_file(file_name, imageIO="", meta_data_keys=[], external_progr
     will be the file name (all other values will be either None or NaN).
     """
     file_info = [None] * (9 + len(meta_data_keys) + len(external_programs))
-    file_info[0] = file_name
+    # Using a list so that returned csv is consistent with the series based analysis (an
+    # image is defined by multiple files).
+    file_info[0] = [file_name]
     current_index = 1
     try:
         reader = sitk.ImageFileReader()
@@ -241,7 +248,7 @@ def inspect_files(
     if len(meta_data_keys) + len(external_programs) != len(additional_column_names):
         raise ValueError("Number of additional column names does not match expected.")
     column_names = [
-        "file name",
+        "files",
         "MD5 intensity hash",
         "image size",
         "image spacing",
@@ -405,7 +412,7 @@ def inspect_series(root_dir, meta_data_keys=[], additional_column_names=[]):
     return pd.DataFrame(res, columns=column_names)
 
 
-def main(argv=None):
+def characterize_data(argv=None):
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "root_of_data_directory", help="path to the topmost directory containing data"
@@ -483,7 +490,9 @@ def main(argv=None):
         df["MD5 intensity hash"].dropna().value_counts().reset_index(name="count")
     )
     duplicates = df[
-        df["MD5 intensity hash"].isin(image_counts[image_counts["count"] > 1]["index"])
+        df["MD5 intensity hash"].isin(
+            image_counts[image_counts["count"] > 1]["MD5 intensity hash"]
+        )
     ].sort_values(by=["MD5 intensity hash"])
     if not duplicates.empty:
         duplicates.to_csv(
@@ -522,7 +531,7 @@ def main(argv=None):
         ax.tick_params(axis="x", labelsize=fontsize_pt)
         ax.xaxis.get_major_locator().set_params(integer=True)
         ax = size_counts.plot.barh(
-            x="index",
+            x="image size",
             y="count",
             xlabel="image size",
             ylabel="# of images",
@@ -564,8 +573,8 @@ def main(argv=None):
             bbox_inches="tight",
         )
 
-    sys.exit(0)
+    return 0
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    sys.exit(characterize_data())
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
@@ -0,0 +1,100 @@
+import os
+
+import pytest
+import pathlib
+import hashlib
+import sys
+import pandas as pd
+
+# Add the script source directory to the path so that we can import
+sys.path.append(str(pathlib.Path(__file__).parent.parent.absolute() / "Python/scripts"))
+
+from characterize_data import characterize_data
+
+
+class TestScripts:
+    def setup_method(self):
+        # Path to testing data is expected in the following location:
+        self.data_path = pathlib.Path(__file__).parent.parent.absolute() / "Data"
+
+    def files_md5(self, ascii_file_list, binary_file_list):
+        """
+        Compute a single/combined md5 hash for a list of ascii and binary files.
+        We can't read all files as binary because of platform specific differences in
+        ascii files. For ascii files we need to open in text mode and use the read() method which
+        to quote the documentation:
+        In text mode, the default when reading is to convert platform-specific line endings (\n on Unix, \r\n on
+        Windows) to just \n.
+
+        This ensures that we get the same md5 hash on all platforms. If we opened the text files as binary the hashes
+        become platform dependent (\r\n vs. \n).
+        """
+        md5 = hashlib.md5()
+        for file_name in ascii_file_list:
+            with open(file_name, "r") as fp:
+                file_contents = fp.read()
+                md5.update(file_contents.encode("utf-8"))
+        for file_name in binary_file_list:
+            with open(file_name, "rb") as fp:
+                file_contents = fp.read()
+                md5.update(file_contents)
+        return md5.hexdigest()
+
+    @pytest.mark.parametrize(
+        "output_file, analysis_type, result_md5hash",
+        [
+            (
+                "per_file_data_characteristics.csv",
+                "per_file",
+                "912ede9ecfe519346f3a519f59215f6d",
+            ),
+            (
+                "per_series_data_characteristics.csv",
+                "per_series",
+                "8a806fa717739b9c6f2132a719b1ab8f",
+            ),
+        ],
+    )
+    def test_characterize_data(
+        self, output_file, analysis_type, result_md5hash, tmp_path
+    ):
+        # NOTE: For now not testing pdf files. Setting the SOURCE_DATE_EPOCH
+        # didn't resolve the variability across platforms, getting different
+        # md5 hash values. Not sure if it is possible to do regression testing
+        # with the pdf files.
+        # Set the SOURCE_DATE_EPOCH environment variable value so that the pdf,ps files
+        # created have the same date. The file content includes the date time and we want
+        # to ignore that difference.
+        # https://github.com/matplotlib/matplotlib/issues/6317/
+        # os.environ["SOURCE_DATE_EPOCH"] = "42"
+        output_dir = tmp_path
+        # Run the script, output files are written to the output_path directory
+        # these are csv and pdf files
+        characterize_data(
+            [
+                str(self.data_path / "CIRS057A_MR_CT_DICOM"),
+                str(output_dir / output_file),
+                analysis_type,
+            ]
+        )
+        # csv files needs to be modified as follows before comparing to expected values:
+        # 1. Modify absolute file paths to only include file name so that they are independent
+        #    of file location.
+        # 2. Sort the file names in the "files" column, os.walk returns directories and file
+        #    names in arbitrary order and the order is different across operating systems.
+        # 3. Sort the image entries (per series or per file) according to MD5 hash as the row order
+        #    depends on the directory order which isn't consistent, same issue as in 2.
+        result_files = output_dir.glob("*.csv")
+        for file in result_files:
+            df = pd.read_csv(file).sort_values(by="MD5 intensity hash")
+            df["files"] = df["files"].apply(
+                lambda x: sorted([pathlib.Path(fname).name for fname in eval(x)])
+            )
+            df.to_csv(file, index=False)
+        assert (
+            self.files_md5(
+                ascii_file_list=output_dir.glob("*.csv"),
+                binary_file_list=[],  # output_dir.glob("*.pdf"),
+            )
+            == result_md5hash
+        )