ImagingDataCommons · vkt1414 · Jun 10, 2024 · Jun 10, 2024
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -23,7 +23,13 @@ workflows:
      primaryDescriptorPath: /workflows/TotalSegmentator/Terra/postProcessing/perFrame.wdl   
      authors:
       - name: ImagingDataCommons
-        email: [email protected]        
+        email: [email protected]
+   - name: convertRadiomicsJsonToDataframeOnTerra
+     subclass: WDL
+     primaryDescriptorPath: /workflows/TotalSegmentator/Terra/postProcessing/convertRadiomicsJsonToDataframe.wdl   
+     authors:
+      - name: ImagingDataCommons
+        email: [email protected]           
    - name: TotalSegmentatortwoVmWorkflowOnSB-CGC
      subclass: CWL
      primaryDescriptorPath: /workflows/TotalSegmentator/SevenBridges/splitWorkflow/twoVM.cwl

diff --git a/.github/workflows/convert_radiomics_features_json_to_df.yml b/.github/workflows/convert_radiomics_features_json_to_df.yml
@@ -0,0 +1,63 @@
+name: convert-radiomics-features-json-dataframe
+
+on:
+  push:
+    branches: 
+      - "main" 
+    paths:
+      - ".github/workflows/convert_radiomics_features_json_to_df.yml"
+      - "workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile"
+    tags:
+      - 'v*'   
+  pull_request:
+    branches:
+      - "main" 
+    paths:
+      - ".github/workflows/convert_radiomics_features_json_to_df.yml"
+      - "workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile"
+  workflow_dispatch:  
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Git
+        run: |
+          git config --global user.email "[email protected]"
+          git config --global user.name "GitHub Actions"
+
+      - name: Get Git Commit Hash
+        id: git-commit-hash
+        run: |
+          COMMIT_HASH=$(git rev-parse HEAD)
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+           imagingdatacommons/convert_radiomics_features_json_to_df
+
+      - name: Login to Docker Hub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Build and Push Docker image to Docker Hub 
+        if:           
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          build-args: |
+            GIT_HASH=${{ env.COMMIT_HASH }}
+        env:
+          COMMIT_HASH: ${{ env.COMMIT_HASH }}      
diff --git a/workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile b/workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile
@@ -0,0 +1,25 @@
+FROM python@sha256:5b287042a6150052420e6a7fb7c1606b6403740880897ae9610faf434da28693
+
+ARG GIT_HASH
+
+LABEL BASE_DOCKER_IMAGE="python:3.11.2-slim-buster"\
+      MAINTAINER="IDC <[email protected]>" \
+      GIT_HASH=${GIT_HASH}\
+      PATH_TO_DOCKER_FILE="https://github.com/imagingdatacommons/CloudSegmentator/blob/${GIT_HASH}/workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_dataframe/Dockerfile"\
+      IMAGE_NAME_ON_DOCKERHUB="imagingdatacommons/convert_radiomics_features_json_to_dataframe"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    lz4\
+    curl\
+    wget\
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --no-cache-dir\   
+    ipykernel==6.22.0\    
+    ipython==8.11.0\    
+    ipywidgets==8.0.5\    
+    jupyter==1.0.0\     
+    pandas==1.5.3\    
+    papermill==2.4.0\
+    pyarrow==16.1.0
diff --git a/workflows/TotalSegmentator/Notebooks/postProcessingRadiomicsJsonToDataFrame.ipynb b/workflows/TotalSegmentator/Notebooks/postProcessingRadiomicsJsonToDataFrame.ipynb
@@ -0,0 +1,246 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "3e33103f",
+      "metadata": {
+        "colab_type": "text",
+        "id": "view-in-github"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/vkt1414/CloudSegmentator/blob/feat-convert-raw-radiomics-to-dataframe/workflows/TotalSegmentator/Notebooks/postProcessingRadiomicsJsonToDataFrame.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "kdOyoySwehlv",
+      "metadata": {
+        "id": "kdOyoySwehlv"
+      },
+      "source": [
+        "## **This notebook converts raw radiomics features in JSON format to a pandas dataframe. It takes the raw radiomics files in lz4 format as input, decompresses them, and flattens them to a dataframe, output a csv.lz4**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "-LdyuHrrPigD",
+      "metadata": {
+        "id": "-LdyuHrrPigD"
+      },
+      "source": [
+        "### **Installing Packages**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "id": "B5KuCCFhOgpA",
+      "metadata": {
+        "id": "B5KuCCFhOgpA"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "import sys\n",
+        "if 'google.colab' in sys.modules:\n",
+        "    !sudo apt-get update \\\n",
+        "    && apt-get install -y --no-install-recommends \\\n",
+        "    lz4"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "73LFRISZPn6Q",
+      "metadata": {
+        "id": "73LFRISZPn6Q"
+      },
+      "source": [
+        "### **Importing Packages**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "id": "db4585bb-9bfb-4de3-882a-c7454c7ec3af",
+      "metadata": {
+        "id": "db4585bb-9bfb-4de3-882a-c7454c7ec3af"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "import subprocess\n",
+        "import json\n",
+        "import pandas as pd\n",
+        "from pandas import json_normalize"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "Clrt8j70Pb5a",
+      "metadata": {
+        "id": "Clrt8j70Pb5a"
+      },
+      "source": [
+        "### **Parameters for papermill**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "id": "4527d881",
+      "metadata": {
+        "id": "4527d881",
+        "tags": [
+          "parameters"
+        ]
+      },
+      "outputs": [],
+      "source": [
+        "if 'google.colab' in sys.modules:\n",
+        "    !wget -q https://github.com/vkt1414/CloudSegmentator/releases/download/test/pyradiomicsRadiomicsFeatures.tar.lz4\n",
+        "    rawJsonRadiomicsFiles=[\"pyradiomicsRadiomicsFeatures.tar.lz4\"]\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "efd80612",
+      "metadata": {
+        "id": "efd80612"
+      },
+      "source": [
+        "### **This is the cell used on cloud, as the file paths are passed to the notebook as a string**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "557de167",
+      "metadata": {
+        "id": "557de167"
+      },
+      "outputs": [],
+      "source": [
+        "if not 'google.colab' in sys.modules:\n",
+        "    rawJsonRadiomicsFiles=rawJsonRadiomicsFiles.split(',')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "id": "ae662d02",
+      "metadata": {
+        "id": "ae662d02"
+      },
+      "outputs": [],
+      "source": [
+        "def flatten_json(seriesInstanceUID, radiomics_file_path):\n",
+        "    try:\n",
+        "        with open(radiomics_file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
+        "            data = json.load(f, strict=False)\n",
+        "    except json.JSONDecodeError as e:\n",
+        "        print(f\"Error decoding JSON: {e}\")\n",
+        "        # Write the file path to the error log\n",
+        "        with open('error_file.txt', 'a') as error_file:\n",
+        "            error_file.write(f\"{radiomics_file_path}\\n\")\n",
+        "        return None \n",
+        "\n",
+        "    # Create an empty list to store DataFrames\n",
+        "    df_list = []\n",
+        "\n",
+        "    # Iterate over the items in the dictionary and flatten each to a row\n",
+        "    for organ, properties in data.items():\n",
+        "        # Normalize the nested dictionary\n",
+        "        organ_df = json_normalize(properties)\n",
+        "        # Add SeriesInstanceUID\n",
+        "        organ_df['seriesInstanceUID'] = seriesInstanceUID\n",
+        "        # Add the organ name as a column\n",
+        "        organ_df['organ'] = organ\n",
+        "        # Append the result to the list\n",
+        "        df_list.append(organ_df)\n",
+        "\n",
+        "    # Concatenate all DataFrames in the list\n",
+        "    df = pd.concat(df_list, ignore_index=True)\n",
+        "\n",
+        "    return df\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "nbzivnkRnb_B",
+      "metadata": {
+        "id": "nbzivnkRnb_B"
+      },
+      "source": [
+        "### **Convert Radiomics features in JSON to DataFrame, finally to a csv**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "ZHSbg5CrWX1Q",
+      "metadata": {
+        "id": "ZHSbg5CrWX1Q"
+      },
+      "outputs": [],
+      "source": [
+        "# Main script to decompress files and flatten JSON\n",
+        "all_dataframes = []  # List to store all DataFrames\n",
+        "\n",
+        "for rawJsonRadiomicsFile in rawJsonRadiomicsFiles:\n",
+        "  # Decompress the LZ4 file and extract the tar file\n",
+        "  !lz4 -d --rm $rawJsonRadiomicsFile -c | tar  -xvf -\n",
+        "  # Assuming 'radiomics' is a directory in the current working directory\n",
+        "  for series_folder in os.listdir('radiomics'):\n",
+        "    # The directory name is the seriesInstanceUID\n",
+        "    seriesInstanceUID = series_folder\n",
+        "    for file in os.listdir(os.path.join('radiomics',series_folder)):\n",
+        "        if file.endswith('_raw.json.lz4'):\n",
+        "            # Construct the full file path\n",
+        "            file_path = os.path.join('radiomics',series_folder, file)\n",
+        "            # Decompress the file using the lz4 command\n",
+        "            !lz4 -d --rm $file_path\n",
+        "            # Truncate to get the JSON filename\n",
+        "            json_file_path=file_path[:-4]\n",
+        "            # Flatten the JSON file into a DataFrame\n",
+        "            df = flatten_json(seriesInstanceUID, json_file_path)\n",
+        "            # Add the DataFrame to the list\n",
+        "            all_dataframes.append(df)\n",
+        "  !rm -r radiomics\n",
+        "\n",
+        "# Concatenate all DataFrames in the list\n",
+        "final_df = pd.concat(all_dataframes, ignore_index=True)\n",
+        "\n",
+        "# Save the final DataFrame to a CSV file\n",
+        "final_df.to_parquet('raw_radiomics.parquet', index=False)\n",
+        "\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "celltoolbar": "Tags",
+    "colab": {
+      "include_colab_link": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.13"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}