Skip to content

Commit

Permalink
add files to extract radiomics features from json files
Browse files Browse the repository at this point in the history
  • Loading branch information
vkt1414 committed Jun 10, 2024
1 parent 1a08044 commit 0ef86cd
Show file tree
Hide file tree
Showing 7 changed files with 1,229 additions and 1 deletion.
8 changes: 7 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ workflows:
primaryDescriptorPath: /workflows/TotalSegmentator/Terra/postProcessing/perFrame.wdl
authors:
- name: ImagingDataCommons
email: [email protected]
email: [email protected]
- name: convertRadiomicsJsonToDataframeOnTerra
subclass: WDL
primaryDescriptorPath: /workflows/TotalSegmentator/Terra/postProcessing/convertRadiomicsJsonToDataframe.wdl
authors:
- name: ImagingDataCommons
email: [email protected]
- name: TotalSegmentatortwoVmWorkflowOnSB-CGC
subclass: CWL
primaryDescriptorPath: /workflows/TotalSegmentator/SevenBridges/splitWorkflow/twoVM.cwl
Expand Down
63 changes: 63 additions & 0 deletions .github/workflows/convert_radiomics_features_json_to_df.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: convert-radiomics-features-json-dataframe

on:
push:
branches:
- "main"
paths:
- ".github/workflows/convert_radiomics_features_json_to_df.yml"
- "workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile"
tags:
- 'v*'
pull_request:
branches:
- "main"
paths:
- ".github/workflows/convert_radiomics_features_json_to_df.yml"
- "workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile"
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Git
run: |
git config --global user.email "[email protected]"
git config --global user.name "GitHub Actions"
- name: Get Git Commit Hash
id: git-commit-hash
run: |
COMMIT_HASH=$(git rev-parse HEAD)
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: |
imagingdatacommons/convert_radiomics_features_json_to_df
- name: Login to Docker Hub
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Build and Push Docker image to Docker Hub
if:
uses: docker/build-push-action@v5
with:
context: .
file: ./workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_df/Dockerfile
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
build-args: |
GIT_HASH=${{ env.COMMIT_HASH }}
env:
COMMIT_HASH: ${{ env.COMMIT_HASH }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM python@sha256:5b287042a6150052420e6a7fb7c1606b6403740880897ae9610faf434da28693

ARG GIT_HASH

LABEL BASE_DOCKER_IMAGE="python:3.11.2-slim-buster"\
MAINTAINER="IDC <[email protected]>" \
GIT_HASH=${GIT_HASH}\
PATH_TO_DOCKER_FILE="https://github.com/imagingdatacommons/CloudSegmentator/blob/${GIT_HASH}/workflows/TotalSegmentator/Dockerfiles/convert_radiomics_features_json_to_dataframe/Dockerfile"\
IMAGE_NAME_ON_DOCKERHUB="imagingdatacommons/convert_radiomics_features_json_to_dataframe"

RUN apt-get update && \
apt-get install -y --no-install-recommends \
lz4\
curl\
wget\
&& rm -rf /var/lib/apt/lists/*

RUN pip3 install --no-cache-dir\
ipykernel==6.22.0\
ipython==8.11.0\
ipywidgets==8.0.5\
jupyter==1.0.0\
pandas==1.5.3\
papermill==2.4.0\
pyarrow==16.1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "3e33103f",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
"<a href=\"https://colab.research.google.com/github/vkt1414/CloudSegmentator/blob/feat-convert-raw-radiomics-to-dataframe/workflows/TotalSegmentator/Notebooks/postProcessingRadiomicsJsonToDataFrame.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "kdOyoySwehlv",
"metadata": {
"id": "kdOyoySwehlv"
},
"source": [
"## **This notebook converts raw radiomics features in JSON format to a pandas dataframe. It takes the raw radiomics files in lz4 format as input, decompresses them, and flattens them to a dataframe, output a csv.lz4**"
]
},
{
"cell_type": "markdown",
"id": "-LdyuHrrPigD",
"metadata": {
"id": "-LdyuHrrPigD"
},
"source": [
"### **Installing Packages**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "B5KuCCFhOgpA",
"metadata": {
"id": "B5KuCCFhOgpA"
},
"outputs": [],
"source": [
"%%capture\n",
"import sys\n",
"if 'google.colab' in sys.modules:\n",
" !sudo apt-get update \\\n",
" && apt-get install -y --no-install-recommends \\\n",
" lz4"
]
},
{
"cell_type": "markdown",
"id": "73LFRISZPn6Q",
"metadata": {
"id": "73LFRISZPn6Q"
},
"source": [
"### **Importing Packages**"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "db4585bb-9bfb-4de3-882a-c7454c7ec3af",
"metadata": {
"id": "db4585bb-9bfb-4de3-882a-c7454c7ec3af"
},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import subprocess\n",
"import json\n",
"import pandas as pd\n",
"from pandas import json_normalize"
]
},
{
"cell_type": "markdown",
"id": "Clrt8j70Pb5a",
"metadata": {
"id": "Clrt8j70Pb5a"
},
"source": [
"### **Parameters for papermill**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4527d881",
"metadata": {
"id": "4527d881",
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"if 'google.colab' in sys.modules:\n",
" !wget -q https://github.com/vkt1414/CloudSegmentator/releases/download/test/pyradiomicsRadiomicsFeatures.tar.lz4\n",
" rawJsonRadiomicsFiles=[\"pyradiomicsRadiomicsFeatures.tar.lz4\"]\n"
]
},
{
"cell_type": "markdown",
"id": "efd80612",
"metadata": {
"id": "efd80612"
},
"source": [
"### **This is the cell used on cloud, as the file paths are passed to the notebook as a string**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "557de167",
"metadata": {
"id": "557de167"
},
"outputs": [],
"source": [
"if not 'google.colab' in sys.modules:\n",
" rawJsonRadiomicsFiles=rawJsonRadiomicsFiles.split(',')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ae662d02",
"metadata": {
"id": "ae662d02"
},
"outputs": [],
"source": [
"def flatten_json(seriesInstanceUID, radiomics_file_path):\n",
" try:\n",
" with open(radiomics_file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
" data = json.load(f, strict=False)\n",
" except json.JSONDecodeError as e:\n",
" print(f\"Error decoding JSON: {e}\")\n",
" # Write the file path to the error log\n",
" with open('error_file.txt', 'a') as error_file:\n",
" error_file.write(f\"{radiomics_file_path}\\n\")\n",
" return None \n",
"\n",
" # Create an empty list to store DataFrames\n",
" df_list = []\n",
"\n",
" # Iterate over the items in the dictionary and flatten each to a row\n",
" for organ, properties in data.items():\n",
" # Normalize the nested dictionary\n",
" organ_df = json_normalize(properties)\n",
" # Add SeriesInstanceUID\n",
" organ_df['seriesInstanceUID'] = seriesInstanceUID\n",
" # Add the organ name as a column\n",
" organ_df['organ'] = organ\n",
" # Append the result to the list\n",
" df_list.append(organ_df)\n",
"\n",
" # Concatenate all DataFrames in the list\n",
" df = pd.concat(df_list, ignore_index=True)\n",
"\n",
" return df\n"
]
},
{
"cell_type": "markdown",
"id": "nbzivnkRnb_B",
"metadata": {
"id": "nbzivnkRnb_B"
},
"source": [
"### **Convert Radiomics features in JSON to DataFrame, finally to a csv**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ZHSbg5CrWX1Q",
"metadata": {
"id": "ZHSbg5CrWX1Q"
},
"outputs": [],
"source": [
"# Main script to decompress files and flatten JSON\n",
"all_dataframes = [] # List to store all DataFrames\n",
"\n",
"for rawJsonRadiomicsFile in rawJsonRadiomicsFiles:\n",
" # Decompress the LZ4 file and extract the tar file\n",
" !lz4 -d --rm $rawJsonRadiomicsFile -c | tar -xvf -\n",
" # Assuming 'radiomics' is a directory in the current working directory\n",
" for series_folder in os.listdir('radiomics'):\n",
" # The directory name is the seriesInstanceUID\n",
" seriesInstanceUID = series_folder\n",
" for file in os.listdir(os.path.join('radiomics',series_folder)):\n",
" if file.endswith('_raw.json.lz4'):\n",
" # Construct the full file path\n",
" file_path = os.path.join('radiomics',series_folder, file)\n",
" # Decompress the file using the lz4 command\n",
" !lz4 -d --rm $file_path\n",
" # Truncate to get the JSON filename\n",
" json_file_path=file_path[:-4]\n",
" # Flatten the JSON file into a DataFrame\n",
" df = flatten_json(seriesInstanceUID, json_file_path)\n",
" # Add the DataFrame to the list\n",
" all_dataframes.append(df)\n",
" !rm -r radiomics\n",
"\n",
"# Concatenate all DataFrames in the list\n",
"final_df = pd.concat(all_dataframes, ignore_index=True)\n",
"\n",
"# Save the final DataFrame to a CSV file\n",
"final_df.to_parquet('raw_radiomics.parquet', index=False)\n",
"\n"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"colab": {
"include_colab_link": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 0ef86cd

Please sign in to comment.