Project-MONAI · holgerroth · Oct 24, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/m3/data_prepare/report/2-1_collect-sentence-pool.py b/m3/data_prepare/report/2-1_collect-sentence-pool.py
@@ -0,0 +1,171 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+
+from openai import OpenAI
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key="$API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC")
+
+# Constants
+MODEL_NAME = "meta/llama-3.1-8b-instruct"  # or "meta/llama-3.1-70b-instruct"
+# The model names are from the following URL:
+#     https://build.nvidia.com/meta/llama-3_1-8b-instruct
+#     https://build.nvidia.com/meta/llama-3_1-70b-instruct
+# We are utilizing the Llama 3.1 model provided by NVIDIA NIM. Alternatively, if you have a local copy of the model,
+# you can apply the same approach to use it.
+
+INPUT_JSON_FILENAME = "mimic_annotation.json"
+TEMPLATES_FILENAME = "sentence-pool.txt"
+BATCH_SIZE = 100
+
+
+def load_file_lines(file_path):
+    """
+    Load the sentences from the 'report' fields of the 'test' section in a JSON file.
+    The sentences will be split by period signs.
+
+    Args:
+        file_path (str): Path to the JSON file containing the reports.
+
+    Returns:
+        list: A list of sentences extracted from the reports.
+
+    Raises:
+        FileNotFoundError: If the file at `file_path` is not found.
+        Exception: For any other error encountered while reading the file.
+    """
+    try:
+        with open(file_path, "r") as file:
+            data = json.load(file)
+            sentences = []
+
+            # Extract reports from the 'test' section
+            for item in data.get("test", []):
+                report = item.get("report", "")
+
+                # Split the report into sentences based on periods
+                sentences.extend([sentence.strip() for sentence in report.split(".") if sentence.strip()])
+
+            return sentences
+    except FileNotFoundError:
+        logger.error(f"File not found: {file_path}")
+        raise
+    except Exception as e:
+        logger.error(f"Error reading or parsing JSON file {file_path}: {e}")
+        raise
+
+
+def make_api_call(sentences, templates):
+    """
+    Make a call to the OpenAI API using the provided sentences and templates.
+
+    Args:
+        sentences (list): A list of sentences to process.
+        templates (str): A string of template sentences to use for replacement.
+
+    Returns:
+        str: The processed text returned by the OpenAI API.
+
+    Raises:
+        Exception: If the OpenAI API call fails for any reason.
+    """
+    combined_sentences = "\n".join(sentences) + f"\n{templates}"
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an expert radiologist.",
+        },
+        {
+            "role": "user",
+            "content": f"Please simplify the following list of sentences according to these instructions: \
+            1. **Break Down**: Separate each sentence into its simplest components. \
+            Each resulting sentence should be straightforward and free of transitional words like 'and,' 'or,' 'but,' 'then,' 'therefore,' etc. \
+            2. **Extract Similarities**: Identify sentences with similar meanings. Group these sentences based on the main idea they convey. \
+            3. **Unify**: For each group of similar sentences, create a single sentence that captures the core meaning. Ensure this unified sentence is concise, clear, and without transitional words. \
+            4. **Create the Final Pool**: Compile a final list of these unified, simplified sentences that represent the main content of the original list. \
+            **Important**: Make sure none of the sentences include transitional words such as 'and,' 'or,' 'but,' 'then,' etc. \
+            Each sentence should stand alone, conveying a single idea. \
+            Here is the list of sentences to process: {combined_sentences}. \
+            Please provide the final list of simplified sentences, focusing only on the most common meanings.",
+        },
+    ]
+
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=2048,  # Adjust the token limit based on expected response length
+            temperature=0.2,  # Set temperature for more deterministic results
+            top_p=0.7,
+            stream=True,
+        )
+        return response["choices"][0]["message"]["content"]
+    except Exception as e:
+        logger.error(f"OpenAI API call failed: {e}")
+        raise
+
+
+def save_template(new_template, output_path):
+    """
+    Save the generated template to a specified file.
+
+    Args:
+        new_template (str): The content to be saved as the template.
+        output_path (str): Path where the new template will be saved.
+
+    Raises:
+        Exception: If there is any issue saving the template to the file.
+    """
+    try:
+        with open(output_path, "w") as f:
+            f.write(new_template)
+        logger.info(f"New template saved to {output_path}")
+    except Exception as e:
+        logger.error(f"Failed to save template to {output_path}: {e}")
+        raise
+
+
+def main():
+    """
+    Main function to load data, call the OpenAI API, and save processed templates.
+
+    The function loads sentences from the JSON file, splits them into chunks, processes them with the OpenAI API,
+    and then saves the processed templates back to a file.
+    """
+    try:
+        # Load sentences from the JSON file
+        sentences = load_file_lines(INPUT_JSON_FILENAME)
+
+        # Initialize an empty templates string (for now, not loading any existing templates)
+        templates = ""
+
+        for i in range(0, len(sentences), BATCH_SIZE):
+            chunk = sentences[i : i + BATCH_SIZE]
+            logger.info(f"Processing chunk {i // BATCH_SIZE + 1} of {len(sentences) // BATCH_SIZE + 1}")
+
+            new_template = make_api_call(chunk, templates)
+            save_template(new_template, TEMPLATES_FILENAME)
+
+            templates = new_template
+
+    except Exception as e:
+        logger.error(f"An error occurred during execution: {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/m3/data_prepare/report/2-2_convert-text.py b/m3/data_prepare/report/2-2_convert-text.py
@@ -0,0 +1,224 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import sys
+
+from openai import OpenAI
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key="$API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC")
+
+# Constants
+MODEL_NAME = "meta/llama-3.1-8b-instruct"  # or "meta/llama-3.1-70b-instruct"
+# The model names are from the following URL:
+#     https://build.nvidia.com/meta/llama-3_1-8b-instruct
+#     https://build.nvidia.com/meta/llama-3_1-70b-instruct
+# We are utilizing the Llama 3.1 model provided by NVIDIA NIM. Alternatively, if you have a local copy of the model,
+# you can apply the same approach to use it.
+
+INPUT_JSON_FILENAME = "mimic_annotation.json"
+OUTPUT_DIR = "./report_new"
+TEMPLATES_FILENAME = "sentence-pool.txt"
+
+
+def load_json_data(file_path):
+    """
+    Load data from a JSON file.
+
+    Args:
+        file_path (str): The path to the JSON file.
+
+    Returns:
+        dict: The parsed JSON content.
+
+    Raises:
+        FileNotFoundError: If the file at `file_path` is not found.
+        Exception: For any other error encountered while reading the file.
+    """
+    try:
+        with open(file_path, "r") as file:
+            return json.load(file)
+    except FileNotFoundError:
+        logger.error(f"File not found: {file_path}")
+        raise
+    except Exception as e:
+        logger.error(f"Error reading or parsing JSON file {file_path}: {e}")
+        raise
+
+
+def load_templates(file_path):
+    """
+    Load template content from a file.
+
+    Args:
+        file_path (str): The path to the file containing template sentences.
+
+    Returns:
+        str: The content of the file as a string.
+
+    Raises:
+        FileNotFoundError: If the template file is not found at the specified path.
+        Exception: For other errors encountered while reading the file.
+    """
+    try:
+        with open(file_path, "r") as file:
+            return file.read()
+    except FileNotFoundError:
+        logger.error(f"Template file not found: {file_path}")
+        raise
+    except Exception as e:
+        logger.error(f"Error loading template file: {e}")
+        raise
+
+
+def initialize_output_directory(output_dir):
+    """
+    Ensure that the output directory exists, creating it if necessary.
+
+    Args:
+        output_dir (str): The path to the output directory to be created.
+
+    Raises:
+        Exception: If there is an error creating the directory.
+    """
+    try:
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Output directory created: {output_dir}")
+    except Exception as e:
+        logger.error(f"Failed to create output directory: {e}")
+        raise
+
+
+def process_files(data, templates, output_dir):
+    """
+    Process all files found under the "train" and "test" keys in the JSON file,
+    retrieve the reports, generate new reports using OpenAI API, and save the results.
+
+    Args:
+        data (dict): The loaded JSON data.
+        templates (str): The template content to be used for generating reports.
+        output_dir (str): The directory where the processed files will be saved.
+
+    Raises:
+        Exception: If there are issues calling the OpenAI API or saving files.
+    """
+    # Process both "train" and "test" sets
+    for split in ["train", "test"]:
+        for item in data.get(split, []):
+            report = item.get("report", "")
+            image_paths = item.get("image_path", [])
+
+            # Generate new report
+            new_report = generate_new_report(templates, report)
+
+            # Save each report with a new filename derived from the image_path
+            for image_path in image_paths:
+                # Extract the basename of the .jpg file and replace .jpg with .txt
+                filename = os.path.basename(image_path).replace(".jpg", ".txt")
+                output_path = os.path.join(output_dir, filename)
+                save_report(new_report, output_path)
+
+
+def generate_new_report(templates, report):
+    """
+    Call OpenAI API to generate a new report based on the template and input report.
+
+    Args:
+        templates (str): The template content used for generating the new report.
+        report (str): The input report that needs to be processed.
+
+    Returns:
+        str: The generated report after processing with the OpenAI API.
+
+    Raises:
+        Exception: If the OpenAI API call fails or an unexpected error occurs.
+    """
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an expert radiologist.",
+        },
+        {
+            "role": "user",
+            "content": f"{templates}\n\nPlease replace sentences with similar meanings in the contents below with the exact sentences from the template provided, "
+            f"ensuring no other parts of the content are altered. Please directly output the updated report in the format 'new report: ...'.\n\n{report}",
+        },
+    ]
+
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=2048,  # Adjust max tokens based on the expected response length.
+            temperature=0.2,  # Set temperature for more deterministic results.
+            top_p=0.7,
+            stream=True,
+        )
+        new_report = response["choices"][0]["message"]["content"]
+        return new_report.replace("new report:", "").replace("New report:", "").strip()
+    except Exception as e:
+        logger.error(f"OpenAI API call failed: {e}")
+        raise
+
+
+def save_report(new_report, output_path):
+    """
+    Save the generated report to the output file.
+
+    Args:
+        new_report (str): The generated report content to be saved.
+        output_path (str): The file path where the report will be saved.
+
+    Raises:
+        Exception: If the report cannot be saved to the specified path.
+    """
+    try:
+        with open(output_path, "w") as f:
+            f.write(new_report)
+        logger.info(f"New report saved to {output_path}")
+    except Exception as e:
+        logger.error(f"Failed to save report to {output_path}: {e}")
+        raise
+
+
+def main():
+    """
+    Main function to load JSON data, templates, and process the files.
+
+    This function orchestrates the steps of loading the JSON content, processing each item
+    in both "train" and "test" sets, generating new reports using the OpenAI API, and saving the
+    results in the output directory.
+    """
+    try:
+        # Load JSON data
+        data = load_json_data(INPUT_JSON_FILENAME)
+
+        # Load templates
+        templates = load_templates(TEMPLATES_FILENAME)
+
+        # Create output directory
+        initialize_output_directory(OUTPUT_DIR)
+
+        # Process files
+        process_files(data, templates, OUTPUT_DIR)
+
+    except Exception as e:
+        logger.error(f"An error occurred during execution: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()