Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Data Preparation for Report Generation #36

Merged
merged 20 commits into from
Oct 24, 2024
171 changes: 171 additions & 0 deletions m3/data_prepare/report/2-1_collect-sentence-pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import os

from openai import OpenAI

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key="$API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC")

# Constants
MODEL_NAME = "meta/llama-3.1-8b-instruct" # or "meta/llama-3.1-70b-instruct"
# The model names are from the following URL:
# https://build.nvidia.com/meta/llama-3_1-8b-instruct
# https://build.nvidia.com/meta/llama-3_1-70b-instruct
# We are utilizing the Llama 3.1 model provided by NVIDIA NIM. Alternatively, if you have a local copy of the model,
# you can apply the same approach to use it.

INPUT_JSON_FILENAME = "mimic_annotation.json"
TEMPLATES_FILENAME = "sentence-pool.txt"
BATCH_SIZE = 100


def load_file_lines(file_path):
"""
Load the sentences from the 'report' fields of the 'test' section in a JSON file.
The sentences will be split by period signs.

Args:
file_path (str): Path to the JSON file containing the reports.

Returns:
list: A list of sentences extracted from the reports.

Raises:
FileNotFoundError: If the file at `file_path` is not found.
Exception: For any other error encountered while reading the file.
"""
try:
with open(file_path, "r") as file:
data = json.load(file)
sentences = []

# Extract reports from the 'test' section
for item in data.get("test", []):
report = item.get("report", "")

# Split the report into sentences based on periods
sentences.extend([sentence.strip() for sentence in report.split(".") if sentence.strip()])

return sentences
except FileNotFoundError:
logger.error(f"File not found: {file_path}")
raise
except Exception as e:
logger.error(f"Error reading or parsing JSON file {file_path}: {e}")
raise


def make_api_call(sentences, templates):
"""
Make a call to the OpenAI API using the provided sentences and templates.

Args:
sentences (list): A list of sentences to process.
templates (str): A string of template sentences to use for replacement.

Returns:
str: The processed text returned by the OpenAI API.

Raises:
Exception: If the OpenAI API call fails for any reason.
"""
combined_sentences = "\n".join(sentences) + f"\n{templates}"

messages = [
{
"role": "system",
"content": "You are an expert radiologist.",
},
{
"role": "user",
"content": f"Please simplify the following list of sentences according to these instructions: \
1. **Break Down**: Separate each sentence into its simplest components. \
Each resulting sentence should be straightforward and free of transitional words like 'and,' 'or,' 'but,' 'then,' 'therefore,' etc. \
2. **Extract Similarities**: Identify sentences with similar meanings. Group these sentences based on the main idea they convey. \
3. **Unify**: For each group of similar sentences, create a single sentence that captures the core meaning. Ensure this unified sentence is concise, clear, and without transitional words. \
4. **Create the Final Pool**: Compile a final list of these unified, simplified sentences that represent the main content of the original list. \
**Important**: Make sure none of the sentences include transitional words such as 'and,' 'or,' 'but,' 'then,' etc. \
Each sentence should stand alone, conveying a single idea. \
Here is the list of sentences to process: {combined_sentences}. \
Please provide the final list of simplified sentences, focusing only on the most common meanings.",
},
]

try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=2048, # Adjust the token limit based on expected response length
temperature=0.2, # Set temperature for more deterministic results
top_p=0.7,
stream=True,
)
return response["choices"][0]["message"]["content"]
except Exception as e:
logger.error(f"OpenAI API call failed: {e}")
raise


def save_template(new_template, output_path):
"""
Save the generated template to a specified file.

Args:
new_template (str): The content to be saved as the template.
output_path (str): Path where the new template will be saved.

Raises:
Exception: If there is any issue saving the template to the file.
"""
try:
with open(output_path, "w") as f:
f.write(new_template)
logger.info(f"New template saved to {output_path}")
except Exception as e:
logger.error(f"Failed to save template to {output_path}: {e}")
raise


def main():
"""
Main function to load data, call the OpenAI API, and save processed templates.

The function loads sentences from the JSON file, splits them into chunks, processes them with the OpenAI API,
and then saves the processed templates back to a file.
"""
try:
# Load sentences from the JSON file
sentences = load_file_lines(INPUT_JSON_FILENAME)

# Initialize an empty templates string (for now, not loading any existing templates)
templates = ""

for i in range(0, len(sentences), BATCH_SIZE):
chunk = sentences[i : i + BATCH_SIZE]
logger.info(f"Processing chunk {i // BATCH_SIZE + 1} of {len(sentences) // BATCH_SIZE + 1}")

new_template = make_api_call(chunk, templates)
save_template(new_template, TEMPLATES_FILENAME)

templates = new_template

except Exception as e:
logger.error(f"An error occurred during execution: {e}")


if __name__ == "__main__":
main()
224 changes: 224 additions & 0 deletions m3/data_prepare/report/2-2_convert-text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import os
import sys

from openai import OpenAI

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key="$API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC")

# Constants
MODEL_NAME = "meta/llama-3.1-8b-instruct" # or "meta/llama-3.1-70b-instruct"
# The model names are from the following URL:
# https://build.nvidia.com/meta/llama-3_1-8b-instruct
# https://build.nvidia.com/meta/llama-3_1-70b-instruct
# We are utilizing the Llama 3.1 model provided by NVIDIA NIM. Alternatively, if you have a local copy of the model,
# you can apply the same approach to use it.

INPUT_JSON_FILENAME = "mimic_annotation.json"
OUTPUT_DIR = "./report_new"
TEMPLATES_FILENAME = "sentence-pool.txt"


def load_json_data(file_path):
"""
Load data from a JSON file.

Args:
file_path (str): The path to the JSON file.

Returns:
dict: The parsed JSON content.

Raises:
FileNotFoundError: If the file at `file_path` is not found.
Exception: For any other error encountered while reading the file.
"""
try:
with open(file_path, "r") as file:
return json.load(file)
except FileNotFoundError:
logger.error(f"File not found: {file_path}")
raise
except Exception as e:
logger.error(f"Error reading or parsing JSON file {file_path}: {e}")
raise


def load_templates(file_path):
"""
Load template content from a file.

Args:
file_path (str): The path to the file containing template sentences.

Returns:
str: The content of the file as a string.

Raises:
FileNotFoundError: If the template file is not found at the specified path.
Exception: For other errors encountered while reading the file.
"""
try:
with open(file_path, "r") as file:
return file.read()
except FileNotFoundError:
logger.error(f"Template file not found: {file_path}")
raise
except Exception as e:
logger.error(f"Error loading template file: {e}")
raise


def initialize_output_directory(output_dir):
"""
Ensure that the output directory exists, creating it if necessary.

Args:
output_dir (str): The path to the output directory to be created.

Raises:
Exception: If there is an error creating the directory.
"""
try:
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Output directory created: {output_dir}")
except Exception as e:
logger.error(f"Failed to create output directory: {e}")
raise


def process_files(data, templates, output_dir):
"""
Process all files found under the "train" and "test" keys in the JSON file,
retrieve the reports, generate new reports using OpenAI API, and save the results.

Args:
data (dict): The loaded JSON data.
templates (str): The template content to be used for generating reports.
output_dir (str): The directory where the processed files will be saved.

Raises:
Exception: If there are issues calling the OpenAI API or saving files.
"""
# Process both "train" and "test" sets
for split in ["train", "test"]:
for item in data.get(split, []):
report = item.get("report", "")
image_paths = item.get("image_path", [])

# Generate new report
new_report = generate_new_report(templates, report)

# Save each report with a new filename derived from the image_path
for image_path in image_paths:
# Extract the basename of the .jpg file and replace .jpg with .txt
filename = os.path.basename(image_path).replace(".jpg", ".txt")
output_path = os.path.join(output_dir, filename)
save_report(new_report, output_path)


def generate_new_report(templates, report):
"""
Call OpenAI API to generate a new report based on the template and input report.

Args:
templates (str): The template content used for generating the new report.
report (str): The input report that needs to be processed.

Returns:
str: The generated report after processing with the OpenAI API.

Raises:
Exception: If the OpenAI API call fails or an unexpected error occurs.
"""
messages = [
{
"role": "system",
"content": "You are an expert radiologist.",
},
{
"role": "user",
"content": f"{templates}\n\nPlease replace sentences with similar meanings in the contents below with the exact sentences from the template provided, "
f"ensuring no other parts of the content are altered. Please directly output the updated report in the format 'new report: ...'.\n\n{report}",
},
]

try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=2048, # Adjust max tokens based on the expected response length.
temperature=0.2, # Set temperature for more deterministic results.
top_p=0.7,
stream=True,
)
new_report = response["choices"][0]["message"]["content"]
return new_report.replace("new report:", "").replace("New report:", "").strip()
except Exception as e:
logger.error(f"OpenAI API call failed: {e}")
raise


def save_report(new_report, output_path):
"""
Save the generated report to the output file.

Args:
new_report (str): The generated report content to be saved.
output_path (str): The file path where the report will be saved.

Raises:
Exception: If the report cannot be saved to the specified path.
"""
try:
with open(output_path, "w") as f:
f.write(new_report)
logger.info(f"New report saved to {output_path}")
except Exception as e:
logger.error(f"Failed to save report to {output_path}: {e}")
raise


def main():
"""
Main function to load JSON data, templates, and process the files.

This function orchestrates the steps of loading the JSON content, processing each item
in both "train" and "test" sets, generating new reports using the OpenAI API, and saving the
results in the output directory.
"""
try:
# Load JSON data
data = load_json_data(INPUT_JSON_FILENAME)

# Load templates
templates = load_templates(TEMPLATES_FILENAME)

# Create output directory
initialize_output_directory(OUTPUT_DIR)

# Process files
process_files(data, templates, OUTPUT_DIR)

except Exception as e:
logger.error(f"An error occurred during execution: {e}")
sys.exit(1)


if __name__ == "__main__":
main()
Loading
Loading