diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Prompt_engineer/.DS_Store b/Prompt_engineer/.DS_Store new file mode 100644 index 0000000..f165c78 Binary files /dev/null and b/Prompt_engineer/.DS_Store differ diff --git a/Prompt_engineer/__pycache__/synthea_data.cpython-310.pyc b/Prompt_engineer/__pycache__/synthea_data.cpython-310.pyc new file mode 100644 index 0000000..9d7c514 Binary files /dev/null and b/Prompt_engineer/__pycache__/synthea_data.cpython-310.pyc differ diff --git a/Prompt_engineer/__pycache__/synthea_data.cpython-37.pyc b/Prompt_engineer/__pycache__/synthea_data.cpython-37.pyc new file mode 100644 index 0000000..0a5c570 Binary files /dev/null and b/Prompt_engineer/__pycache__/synthea_data.cpython-37.pyc differ diff --git a/Prompt_engineer/config.yaml b/Prompt_engineer/config.yaml new file mode 100644 index 0000000..a950330 --- /dev/null +++ b/Prompt_engineer/config.yaml @@ -0,0 +1,12 @@ +# Configuration file for the medical report generator +cleaned_data_csv_path: '/Users/ayodejioyesanya/Desktop/Tdata/cleaned_medical_data.csv' +prompt_template: | + [INST] <> You are an experienced medical AI assistant trained to provide helpful and accurate + information to patients. You have extensive knowledge of human anatomy, common medical conditions, and + evidence-based treatments. Your responses should be empathetic, informative, and adhere to medical best + practices. You will not provide any medical advice that could be harmful. [/INST] + + Diagnosis: {diagnosis}. Patient's age: {age}. Gender: {gender}. + Conditions: {conditions}. Observations: {observations}. Care plans: {care_plans}. + Imaging modality: {modality}. Body area: {body_area}. + Please provide a comprehensive report based on the patient's data and imaging results. diff --git a/Prompt_engineer/main.py b/Prompt_engineer/main.py new file mode 100644 index 0000000..54f9c86 --- /dev/null +++ b/Prompt_engineer/main.py @@ -0,0 +1,82 @@ +import argparse +import yaml +import pandas as pd +from datetime import datetime +from typing import Dict, Any, Optional + +def load_config(config_path: str) -> Dict[str, Any]: + """ + Load the YAML configuration file. + """ + with open(config_path, 'r') as file: + config = yaml.safe_load(file) or {} + return config + +def calculate_age(birthdate: str) -> int: + """ + Calculate age given the birthdate. + """ + birth_date = datetime.strptime(birthdate, "%Y-%m-%d") + today = datetime.now() + return today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day)) + +def generate_medical_prompt(patient_data: Dict[str, Any], diagnosis: str, config: Dict[str, Any]) -> str: + """ + Generate a medical prompt for the medical report based on patient data and diagnosis details using a template from the configuration. + Uses REASONDESCRIPTION as the source for the patient's conditions. + """ + prompt_template = config.get('prompt_template', "Default template if not specified in config.") + age = calculate_age(patient_data['BIRTHDATE']) # Calculating age using the BIRTHDATE field + conditions = patient_data.get('REASONDESCRIPTION', 'No reason description provided') # Default message if not specified + prompt = prompt_template.format( + diagnosis=diagnosis, + age=age, + gender=patient_data['GENDER'], + conditions=conditions, # Correct field used + observations=patient_data.get('observation', 'No observations recorded'), # Use correct field for observations + care_plans=patient_data.get('DESCRIPTION_careplan', 'No care plans recorded'), + modality=patient_data.get('modality', "Not specified"), + body_area=patient_data.get('body_area', "Not specified") + ) + return prompt + +def load_patient_data(csv_path: str) -> pd.DataFrame: + """ + Load patient data from a CSV file. + """ + return pd.read_csv(csv_path) + +def select_random_patient_data(patient_data: pd.DataFrame) -> Dict[str, Any]: + """ + Select random patient data from the entire dataset, independent of the diagnosis. + """ + return patient_data.sample(n=1).iloc[0].to_dict() + +def parse_arguments() -> argparse.Namespace: + """ + Parse command line arguments. + """ + parser = argparse.ArgumentParser(description='Medical Report Generator') + parser.add_argument('--config_path', type=str, default='/Users/ayodejioyesanya/Documents/SFdev/Prompt_engineer/config.yaml', help='Path to the YAML configuration file.') + parser.add_argument('--csv_path', type=str, default='/Users/ayodejioyesanya/Desktop/Tdata/cleaned_medical_data.csv', help='Path to the CSV file containing patient data.') + parser.add_argument('--diagnosis', type=str, required=True, help='Diagnosis determined by the image classifier.') + return parser.parse_args() + +def main() -> None: + """ + Main function to orchestrate the workflow. + """ + args = parse_arguments() + config = load_config(args.config_path) + patient_data = load_patient_data(args.csv_path) + diagnosis = args.diagnosis # Diagnosis passed from the command line + random_patient_data = select_random_patient_data(patient_data) + + if random_patient_data: + medical_prompt = generate_medical_prompt(random_patient_data, diagnosis, config) + print(medical_prompt) + else: + print("No patient data available.") + +if __name__ == '__main__': + main() diff --git a/Prompt_engineer/synthea_data.py b/Prompt_engineer/synthea_data.py new file mode 100644 index 0000000..b591636 --- /dev/null +++ b/Prompt_engineer/synthea_data.py @@ -0,0 +1,66 @@ +from typing import Dict, Any, Tuple, Optional +from datetime import datetime +from dateutil.relativedelta import relativedelta +import random + +# Example static data, simulating a patient database +PATIENT_DATA = [ + { + 'BIRTHDATE': '1990-01-01', + 'GENDER': 'Male', + 'REASONDESCRIPTION': 'lung cancer', + 'observation': 'increased cough', + 'DESCRIPTION_careplan': 'regular monitoring', + 'modality': 'X-Ray', + 'body_area': 'Chest' + }, + { + 'BIRTHDATE': '1985-05-15', + 'GENDER': 'Female', + 'REASONDESCRIPTION': '', # This patient has no diagnosis specified. + 'observation': 'shortness of breath', + 'DESCRIPTION_careplan': 'oxygen therapy', + 'modality': 'CT Scan', + 'body_area': 'Chest' + }, + # Additional records can be added here. +] + +class SyntheaData: + """ + Simulates data retrieval from a static dataset. + """ + def get_patient_data_by_diagnosis(self, diagnosis: str) -> Tuple[Dict[str, Any], Optional[str], Optional[str]]: + """ + Retrieves a random patient's data who has been diagnosed with a specified condition using the REASONDESCRIPTION. + If no diagnosis is provided in REASONDESCRIPTION, it defaults to "Nil significant past medical history." + """ + # Filtering patients based on diagnosis, considering those with empty or null diagnosis as having no significant history. + matching_patients = [patient for patient in PATIENT_DATA if patient['REASONDESCRIPTION'].strip().lower() == diagnosis.lower()] if diagnosis.strip() else [patient for patient in PATIENT_DATA if not patient['REASONDESCRIPTION'].strip()] + + if matching_patients: + selected_patient = random.choice(matching_patients) + age = relativedelta(datetime.now(), datetime.strptime(selected_patient['BIRTHDATE'], '%Y-%m-%d')).years + patient_data = { + 'age': age, + 'gender': selected_patient['GENDER'], + 'conditions': selected_patient['REASONDESCRIPTION'] if selected_patient['REASONDESCRIPTION'].strip() else "Nil significant past medical history", + 'observations': selected_patient['observation'], + 'care_plans': selected_patient['DESCRIPTION_careplan'], + 'modality': selected_patient.get('modality', "Not specified"), + 'body_area': selected_patient.get('body_area', "Not specified") + } + modality = selected_patient.get('modality') + body_area = selected_patient.get('body_area') + return patient_data, modality, body_area + else: + # Default case when no patients match the criteria, including empty diagnosis search + return ({ + 'age': None, + 'gender': None, + 'conditions': "Nil significant past medical history", + 'observations': None, + 'care_plans': None, + 'modality': None, + 'body_area': None + }, None, None) diff --git a/config.yaml b/config.yaml deleted file mode 100644 index c28a9ca..0000000 --- a/config.yaml +++ /dev/null @@ -1,2 +0,0 @@ -database_path: 'path/to/health_data.db' -cleaned_data_csv_path: '/path/to/cleaned_medical_data.csv' diff --git a/main.py b/main.py deleted file mode 100644 index 8288db7..0000000 --- a/main.py +++ /dev/null @@ -1,65 +0,0 @@ -import argparse -import yaml -from typing import Dict, Any, Optional -from transformers import AutoTokenizer, pipeline -from synthea_data import SyntheaData #SyntheaData class is in synthea_data.py - -def load_config(config_path: str) -> Dict[str, Any]: - """ - Load the YAML configuration file. - """ - with open(config_path, 'r') as file: - config = yaml.safe_load(file) - return config - -def generate_medical_prompt(patient_data: Dict[str, Any], diagnosis: str, modality: Optional[str], body_area: Optional[str]) -> str: - """ - Generate a medical prompt for the LLAMA model based on patient data, a given diagnosis, and imaging details. - """ - prompt_template = ( - "Diagnosis: {diagnosis}. Patient's age: {age}. Gender: {gender}. " - "Conditions: {conditions}. Observations: {observations}. Care plans: {care_plans}. " - "Imaging modality: {modality}. Body area: {body_area}. " - "Please provide a comprehensive report based on the patient's data and imaging results." - ) - - prompt = prompt_template.format( - diagnosis=diagnosis, - age=patient_data['age'], - gender=patient_data['gender'], - conditions=', '.join(patient_data['conditions']), - observations=', '.join(patient_data['observations']), - care_plans=', '.join(patient_data['care_plans']), - modality=modality or "Not specified", - body_area=body_area or "Not specified" - ) - return prompt - -def parse_arguments() -> argparse.Namespace: - """ - Parse command line arguments. - """ - parser = argparse.ArgumentParser(description='Medical Report Generator') - parser.add_argument('--config_path', type=str, default='config.yaml', help='Path to the YAML configuration file.') - return parser.parse_args() - -def main() -> None: - """ - Main function to orchestrate the workflow. - """ - args = parse_arguments() - config = load_config(args.config_path) - - synthea_data = SyntheaData(config) - diagnosis = "lung cancer" # Example diagnosis - patient_data, modality, body_area = synthea_data.get_patient_data_by_diagnosis(diagnosis) - - if patient_data: - medical_prompt = generate_medical_prompt(patient_data, diagnosis, modality, body_area) - print(medical_prompt) - # Further processing with LLAMA or other models can be done here - else: - print("No patient data found for the given diagnosis.") - -if __name__ == '__main__': - main() diff --git a/synthea_data.py b/synthea_data.py deleted file mode 100644 index 34e32e8..0000000 --- a/synthea_data.py +++ /dev/null @@ -1,52 +0,0 @@ -import pandas as pd -import sqlite3 -from datetime import datetime -from dateutil.relativedelta import relativedelta -from typing import Dict, Any, Tuple -import random - -class SyntheaData: - """ - Handles operations related to processing and retrieving patient data from a consolidated dataset. - """ - def __init__(self, config: Dict[str, Any]) -> None: - """ - Initializes with configuration settings. - """ - self.config = config - - def import_cleaned_data_to_sqlite(self) -> None: - """ - Imports cleaned medical data from a CSV file into an SQLite database. - """ - conn = sqlite3.connect(self.config['database_path']) - df = pd.read_csv(self.config['cleaned_data_csv_path']) - df.to_sql('cleaned_medical_data', conn, if_exists='replace', index=False) - conn.close() - - def get_patient_data_by_diagnosis(self, diagnosis: str) -> Tuple[Dict[str, Any], Optional[str], Optional[str]]: - """ - Retrieves a random patient's data who has been diagnosed with a specified condition. - """ - conn = sqlite3.connect(self.config['database_path']) - query = """ - SELECT * FROM cleaned_medical_data - WHERE description_cond LIKE ? OR reasondescription LIKE ? - """ - df = pd.read_sql_query(query, conn, params=(f'%{diagnosis}%', f'%{diagnosis}%')) - conn.close() - - if not df.empty: - selected_row = df.sample(n=1).iloc[0] - patient_data = { - 'age': relativedelta(datetime.now(), datetime.strptime(selected_row['BIRTHDATE'], '%Y-%m-%d')).years, - 'gender': selected_row['GENDER'], - 'conditions': [selected_row['description_cond']], - 'observations': [selected_row.get('observation', '')], - 'care_plans': [selected_row.get('DESCRIPTION_careplan', '')], - } - modality = selected_row.get('modality', None) - body_area = selected_row.get('body_area', None) - return patient_data, modality, body_area - else: - return {}, None, None