run vista3d dataset creation

Project-MONAI · Sep 23, 2024 · 392065d · 392065d
1 parent c810abc
commit 392065d
Show file tree

Hide file tree

Showing 7 changed files with 353 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -85,3 +85,6 @@ venv.bak/
 
 # VSCode
 .vscode/
+
+# Data files
+data/
diff --git a/monai_vila2d/data_prepare/data_utils.py b/monai_vila2d/data_prepare/data_utils.py
@@ -34,10 +34,12 @@
 )
 
 
-def read_txt(filename):
+def read_txt(filename, ignore_newline=True):
     assert ".txt" in filename
     with open(filename, "r") as f:
         data = f.readlines()
+    if ignore_newline:
+        data = [d.replace("\n", "") for d in data]
     return data
 
 

diff --git a/monai_vila2d/data_prepare/experts/expert_train_data_vista3d.py b/monai_vila2d/data_prepare/experts/expert_train_data_vista3d.py
@@ -0,0 +1,246 @@
+import os
+from data_utils import read_txt, read_json, write_json
+import uuid
+from tqdm import tqdm
+import random
+random.seed(0)
+
+from expert_utils import model_list
+assert isinstance(model_list, str)
+
+root_dir = "./"
+
+n = 100_000
+test_frac = 0.5
+# TODO: upsample tumors
+
+
+def get_qa_pairs(qas):
+    qa_pairs = []
+    while qas.find("Q:") and qas.find("A:"):
+        _start = qas.find("Q:")
+        qas = qas[_start::]
+        _end = qas.find("\n")
+        question = qas[qas.find("Q:") + 3:_end]
+        qas = qas[_end + 1::]
+        _end = qas.find("\n")
+        answer = qas[qas.find("A:") + 3:_end]
+        qas = qas[_end::]
+        #print(question, answer)
+        assert "Q:" not in answer
+        assert "A:" not in question
+        if len(question) == 0 or len(answer) == 0:
+            break
+        qa_pairs.append((question, answer))
+
+    assert len(qa_pairs) > 0
+    return qa_pairs
+
+
+def get_questions(reply):
+    questions = []
+    lines = reply.split("\n")
+    for l in lines:
+        if len(l) > 4:
+            if "." == l[1]:  # "e.g., '1.'"
+                question = l[3::]
+                questions.append(question)
+
+    assert len(questions) > 0
+    return questions
+
+
+def parse_qas(seg_qas_raw_file, lesions_q_raw_file, how_many_q_raw_file):
+    seg_qas_raw = read_json(seg_qas_raw_file)
+    lesions_q_raw = read_json(lesions_q_raw_file)
+    how_many_q_raw = read_json(how_many_q_raw_file)
+
+    # segmentation qa
+    seg_qas = {}
+    for v in seg_qas_raw:
+        qas = get_qa_pairs(v["reply"])
+        seg_qas[v["object_type"]] = {
+            "reply": v["reply"],
+            "exp_model": v["exp_model"],
+            "qas": qas
+        }
+        print(f"Added {len(qas)} QA pairs for {v['object_type']}")
+
+    lesions_qs = {}
+    for v in lesions_q_raw:
+        qs = get_questions(v["reply"])
+        lesions_qs[v["tumor"]] = {
+            "reply": v["reply"],
+            "tumor": v["tumor"],
+            "questions": qs
+        }
+        print(f"Added {len(qs)} lesion questions for {v['tumor']}")
+
+    how_many_qs = {}
+    for v in how_many_q_raw:
+        qs = get_questions(v["reply"])
+        how_many_qs[v["tumor"]] = {
+            "reply": v["reply"],
+            "tumor": v["tumor"],
+            "questions": qs
+        }
+        print(f"Added {len(qs)} how many questions for {v['tumor']}")
+
+    return seg_qas, lesions_qs, how_many_qs
+
+
+def read_meta_files(root, datasets):
+    assert isinstance(root, str)
+    assert isinstance(datasets, list)
+    meta_files = [os.path.join(root, ds, "extracted_slices_meta.json")for ds in datasets]
+
+    meta = []
+    out_datasets = []
+    counts = []
+    for meta_file, ds in zip(meta_files, datasets):
+        data = read_json(meta_file)
+        counts.append(len(data))
+        meta.extend(data)
+        out_datasets.extend(len(data) * [ds])
+
+    print(f"Joined {counts} meta entries. Total {len(meta)}.")
+    return meta, out_datasets
+
+
+def find_image(images, image, dataset):
+    assert isinstance(images, list)
+    assert isinstance(image, str)
+    assert isinstance(dataset, str)
+    assert len(images) > 0
+    for img in images:
+        if dataset in img:
+            if image in img:
+                return img
+    raise ValueError(f"Did not find a matching image for {image} and dataset {dataset}")
+
+
+def main():
+
+    images = read_txt("./vista3d/ct2D_vista3d_images.txt")
+    assert n < len(images)
+
+    incl_ds = ["Task03", "Task07", "Task09", "TotalSegmentatorV2"]
+    #incl_ds = ["Task03", "Task09"]
+    meta, datasets = read_meta_files(root="../experts/vista3d", datasets=incl_ds)
+
+    out_fileprefix = "../../data/experts/vista3d/llama_gen_expert_data_vista3d_what"
+
+    # TODO: add tumor questions
+
+    what_questions = read_txt("./llama_output/llama_gen_expert_what.txt")
+
+    # convert raw to dict
+    seg_qas, lesions_qs, how_many_qs = parse_qas("./llama_output/llama_gen_expert_qa_vista3d.json",
+                                                 "./llama_output/llama_gen_expert_qa_lesions.json",
+                                                 "./llama_output/llama_gen_expert_qa_how_many.json")
+
+    meta_ds = [(m, d) for m, d in zip(meta, datasets)]
+    meta_ds = random.sample(meta_ds, k=n)
+
+    all_conversations = []
+    n_neg_tumors, n_pos_tumors, n_seg, n_what = 0, 0, 0, 0
+    for md in tqdm(meta_ds, desc="creating train data..."):
+        m, ds = md[0], md[1]
+        image = find_image(images, m["image"], ds).replace(root_dir, "").replace("\n", "")
+        label = image.replace("_img.png", "_label.png")
+        group_name = m["group_name"]
+
+        id = str(uuid.uuid4())
+
+        entry = {
+            "image": image,
+            "id": id
+        }
+
+        if "tumor" in group_name or "lesion" in group_name:
+            # tumor task
+            if group_name in lesions_qs:
+                les_qs = lesions_qs[group_name]
+            else:
+                les_qs = lesions_qs[group_name+"s"]
+
+            question = random.choice(les_qs["questions"])
+
+            conv = list()
+            conv.append({"from": "human", "value": model_list + f" <image>This is a CT image.\n" + question})
+            conv.append({"from": "gpt", "value": f"This looks like a CT image. Let me trigger <VISTA3D({group_name})>. "})
+            conv.append({"from": "human", "value": f"The results are <segmentation>. The colors in this image describe {m['label_colors']}. Use this result to respond to this prompt:\n{question}."})
+            if len(m["num_tumors"]) > 0:
+                n_pos_tumors += 1
+                answer = "yes"
+                conv.append({"from": "gpt", "value": answer})
+                how_many = random.choice(how_many_qs["Any"]["questions"])
+                conv.append({"from": "human", "value": how_many})
+                answers = []
+                for t, nt in zip(m["found_tumor"], m["num_tumors"]):
+                    add_s = False
+                    if nt > 1:
+                        if not t.endswith("s"):
+                            add_s = True
+                    if add_s:
+                        answer = f"There are {nt} {t}s."
+                    else:
+                        answer = f"There are {nt} {t}."
+                    answers.append(answer)
+                    if len(answers) > 1:
+                        answers = " ".join(answers)
+                    else:
+                        answers = answers[0]
+                conv.append({"from": "gpt", "value": answers})
+
+                # TODO: Follow-up with what type of tumor question
+            else:
+                n_neg_tumors += 1
+                answer = "no"
+                conv.append({"from": "gpt", "value": answer})
+
+            entry["segmentation"] = label
+        else:  # segmentation or what is task
+            segment_task = True if random.random() > 0.5 else False
+            if segment_task:
+                # segmentation task
+                n_seg += 1
+                seg_qa = seg_qas[group_name]
+                qa = random.choice(seg_qa["qas"])
+                question, answer = qa[0], qa[1]
+
+                conv = list()
+                conv.append({"from": "human", "value": model_list + f" <image>This is a CT image.\n" + question})
+                conv.append({"from": "gpt", "value": answer})
+            else:
+                n_what += 1
+                question = random.choice(what_questions)
+                conv = list()
+                conv.append({"from": "human", "value": model_list + f" <image>This is a CT image.\n" + question})
+                conv.append({"from": "gpt", "value": f"This looks like a CT image. Let me trigger <VISTA3D({group_name})>. "})
+                conv.append({"from": "human", "value": f"The results are <segmentation>. The colors in this image describe {m['label_colors']}. Use this result to respond to this prompt:\n{question}."})
+                conv.append({"from": "gpt", "value": f"This a CT image. It contains several anatomical structures such as identified by VISTA3D: {m['label_colors']}."})
+
+        entry["conversations"] = conv
+
+        all_conversations.append(entry)
+
+    print(f"Converted {len(all_conversations)} conversations")
+
+    out_train_file = out_fileprefix + "_train.json"
+    out_test_file = out_fileprefix + "_test.json"
+
+    split_idx = int(test_frac*len(all_conversations))
+
+    random.shuffle(all_conversations)
+    test_conversations = all_conversations[0:split_idx]
+    train_conversations = all_conversations[split_idx::]
+
+    write_json(train_conversations, out_train_file)
+    write_json(test_conversations, out_test_file)
+
+    print(f"Saved neg tumors: {n_neg_tumors}, pos tumors: {n_pos_tumors}, seg: {n_seg}, what: {n_what}, total: {len(all_conversations)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/monai_vila2d/data_prepare/experts/llama_output/llama_gen_expert_qa_how_many.json b/monai_vila2d/data_prepare/experts/llama_output/llama_gen_expert_qa_how_many.json
@@ -0,0 +1,7 @@
+[
+    {
+        "reply": "Here are some rephrased versions of the question \"How many are there?\" in a biomedical professional, precise, and accurate manner, with a mix of polite and command forms:\n\n**Polite Forms:**\n\n1. Can you please quantify the number of lesions present?\n2. Would you be able to tell me the total count of abnormalities in this image?\n3. Could you kindly provide the number of structures visible in this scan?\n4. May I request the quantity of findings in this medical image?\n5. Can you help me determine the number of affected areas?\n\n**Command Forms:**\n\n1. Quantify the number of lesions present in this image.\n2. Report the total count of abnormalities in this scan.\n3. Provide the exact number of structures visible in this medical image.\n4. Specify the quantity of findings in this radiograph.\n5. Determine the number of affected areas in this patient's scan.\n\n**Additional Variations:**\n\n1. What is the total number of [condition/abnormality] present in this image?\n2. How many [condition/abnormality] can be seen in this scan?\n3. What is the count of [condition/abnormality] in this medical image?\n4. Can you give me an exact number of [condition/abnormality] in this patient's scan?\n5. What is the number of [condition/abnormality] visible in this radiograph?\n\nThese rephrased questions aim to provide a diverse set of queries that can help train your multimodal model for medical visual question answering. Feel free to adjust or modify them as needed to fit your specific requirements.",
+        "question": "How many are there?",
+        "tumor": "Any"
+    }
+]
diff --git a/monai_vila2d/data_prepare/experts/llama_output/llama_gen_expert_qa_lesions.json b/monai_vila2d/data_prepare/experts/llama_output/llama_gen_expert_qa_lesions.json
@@ -0,0 +1,22 @@
+[
+    {
+        "reply": "Here are some rephrased versions of the question \"Are there any hepatic tumors?\" in a biomedical professional, precise, and accurate manner, with a mix of polite and command forms:\n\n**Polite Forms:**\n\n1. Can you identify any liver masses or tumors?\n2. Are hepatic lesions or tumors present in this case?\n3. Would you please check for any liver tumors or abnormalities?\n4. Could you determine if there are any hepatic malignancies?\n5. Are there any notable liver nodules or tumors visible?\n\n**Command Forms:**\n\n1. Identify all hepatic tumors or lesions.\n2. Determine the presence of liver cancer or tumors.\n3. Detect any liver abnormalities, including tumors.\n4. Check for hepatic metastases or primary tumors.\n5. Confirm the absence or presence of liver tumors.\n\n**Additional Variations:**\n\n1. Is there evidence of hepatic neoplasia?\n2. Are there any focal liver lesions suspicious for tumor?\n3. Can you rule out liver cancer or tumors?\n4. Are there any abnormalities in the liver parenchyma suggestive of tumor?\n5. Is there a presence of liver tumor or cyst?\n\n**Using Synonyms and Related Terms:**\n\n1. Are there any liver carcinomas or sarcomas?\n2. Can you identify any hepatic adenomas or hemangiomas?\n3. Are there any benign or malignant liver lesions?\n4. Can you detect any liver metastases from other primary cancers?\n5. Are there any signs of liver cancer, such as hepatocellular carcinoma?\n\nThese rephrased questions should provide a good starting point for generating a diverse set of questions for your multimodal model. Remember to consider various medical conditions, use precise medical terminology, and include a mix of polite and command forms to simulate real-world clinical scenarios.",
+        "question": "Are there any hepatic tumors?",
+        "tumor": "hepatic tumors"
+    },
+    {
+        "reply": "Here are some rephrased versions of the question \"Are there any pancreatic tumors?\" in a biomedical professional, precise, and accurate manner, with a mix of polite and command forms:\n\n**Polite Forms**\n\n1. Can you identify any pancreatic tumors in this image?\n2. Could you please check for the presence of pancreatic tumors?\n3. Would you be able to detect any tumors in the pancreas?\n4. Is there evidence of pancreatic tumors in this scan?\n5. Can you confirm the absence or presence of pancreatic tumors?\n6. Would you kindly look for any pancreatic tumors in this image?\n7. Can you tell me if there are any pancreatic tumors visible?\n8. Is it possible to identify any pancreatic tumors in this study?\n\n**Command Forms**\n\n1. Identify any pancreatic tumors in this image.\n2. Check for the presence of pancreatic tumors.\n3. Detect any tumors in the pancreas.\n4. Report any evidence of pancreatic tumors in this scan.\n5. Confirm the presence or absence of pancreatic tumors.\n6. Look for any pancreatic tumors in this image.\n7. Determine if there are any pancreatic tumors visible.\n8. Highlight any pancreatic tumors in this study.\n\n**Additional Variations**\n\n1. Are there any masses or lesions in the pancreas that could indicate a tumor?\n2. Is there any indication of pancreatic cancer or neoplasia?\n3. Can you identify any abnormal cell growth in the pancreas?\n4. Are there any signs of pancreatic adenocarcinoma or other types of pancreatic tumors?\n5. Can you detect any pancreatic neuroendocrine tumors (PNETs)?\n\nThese rephrased questions should provide a good starting point for generating a diverse set of questions for your multimodal model. Remember to consider various medical conditions, imaging modalities, and clinical scenarios to create a comprehensive dataset.",
+        "question": "Are there any pancreatic tumors?",
+        "tumor": "pancreatic tumors"
+    },
+    {
+        "reply": "Here are some rephrased versions of the question \"Are there any lung tumors?\" in a biomedical professional, precise, and accurate manner, with a mix of polite and command forms:\n\n**Polite Forms:**\n\n1. Can you identify any lung tumors in this image?\n2. Would you please check for the presence of lung tumors?\n3. Could you determine if there are any lung tumors visible?\n4. May I ask if lung tumors are present in this case?\n5. Would you kindly assess for lung tumors in this patient?\n\n**Command Forms:**\n\n1. Identify any lung tumors in this image.\n2. Check for the presence of lung tumors.\n3. Determine if there are any lung tumors visible.\n4. Assess for lung tumors in this patient.\n5. Detect any lung tumors in this radiograph.\n\n**Additional Variations:**\n\n1. Are there any masses or nodules in the lungs that could indicate a tumor?\n2. Is there evidence of lung cancer or tumor growth?\n3. Can you rule out the presence of lung tumors?\n4. Are there any abnormal cell growths or lesions in the lungs?\n5. Is there a tumor or neoplasm present in the lung tissue?\n\n**Using Technical Terms:**\n\n1. Are there any pulmonary neoplasms or malignancies present?\n2. Can you identify any lung lesions or masses that may be indicative of a tumor?\n3. Is there evidence of bronchogenic carcinoma or other lung cancers?\n4. Are there any abnormalities in the lung parenchyma that could suggest a tumor?\n5. Can you detect any thoracic tumors or cancers?\n\nThese rephrased questions should provide a good starting point for generating a diverse set of questions for your multimodal model. Remember to consider various linguistic and medical nuances to ensure that your model is trained on a comprehensive and accurate dataset.",
+        "question": "Are there any lung tumors?",
+        "tumor": "lung tumors"
+    },
+    {
+        "reply": "Here are some rephrased versions of the question \"Are there any bone lesions?\" in a biomedical professional, precise, and accurate manner, with a mix of polite and command forms:\n\n**Polite Forms:**\n\n1. Can you identify any bone lesions in this image?\n2. Would you please check for the presence of bone lesions?\n3. Could you kindly look for any bone abnormalities or lesions?\n4. May I request that you examine the image for bone lesions?\n5. Would you be so kind as to determine if there are any bone lesions present?\n\n**Command Forms:**\n\n1. Identify all bone lesions in this image.\n2. Check for bone lesions and report any findings.\n3. Examine the image for the presence of bone abnormalities or lesions.\n4. Determine if there are any bone lesions present in this image.\n5. Detect and describe any bone lesions in the image.\n\n**Alternative Phrasings:**\n\n1. Are there any abnormalities in the bone structure?\n2. Is there evidence of bone pathology?\n3. Can you detect any bone tumors or cysts?\n4. Are there any signs of bone infection or inflammation?\n5. Can you identify any bone fractures or defects?\n\n**More Specific Phrasings:**\n\n1. Are there any lytic or blastic bone lesions present?\n2. Can you identify any bone metastases?\n3. Are there any signs of osteomyelitis or bone abscess?\n4. Can you detect any bone cysts or tumors, such as osteosarcoma?\n5. Are there any degenerative changes in the bone, such as osteoarthritis?\n\nThese rephrased questions should provide a good starting point for generating a diverse set of questions for your multimodal model. Remember to consider the specific medical condition, imaging modality, and relevant clinical information when generating questions to ensure they are accurate and relevant.",
+        "question": "Are there any bone lesions?",
+        "tumor": "bone lesions"
+    }
+]