Properly Separating samples and drug omics file generation for beatam…

…l and hcmi. Allowing hcmi to only use a single dockerfile.
PNNL-CompBio · Jan 3, 2024 · 72d56d4 · 72d56d4
1 parent 589a2c6
commit 72d56d4
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 110 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -21,13 +21,17 @@ jobs:
     runs-on: ubuntu-20.04
     environment: build
     steps:
-      #Add upload artifact from CPTAC Samples 
       - name: Checkout  HCMI
         uses: actions/checkout@v2
+        #Add upload artifact from CPTAC Samples when ready
+        #temp fix below
+      - name: Copy CPTAC sample from build location. 
+          run: |
+            cp build/cptac/cptac_samples.csv .
       - name: Build HCMI Image
-        run: docker build -t hcmi-samples-builder -f build/docker/Dockerfile.hcmi.samples .
-      - name: Run HCMI Container
-        run: docker run --name hcmi-samples-container hcmi-samples-builder
+        run: docker build -t hcmi-samples-builder -f build/docker/Dockerfile.hcmi .
+      - name: Run HCMI samples Container
+        run: docker run --name hcmi-samples-container hcmi-samples-builder python 01-createHCMISamplesFile.py
       - name: Copy file from HCMI Samples Container
         run: |
           mkdir hcmi-samples
@@ -36,11 +40,6 @@ jobs:
           done
       - name: Get HCMI container logs
         run: docker logs hcmi-samples-container 
-      # - name: dummy hcmi
-      #   run: |
-      #     mkdir hcmi-files
-      #     echo "This is some sample text." > hcmi-files/d.txt
-      #     echo "Another line of text." > hcmi-files/c.gz
       - name: Upload artifacts for HCMI
         uses: actions/upload-artifact@v2
         with:
@@ -64,7 +63,8 @@ jobs:
       - name: Build HCMI Image
         run: docker build -t hcmi-builder -f build/docker/Dockerfile.hcmi .
       - name: Run HCMI Container
-        run: docker run --name hcmi-container hcmi-builder
+        run: |
+          docker run --name hcmi-container hcmi-builder /bin/bash -c "python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o hcmi_transcriptomics.csv && python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t copy_number -o hcmi_copy_number.csv && python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t mutations -o hcmi_mutations.csv"
       - name: Copy files from HCMI Container
         run: |
           mkdir hcmi-files
@@ -83,10 +83,39 @@ jobs:
         with:
           name: hcmi-files
           path: hcmi-files
-
-  build-beataml:
+  build-beataml-samples:
     if: github.actor != 'github-actions[bot]'
+    runs-on: ubuntu-20.04
+    environment: build
     needs: [build-hcmi-samples]
+    steps:
+      #Add upload artifact from CPTAC Samples 
+      - name: Checkout  BeatAML
+        uses: actions/checkout@v2
+      - name: Download artifacts from HCMI
+        uses: actions/download-artifact@v2
+        with:
+          name: hcmi-samples
+      - name: Build BeatAML Image
+        run: docker build -t beataml-builder -f build/docker/Dockerfile.beataml .
+      - name: Run BeatAML Container with samples option
+        run: docker run --name beataml-samples-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder python GetBeatAML.py --token ${SYNAPSE_TOKEN} --samples
+      - name: Copy file from BeatAML Samples Container
+        run: |
+          mkdir beataml-samples
+          for file in beataml_samples.csv; do
+            docker cp beataml-samples-container:/usr/src/app/$file beataml-samples/$file
+          done
+      - name: Get BeatAML container logs
+        run: docker logs beataml-samples-container 
+      - name: Upload artifacts for BeatAML
+        uses: actions/upload-artifact@v2
+        with:
+          name: beataml-samples
+          path: beataml-samples
+  build-beataml:
+    if: github.actor != 'github-actions[bot]'
+    needs: [build-beataml-samples]
     runs-on: ubuntu-20.04
     environment: build
     steps:
@@ -95,13 +124,15 @@ jobs:
       - name: Download artifacts from HCMI
         uses: actions/download-artifact@v2
         with:
-          name: hcmi-samples
+          name: beataml-samples
       - name: List files cwd
         run: ls -lah 
       - name: Build BeatAML Image
         run: docker build -t beataml-builder -f build/docker/Dockerfile.beataml .
+      # - name: Run BeatAML Container
+      #   run: docker run --name beataml-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder
       - name: Run BeatAML Container
-        run: docker run --name beataml-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder
+        run: docker run --name beataml-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder python GetBeatAML.py --token ${SYNAPSE_TOKEN}
       - name: Copy files from BeatAML Container
         run: |
           mkdir beataml-files

diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -508,6 +508,7 @@ def generate_raw_drug_file(original_drug_file, sample_mapping_file, updated_raw_
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Process some integers and a string.')
     parser.add_argument('-t', '--token', type=str, help='Synapse Token')
+    parser.add_argument('-s', '--samples', action='store_true', help='Only Run Samples File Generation')
     args = parser.parse_args()
 
     print("Logging into Synapse")
@@ -541,102 +542,104 @@ def generate_raw_drug_file(original_drug_file, sample_mapping_file, updated_raw_
     additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
     sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
     download_from_github(additional_mapping_url, sample_mapping_file)
-
-    original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-    original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-    download_from_github(original_drug_url, original_drug_file)
-
-    updated_raw_drug_file = "beatAML_drug_raw.tsv"
-
-    drug_path = "beatAML_drug_processed.tsv.0"
-    drug_map_path = retrieve_figshare_data("https://figshare.com/ndownloader/files/43112314?private_link=0ea222d9bd461c756fb0")
-
-    transcriptomics_file = "beataml_waves1to4_norm_exp_dbgap.txt"
-    transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
-    download_from_github(transcriptomics_url, transcriptomics_file)
-
-    mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
-    mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
-    download_from_github(mutations_url, mutations_file)
-
-    mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
-    mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
-    download_from_github(mutation_map_url, mutation_map_file)
-
+
     supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
     supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
     download_from_github(supplementary_url, supplimentary_file)
 
     prev_samples_path = "hcmi_samples.csv"
-
-    #Generate Samples File
-    generate_samples_file(prev_samples_path)
     improve_map_file = "beataml_samples.csv"
 
-    print("Starting Raw Drug File Generation ")
-    # Generate Raw Drugs File to use in Curve fitting algorithm
-    generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)
-
-    print("Starting Curve Fitting Algorithm")
-    # Run Curve fitting algorithm from scripts directory.
-    # Note the file path to fit_curve.py may need to be changed.
-    command = ['python', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv']
-    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if result.returncode == 0:
-        print("Curve Fitting executed successfully!")
+    if args.samples:
+        print("Only running Samples File Generation")
+        prev_samples_path = "../hcmi/hcmi_samples.csv"
+        #Generate Samples File
+        generate_samples_file(prev_samples_path)
     else:
-        print("Curve fitting failed.")
-        print("Out:", result.stdout)
-        print("Error:", result.stderr)
-
-    # New Transcriptomics Data
-    print("Starting Transcriptomics Data")
-    t_df = pd.read_csv(transcriptomics_file, sep = '\t')
-    t_df.index = t_df.display_label
-    t_df = t_df.iloc[:, 4:]
-    t_df = t_df.reset_index().rename(columns={'display_label': 'Gene'})
-    t_df = pd.melt(t_df, id_vars=['Gene'], var_name='sample_id', value_name='transcriptomics')
-    t_df = map_and_combine(t_df, "transcriptomics", entrez_map_file, "beataml_samples.csv", sample_mapping_file)
-    t_df = t_df[t_df.entrez_id.notna()]
-    t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]]
-    t_df.to_csv("beataml_transcriptomics.csv",index=False)
-
-    # New Proteomics Data
-    print("Starting Proteomics Data")
-    p_df = pd.read_csv("ptrc_ex10_crosstab_global_gene_corrected.txt", sep = '\t')
-    p_df = p_df.reset_index().rename(columns={'index': 'Protein'})
-    p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
-    p_df = map_and_combine(p_df, "proteomics", entrez_map_file, improve_map_file, proteomics_map)
-    p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
-    p_df.to_csv("beataml_proteomics.csv",index=False)
-
-    # New Mutation Data
-    print("Starting Mutation Data")
-    m_df = pd.read_csv(mutations_file, sep = '\t')
-    m_df = map_and_combine(m_df, "mutations", entrez_map_file, "beataml_samples.csv", mutation_map_file)
-    m_df = m_df[["improve_sample_id","mutations", "entrez_id","variant_classification","source","study"]]
-    m_df.to_csv("beataml_mutations.csv",index=False)
-
-    # Drug and Experiment Data
-    print("Starting Drug Data")
-    drug_map = format_drug_map(drug_map_path)
-    d_df = format_drug_df(drug_path)
-    d_df = update_dataframe_with_pubchem(d_df)
-    d_res = merge_drug_info(d_df, drug_map)
-    d_res = add_improve_id(drug_map, d_res)
-    #Drug Data
-    drug_res = d_res[["improve_drug_id","chem_name","formula","weight","inchikey","canSMILES","isoSMILES"]]
-    drug_res.rename(columns={"inchikey": "inCHIKey"}, inplace=True)
-    drug_res.to_csv("beataml_drugs.tsv",sep="\t", index=False)
-
-    print("Starting Experiment Data")
-    # Experiment Data
-    d_res = d_res.rename(columns={"CELL":"sample_id","AUC":"auc"})
-    exp_res = map_exp_to_improve(d_res,"beataml_samples.csv")
-    exp_res = exp_res[["source","improve_sample_id","improve_drug_id","study","auc","ic50","ec50","ec50se","r2fit","einf","hs","aac1","auc1","dss1"]]
-    exp_res.to_csv("beataml_experiments.csv", index=False)
-    print("Finished Pipeline")
-
-
-
+        print("Generating all drug/omics files")
+        original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
+        original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
+        download_from_github(original_drug_url, original_drug_file)
+
+        updated_raw_drug_file = "beatAML_drug_raw.tsv"
+
+        drug_path = "beatAML_drug_processed.tsv.0"
+        drug_map_path = retrieve_figshare_data("https://figshare.com/ndownloader/files/43112314?private_link=0ea222d9bd461c756fb0")
+
+        transcriptomics_file = "beataml_waves1to4_norm_exp_dbgap.txt"
+        transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
+        download_from_github(transcriptomics_url, transcriptomics_file)
+
+        mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
+        mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
+        download_from_github(mutations_url, mutations_file)
 
+        mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
+        mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
+        download_from_github(mutation_map_url, mutation_map_file)
+
+
+        print("Starting Raw Drug File Generation ")
+        # Generate Raw Drugs File to use in Curve fitting algorithm
+        generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)
+
+        print("Starting Curve Fitting Algorithm")
+        # Run Curve fitting algorithm from scripts directory.
+        # Note the file path to fit_curve.py may need to be changed.
+        command = ['python', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv']
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if result.returncode == 0:
+            print("Curve Fitting executed successfully!")
+        else:
+            print("Curve fitting failed.")
+            print("Out:", result.stdout)
+            print("Error:", result.stderr)
+
+        # New Transcriptomics Data
+        print("Starting Transcriptomics Data")
+        t_df = pd.read_csv(transcriptomics_file, sep = '\t')
+        t_df.index = t_df.display_label
+        t_df = t_df.iloc[:, 4:]
+        t_df = t_df.reset_index().rename(columns={'display_label': 'Gene'})
+        t_df = pd.melt(t_df, id_vars=['Gene'], var_name='sample_id', value_name='transcriptomics')
+        t_df = map_and_combine(t_df, "transcriptomics", entrez_map_file, "beataml_samples.csv", sample_mapping_file)
+        t_df = t_df[t_df.entrez_id.notna()]
+        t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]]
+        t_df.to_csv("beataml_transcriptomics.csv",index=False)
+
+        # New Proteomics Data
+        print("Starting Proteomics Data")
+        p_df = pd.read_csv("ptrc_ex10_crosstab_global_gene_corrected.txt", sep = '\t')
+        p_df = p_df.reset_index().rename(columns={'index': 'Protein'})
+        p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
+        p_df = map_and_combine(p_df, "proteomics", entrez_map_file, improve_map_file, proteomics_map)
+        p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
+        p_df.to_csv("beataml_proteomics.csv",index=False)
+
+        # New Mutation Data
+        print("Starting Mutation Data")
+        m_df = pd.read_csv(mutations_file, sep = '\t')
+        m_df = map_and_combine(m_df, "mutations", entrez_map_file, "beataml_samples.csv", mutation_map_file)
+        m_df = m_df[["improve_sample_id","mutations", "entrez_id","variant_classification","source","study"]]
+        m_df.to_csv("beataml_mutations.csv",index=False)
+
+        # Drug and Experiment Data
+        print("Starting Drug Data")
+        drug_map = format_drug_map(drug_map_path)
+        d_df = format_drug_df(drug_path)
+        d_df = update_dataframe_with_pubchem(d_df)
+        d_res = merge_drug_info(d_df, drug_map)
+        d_res = add_improve_id(drug_map, d_res)
+        #Drug Data
+        drug_res = d_res[["improve_drug_id","chem_name","formula","weight","inchikey","canSMILES","isoSMILES"]]
+        drug_res.rename(columns={"inchikey": "inCHIKey"}, inplace=True)
+        drug_res.to_csv("beataml_drugs.tsv",sep="\t", index=False)
+
+        print("Starting Experiment Data")
+        # Experiment Data
+        d_res = d_res.rename(columns={"CELL":"sample_id","AUC":"auc"})
+        exp_res = map_exp_to_improve(d_res,"beataml_samples.csv")
+        exp_res = exp_res[["source","improve_sample_id","improve_drug_id","study","auc","ic50","ec50","ec50se","r2fit","einf","hs","aac1","auc1","dss1"]]
+        exp_res.to_csv("beataml_experiments.csv", index=False)
+        print("Finished Pipeline")
+
diff --git a/build/docker/Dockerfile.beataml b/build/docker/Dockerfile.beataml
@@ -2,11 +2,11 @@ FROM --platform=linux/x86_64 python:3.9
 
 WORKDIR /usr/src/app
 
-COPY hcmi_samples.csv .
+COPY *_samples.csv .
 COPY build/beatAML/GetBeatAML.py . 
 COPY build/utils/fit_curve.py .
 
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-CMD python GetBeatAML.py --token ${SYNAPSE_TOKEN}
+# CMD python GetBeatAML.py --token ${SYNAPSE_TOKEN}
diff --git a/build/docker/Dockerfile.hcmi b/build/docker/Dockerfile.hcmi
@@ -2,13 +2,17 @@ FROM --platform=linux/x86_64 python:3.9
 
 WORKDIR /usr/src/app
 
+COPY build/hcmi/01-createHCMISamplesFile.py .
 COPY build/hcmi/02-getHCMIData.py .
 COPY build/hcmi/full_manifest.txt .
-COPY hcmi_samples.csv .
+
+COPY *_samples.csv .
+# COPY build/cptac/cptac_samples.csv .
 
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-CMD python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o hcmi_transcriptomics.csv && \
-    python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t copy_number -o hcmi_copy_number.csv && \
-    python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t mutations -o hcmi_mutations.csv
+# CMD python 01-createHCMISamplesFile.py && \
+#     python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o hcmi_transcriptomics.csv && \
+#     python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t copy_number -o hcmi_copy_number.csv && \
+#     python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t mutations -o hcmi_mutations.csv