Skip to content

Commit

Permalink
Properly Separating samples and drug omics file generation for beatam…
Browse files Browse the repository at this point in the history
…l and hcmi. Allowing hcmi to only use a single dockerfile.
  • Loading branch information
jjacobson95 committed Jan 3, 2024
1 parent 589a2c6 commit 72d56d4
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 110 deletions.
59 changes: 45 additions & 14 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@ jobs:
runs-on: ubuntu-20.04
environment: build
steps:
#Add upload artifact from CPTAC Samples
- name: Checkout HCMI
uses: actions/checkout@v2
#Add upload artifact from CPTAC Samples when ready
#temp fix below
- name: Copy CPTAC sample from build location.
run: |
cp build/cptac/cptac_samples.csv .
- name: Build HCMI Image
run: docker build -t hcmi-samples-builder -f build/docker/Dockerfile.hcmi.samples .
- name: Run HCMI Container
run: docker run --name hcmi-samples-container hcmi-samples-builder
run: docker build -t hcmi-samples-builder -f build/docker/Dockerfile.hcmi .
- name: Run HCMI samples Container
run: docker run --name hcmi-samples-container hcmi-samples-builder python 01-createHCMISamplesFile.py
- name: Copy file from HCMI Samples Container
run: |
mkdir hcmi-samples
Expand All @@ -36,11 +40,6 @@ jobs:
done
- name: Get HCMI container logs
run: docker logs hcmi-samples-container
# - name: dummy hcmi
# run: |
# mkdir hcmi-files
# echo "This is some sample text." > hcmi-files/d.txt
# echo "Another line of text." > hcmi-files/c.gz
- name: Upload artifacts for HCMI
uses: actions/upload-artifact@v2
with:
Expand All @@ -64,7 +63,8 @@ jobs:
- name: Build HCMI Image
run: docker build -t hcmi-builder -f build/docker/Dockerfile.hcmi .
- name: Run HCMI Container
run: docker run --name hcmi-container hcmi-builder
run: |
docker run --name hcmi-container hcmi-builder /bin/bash -c "python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o hcmi_transcriptomics.csv && python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t copy_number -o hcmi_copy_number.csv && python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t mutations -o hcmi_mutations.csv"
- name: Copy files from HCMI Container
run: |
mkdir hcmi-files
Expand All @@ -83,10 +83,39 @@ jobs:
with:
name: hcmi-files
path: hcmi-files

build-beataml:
build-beataml-samples:
if: github.actor != 'github-actions[bot]'
runs-on: ubuntu-20.04
environment: build
needs: [build-hcmi-samples]
steps:
#Add upload artifact from CPTAC Samples
- name: Checkout BeatAML
uses: actions/checkout@v2
- name: Download artifacts from HCMI
uses: actions/download-artifact@v2
with:
name: hcmi-samples
- name: Build BeatAML Image
run: docker build -t beataml-builder -f build/docker/Dockerfile.beataml .
- name: Run BeatAML Container with samples option
run: docker run --name beataml-samples-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder python GetBeatAML.py --token ${SYNAPSE_TOKEN} --samples
- name: Copy file from BeatAML Samples Container
run: |
mkdir beataml-samples
for file in beataml_samples.csv; do
docker cp beataml-samples-container:/usr/src/app/$file beataml-samples/$file
done
- name: Get BeatAML container logs
run: docker logs beataml-samples-container
- name: Upload artifacts for BeatAML
uses: actions/upload-artifact@v2
with:
name: beataml-samples
path: beataml-samples
build-beataml:
if: github.actor != 'github-actions[bot]'
needs: [build-beataml-samples]
runs-on: ubuntu-20.04
environment: build
steps:
Expand All @@ -95,13 +124,15 @@ jobs:
- name: Download artifacts from HCMI
uses: actions/download-artifact@v2
with:
name: hcmi-samples
name: beataml-samples
- name: List files cwd
run: ls -lah
- name: Build BeatAML Image
run: docker build -t beataml-builder -f build/docker/Dockerfile.beataml .
# - name: Run BeatAML Container
# run: docker run --name beataml-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder
- name: Run BeatAML Container
run: docker run --name beataml-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder
run: docker run --name beataml-container -e SYNAPSE_TOKEN=${{ secrets.SYNAPSE_TOKEN_SECRET }} beataml-builder python GetBeatAML.py --token ${SYNAPSE_TOKEN}
- name: Copy files from BeatAML Container
run: |
mkdir beataml-files
Expand Down
183 changes: 93 additions & 90 deletions build/beatAML/GetBeatAML.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ def generate_raw_drug_file(original_drug_file, sample_mapping_file, updated_raw_
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process some integers and a string.')
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
parser.add_argument('-s', '--samples', action='store_true', help='Only Run Samples File Generation')
args = parser.parse_args()

print("Logging into Synapse")
Expand Down Expand Up @@ -541,102 +542,104 @@ def generate_raw_drug_file(original_drug_file, sample_mapping_file, updated_raw_
additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
download_from_github(additional_mapping_url, sample_mapping_file)

original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
download_from_github(original_drug_url, original_drug_file)

updated_raw_drug_file = "beatAML_drug_raw.tsv"

drug_path = "beatAML_drug_processed.tsv.0"
drug_map_path = retrieve_figshare_data("https://figshare.com/ndownloader/files/43112314?private_link=0ea222d9bd461c756fb0")

transcriptomics_file = "beataml_waves1to4_norm_exp_dbgap.txt"
transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
download_from_github(transcriptomics_url, transcriptomics_file)

mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
download_from_github(mutations_url, mutations_file)

mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
download_from_github(mutation_map_url, mutation_map_file)


supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
download_from_github(supplementary_url, supplimentary_file)

prev_samples_path = "hcmi_samples.csv"

#Generate Samples File
generate_samples_file(prev_samples_path)
improve_map_file = "beataml_samples.csv"

print("Starting Raw Drug File Generation ")
# Generate Raw Drugs File to use in Curve fitting algorithm
generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)

print("Starting Curve Fitting Algorithm")
# Run Curve fitting algorithm from scripts directory.
# Note the file path to fit_curve.py may need to be changed.
command = ['python', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv']
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode == 0:
print("Curve Fitting executed successfully!")
if args.samples:
print("Only running Samples File Generation")
prev_samples_path = "../hcmi/hcmi_samples.csv"
#Generate Samples File
generate_samples_file(prev_samples_path)
else:
print("Curve fitting failed.")
print("Out:", result.stdout)
print("Error:", result.stderr)

# New Transcriptomics Data
print("Starting Transcriptomics Data")
t_df = pd.read_csv(transcriptomics_file, sep = '\t')
t_df.index = t_df.display_label
t_df = t_df.iloc[:, 4:]
t_df = t_df.reset_index().rename(columns={'display_label': 'Gene'})
t_df = pd.melt(t_df, id_vars=['Gene'], var_name='sample_id', value_name='transcriptomics')
t_df = map_and_combine(t_df, "transcriptomics", entrez_map_file, "beataml_samples.csv", sample_mapping_file)
t_df = t_df[t_df.entrez_id.notna()]
t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]]
t_df.to_csv("beataml_transcriptomics.csv",index=False)

# New Proteomics Data
print("Starting Proteomics Data")
p_df = pd.read_csv("ptrc_ex10_crosstab_global_gene_corrected.txt", sep = '\t')
p_df = p_df.reset_index().rename(columns={'index': 'Protein'})
p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
p_df = map_and_combine(p_df, "proteomics", entrez_map_file, improve_map_file, proteomics_map)
p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
p_df.to_csv("beataml_proteomics.csv",index=False)

# New Mutation Data
print("Starting Mutation Data")
m_df = pd.read_csv(mutations_file, sep = '\t')
m_df = map_and_combine(m_df, "mutations", entrez_map_file, "beataml_samples.csv", mutation_map_file)
m_df = m_df[["improve_sample_id","mutations", "entrez_id","variant_classification","source","study"]]
m_df.to_csv("beataml_mutations.csv",index=False)

# Drug and Experiment Data
print("Starting Drug Data")
drug_map = format_drug_map(drug_map_path)
d_df = format_drug_df(drug_path)
d_df = update_dataframe_with_pubchem(d_df)
d_res = merge_drug_info(d_df, drug_map)
d_res = add_improve_id(drug_map, d_res)
#Drug Data
drug_res = d_res[["improve_drug_id","chem_name","formula","weight","inchikey","canSMILES","isoSMILES"]]
drug_res.rename(columns={"inchikey": "inCHIKey"}, inplace=True)
drug_res.to_csv("beataml_drugs.tsv",sep="\t", index=False)

print("Starting Experiment Data")
# Experiment Data
d_res = d_res.rename(columns={"CELL":"sample_id","AUC":"auc"})
exp_res = map_exp_to_improve(d_res,"beataml_samples.csv")
exp_res = exp_res[["source","improve_sample_id","improve_drug_id","study","auc","ic50","ec50","ec50se","r2fit","einf","hs","aac1","auc1","dss1"]]
exp_res.to_csv("beataml_experiments.csv", index=False)
print("Finished Pipeline")



print("Generating all drug/omics files")
original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
download_from_github(original_drug_url, original_drug_file)

updated_raw_drug_file = "beatAML_drug_raw.tsv"

drug_path = "beatAML_drug_processed.tsv.0"
drug_map_path = retrieve_figshare_data("https://figshare.com/ndownloader/files/43112314?private_link=0ea222d9bd461c756fb0")

transcriptomics_file = "beataml_waves1to4_norm_exp_dbgap.txt"
transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
download_from_github(transcriptomics_url, transcriptomics_file)

mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
download_from_github(mutations_url, mutations_file)

mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
download_from_github(mutation_map_url, mutation_map_file)


print("Starting Raw Drug File Generation ")
# Generate Raw Drugs File to use in Curve fitting algorithm
generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)

print("Starting Curve Fitting Algorithm")
# Run Curve fitting algorithm from scripts directory.
# Note the file path to fit_curve.py may need to be changed.
command = ['python', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv']
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode == 0:
print("Curve Fitting executed successfully!")
else:
print("Curve fitting failed.")
print("Out:", result.stdout)
print("Error:", result.stderr)

# New Transcriptomics Data
print("Starting Transcriptomics Data")
t_df = pd.read_csv(transcriptomics_file, sep = '\t')
t_df.index = t_df.display_label
t_df = t_df.iloc[:, 4:]
t_df = t_df.reset_index().rename(columns={'display_label': 'Gene'})
t_df = pd.melt(t_df, id_vars=['Gene'], var_name='sample_id', value_name='transcriptomics')
t_df = map_and_combine(t_df, "transcriptomics", entrez_map_file, "beataml_samples.csv", sample_mapping_file)
t_df = t_df[t_df.entrez_id.notna()]
t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]]
t_df.to_csv("beataml_transcriptomics.csv",index=False)

# New Proteomics Data
print("Starting Proteomics Data")
p_df = pd.read_csv("ptrc_ex10_crosstab_global_gene_corrected.txt", sep = '\t')
p_df = p_df.reset_index().rename(columns={'index': 'Protein'})
p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
p_df = map_and_combine(p_df, "proteomics", entrez_map_file, improve_map_file, proteomics_map)
p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
p_df.to_csv("beataml_proteomics.csv",index=False)

# New Mutation Data
print("Starting Mutation Data")
m_df = pd.read_csv(mutations_file, sep = '\t')
m_df = map_and_combine(m_df, "mutations", entrez_map_file, "beataml_samples.csv", mutation_map_file)
m_df = m_df[["improve_sample_id","mutations", "entrez_id","variant_classification","source","study"]]
m_df.to_csv("beataml_mutations.csv",index=False)

# Drug and Experiment Data
print("Starting Drug Data")
drug_map = format_drug_map(drug_map_path)
d_df = format_drug_df(drug_path)
d_df = update_dataframe_with_pubchem(d_df)
d_res = merge_drug_info(d_df, drug_map)
d_res = add_improve_id(drug_map, d_res)
#Drug Data
drug_res = d_res[["improve_drug_id","chem_name","formula","weight","inchikey","canSMILES","isoSMILES"]]
drug_res.rename(columns={"inchikey": "inCHIKey"}, inplace=True)
drug_res.to_csv("beataml_drugs.tsv",sep="\t", index=False)

print("Starting Experiment Data")
# Experiment Data
d_res = d_res.rename(columns={"CELL":"sample_id","AUC":"auc"})
exp_res = map_exp_to_improve(d_res,"beataml_samples.csv")
exp_res = exp_res[["source","improve_sample_id","improve_drug_id","study","auc","ic50","ec50","ec50se","r2fit","einf","hs","aac1","auc1","dss1"]]
exp_res.to_csv("beataml_experiments.csv", index=False)
print("Finished Pipeline")

4 changes: 2 additions & 2 deletions build/docker/Dockerfile.beataml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ FROM --platform=linux/x86_64 python:3.9

WORKDIR /usr/src/app

COPY hcmi_samples.csv .
COPY *_samples.csv .
COPY build/beatAML/GetBeatAML.py .
COPY build/utils/fit_curve.py .

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

CMD python GetBeatAML.py --token ${SYNAPSE_TOKEN}
# CMD python GetBeatAML.py --token ${SYNAPSE_TOKEN}
12 changes: 8 additions & 4 deletions build/docker/Dockerfile.hcmi
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@ FROM --platform=linux/x86_64 python:3.9

WORKDIR /usr/src/app

COPY build/hcmi/01-createHCMISamplesFile.py .
COPY build/hcmi/02-getHCMIData.py .
COPY build/hcmi/full_manifest.txt .
COPY hcmi_samples.csv .

COPY *_samples.csv .
# COPY build/cptac/cptac_samples.csv .

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

CMD python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o hcmi_transcriptomics.csv && \
python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t copy_number -o hcmi_copy_number.csv && \
python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t mutations -o hcmi_mutations.csv
# CMD python 01-createHCMISamplesFile.py && \
# python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o hcmi_transcriptomics.csv && \
# python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t copy_number -o hcmi_copy_number.csv && \
# python 02-getHCMIData.py -m full_manifest.txt -M full_manifest_files -t mutations -o hcmi_mutations.csv

0 comments on commit 72d56d4

Please sign in to comment.