Skip to content

Commit

Permalink
Merge pull request #295 from PNNL-CompBio/sample_id_mapping_update
Browse files Browse the repository at this point in the history
Cross-Build Sample & Drug ID Mapping
  • Loading branch information
jjacobson95 authored Jan 27, 2025
2 parents c39a21a + 358e95c commit 8fadc34
Show file tree
Hide file tree
Showing 9 changed files with 1,156 additions and 48 deletions.
51 changes: 45 additions & 6 deletions build/broad_sanger/05b_separate_datasets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import gc
import polars as pl


import os
import gzip
import shutil

def main():
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
Expand All @@ -23,58 +24,96 @@ def main():
}

for dataset in datasets_to_process:
exp = pl.read_csv("broad_sanger_experiments.tsv", separator="\t") # Keeping memory down, so I will not be making copies.
exp_in_filename = "broad_sanger_experiments.tsv"
if os.path.isfile(exp_in_filename + ".gz"):
exp_in_filename = exp_in_filename + ".gz"

exp = pl.read_csv(exp_in_filename, separator="\t") # Keeping memory down, so I will not be making copies.
exp = exp.filter(pl.col("study") == dataset)

# Extract information to separate out datasets
exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list()
exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list()

# Write Filtered Experiments File to TSV. Then delete it from memory.
exp_filename = f"/tmp/{dataset}_experiments.tsv".lower()
exp.write_csv(exp_filename, separator="\t")
exp_filename_out = f"/tmp/{dataset}_experiments.tsv".lower()
exp.write_csv(exp_filename_out, separator="\t")
#Rewrite as gzipped if needed
if exp_in_filename.endswith(".gz"):
with open(exp_filename_out, 'rb') as f_in, gzip.open(exp_filename_out + ".gz", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(exp_filename_out)

del exp
gc.collect()


#Filter Samples files, write to file, delete from mem.
for samples in samples_datatypes:
samples_filename_in = f"broad_sanger_{samples}.csv"
if os.path.isfile(samples_filename_in + ".gz"):
samples_filename_in += ".gz"

samples_filename_out = f"/tmp/{dataset}_{samples}.csv".lower()
samples_df = pl.read_csv(samples_filename_in)
samples_df = samples_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
samples_df.write_csv(samples_filename_out) #csv

#Rewrite as gzipped if needed
if samples_filename_in.endswith(".gz"):
with open(samples_filename_out, 'rb') as f_in, gzip.open(samples_filename_out + ".gz", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(samples_filename_out)

del samples_df
gc.collect()

#One by one, filter other Omics files, write to file, delete from mem.
for omics in omics_datatypes:
omics_filename_in = f"broad_sanger_{omics}.csv"
if os.path.isfile(omics_filename_in + ".gz"):
omics_filename_in += ".gz"

omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
omics_df = pl.read_csv(omics_filename_in)
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
omics_df.write_csv(omics_filename_out) #csv

#Rewrite as gzipped if needed
if omics_filename_in.endswith(".gz"):
with open(omics_filename_out, 'rb') as f_in, gzip.open(omics_filename_out + ".gz", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(omics_filename_out)

del omics_df
gc.collect()


#One by one, filter other Drugs files, write to file, delete from mem.
for drugs in drugs_datatypes:
drugs_filename_in = f"broad_sanger_{drugs}.tsv"
if os.path.isfile(drugs_filename_in + ".gz"):
drugs_filename_in += ".gz"

drugs_filename_out = f"/tmp/{dataset}_{drugs}.tsv".lower()
if drugs == "drug_descriptors":
drugs_df = pl.read_csv(drugs_filename_in,separator="\t",
dtypes={"improve_drug_id": pl.Utf8,
"structural_descriptor": pl.Utf8,
"descriptor_value": pl.Utf8}
)

else:
drugs_df = pl.read_csv(drugs_filename_in,separator="\t")

drugs_df = drugs_df.filter(pl.col("improve_drug_id").is_in(exp_improve_drug_ids))
drugs_df.write_csv(drugs_filename_out,separator="\t") #tsv

if drugs_filename_in.endswith(".gz"):
with open(drugs_filename_out, 'rb') as f_in, gzip.open(drugs_filename_out + ".gz", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(drugs_filename_out)

del drugs_df
gc.collect()

Expand Down
87 changes: 64 additions & 23 deletions build/build_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import gzip
from glob import glob
import sys
import requests

def main():
parser=argparse.ArgumentParser(
Expand Down Expand Up @@ -131,7 +132,7 @@ def process_docker(datasets):
datasets_to_build.extend(dataset_map.get(dataset, []))

# Build the docker-compose command, adding specific datasets
compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build

log_file_path = 'local/docker.log'
env = os.environ.copy()
Expand Down Expand Up @@ -266,9 +267,11 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
docker_run = ['docker', 'run', '--rm', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '-e', f"VERSION={version}"]

# Add Appropriate Environment Variables
if name == "validate":
docker_run.extend(['upload'])
if 'FIGSHARE_TOKEN' in env and name == 'Figshare':
docker_run.extend(['-e', f"FIGSHARE_TOKEN={env['FIGSHARE_TOKEN']}", 'upload'])
if name == "validate":
if name == "Map_Drugs" or name == "Map_Samples":
docker_run.extend(['upload'])
if 'GITHUB_TOKEN' in env and name == "GitHub":
docker_run.extend(['-e', f"GITHUB_TOKEN={env['GITHUB_TOKEN']}", 'upload'])
Expand Down Expand Up @@ -300,6 +303,18 @@ def compress_file(file_path):
with gzip.open(compressed_file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(file_path)

def get_latest_commit_hash(owner, repo, branch='main'):
"""
Returns the SHA of the latest commit on the specified branch.
"""
url = f"https://api.github.com/repos/{owner}/{repo}/commits/{branch}"
response = requests.get(url)
response.raise_for_status()

# The commit data is in JSON format; the 'sha' field is the full commit hash.
commit_data = response.json()
return commit_data['sha']

######
### Pre-Build Environment Token Check
Expand Down Expand Up @@ -388,17 +403,17 @@ def compress_file(file_path):
######
### Begin Upload and/or validation
#####

if args.figshare or args.validate:
if args.figshare or args.validate or github_token:
# if args.figshare or args.validate:
# FigShare File Prefixes:

prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
if "broad_sanger" in datasets:
prefixes.extend(broad_sanger_datasets)
datasets.extend(broad_sanger_datasets)
datasets.remove("broad_sanger")


figshare_token = os.getenv('FIGSHARE_TOKEN')

all_files_dir = 'local/all_files_dir'
Expand All @@ -422,6 +437,13 @@ def compress_file(file_path):
for file in glob(os.path.join(all_files_dir, '*.gz')):
decompress_file(file)

### These should be done before schema checking.
sample_mapping_command = ['python3', 'scripts/map_improve_sample_ids.py', '--local_dir', "/tmp", '--version', args.version]
run_docker_upload_cmd(sample_mapping_command, 'all_files_dir', 'Map_Samples', args.version)

drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)

# Run schema checker - This will always run if uploading data.
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version)
Expand All @@ -438,28 +460,47 @@ def compress_file(file_path):

print("File compression and decompression adjustments are complete.")

# Upload to Figshare using Docker
### Upload to Figshare using Docker
if args.figshare and args.version and figshare_token:
figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--publish']
figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--version', args.version, '--publish']
run_docker_upload_cmd(figshare_command, 'all_files_dir', 'Figshare', args.version)

### Push changes to GitHub using Docker
# if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:

# Push changes to GitHub using Docker
if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
git_command = [
'bash', '-c', (
f'git config --global user.name "{args.github_username}" '
f'&& git config --global user.email "{args.github_email}" '
f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
f'&& git add docs/_data/figshare_latest.yml '
f'&& git commit -m "Data Built and Uploaded. New Tag: {args.version}" '
f'&& git tag {args.version} '
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git main '
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git --tags'
)
]
run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)
# You can only upload to Github after Figshare upload is completed - otherwise figshare_latest.yml and dataset.yml won't be available.
if args.version and github_token and args.github_username and args.github_email:

git_command = [
'bash', '-c', (
f'git config --global user.name "{args.github_username}" '
f'&& git config --global user.email "{args.github_email}" '

# Checkout a new branch
f'&& git checkout -b testing-auto-build-pr-{args.version} '

# Copy and add the necessary files
f'&& cp /tmp/improve_sample_mapping.json.gz /usr/src/app/coderdata/build/improve_sample_mapping.json.gz '
f'&& cp /tmp/improve_drug_mapping.json.gz /usr/src/app/coderdata/build/improve_drug_mapping.json.gz '
f'&& gunzip /usr/src/app/coderdata/build/*.gz '
f'&& git add -f build/improve_sample_mapping.json build/improve_drug_mapping.json '
f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
f'&& cp /tmp/dataset.yml /usr/src/app/coderdata/coderdata/dataset.yml '
f'&& git add -f docs/_data/figshare_latest.yml coderdata/dataset.yml'

# Tag and push
f'&& git commit -m "Data Built and Uploaded. New Tag: {args.version}" '
f'&& git tag {args.version} '
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git testing-auto-build-pr-{args.version} '

# Create a PR using GitHub CLI
f'&& gh pr create --title "Testing Auto PR instead of auto Merge {args.version}" '
f'--body "This PR was automatically generated by the build process." '
f'--base main --head testing-auto-build-pr-{args.version}'
)
]

run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)


if __name__ == '__main__':
main()
4 changes: 2 additions & 2 deletions build/build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def process_docker(dataset,validate):

datasets_to_build.extend(dataset_map.get(dataset, []))

compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
compose_command = ['docker','compose', '-f', compose_file, 'build'] + datasets_to_build

log_file_path = 'local/docker.log'
env = os.environ.copy()
Expand Down Expand Up @@ -260,7 +260,7 @@ def run_schema_checker(dataset):
decompress_file(os.path.join('local', all_files_dir, file))

# Run schema checker
schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')

def main():
Expand Down
16 changes: 15 additions & 1 deletion build/docker/Dockerfile.upload
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@ RUN python -m pip install --upgrade pip pyyaml requests linkml

RUN apt-get update && apt-get install -y git

# Install necessary system packages: git, curl, and gpg
RUN apt-get update && \
apt-get install -y git curl gnupg && \
rm -rf /var/lib/apt/lists/*

# Install GitHub CLI (gh)
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | \
gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg && \
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | \
tee /etc/apt/sources.list.d/github-cli.list > /dev/null && \
apt-get update && \
apt-get install -y gh && \
rm -rf /var/lib/apt/lists/*


RUN git clone https://github.com/PNNL-CompBio/coderdata.git
WORKDIR /usr/src/app/coderdata
WORKDIR /usr/src/app/coderdata
1 change: 0 additions & 1 deletion build/genes/00-buildGeneFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,3 @@ write.table(full.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)

##store this file somewhere!


Loading

0 comments on commit 8fadc34

Please sign in to comment.