Skip to content

Commit

Permalink
Merge pull request #302 from PNNL-CompBio/panc-pdo-check
Browse files Browse the repository at this point in the history
updated panc pdo files for proper DMSO control and added in genomic data
  • Loading branch information
jjacobson95 authored Jan 27, 2025
2 parents 5675103 + cffa309 commit c39a21a
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 23 deletions.
3 changes: 2 additions & 1 deletion build/build_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def main():
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx',help='Datasets to process. Defaults to all available.')
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo',help='Datasets to process. Defaults to all available.')
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
Expand Down Expand Up @@ -119,6 +119,7 @@ def process_docker(datasets):
'beataml': ['beataml'],
'mpnst': ['mpnst'],
'mpnstpdx': ['mpnstpdx'],
'pancpdo': ['pancpdo'],
'cptac': ['cptac'],
'genes': ['genes'],
'upload': ['upload']
Expand Down
2 changes: 2 additions & 0 deletions build/docker/Dockerfile.pancpdo
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ WORKDIR /usr/src/app

COPY build/pancpdo/01-createPancPDOSamplesFile.py .
COPY build/pancpdo/02-getPancPDOData.py .
COPY build/pancpdo/02a-getPancPDODataFromSynapse.py .
COPY build/pancpdo/03-getPancPDODrugs.py .
COPY build/pancpdo/04-getPancPDOExperiments.py .
COPY build/pancpdo/05-addPrecalcAUC.py .
Expand All @@ -18,4 +19,5 @@ ENV MPLCONFIGDIR=/app/tmp/matplotlib
RUN mkdir -p /app/tmp/matplotlib

RUN pip install --no-cache-dir -r requirements.txt

VOLUME ['/tmp']
37 changes: 24 additions & 13 deletions build/genes/00-buildGeneFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ library(dplyr)
##get entrez ids to symbol
entrez<-as.data.frame(org.Hs.egALIAS2EG)

sym <- as.data.frame(org.Hs.egSYMBOL)

##get entriz ids to ensembl
ens<-as.data.frame(org.Hs.egENSEMBL2EG)

Expand All @@ -22,25 +24,34 @@ ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
tab <- getBM(attributes=c('ensembl_gene_id'),filters='biotype', values=c('protein_coding'),mart=ensembl)


joined.df<-entrez%>%full_join(ens)%>%
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='ensembl_id')%>%
mutate(other_id_source='ensembl_gene')|>
mutate(is_protein=other_id%in%tab$ensembl_gene_id)|>
subset(is_protein)|>
dplyr::select(-is_protein)
joined.df<-entrez|>
left_join(sym)|>
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='alias_symbol',gene_symbol='symbol')%>%
mutate(other_id_source='entrez_alias')

##now get aliases from ensembl
edf <- sym|>
inner_join(ens)|>
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='ensembl_id')%>%
mutate(other_id_source='ensembl_gene')


tdf<-entrez|>
full_join(enst)|>
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='trans_id')|>
subset(entrez_id%in%joined.df$entrez_id)|>
subset(gene_symbol%in%joined.df$gene_symbol)|>
tdf<-sym|>
inner_join(enst)|>
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='trans_id')|>
subset(entrez_id%in%edf$entrez_id)|>
# subset(gene_symbol%in%ed.df$gene_symbol)|>
dplyr::mutate(other_id_source='ensembl_transcript')

joined.df<-rbind(joined.df,tdf)|>

prots<-subset(edf,other_id%in%tab$ensembl_gene_id)

full.df<-rbind(joined.df,edf,tdf)|>
subset(entrez_id%in%prots$entrez_id)|>
distinct()

#save to file and version
write.table(joined.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
write.table(full.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)

##store this file somewhere!

Expand Down
16 changes: 12 additions & 4 deletions build/pancpdo/01-createPancPDOSamplesFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,18 +270,26 @@ def filter_and_subset_data(df, maxval, mapfile):
# Convert 'other_names' to string to ensure consistency
longtab['other_names'] = longtab['other_names'].astype(str)

#print(longtab)
# Reassign 'improve_sample_id's at the end
unique_other_names = longtab['other_names'].unique()
print("Number of unique 'other_names' after filtering:", len(unique_other_names))

##UPDATE: assign them to common_names instead!
unique_common_names = longtab['common_name'].unique()
print("Number of unique 'common_names' after filtering:", len(unique_common_names))
# Create a new mapping
#mapping = pd.DataFrame({
# 'other_names': unique_other_names,
# 'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
#})
mapping = pd.DataFrame({
'other_names': unique_other_names,
'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
})
'common_name':unique_common_names,
'improve_sample_id': range(int(maxval) +1, int(maxval) + len(unique_common_names)+1)
})

# Merge the mapping back into 'longtab'
longtab = pd.merge(longtab, mapping, on='other_names', how='left')
longtab = pd.merge(longtab, mapping, on='common_name', how='left')

# Debugging: Check longtab after reassigning IDs
print("\nlongtab columns after reassigning 'improve_sample_id':", longtab.columns)
Expand Down
8 changes: 7 additions & 1 deletion build/pancpdo/02-getPancPDOData.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):

# Initialize retry variables
retries = 0
max_retries = 5
max_retries = 1

# Function to get downloaded file IDs
def get_downloaded_ids(manifest_loc):
Expand Down Expand Up @@ -683,6 +683,12 @@ def main():
final_data = align_to_schema(combined_data,args.type,7500,args.samples)
gc.collect()

##what if we shrink samples to only include the values that have transcriptional data
#this fails
#newsamps = pd.read_csv(args.samples)
#newsamps = newsamps[newsamps.improve_sample_id.isin(final_data.improve_sample_id)]
#newsamps.to_csv(args.samples)

combined_data = None

print(f"final data:\n{final_data}")
Expand Down
171 changes: 171 additions & 0 deletions build/pancpdo/02a-getPancPDODataFromSynapse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import pandas as pd
import synapseclient
import argparse
import math


def get_copy_call(a):
"""
Helper Function - Determine copy call for a value.
"""

if a is None:
return float('nan')

if math.isnan(a):
return float('nan')

a_val = a##math.log2(float(a)+0.000001) ###this should not be exponent, should be log!!! 2**float(a)
if a_val < 0.0: #0.5210507:
return 'deep del'
elif a_val < 0.7311832:
return 'het loss'
elif a_val < 1.214125:
return 'diploid'
elif a_val < 1.731183:
return 'gain'
else:
return 'amp'

return pl.Series([get_copy_call(a) for a in arr])

def parseCNVFile(fpath, sampid, genes):
log2data = pd.read_csv(fpath, sep='\t', header=None)
log2data.columns = ['gene_symbol','copy_number','Region','Type','Pos']
log2data['improve_sample_id']=sampid
newdat = pd.merge(log2data,genes)[['improve_sample_id','entrez_id','copy_number']].drop_duplicates()
newdat['study']='pancpdo'
newdat['source']='TiriacEtal'
newdat = newdat[['improve_sample_id','entrez_id','copy_number','source','study']]
newdat['copy_call'] = [get_copy_call(a) for a in newdat['copy_number']]
return newdat


mutmap = {'CODON_CHANGE_PLUS_CODON_DELETION':'In_Frame_Del', ##this isn't a great mapping
'CODON_CHANGE_PLUS_CODON_INSERTION':'In_Frame_Ins', ##this isn't a great mapping
'CODON_DELETION':'In_Frame_Del',
'CODON_INSERTION':'In_Frame_Ins',
'DOWNSTREAM':"3'Flank",
'FRAME_SHIFT':'Frameshift_Variant',
'FRAME_SHIFT+SPLICE_SITE_ACCEPTOR+SPLICE_SITE_REGION+INTRON':'Frameshift_Variant',
'FRAME_SHIFT+SPLICE_SITE_REGION':'Frameshift_Variant',
'INTERGENIC':'IGR',
'INTRON':'Intron',
'NON_SYNONYMOUS_CODING':'Missense_Mutation',
'NON_SYNONYMOUS_CODING+SPLICE_SITE_REGION':'Missense_Mutation',
'SPLICE_SITE_ACCEPTOR+INTRON':'Splice_Site',
'SPLICE_SITE_DONOR+INTRON':'Splice_Site',
'SPLICE_SITE_REGION+INTRON':'Splice_Site',
'SPLICE_SITE_REGION+NON_CODING_EXON_VARIANT':'Splice_Site',
'SPLICE_SITE_REGION+SYNONYMOUS_CODING':'Silent',
'START_GAINED+UTR_5_PRIME':'Start_Codon_Ins',
'STOP_GAINED':'Stop_Codon_Ins',
'STOP_GAINED+CODON_CHANGE_PLUS_CODON_INSERTION':'Stop_Codon_Ins',
'SYNONYMOUS_CODING':'Silent',
'UPSTREAM':"5'Flank",
'UTR_3_PRIME':"3'UTR",
'UTR_5_PRIME':"5'UTR"
}

def parseMutFile(fpath, sampid,genes):
'''
move mutations to following headers:
entrez_id, improve_sample_id, source, study, mutation, variant_classification
'''
mutfile = pd.read_csv(fpath,sep='\t')[['SNPEFF_GENE_NAME','SNPEFF_EFFECT','SNPEFF_CDS_CHANGE']]
mutfile = mutfile.dropna(subset='SNPEFF_CDS_CHANGE')
mutfile.columns = ['gene_symbol','SNPEFF_EFFECT','mutation']
fullfile = pd.merge(mutfile,pd.DataFrame({'SNPEFF_EFFECT':mutmap.keys(),'variant_classification':mutmap.values()}))
fullfile = pd.merge(fullfile,genes)
fullfile['improve_sample_id'] = sampid
fullfile['source']='TiriacEtAl'
fullfile['study']='pancpdo'
fullfile = fullfile[['improve_sample_id','entrez_id','source','study','mutation','variant_classification']]
fullfile = fullfile.dropna().drop_duplicates()
return fullfile

def main():
parser = argparse.ArgumentParser(description = 'Script that collects WES and CNV data from Synapse for Coderdata')
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
parser.add_argument('-t', '--token', help='Synapse token')

args = parser.parse_args()
if args.samples is None or args.genes is None:
print('We need at least a genes and samples file to continue')
exit()
samps = pd.read_csv(args.samples)
genes = pd.read_csv(args.genes)

print("Logging into synapse")
sc = synapseclient.Synapse()
sc.login(authToken=args.token)

##to double check identifiers, we use transcriptomics data since that determines what samples were sequenced
#update this step isn't needed anymore
trans = pd.read_csv('/tmp/pancpdo_transcriptomics.csv.gz')
tsamps = samps[samps.improve_sample_id.isin(trans.improve_sample_id)]
print(samps.shape)
print(tsamps.shape)


missingsamples = []
if args.copy:
##query synapse view for files
cnvs = sc.tableQuery("select * from syn64608378 where parentId='syn64608163'").asDataFrame()
alldats = []
##go through table and get every file
for index,row in cnvs.iterrows():
sid = row.id
sname = row['name'].split('--')[0]
print(sid,sname)
path = sc.get(sid).path
if sname in set(tsamps.other_id):
print(sname+' in transcriptomics, using that id')
sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
missingsamples.append('copy,trans,'+sname)
elif sname in set(samps.other_id):
print(sname+' in samples but not transcriptomics, using other id')
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
missingsamples.append("copy,notrans,"+sname)
else:
print('Missing sample id for '+sname,' skipping for now')
missingsamples.append('copy,missed,'+sname)
continue
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
res = parseCNVFile(path,sampid, genes)
alldats.append(res)
newcnv = pd.concat(alldats)
newcnv.to_csv('/tmp/pancpdo_copy_number.csv.gz',compression='gzip',index=False)

if args.mutation:
wes = sc.tableQuery("select * from syn64608378 where parentId='syn64608263'").asDataFrame()
alldats = []
##go through and get every mutation file
for index,row in wes.iterrows():
sname = row['name'].split('--')[0]
sid = row.id
print(sid,sname)
if sname in set(tsamps.other_id):
print(sname+' in transcriptomics, using that id')
sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
missingsamples.append('mutation,trans,'+sname)
elif sname in set(samps.other_id):
print(sname+' in samples but not transcriptomics, using other id')
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
missingsamples.append('mutation,notrans,'+sname)
else:
print('Missing sample id for '+sname)
missingsamples.append('mutation,'+sname)
continue
path = sc.get(sid).path
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
res = parseMutFile(path,sampid, genes)
alldats.append(res)
newmut = pd.concat(alldats)
newmut.to_csv("/tmp/pancpdo_mutations.csv.gz",compression='gzip',index=False)
#pd.DataFrame(missingsamples).to_csv('missing.csv',index=False,quoting=None,header=False)
if __name__=='__main__':
main()
15 changes: 11 additions & 4 deletions build/pancpdo/04-getPancPDOExperiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,22 @@ def get_data(token):

##renormalize values to max
##IMPORTANT: this is how we normalize without DMSO. We need to consider how we're doing this for EACH ORGANOID
##currently we take the max value of each orgnaoid/replicate.
rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max')
rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00
##currently we take the max value of each orgnaoid/replicate.
##UPDATE: see belo
# rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max')
# rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00


##dosenum isa dummy value to use for merging since we need to repeat the concentrations over and over
dosenum = [a for a in range(15)]
rtab['Dosenum']=dosenum*int(rtab.shape[0]/15)

##The last dose (dosenum ==14) is the control value per Herve. we now must normalize to that

dmso_vals = rtab[rtab.Dosenum==14][['Organoid','Drug','Rep','Response']].rename({'Response':'DMSO'},axis=1)
full_res = rtab.merge(dmso_vals,on=['Organoid','Drug','Rep'])
full_res['PercResponse'] = 100*(full_res.Response/full_res.DMSO)

#print(set(rtab.Drug))
##merge the concentrations
concs = concs.dropna().melt(value_vars=concs.columns,var_name='Drug',value_name='Dose')
Expand All @@ -114,7 +121,7 @@ def get_data(token):
concs['Dosenum'] = dosenum*int(concs.shape[0]/15)##creating dosenum here to merge
#print(set(concs.Drug))

return rtab.merge(concs)
return full_res.merge(concs)

if __name__=='__main__':
main()
3 changes: 3 additions & 0 deletions build/pancpdo/build_omics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit
echo "Running 02-getPancPDOData.py for transcriptomics."
python 02-getPancPDOData.py -m full_manifest.txt -t transcriptomics -o /tmp/pancpdo_transcriptomics.csv.gz -g $1 -s $2

echo 'Running 02a-getPancPDODataFromSynapse.py for copy number and mutations'
python 02a-getPancPDODataFromSynapse.py -g $1 -s $2 -t $SYNAPSE_AUTH_TOKEN -c -m

#echo "Running 02-getPancPDOData.py for copy_number."
#python 02-getPancPDOData.py -m full_manifest.txt -t copy_number -o /tmp/pancpdo_copy_number.csv.gz -g $1 -s $2

Expand Down

0 comments on commit c39a21a

Please sign in to comment.