Merge pull request #302 from PNNL-CompBio/panc-pdo-check

jjacobson95 · web-flow · commit c39a21ab7b7a · 2025-01-27T12:03:36.000-08:00
updated panc pdo files for proper DMSO control and added in genomic data
diff --git a/build/build_all.py b/build/build_all.py
@@ -39,7 +39,7 @@ def main():
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
     parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo',help='Datasets to process. Defaults to all available.')
     parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -119,6 +119,7 @@ def process_docker(datasets):
             'beataml': ['beataml'],
             'mpnst': ['mpnst'],
             'mpnstpdx': ['mpnstpdx'],
+            'pancpdo': ['pancpdo'],
             'cptac': ['cptac'],
             'genes': ['genes'],
             'upload': ['upload']
diff --git a/build/docker/Dockerfile.pancpdo b/build/docker/Dockerfile.pancpdo
@@ -4,6 +4,7 @@ WORKDIR /usr/src/app
 
 COPY build/pancpdo/01-createPancPDOSamplesFile.py .
 COPY build/pancpdo/02-getPancPDOData.py .
+COPY build/pancpdo/02a-getPancPDODataFromSynapse.py .
 COPY build/pancpdo/03-getPancPDODrugs.py .
 COPY build/pancpdo/04-getPancPDOExperiments.py .
 COPY build/pancpdo/05-addPrecalcAUC.py .
@@ -18,4 +19,5 @@ ENV MPLCONFIGDIR=/app/tmp/matplotlib
 RUN mkdir -p /app/tmp/matplotlib
 
 RUN pip install --no-cache-dir -r requirements.txt
+
 VOLUME ['/tmp']
diff --git a/build/genes/00-buildGeneFile.R b/build/genes/00-buildGeneFile.R
@@ -10,6 +10,8 @@ library(dplyr)
 ##get entrez ids to symbol
 entrez<-as.data.frame(org.Hs.egALIAS2EG)
 
+sym <- as.data.frame(org.Hs.egSYMBOL)
+
 ##get entriz ids to ensembl
 ens<-as.data.frame(org.Hs.egENSEMBL2EG)
 
@@ -22,25 +24,34 @@ ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
 tab <- getBM(attributes=c('ensembl_gene_id'),filters='biotype', values=c('protein_coding'),mart=ensembl)
 
 
-joined.df<-entrez%>%full_join(ens)%>%
-  dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='ensembl_id')%>%
-    mutate(other_id_source='ensembl_gene')|>
-    mutate(is_protein=other_id%in%tab$ensembl_gene_id)|>
-    subset(is_protein)|>
-    dplyr::select(-is_protein)
+joined.df<-entrez|>
+    left_join(sym)|>
+    dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='alias_symbol',gene_symbol='symbol')%>%
+    mutate(other_id_source='entrez_alias')
+
+##now get aliases from ensembl
+edf <- sym|>
+    inner_join(ens)|>
+    dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='ensembl_id')%>%
+    mutate(other_id_source='ensembl_gene')
+
 
-tdf<-entrez|>
-    full_join(enst)|>
-    dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='trans_id')|>
-    subset(entrez_id%in%joined.df$entrez_id)|>
-    subset(gene_symbol%in%joined.df$gene_symbol)|>
+tdf<-sym|>
+    inner_join(enst)|>
+    dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='trans_id')|>
+    subset(entrez_id%in%edf$entrez_id)|>
+#    subset(gene_symbol%in%ed.df$gene_symbol)|>
     dplyr::mutate(other_id_source='ensembl_transcript')
 
-joined.df<-rbind(joined.df,tdf)|>
+
+prots<-subset(edf,other_id%in%tab$ensembl_gene_id)
+
+full.df<-rbind(joined.df,edf,tdf)|>
+    subset(entrez_id%in%prots$entrez_id)|>
     distinct()
 
 #save to file and version
-write.table(joined.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
+write.table(full.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
 
 ##store this file somewhere!
 
diff --git a/build/pancpdo/01-createPancPDOSamplesFile.py b/build/pancpdo/01-createPancPDOSamplesFile.py
@@ -270,18 +270,26 @@ def filter_and_subset_data(df, maxval, mapfile):
     # Convert 'other_names' to string to ensure consistency
     longtab['other_names'] = longtab['other_names'].astype(str)
 
+    #print(longtab)
     # Reassign 'improve_sample_id's at the end
     unique_other_names = longtab['other_names'].unique()
     print("Number of unique 'other_names' after filtering:", len(unique_other_names))
 
+    ##UPDATE: assign them to common_names instead!
+    unique_common_names = longtab['common_name'].unique()
+    print("Number of unique 'common_names' after filtering:", len(unique_common_names))        
     # Create a new mapping
+    #mapping = pd.DataFrame({
+    #    'other_names': unique_other_names,
+    #    'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
+        #})
     mapping = pd.DataFrame({
-        'other_names': unique_other_names,
-        'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
-    })
+        'common_name':unique_common_names,
+        'improve_sample_id': range(int(maxval) +1, int(maxval) + len(unique_common_names)+1)
+        })
 
     # Merge the mapping back into 'longtab'
-    longtab = pd.merge(longtab, mapping, on='other_names', how='left')
+    longtab = pd.merge(longtab, mapping, on='common_name', how='left')
 
     # Debugging: Check longtab after reassigning IDs
     print("\nlongtab columns after reassigning 'improve_sample_id':", longtab.columns)
diff --git a/build/pancpdo/02-getPancPDOData.py b/build/pancpdo/02-getPancPDOData.py
@@ -183,7 +183,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):
 
         # Initialize retry variables
         retries = 0
-        max_retries = 5
+        max_retries = 1
 
         # Function to get downloaded file IDs
         def get_downloaded_ids(manifest_loc):
@@ -683,6 +683,12 @@ def main():
     final_data = align_to_schema(combined_data,args.type,7500,args.samples)
     gc.collect()
 
+    ##what if we shrink samples to only include the values that have transcriptional data
+    #this fails
+    #newsamps = pd.read_csv(args.samples)
+    #newsamps = newsamps[newsamps.improve_sample_id.isin(final_data.improve_sample_id)]
+    #newsamps.to_csv(args.samples)
+
     combined_data = None
     
     print(f"final data:\n{final_data}")
diff --git a/build/pancpdo/02a-getPancPDODataFromSynapse.py b/build/pancpdo/02a-getPancPDODataFromSynapse.py
@@ -0,0 +1,171 @@
+import pandas as pd
+import synapseclient
+import argparse
+import math
+
+
+def get_copy_call(a):
+    """
+    Helper Function - Determine copy call for a value.
+    """
+
+    if a is None:
+        return float('nan')
+    
+    if math.isnan(a):
+        return float('nan')
+    
+    a_val = a##math.log2(float(a)+0.000001) ###this should not be exponent, should be log!!! 2**float(a)
+    if a_val < 0.0: #0.5210507:
+        return 'deep del'
+    elif a_val < 0.7311832:
+        return 'het loss'
+    elif a_val < 1.214125:
+        return 'diploid'
+    elif a_val < 1.731183:
+        return 'gain'
+    else:
+        return 'amp'
+    
+    return pl.Series([get_copy_call(a) for a in arr])
+
+def parseCNVFile(fpath, sampid, genes):
+    log2data = pd.read_csv(fpath, sep='\t', header=None)
+    log2data.columns = ['gene_symbol','copy_number','Region','Type','Pos']
+    log2data['improve_sample_id']=sampid
+    newdat =  pd.merge(log2data,genes)[['improve_sample_id','entrez_id','copy_number']].drop_duplicates()
+    newdat['study']='pancpdo'
+    newdat['source']='TiriacEtal'
+    newdat = newdat[['improve_sample_id','entrez_id','copy_number','source','study']]
+    newdat['copy_call'] = [get_copy_call(a) for a in newdat['copy_number']]
+    return newdat
+
+
+mutmap = {'CODON_CHANGE_PLUS_CODON_DELETION':'In_Frame_Del', ##this isn't a great mapping
+          'CODON_CHANGE_PLUS_CODON_INSERTION':'In_Frame_Ins', ##this isn't a great mapping
+          'CODON_DELETION':'In_Frame_Del',
+          'CODON_INSERTION':'In_Frame_Ins',
+          'DOWNSTREAM':"3'Flank",
+          'FRAME_SHIFT':'Frameshift_Variant',
+          'FRAME_SHIFT+SPLICE_SITE_ACCEPTOR+SPLICE_SITE_REGION+INTRON':'Frameshift_Variant',
+          'FRAME_SHIFT+SPLICE_SITE_REGION':'Frameshift_Variant',
+          'INTERGENIC':'IGR',
+          'INTRON':'Intron',
+          'NON_SYNONYMOUS_CODING':'Missense_Mutation',
+          'NON_SYNONYMOUS_CODING+SPLICE_SITE_REGION':'Missense_Mutation',
+          'SPLICE_SITE_ACCEPTOR+INTRON':'Splice_Site',
+          'SPLICE_SITE_DONOR+INTRON':'Splice_Site',
+          'SPLICE_SITE_REGION+INTRON':'Splice_Site',
+          'SPLICE_SITE_REGION+NON_CODING_EXON_VARIANT':'Splice_Site',
+          'SPLICE_SITE_REGION+SYNONYMOUS_CODING':'Silent',
+          'START_GAINED+UTR_5_PRIME':'Start_Codon_Ins',
+          'STOP_GAINED':'Stop_Codon_Ins',
+          'STOP_GAINED+CODON_CHANGE_PLUS_CODON_INSERTION':'Stop_Codon_Ins',
+          'SYNONYMOUS_CODING':'Silent',
+          'UPSTREAM':"5'Flank",
+          'UTR_3_PRIME':"3'UTR",
+          'UTR_5_PRIME':"5'UTR"
+          }
+
+def parseMutFile(fpath, sampid,genes):
+    '''
+    move mutations to following headers:
+     entrez_id, improve_sample_id, source, study, mutation, variant_classification
+    '''
+    mutfile = pd.read_csv(fpath,sep='\t')[['SNPEFF_GENE_NAME','SNPEFF_EFFECT','SNPEFF_CDS_CHANGE']]
+    mutfile = mutfile.dropna(subset='SNPEFF_CDS_CHANGE')
+    mutfile.columns  = ['gene_symbol','SNPEFF_EFFECT','mutation']
+    fullfile = pd.merge(mutfile,pd.DataFrame({'SNPEFF_EFFECT':mutmap.keys(),'variant_classification':mutmap.values()}))
+    fullfile = pd.merge(fullfile,genes)
+    fullfile['improve_sample_id'] = sampid
+    fullfile['source']='TiriacEtAl'
+    fullfile['study']='pancpdo'
+    fullfile = fullfile[['improve_sample_id','entrez_id','source','study','mutation','variant_classification']]
+    fullfile = fullfile.dropna().drop_duplicates()
+    return fullfile
+
+def main():
+    parser = argparse.ArgumentParser(description = 'Script that collects WES and CNV data from Synapse for Coderdata')
+    parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
+    parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
+    parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
+    parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
+    parser.add_argument('-t', '--token', help='Synapse token')
+
+    args = parser.parse_args()
+    if args.samples is None or args.genes is None:
+        print('We need at least a genes and samples file to continue')
+        exit()
+    samps = pd.read_csv(args.samples)
+    genes = pd.read_csv(args.genes)
+
+    print("Logging into synapse")
+    sc = synapseclient.Synapse()
+    sc.login(authToken=args.token)
+    
+    ##to double check identifiers, we use transcriptomics data since that determines what samples were sequenced
+    #update this step isn't needed anymore
+    trans = pd.read_csv('/tmp/pancpdo_transcriptomics.csv.gz')
+    tsamps = samps[samps.improve_sample_id.isin(trans.improve_sample_id)]
+    print(samps.shape)
+    print(tsamps.shape)
+    
+          
+    missingsamples = []
+    if args.copy:
+        ##query synapse view for files
+        cnvs = sc.tableQuery("select * from syn64608378 where parentId='syn64608163'").asDataFrame()
+        alldats = []
+        ##go through table and get every file
+        for index,row in cnvs.iterrows():
+            sid = row.id
+            sname = row['name'].split('--')[0]
+            print(sid,sname)
+            path = sc.get(sid).path
+            if sname in set(tsamps.other_id):
+                print(sname+' in transcriptomics, using that id')
+                sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
+                missingsamples.append('copy,trans,'+sname)
+            elif sname in set(samps.other_id):
+                print(sname+' in samples but not transcriptomics, using other id')
+                sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
+                missingsamples.append("copy,notrans,"+sname)
+            else:
+                print('Missing sample id for '+sname,' skipping for now')
+                missingsamples.append('copy,missed,'+sname)
+                continue
+            sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
+            res = parseCNVFile(path,sampid, genes)
+            alldats.append(res)
+        newcnv = pd.concat(alldats)
+        newcnv.to_csv('/tmp/pancpdo_copy_number.csv.gz',compression='gzip',index=False)
+            
+    if args.mutation:
+        wes = sc.tableQuery("select * from syn64608378 where parentId='syn64608263'").asDataFrame()
+        alldats = []
+        ##go through and get every mutation file
+        for index,row in wes.iterrows():
+            sname = row['name'].split('--')[0]
+            sid = row.id
+            print(sid,sname)
+            if sname in set(tsamps.other_id):
+                print(sname+' in transcriptomics, using that id')
+                sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
+                missingsamples.append('mutation,trans,'+sname)
+            elif sname in set(samps.other_id):
+                print(sname+' in samples but not transcriptomics, using other id')
+                sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
+                missingsamples.append('mutation,notrans,'+sname)
+            else:
+                print('Missing sample id for '+sname)
+                missingsamples.append('mutation,'+sname)
+                continue
+            path = sc.get(sid).path
+            sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
+            res = parseMutFile(path,sampid, genes)
+            alldats.append(res)
+        newmut = pd.concat(alldats)
+        newmut.to_csv("/tmp/pancpdo_mutations.csv.gz",compression='gzip',index=False)
+    #pd.DataFrame(missingsamples).to_csv('missing.csv',index=False,quoting=None,header=False)
+if __name__=='__main__':
+    main()
diff --git a/build/pancpdo/04-getPancPDOExperiments.py b/build/pancpdo/04-getPancPDOExperiments.py
@@ -95,15 +95,22 @@ def get_data(token):
 
     ##renormalize values to max
     ##IMPORTANT: this is how we normalize without DMSO. We need to consider how we're doing this for EACH ORGANOID
-    ##currently we take the max value of each orgnaoid/replicate. 
-    rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max')
-    rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00
+    ##currently we take the max value of each orgnaoid/replicate.
+    ##UPDATE: see belo
+#    rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max')
+#    rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00
 
 
     ##dosenum isa dummy value to use for merging since we need to repeat the concentrations over and over
     dosenum = [a for a in range(15)]
     rtab['Dosenum']=dosenum*int(rtab.shape[0]/15)
 
+    ##The last dose (dosenum ==14) is the control value per Herve. we now must normalize to that
+
+    dmso_vals = rtab[rtab.Dosenum==14][['Organoid','Drug','Rep','Response']].rename({'Response':'DMSO'},axis=1)
+    full_res = rtab.merge(dmso_vals,on=['Organoid','Drug','Rep'])
+    full_res['PercResponse'] = 100*(full_res.Response/full_res.DMSO)
+    
     #print(set(rtab.Drug))
     ##merge the concentrations
     concs = concs.dropna().melt(value_vars=concs.columns,var_name='Drug',value_name='Dose')
@@ -114,7 +121,7 @@ def get_data(token):
     concs['Dosenum'] = dosenum*int(concs.shape[0]/15)##creating dosenum here to merge
     #print(set(concs.Drug))
     
-    return rtab.merge(concs)
+    return full_res.merge(concs)
 
 if __name__=='__main__':
     main()
diff --git a/build/pancpdo/build_omics.sh b/build/pancpdo/build_omics.sh
@@ -6,6 +6,9 @@ trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit
 echo "Running 02-getPancPDOData.py for transcriptomics."
 python 02-getPancPDOData.py -m full_manifest.txt -t transcriptomics -o /tmp/pancpdo_transcriptomics.csv.gz -g $1 -s $2
 
+echo 'Running 02a-getPancPDODataFromSynapse.py for copy number and mutations'
+python 02a-getPancPDODataFromSynapse.py -g $1 -s $2 -t $SYNAPSE_AUTH_TOKEN -c -m
+
 #echo "Running 02-getPancPDOData.py for copy_number."
 #python 02-getPancPDOData.py -m full_manifest.txt -t copy_number -o /tmp/pancpdo_copy_number.csv.gz -g $1 -s $2