diff --git a/MAVEN/Example_Data/lapatinib/.Rhistory b/MAVEN/Example_Data/lapatinib/.Rhistory deleted file mode 100755 index 7564a46..0000000 --- a/MAVEN/Example_Data/lapatinib/.Rhistory +++ /dev/null @@ -1,512 +0,0 @@ -foreach(i = compound_names) %dopar% { # Loop over every compound -print(i) -fname = paste0(i,".txt") # Set the file name for the signature file -compound_sig = cbind(test_df[1],test_df[i]) # Take the gene names and corresponding measurements -write.table(compound_sig,fname,sep="\t",quote=F,row.names=F,col.names=T,append=T) # Write -df = read.table(fname,sep="\t",header=T,row.names=1) # Read in the signature .txt file -TF_genesymbol<-try( # run DoRothEA, confidence levels A, B and C -runDoRothEA(df, regulon=viper_regulon, confidence_level=c('A','B','C')), -silent = T -) -if(inherits(TF_genesymbol,"try-error")){ # If there is an error for some reason, skip the compound -next -} -TF_uniprot<-GeneSymbol2Uniprot(TF_genesymbol, map, 1, 2) # Map to UniProt -folder = paste0(i,"_measurements/") # Set folder name -generate_measfile(measurements=TF_uniprot, topnumber=50, write2folder=folder) # Write TF activities to folder -} -# Stop the cluster -stopCluster(myCluster) -# Script to prepare DoRoTHea TF activities -# Input: Matrix of gene expression data (Rows are genes (Entrez), columns are compounds) -# Output: 1. .txt file for each signature found in the input matrix -# 2. Folder for each compound (compoundname_measurements) -# with TF activities (meas_50.txt) as UniProt ID -# Import packages -library(CARNIVAL) -library(org.Hs.eg.db) -library(foreach) -library(doParallel) -library(plyr) -library(data.table) -# Initialise cluster -n = 2 # change to number of cores needed -myCluster <- makeCluster(n, type="FORK",outfile="") -registerDoParallel(myCluster) -# Load files for dorothea -file.copy(from=system.file("dorothea_TF_mapping.csv",package="CARNIVAL"),to=getwd(),overwrite=TRUE) -load(file = system.file("BEST_viperRegulon.rdata",package="CARNIVAL")) -map<-read.csv("dorothea_TF_mapping.csv") -#Open matrix -gexfile = "HT29_6h_10uM.csv" -gex_df = fread(gexfile,header=TRUE,sep=",") # First row is python header -gex_df = as.data.frame(gex_df) # Change into df -compound_names = names(gex_df) # Column names are compounds -compound_names = compound_names[!compound_names %in% 'Compound_id'] # Get rid of 'Compound_id' -#Now change gene ids to gene symbols using metadata -gene_info = fread('gene_info.csv',header=TRUE) # Import metadata -gene_info = as.data.frame(gene_info) # Read as df -converted = merge(test_df,gene_info,by.x='Compound_id',by.y='pr_gene_id') # Map to gene symbol -converted_symbols = converted$pr_gene_symbol # Extract symbols -test_df$Compound_id = converted_symbols # Make row names into symbols -compound_names = compound_names[0:10] # test e.g. 10 compounds -# Run -output <- -foreach(i = compound_names) %dopar% { # Loop over every compound -print(i) -fname = paste0(i,".txt") # Set the file name for the signature file -compound_sig = cbind(test_df[1],test_df[i]) # Take the gene names and corresponding measurements -write.table(compound_sig,fname,sep="\t",quote=F,row.names=F,col.names=T,append=T) # Write -df = read.table(fname,sep="\t",header=T,row.names=1) # Read in the signature .txt file -TF_genesymbol<-try( # run DoRothEA, confidence levels A, B and C -runDoRothEA(df, regulon=viper_regulon, confidence_level=c('A','B','C')), -silent = T -) -if(inherits(TF_genesymbol,"try-error")){ # If there is an error for some reason, skip the compound -next -} -TF_uniprot<-GeneSymbol2Uniprot(TF_genesymbol, map, 1, 2) # Map to UniProt -folder = paste0(i,"_measurements/") # Set folder name -generate_measfile(measurements=TF_uniprot, topnumber=50, write2folder=folder) # Write TF activities to folder -} -# Stop the cluster -stopCluster(myCluster) -# Load files for progeny -file.copy(from=system.file("model_NatComm+14_human.csv",package="CARNIVAL"),to=getwd(),overwrite=TRUE) -weight_matrix<-read.csv("model_NatComm+14_human.csv") -View(weight_matrix) -# Import matrix -gexfile = "HT29_6h_10uM.csv" -# Import matrix -gexfile = "HT29_6h_10uM.csv" -gex_df = fread(gexfile,header=TRUE,sep=",") # First row is python header -gex_df = as.data.frame(gex_df) # As df -compound_names = names(gex_df) # Column names are compounds -compound_names = compound_names[!compound_names %in% 'Compound_id'] # Get rid of 'Compound_id' -i = compound_names[[1]] -print(i) -fname <- paste0("/",i, ".txt") # get signature .txt -fname <- paste0(i, ".txt") # get signature .txt -df = read.table(fname,sep="\t",header=TRUE,row.names=1) # Read back in file -df_genenames <- data.frame('gene'=rownames(df),df) -df_genenames -#Run progeny -pathway_scores <- try( -runPROGENy(df_genenames,weight_matrix,z_scores=F), -silent = T) -pathway_scores -#Generate input files -folder = paste0(i,"_measurements/scores_") -scores <- rbind(rownames(pathway_scores),pathway_scores[,1]) -scores -write.table(scores,paste0(folder,i,".txt"),col.names=F,row.names=F,quote=F,sep='\t') # save -# Script to prepare PROGENy pathway scores -# Input: Matrix of gene expression data (Rows are genes (Entrez), columns are compounds), and -# The .txt file for each signature (From prepare_input_parellel.R) -# And a measurement folder for each compound (From prepare_input_parellel.R) -# Output: PROGEny pathway weights .txt in each compound's measurement folder -# Import packages -library(CARNIVAL) -library(org.Hs.eg.db) -library(foreach) -library(doParallel) -library(plyr) -library(data.table) -# set n to number of cores -myCluster <- makeCluster(n, type="FORK",outfile="") -registerDoParallel(myCluster) -# Load files for progeny -file.copy(from=system.file("model_NatComm+14_human.csv",package="CARNIVAL"),to=getwd(),overwrite=TRUE) -weight_matrix<-read.csv("model_NatComm+14_human.csv") -# Import matrix -gexfile = "HT29_6h_10uM.csv" -gex_df = fread(gexfile,header=TRUE,sep=",") # First row is python header -gex_df = as.data.frame(gex_df) # As df -compound_names = names(gex_df) # Column names are compounds -compound_names = compound_names[!compound_names %in% 'Compound_id'] # Get rid of 'Compound_id' -# test for e.g. 10 compounds -compound_names = compound_names[1:10] -output <- -foreach(i = compound_names) %dopar% { # loop over each compound -print(i) -fname <- paste0(i, ".txt") # get signature .txt -df = read.table(fname,sep="\t",header=TRUE,row.names=1) # Read back in file -df_genenames <- data.frame('gene'=rownames(df),df) # make df with rownames = gene symbols -#Run progeny -pathway_scores <- try( -runPROGENy(df_genenames,weight_matrix,z_scores=F), -silent = T) -if(inherits(pathway_scores,"try-error")){ # if it fails then skip -next -} -#Generate input files -folder = paste0(i,"_measurements/scores_") # get folder name -scores <- rbind(rownames(pathway_scores),pathway_scores[,1]) # put into correct format -write.table(scores,paste0(folder,i,".txt"),col.names=F,row.names=F,quote=F,sep='\t') # save -} -stopCluster(myCluster) -# Script to prepare PROGENy pathway scores -# Input: Matrix of gene expression data (Rows are genes (Entrez), columns are compounds), and -# The .txt file for each signature (From prepare_input_parellel.R) -# And a measurement folder for each compound (From prepare_input_parellel.R) -# Output: PROGEny pathway weights .txt in each compound's measurement folder -# Import packages -library(CARNIVAL) -library(org.Hs.eg.db) -library(foreach) -library(doParallel) -library(plyr) -library(data.table) -# set n to number of cores -myCluster <- makeCluster(n, type="FORK",outfile="") -registerDoParallel(myCluster) -# Load files for progeny -file.copy(from=system.file("model_NatComm+14_human.csv",package="CARNIVAL"),to=getwd(),overwrite=TRUE) -weight_matrix<-read.csv("model_NatComm+14_human.csv") -# Import matrix -gexfile = "HT29_6h_10uM.csv" -gex_df = fread(gexfile,header=TRUE,sep=",") # First row is python header -gex_df = as.data.frame(gex_df) # As df -compound_names = names(gex_df) # Column names are compounds -compound_names = compound_names[!compound_names %in% 'Compound_id'] # Get rid of 'Compound_id' -# test for e.g. 10 compounds -compound_names = compound_names[1:10] -output <- -foreach(i = compound_names) %dopar% { # loop over each compound -print(i) -fname <- paste0(i, ".txt") # get signature .txt -df = read.table(fname,sep="\t",header=TRUE,row.names=1) # Read back in file -df_genenames <- data.frame('gene'=rownames(df),df) # make df with rownames = gene symbols -#Run progeny -pathway_scores <- try( -runPROGENy(df_genenames,weight_matrix,z_scores=F), -silent = T) -if(inherits(pathway_scores,"try-error")){ # if it fails then skip -next -} -#Generate input files -folder = paste0(i,"_measurements/scores_") # get folder name -scores <- rbind(rownames(pathway_scores),pathway_scores[,1]) # put into correct format -write.table(scores,paste0(folder,i,".txt"),col.names=F,row.names=F,quote=F,sep='\t') # save -} -stopCluster(myCluster) -# Script to prepare PROGENy pathway scores -# Input: Matrix of gene expression data (Rows are genes (Entrez), columns are compounds), and -# The .txt file for each signature (From prepare_input_parellel.R) -# And a measurement folder for each compound (From prepare_input_parellel.R) -# Output: PROGEny pathway weights .txt in each compound's measurement folder -# Import packages -library(CARNIVAL) -library(org.Hs.eg.db) -library(foreach) -library(doParallel) -library(plyr) -library(data.table) -# set n to number of cores -n = 2 -myCluster <- makeCluster(n, type="FORK",outfile="") -registerDoParallel(myCluster) -# Load files for progeny -file.copy(from=system.file("model_NatComm+14_human.csv",package="CARNIVAL"),to=getwd(),overwrite=TRUE) -weight_matrix<-read.csv("model_NatComm+14_human.csv") -# Import matrix -gexfile = "HT29_6h_10uM.csv" -gex_df = fread(gexfile,header=TRUE,sep=",") # First row is python header -gex_df = as.data.frame(gex_df) # As df -compound_names = names(gex_df) # Column names are compounds -compound_names = compound_names[!compound_names %in% 'Compound_id'] # Get rid of 'Compound_id' -# test for e.g. 10 compounds -compound_names = compound_names[1:10] -output <- -foreach(i = compound_names) %dopar% { # loop over each compound -print(i) -fname <- paste0(i, ".txt") # get signature .txt -df = read.table(fname,sep="\t",header=TRUE,row.names=1) # Read back in file -df_genenames <- data.frame('gene'=rownames(df),df) # make df with rownames = gene symbols -#Run progeny -pathway_scores <- try( -runPROGENy(df_genenames,weight_matrix,z_scores=F), -silent = T) -if(inherits(pathway_scores,"try-error")){ # if it fails then skip -next -} -#Generate input files -folder = paste0(i,"_measurements/scores_") # get folder name -scores <- rbind(rownames(pathway_scores),pathway_scores[,1]) # put into correct format -write.table(scores,paste0(folder,i,".txt"),col.names=F,row.names=F,quote=F,sep='\t') # save -} -stopCluster(myCluster) -# Create output dir -dir.create(file.path("RESULTS_CARNIVAL"),showWarnings = FALSE) -compound_folders = list.dirs(recursive=FALSE) -compound_dirs -compound_folders -# get ones that have already finished and exclude (checkpointing) -done_folders = list.dirs(path="RESULTS_CARNIVAL",full.names=FALSE,recursive=FALSE) -info = file.info(list.dirs(path="RESULTS_CARNIVAL",recursive=FALSE)) -info = info[with(info, order(as.POSIXct(ctime))),] -donecomps = rownames(info) -exclude = tail(donecomps,n=1) -exclude = unlist(strsplit(exclude,"RESULTS_CARNIVAL/"))[2] -done_final = done_folders[!done_folders %in% exclude] -compound_folders -compound = compound_folders[1] -compound -drug = unlist(strsplit(unlist(strsplit(compound,"/"))[2],"_"))[1] # get the compound name -drug -#dir.create(file.path(paste0("RESULTS_CARNIVAL/",drug)),showWarnings = FALSE) -results_dir = paste0("RESULTS_CARNIVAL/",drug) -results_dir -#results_dir = "RESULTS_CARNIVAL/" -tf_activities = list.files(path=compound,pattern="_50.txt",full.names=TRUE) -progeny_pathways = list.files(path=compound,pattern="scores_",full.names=TRUE) -tf_activities -dir.create(file.path(paste0("RESULTS_CARNIVAL/",drug)),showWarnings = FALSE) -# load tf + progeny -tf_activities = list.files(path=compound,pattern="_50.txt",full.names=TRUE) -progeny_pathways = list.files(path=compound,pattern="scores_",full.names=TRUE) -progeny_pathways -R.version.string -# Script to install packages required -install_required <- function(x){ -for( i in x ){ -# require returns TRUE invisibly if it was able to load package -if( ! require( i , character.only = TRUE ) ){ -# If package was not able to be loaded then re-install -install.packages( i , dependencies = TRUE ) -# Load package after installing -require( i , character.only = TRUE ) -} -} -} -install_required(c(shiny,shinyjs)) -install_required(c("shiny","shinyjs")) -install_required(c("shiny","shinyjs","igraph","DT","miniUI","shinysky","shinyalert")) -runApp('OneDrive - University Of Cambridge/MoA_Tool') -library(shiny) -library(shinyjs) -library(igraph) -library(DT) -library(miniUI) -library(chemdoodle) -library(rhandsontable) -library(shinysky) -library(shinyBS) -library(shinythemes) -library(shinyFiles) -library(org.Hs.eg.db) -library(dorothea) -library(dplyr) -library(tibble) -library(ggplot2) -library(progeny) -library(CARNIVAL) -library(visNetwork) -library(piano) -library(HGNChelper) -library(shinyalert) -library(shinyWidgets) -shiny::runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -runApp('OneDrive - University Of Cambridge/MoA_Tool') -shiny::runApp('OneDrive - University Of Cambridge/MoA_Tool') -install.packages("sortable") -runApp('OneDrive - University Of Cambridge/MoA_Tool') -# Version info: R 3.2.3, Biobase 2.30.0, GEOquery 2.40.0, limma 3.26.8 -################################################################ -# Differential expression analysis with limma -library(GEOquery) -BiocManager::install("GEOQuery") -BiocManager::install("GEOquery") -# Version info: R 3.2.3, Biobase 2.30.0, GEOquery 2.40.0, limma 3.26.8 -################################################################ -# Differential expression analysis with limma -library(GEOquery) -library(limma) -library(umap) -gset <- getGEO("GSE129254", GSEMatrix =TRUE, AnnotGPL=TRUE) -if (length(gset) > 1) idx <- grep("GPL10558", attr(gset, "names")) else idx <- 1 -gset <- gset[[idx]] -# make proper column names to match toptable -fvarLabels(gset) <- make.names(fvarLabels(gset)) -# group membership for all samples -gsms <- "XXXXXXXXX000111XXX" -sml <- strsplit(gsms, split="")[[1]] -# filter out excluded samples (marked as "X") -sel <- which(sml != "X") -sml <- sml[sel] -gset <- gset[ ,sel] -# log2 transformation -ex <- exprs(gset) -qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) -LogC <- (qx[5] > 100) || -(qx[6]-qx[1] > 50 && qx[2] > 0) -if (LogC) { ex[which(ex <= 0)] <- NaN -exprs(gset) <- log2(ex) } -# assign samples to groups and set up design matrix -gs <- factor(sml) -groups <- make.names(c("Control","Treatment")) -levels(gs) <- groups -gset$group <- gs -design <- model.matrix(~group + 0, gset) -colnames(design) <- levels(gs) -fit <- lmFit(gset, design) # fit linear model -# set up contrasts of interest and recalculate model coefficients -cts <- paste(groups[1], groups[2], sep="-") -cont.matrix <- makeContrasts(contrasts=cts, levels=design) -fit2 <- contrasts.fit(fit, cont.matrix) -# compute statistics and table of top significant genes -fit2 <- eBayes(fit2, 0.01) -tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) -?topTable -fit2 -design -# group membership for all samples -gsms <- "XXXXXXXXX111000XXX" -sml <- strsplit(gsms, split="")[[1]] -# filter out excluded samples (marked as "X") -sel <- which(sml != "X") -sml <- sml[sel] -gset <- gset[ ,sel] -gset <- getGEO("GSE129254", GSEMatrix =TRUE, AnnotGPL=TRUE) -if (length(gset) > 1) idx <- grep("GPL10558", attr(gset, "names")) else idx <- 1 -gset <- gset[[idx]] -# make proper column names to match toptable -fvarLabels(gset) <- make.names(fvarLabels(gset)) -# group membership for all samples -gsms <- "XXXXXXXXX111000XXX" -sml <- strsplit(gsms, split="")[[1]] -# filter out excluded samples (marked as "X") -sel <- which(sml != "X") -sml <- sml[sel] -gset <- gset[ ,sel] -# log2 transformation -ex <- exprs(gset) -qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) -LogC <- (qx[5] > 100) || -(qx[6]-qx[1] > 50 && qx[2] > 0) -if (LogC) { ex[which(ex <= 0)] <- NaN -exprs(gset) <- log2(ex) } -# assign samples to groups and set up design matrix -gs <- factor(sml) -groups <- make.names(c("Treatment","Control")) -levels(gs) <- groups -gset$group <- gs -design <- model.matrix(~group + 0, gset) -colnames(design) <- levels(gs) -fit <- lmFit(gset, design) # fit linear model -design -fit -design -# set up contrasts of interest and recalculate model coefficients -cts <- paste(groups[1], groups[2], sep="-") -cont.matrix <- makeContrasts(contrasts=cts, levels=design) -fit2 <- contrasts.fit(fit, cont.matrix) -# compute statistics and table of top significant genes -fit2 <- eBayes(fit2, 0.01) -fit2 -47318+5 -tT <- topTable(fit2, adjust="fdr", sort.by="B", number=47323) -tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.symbol","Gene.title")) -View(tT) -View(tT) -tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) -tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.symbol","Gene.title")) -tT2 <- tT %>% -dplyr::select(Gene.symbol, t) %>% -na.omit() %>% -distinct() %>% -group_by(Gene.symbol) %>% -summarize(t=mean(t)) -R -version() -version -sessionInfo(0) -sessionInfo() -library(dplyr) -sessionInfo() -# Version info: R 4.0.3, GEOquery 2.58.0, limma 3.46.0, dplyr 1.0.6 -# R code from GEO2R -################################################################ -# Differential expression analysis with limma -library(GEOquery) -library(limma) -library(dplyr) -# load series and platform data from GEO -gset <- getGEO("GSE129254", GSEMatrix =TRUE, AnnotGPL=TRUE) -if (length(gset) > 1) idx <- grep("GPL10558", attr(gset, "names")) else idx <- 1 -gset <- gset[[idx]] -# make proper column names to match toptable -fvarLabels(gset) <- make.names(fvarLabels(gset)) -# group membership for all samples -gsms <- "XXXXXXXXX111000XXX" -sml <- strsplit(gsms, split="")[[1]] -# filter out excluded samples (marked as "X") -sel <- which(sml != "X") -sml <- sml[sel] -gset <- gset[ ,sel] -# log2 transformation -ex <- exprs(gset) -qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) -LogC <- (qx[5] > 100) || -(qx[6]-qx[1] > 50 && qx[2] > 0) -if (LogC) { ex[which(ex <= 0)] <- NaN -exprs(gset) <- log2(ex) } -# assign samples to groups and set up design matrix -gs <- factor(sml) -groups <- make.names(c("Treatment","Control")) -levels(gs) <- groups -gset$group <- gs -design <- model.matrix(~group + 0, gset) -colnames(design) <- levels(gs) -fit <- lmFit(gset, design) # fit linear model -# set up contrasts of interest and recalculate model coefficients -cts <- paste(groups[1], groups[2], sep="-") -cont.matrix <- makeContrasts(contrasts=cts, levels=design) -fit2 <- contrasts.fit(fit, cont.matrix) -# compute statistics and table of top significant genes -fit2 <- eBayes(fit2, 0.01) -tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) -# -tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.symbol","Gene.title")) -tT2 <- tT %>% -dplyr::select(Gene.symbol, t) %>% -na.omit() %>% -distinct() %>% -group_by(Gene.symbol) %>% -summarize(t=mean(t)) -#write.table(tT, file=, row.names=F, sep="\t") -setwd("~/OneDrive - University Of Cambridge/MoA_Tool/lapatinib") -View(tT2) -View(tT) -?eBayes -# get rid of empty gene symbol rows -tT = tT[complete.cases(tT),] -View(tT) -# compute statistics and table of top significant genes -fit2 <- eBayes(fit2, 0.01) -tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) -tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.symbol","Gene.title")) -# get rid of empty gene symbol rows, get mean t-stat per gene symbol -tT[tT==""] <- NA -tT2 <- tT %>% -dplyr::select(Gene.symbol, t) %>% -na.omit() %>% -distinct() %>% -group_by(Gene.symbol) %>% -summarize(t=mean(t)) -View(tT2) -# save tt -write.table(tT2, file="GSE129254_lapatinib_BT474", row.names=F, sep="\t") -# save tt -write.table(tT2, file="GSE129254_lapatinib_BT474.txt", row.names=F, sep="\t") -# save tt -write.table(tT2, file="GSE129254_lapatinib_BT474.txt", row.names=F, sep="\t",quote=F)