From 7341fb2068c97f9f082a67e42615f1f91469269c Mon Sep 17 00:00:00 2001 From: tma1 <107221080+tma1@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:30:32 -0400 Subject: [PATCH] Add files via upload --- petagraph/code/preprocessing/4DN_LOOP.R | 7 ++++++ petagraph/code/preprocessing/4DN_Q.R | 3 +++ petagraph/code/preprocessing/CLINVAR.R | 23 ++++++++++++++++++++ petagraph/code/preprocessing/CMAP.R | 12 ++++++++++ petagraph/code/preprocessing/HSCLO_GENCODE.R | 1 + petagraph/code/preprocessing/L1000.R | 17 +++++++++++++++ petagraph/code/preprocessing/MSIGDB.R | 15 +++++++++++++ 7 files changed, 78 insertions(+) create mode 100644 petagraph/code/preprocessing/4DN_LOOP.R create mode 100644 petagraph/code/preprocessing/4DN_Q.R create mode 100644 petagraph/code/preprocessing/CLINVAR.R create mode 100644 petagraph/code/preprocessing/CMAP.R create mode 100644 petagraph/code/preprocessing/HSCLO_GENCODE.R create mode 100644 petagraph/code/preprocessing/L1000.R create mode 100644 petagraph/code/preprocessing/MSIGDB.R diff --git a/petagraph/code/preprocessing/4DN_LOOP.R b/petagraph/code/preprocessing/4DN_LOOP.R new file mode 100644 index 0000000..6b56094 --- /dev/null +++ b/petagraph/code/preprocessing/4DN_LOOP.R @@ -0,0 +1,7 @@ +DNL4<-array("",c(1,3));for (i in 1:4) {D <- read.delim(paste("~/",DS[i,1],".txt",sep = ""));D<-D[,c(1:6,12)]; +U<-array("",c(1,3));for (j in 1:dim(D)[1]) {u <-paste("4DNL ",DS[i,1],".",D[j,1],".",D[j,2],"-",D[j,3],".",D[j,4],".",D[j,5],"-",D[j,6],sep = ""); +us <- cbind(u,"loop_us_start",paste("HSCLO ",D[j,1],".",D[j,2]-999,"-",D[j,2],sep = "")); +ue <- cbind(u,"loop_us_end",paste("HSCLO ",D[j,1],".",D[j,3]-999,"-",D[j,3],sep = "")); +ds <- cbind(u,"loop_ds_start",paste("HSCLO ",D[j,4],".",D[j,5]-999,"-",D[j,5],sep = "")); +de <- cbind(u,"loop_ds_end",paste("HSCLO ",D[j,4],".",D[j,6]-999,"-",D[j,6],sep = "")); +anc<-rbind(us,ue,ds,de);U <-rbind(U,anc)};DNL4<-rbind(DNL4,U)} \ No newline at end of file diff --git a/petagraph/code/preprocessing/4DN_Q.R b/petagraph/code/preprocessing/4DN_Q.R new file mode 100644 index 0000000..7cf2ac8 --- /dev/null +++ b/petagraph/code/preprocessing/4DN_Q.R @@ -0,0 +1,3 @@ +DNQ4<-array("",c(1,3));for (i in 1:4) {D <- read.delim(paste("~/",DS[i,1],".txt",sep = ""));D<-D[,c(1:6,12)]; +U<-array("",c(dim(D)[1],3));for (j in 1:dim(D)[1]) {U[j,1] <-paste("4DNL ",DS[i,1],".",D[j,1],".",D[j,2],"-",D[j,3],".",D[j,4],".",D[j,5],"-",D[j,6],sep = ""); +U[j,2]<-"loop_has_qvalue_bin";U[j,3]<-paste("4DNQ ","1e",floor(log10(D[j,7])),".","1e",ceiling(log10(D[j,7])),sep = "")};DNQ4<-rbind(DNQ4,U)} diff --git a/petagraph/code/preprocessing/CLINVAR.R b/petagraph/code/preprocessing/CLINVAR.R new file mode 100644 index 0000000..1d30a28 --- /dev/null +++ b/petagraph/code/preprocessing/CLINVAR.R @@ -0,0 +1,23 @@ +# Requires the latest variant_summary.txt from the ClinVar website at NCBI +library(stringr) +VS <- read.delim("~/Path/variant_summary.txt", header=FALSE, comment.char="#") +VS<-unique(VS[,c(6,7,13,25)]) +VS<-VS[str_which(VS[,2],'pathogenic'),] +VS<-VS[-str_which(VS[,2],'uncertain'),] +VS<-VS[-str_which(VS[,2],'conflicting'),] +VS<-VS[-which(VS[,4]=='no assertion criteria provided'),] +VS[,3]<-str_replace_all(VS[,3],'\\|',',') +VS[,3]<-str_replace_all(VS[,3],';',',') +V<-strsplit(VS[,3],',') +VR<-array("",c(1,2)) +for (i in 1:length(V)){v<-V[[i]];l<-length(v);u<-array(VS[i,1],c(l,2));u[,2]<-v;VR<-rbind(u,VR)} +VR<-unique(VR) +VR<-VR[-which(VR[,2]==""),] +VR<-VR[-str_which(VR[,2],'condition'),] +VR<-VR[-which(VR[,2]=='-'),] +VR<-VR[-which(VR[,1]=='-'),] +VR[,2]<-str_remove_all(VR[,2],'Human Phenotype Ontology:') +VR[,2]<-str_replace_all(VR[,2],'MONDO:MONDO:','MONDO:') +rm(VS,VR,u,v,i) +ClinVar_Edgelist<-cbind(VR,array('gene_assoicated_with_disease_or_phenotype',c(dim(VR)[1],1))) +colnames(ClinVar_Edgelist)<-c('subject','object','predicate') \ No newline at end of file diff --git a/petagraph/code/preprocessing/CMAP.R b/petagraph/code/preprocessing/CMAP.R new file mode 100644 index 0000000..6148065 --- /dev/null +++ b/petagraph/code/preprocessing/CMAP.R @@ -0,0 +1,12 @@ + +#CMAP edgelist data is required and can be obtained from https://maayanlab.cloud/Harmonizome/resource/Connectivity+Map +CM <- read.delim("~/CMAP.txt") +CM<-cbind(CM$source,CM$target,CM$weight) +colnames(CM)<-c('subject','object','predicate') +CM <-as.data.frame(CM) +CM<-CM[-1,] +SM <-cbind(unique(CM[,2]),unique(CM[,2])) +SMS<-strsplit(SM[,1],'-') +for (i in 1:length(SMS)){v<-SMS[[i]];u<-v[1];l<-length(v);if (l>2) {for (j in 2:(l-1)){u<-paste(u,v[j],sep = '-')}};SM[i,2]<-u} +colnames(SM)<-c('object','object-') +CM<-merge(CM,SM,by = 'object') \ No newline at end of file diff --git a/petagraph/code/preprocessing/HSCLO_GENCODE.R b/petagraph/code/preprocessing/HSCLO_GENCODE.R new file mode 100644 index 0000000..e3f75f2 --- /dev/null +++ b/petagraph/code/preprocessing/HSCLO_GENCODE.R @@ -0,0 +1 @@ +for (i in 1:295559) {l <- floor(as.numeric(G[i,2])/1000)*1000; if (G[i,2]>l) {L[i,3] = paste("HSCLO"," ",G[i,6],".",l+1,"-",l+1000,sep = "")} else {L[i,3] = paste("HSCLO"," ",G[i,6],".",l-999,"-",l,sep = "")}} \ No newline at end of file diff --git a/petagraph/code/preprocessing/L1000.R b/petagraph/code/preprocessing/L1000.R new file mode 100644 index 0000000..23bc76a --- /dev/null +++ b/petagraph/code/preprocessing/L1000.R @@ -0,0 +1,17 @@ +#L1000 edgelist data is required and can be obtained from https://maayanlab.cloud/Harmonizome/resource/LINCS+L1000+Connectivity+Map +L1000 <- read.delim("~/L1000.txt") +L1000<-as.data.frame(L1000) +Target<-strsplit(L1000$target,"_") +Drugs <-array("",c(length(Target),1)) +for (i in 1:length(Target)) {v<-Target[[i]];Drugs[i,]<-v[1]} +L1000<-cbind(L1000$source,Drugs,L1000$weight) +Small_Molecules<-read.csv("SM.csv") +colnames(L1000)<-c("subject","object","predicate") +colnames(Small_Molecules)<-c("object","pubchem_cid") +L1000<-merge(L1000,Small_Molecules,by = "object") +L1000<-L1000[,c(4,3,2)] +colnames(L1000)<-c("subject","predicate","object") +L1000[which(as.numeric(L1000$predicate)==1),2]<-"positively_correlated_with_gene" +L1000[which(as.numeric(L1000$predicate)==-1),2]<-"negatively_correlated_with_gene" +L1000$subject<-paste("PUBCHEM",L1000$subject) +L1000 <- unique(L1000) \ No newline at end of file diff --git a/petagraph/code/preprocessing/MSIGDB.R b/petagraph/code/preprocessing/MSIGDB.R new file mode 100644 index 0000000..695d2e4 --- /dev/null +++ b/petagraph/code/preprocessing/MSIGDB.R @@ -0,0 +1,15 @@ +library('msigdbr') +library('stringr') +C1 <- msigdbr(species = 'human', category = 'C1');C1 <- as.data.frame(C1[,c(3,2,7)]) +C2 <- msigdbr(species = 'human', category = 'C2');C2 <- as.data.frame(C2[,c(3,2,7)]) +C2<-C2[-str_which(C2[,1],'KEGG'),] +C3 <- msigdbr(species = 'human', category = 'C3');C3 <- as.data.frame(C3[,c(3,2,7)]) +C8 <- msigdbr(species = 'human', category = 'C8');C8 <- as.data.frame(C8[,c(3,2,7)]) +H <- msigdbr(species = 'human', category = 'H');H <- as.data.frame(H[,c(3,2,7)]) +C1[,2]<-'chr_band_contains_gene' +C2[,2]<-'pathway_associated_with_gene' +C3[,2]<-'targets_expression_of_gene' +C8[,2]<-'has_marker_gene' +H[,2]<-'has_signature_gene' +MSIGDB<-unique(rbind(C1,C2,C3,C8,H)) +colnames(MSIGDB)<-c('subject','predicate','object') \ No newline at end of file