From 7341fb2068c97f9f082a67e42615f1f91469269c Mon Sep 17 00:00:00 2001
From: tma1 <107221080+tma1@users.noreply.github.com>
Date: Fri, 9 Aug 2024 12:30:32 -0400
Subject: [PATCH] Add files via upload

---
 petagraph/code/preprocessing/4DN_LOOP.R      |  7 ++++++
 petagraph/code/preprocessing/4DN_Q.R         |  3 +++
 petagraph/code/preprocessing/CLINVAR.R       | 23 ++++++++++++++++++++
 petagraph/code/preprocessing/CMAP.R          | 12 ++++++++++
 petagraph/code/preprocessing/HSCLO_GENCODE.R |  1 +
 petagraph/code/preprocessing/L1000.R         | 17 +++++++++++++++
 petagraph/code/preprocessing/MSIGDB.R        | 15 +++++++++++++
 7 files changed, 78 insertions(+)
 create mode 100644 petagraph/code/preprocessing/4DN_LOOP.R
 create mode 100644 petagraph/code/preprocessing/4DN_Q.R
 create mode 100644 petagraph/code/preprocessing/CLINVAR.R
 create mode 100644 petagraph/code/preprocessing/CMAP.R
 create mode 100644 petagraph/code/preprocessing/HSCLO_GENCODE.R
 create mode 100644 petagraph/code/preprocessing/L1000.R
 create mode 100644 petagraph/code/preprocessing/MSIGDB.R

diff --git a/petagraph/code/preprocessing/4DN_LOOP.R b/petagraph/code/preprocessing/4DN_LOOP.R
new file mode 100644
index 0000000..6b56094
--- /dev/null
+++ b/petagraph/code/preprocessing/4DN_LOOP.R
@@ -0,0 +1,7 @@
+DNL4<-array("",c(1,3));for (i in 1:4) {D <- read.delim(paste("~/",DS[i,1],".txt",sep = ""));D<-D[,c(1:6,12)];
+U<-array("",c(1,3));for (j in 1:dim(D)[1]) {u  <-paste("4DNL ",DS[i,1],".",D[j,1],".",D[j,2],"-",D[j,3],".",D[j,4],".",D[j,5],"-",D[j,6],sep = "");
+us <- cbind(u,"loop_us_start",paste("HSCLO ",D[j,1],".",D[j,2]-999,"-",D[j,2],sep = ""));
+ue <- cbind(u,"loop_us_end",paste("HSCLO ",D[j,1],".",D[j,3]-999,"-",D[j,3],sep = ""));
+ds <- cbind(u,"loop_ds_start",paste("HSCLO ",D[j,4],".",D[j,5]-999,"-",D[j,5],sep = ""));
+de <- cbind(u,"loop_ds_end",paste("HSCLO ",D[j,4],".",D[j,6]-999,"-",D[j,6],sep = ""));
+anc<-rbind(us,ue,ds,de);U <-rbind(U,anc)};DNL4<-rbind(DNL4,U)}
\ No newline at end of file
diff --git a/petagraph/code/preprocessing/4DN_Q.R b/petagraph/code/preprocessing/4DN_Q.R
new file mode 100644
index 0000000..7cf2ac8
--- /dev/null
+++ b/petagraph/code/preprocessing/4DN_Q.R
@@ -0,0 +1,3 @@
+DNQ4<-array("",c(1,3));for (i in 1:4) {D <- read.delim(paste("~/",DS[i,1],".txt",sep = ""));D<-D[,c(1:6,12)];
+U<-array("",c(dim(D)[1],3));for (j in 1:dim(D)[1]) {U[j,1]  <-paste("4DNL ",DS[i,1],".",D[j,1],".",D[j,2],"-",D[j,3],".",D[j,4],".",D[j,5],"-",D[j,6],sep = "");
+U[j,2]<-"loop_has_qvalue_bin";U[j,3]<-paste("4DNQ ","1e",floor(log10(D[j,7])),".","1e",ceiling(log10(D[j,7])),sep = "")};DNQ4<-rbind(DNQ4,U)}
diff --git a/petagraph/code/preprocessing/CLINVAR.R b/petagraph/code/preprocessing/CLINVAR.R
new file mode 100644
index 0000000..1d30a28
--- /dev/null
+++ b/petagraph/code/preprocessing/CLINVAR.R
@@ -0,0 +1,23 @@
+# Requires the latest variant_summary.txt from the ClinVar website at NCBI
+library(stringr)
+VS <- read.delim("~/Path/variant_summary.txt", header=FALSE, comment.char="#")
+VS<-unique(VS[,c(6,7,13,25)])
+VS<-VS[str_which(VS[,2],'pathogenic'),]
+VS<-VS[-str_which(VS[,2],'uncertain'),]
+VS<-VS[-str_which(VS[,2],'conflicting'),]
+VS<-VS[-which(VS[,4]=='no assertion criteria provided'),]
+VS[,3]<-str_replace_all(VS[,3],'\\|',',')
+VS[,3]<-str_replace_all(VS[,3],';',',')
+V<-strsplit(VS[,3],',')
+VR<-array("",c(1,2))
+for (i in 1:length(V)){v<-V[[i]];l<-length(v);u<-array(VS[i,1],c(l,2));u[,2]<-v;VR<-rbind(u,VR)}
+VR<-unique(VR)
+VR<-VR[-which(VR[,2]==""),]
+VR<-VR[-str_which(VR[,2],'condition'),]
+VR<-VR[-which(VR[,2]=='-'),]
+VR<-VR[-which(VR[,1]=='-'),]
+VR[,2]<-str_remove_all(VR[,2],'Human Phenotype Ontology:')
+VR[,2]<-str_replace_all(VR[,2],'MONDO:MONDO:','MONDO:')
+rm(VS,VR,u,v,i)
+ClinVar_Edgelist<-cbind(VR,array('gene_assoicated_with_disease_or_phenotype',c(dim(VR)[1],1)))
+colnames(ClinVar_Edgelist)<-c('subject','object','predicate')
\ No newline at end of file
diff --git a/petagraph/code/preprocessing/CMAP.R b/petagraph/code/preprocessing/CMAP.R
new file mode 100644
index 0000000..6148065
--- /dev/null
+++ b/petagraph/code/preprocessing/CMAP.R
@@ -0,0 +1,12 @@
+
+#CMAP edgelist data is required and can be obtained from https://maayanlab.cloud/Harmonizome/resource/Connectivity+Map
+CM <- read.delim("~/CMAP.txt")
+CM<-cbind(CM$source,CM$target,CM$weight)
+colnames(CM)<-c('subject','object','predicate')
+CM <-as.data.frame(CM)
+CM<-CM[-1,]
+SM <-cbind(unique(CM[,2]),unique(CM[,2]))
+SMS<-strsplit(SM[,1],'-')
+for (i in 1:length(SMS)){v<-SMS[[i]];u<-v[1];l<-length(v);if (l>2) {for (j in 2:(l-1)){u<-paste(u,v[j],sep = '-')}};SM[i,2]<-u}
+colnames(SM)<-c('object','object-')
+CM<-merge(CM,SM,by = 'object')
\ No newline at end of file
diff --git a/petagraph/code/preprocessing/HSCLO_GENCODE.R b/petagraph/code/preprocessing/HSCLO_GENCODE.R
new file mode 100644
index 0000000..e3f75f2
--- /dev/null
+++ b/petagraph/code/preprocessing/HSCLO_GENCODE.R
@@ -0,0 +1 @@
+for (i in 1:295559) {l <- floor(as.numeric(G[i,2])/1000)*1000; if (G[i,2]>l) {L[i,3] = paste("HSCLO"," ",G[i,6],".",l+1,"-",l+1000,sep = "")} else {L[i,3] = paste("HSCLO"," ",G[i,6],".",l-999,"-",l,sep = "")}}
\ No newline at end of file
diff --git a/petagraph/code/preprocessing/L1000.R b/petagraph/code/preprocessing/L1000.R
new file mode 100644
index 0000000..23bc76a
--- /dev/null
+++ b/petagraph/code/preprocessing/L1000.R
@@ -0,0 +1,17 @@
+#L1000 edgelist data is required and can be obtained from https://maayanlab.cloud/Harmonizome/resource/LINCS+L1000+Connectivity+Map
+L1000 <- read.delim("~/L1000.txt")
+L1000<-as.data.frame(L1000)
+Target<-strsplit(L1000$target,"_")
+Drugs <-array("",c(length(Target),1))
+for (i in 1:length(Target)) {v<-Target[[i]];Drugs[i,]<-v[1]}
+L1000<-cbind(L1000$source,Drugs,L1000$weight)
+Small_Molecules<-read.csv("SM.csv")
+colnames(L1000)<-c("subject","object","predicate")
+colnames(Small_Molecules)<-c("object","pubchem_cid")
+L1000<-merge(L1000,Small_Molecules,by = "object")
+L1000<-L1000[,c(4,3,2)]
+colnames(L1000)<-c("subject","predicate","object")
+L1000[which(as.numeric(L1000$predicate)==1),2]<-"positively_correlated_with_gene"
+L1000[which(as.numeric(L1000$predicate)==-1),2]<-"negatively_correlated_with_gene"
+L1000$subject<-paste("PUBCHEM",L1000$subject)
+L1000 <- unique(L1000)
\ No newline at end of file
diff --git a/petagraph/code/preprocessing/MSIGDB.R b/petagraph/code/preprocessing/MSIGDB.R
new file mode 100644
index 0000000..695d2e4
--- /dev/null
+++ b/petagraph/code/preprocessing/MSIGDB.R
@@ -0,0 +1,15 @@
+library('msigdbr')
+library('stringr')
+C1 <- msigdbr(species = 'human', category = 'C1');C1 <- as.data.frame(C1[,c(3,2,7)])
+C2 <- msigdbr(species = 'human', category = 'C2');C2 <- as.data.frame(C2[,c(3,2,7)])
+C2<-C2[-str_which(C2[,1],'KEGG'),]
+C3 <- msigdbr(species = 'human', category = 'C3');C3 <- as.data.frame(C3[,c(3,2,7)])
+C8 <- msigdbr(species = 'human', category = 'C8');C8 <- as.data.frame(C8[,c(3,2,7)])
+H <- msigdbr(species = 'human', category = 'H');H <- as.data.frame(H[,c(3,2,7)])
+C1[,2]<-'chr_band_contains_gene'
+C2[,2]<-'pathway_associated_with_gene'
+C3[,2]<-'targets_expression_of_gene'
+C8[,2]<-'has_marker_gene'
+H[,2]<-'has_signature_gene'
+MSIGDB<-unique(rbind(C1,C2,C3,C8,H))
+colnames(MSIGDB)<-c('subject','predicate','object')
\ No newline at end of file