You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
1. Use lineage 5 as root for just the smith data.
2. Remove all non-common genes and see how smith/severo cluster –expect those to be dispersed around, not clustering to a single spot.
3. Do severo data by themselves
4. Find genes with expression that define the populations.
# some genes are super high in the severo data, so need to adjust, but on average severo data has 1000's more reads
#the severo data that has high counts, does not associate with super large genes, so they appear normalized.
less -S AllGenesSmithSeveroFormat.txt |awk 'NR>1' |sort -k2,2nr |less -S
awk '$3=="gene" {print $5-$4,$9}' Anopheles_gambiae.AgamP4.46.gff3 |sort -k1,1nr |less
less Cell_metadata_Combine |cut -f 1,4 |awk '{if($2!=9){print $1"\t0"}else {print $1"\t1"}}' >ConditionTable.txt
cp AllGenesSmithSeveroFormat.txt DeseqTable.txt
#modified first col of header
vi DeseqTable.txt
#add 1 to every gene count to allow for deseq log transformation
awk 'NR>1' DeseqTable.txt |cut -f 2- |awk -v n=1 -F"\t" '{for(i=1;i<=NF;i++)printf($i+n)"\t"};{print FS}' |paste <(cut -f 1 DeseqTable.txt|awk 'NR>1') - |cat <(awk 'NR==1' DeseqTable.txt) - >DeseqTable2.txt
#using DESEQ to normalize genes
ml r-deseq2/1.20.0-py2-r3.5-openmpi3-zhebatp
R
library("DESeq2")
dat<-ceiling(read.table("DeseqTable2.txt",header = T,quote = "",row.names = 1))
dat <- as.matrix(dat)
condition <- factor(c(rep("1",25),rep("0",239)))
condition=relevel(condition,ref = "0")
coldata <-read.table("ConditionTable.txt",header = T,row.names = 1)
coldata <- data.frame(row.names=colnames(dat), condition)
dds <- DESeqDataSetFromMatrix(countData = dat, colData = coldata,design=~ condition)
dds <- DESeq(dds)
###############################################################################
> dat<-ceiling(read.table("DeseqTable2.txt",header = T,quote = "",row.names = 1))
> dat <- as.matrix(dat)
> condition <- factor(c(rep("1",25),rep("0",239)))
> condition=relevel(condition,ref = "0")
> coldata <-read.table("ConditionTable.txt",header = T,row.names = 1)
> coldata <- data.frame(row.names=colnames(dat), condition)
> dds <- DESeqDataSetFromMatrix(countData = dat, colData = coldata,design=~ condition)
converting counts to integer mode
> dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 5446 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
Warning message:
In fitDisp(ySEXP = ySEXP, xSEXP = xSEXP, mu_hatSEXP = mu_hatSEXP, :
'.Random.seed[1]' is not a valid integer, so ignored
#########################################################################################
res <- results(dds)
table(res$padj<0.05)
res <- res[order(res$padj), ]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
write.csv(resdata, file="SmithSeveroNormalized",quote = FALSE,row.names = F)
#################################################################################
FALSE TRUE
2766 10666
> res <- res[order(res$padj), ]
> resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
> names(resdata)[1] <- "Gene"
> write.csv(resdata, file="SmithSeveroNormalized",quote = FALSE,row.names = F)
> rld <- rlogTransformation(dds)
rlog() may take a long time with 50 or more samples,
vst() is a much faster transformation
^C
> help(vst)
> rld <- vst(dds)
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
rld <- vst(dds)
library(ggplot2)
p <- plotPCA(rld)
p <- p + geom_text(aes_string(x = "PC1", y = "PC2", label = "name"), color = "black")
print(p)
q()
mv Rplots.pdf DESEQPCA.pdf
less -S SmithSeveroNormalized| sed 's/,/\t/g' |cut -f 1,8- |awk 'NR>1' |sort -k1,1V |cat <(awk 'NR==1' SmithSeveroNormalized| sed 's/,/\t/g' |cut -f 1,8- ) - >NormalizedSmithSeveroCounts4Monocle.tab
#fix the header to match col1 of nothing
vi NormalizedSmithSeveroCounts4Monocle.tab
#DESEQ PCA
Run monocle to see if this normalized dataset accomplishes the clustering
library(monocle3)
library(dplyr)
library(Matrix)
expression_matrix2<- round(as.matrix(read.table("NormalizedSmithSeveroCounts4Monocle.tab",header = T,row.names=1)))
gene_metadata2 <- as.matrix(read.table("AllGeneMetadata",header = T,row.names = 1))
cell_metadata2<- as.matrix(read.table("Cell_metadata_Combine",header = T,row.names = 1))
M1 <- as(expression_matrix2, "dgCMatrix")
#if this breaks, your files are wrong
cds <- new_cell_data_set(M1,cell_metadata = cell_metadata2,gene_metadata = gene_metadata2)
cds <- preprocess_cds(cds, num_dim = 100)
cds <- reduce_dimension(cds,reduction_method=c("UMAP"))
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
cds <- order_cells(cds,root_cells=row.names(colData(cds[,3],on=5)))
plot_cells(cds, reduction_method="UMAP", color_cells_by="group",cell_size=1.3,label_cell_groups=FALSE,label_leaves=TRUE,graph_label_size=2,label_branch_points=TRUE)
#This places the severo samples further away from the smith samples, so normalization of both datasets made this worse.
Try to use upper quartile normalization on severo data
# calculated upper quartile in excel, divided all values by their upper quartile and then averaged all of the upper quartile's and multiplied all all values by the mean of all upper quartile's.
ModifiedForExcelAllGenesSmithSeveroFormatNormalized.txt
library(monocle3)
library(dplyr)
library(Matrix)
expression_matrix2<- round(as.matrix(read.table("ModifiedForExcelAllGenesSmithSeveroFormatNormalized.txt",header = T,row.names=1)))
gene_metadata2 <- as.matrix(read.table("AllGeneMetadata",header = T,row.names = 1))
cell_metadata2<- as.matrix(read.table("Cell_metadata_Combine",header = T,row.names = 1))
M1 <- as(expression_matrix2, "dgCMatrix")
#if this breaks, your files are wrong
cds <- new_cell_data_set(M1,cell_metadata = cell_metadata2,gene_metadata = gene_metadata2)
cds <- preprocess_cds(cds, num_dim = 100)
cds <- reduce_dimension(cds,reduction_method=c("UMAP"))
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
cds <- order_cells(cds,root_cells=row.names(colData(cds[,3],on=5)))
plot_cells(cds, reduction_method="UMAP", color_cells_by="group",cell_size=1.3,label_cell_groups=FALSE,label_leaves=TRUE,graph_label_size=2,label_branch_points=TRUE)
This was worse than the original plot without any normalization.
Create monocle plots that associate genes with clusters of cells