Overall_Analysis.Rmd

---
title: "Overall Analysis"
author: "Alexander Schulz"
date: "4 2 2020"
output: html_notebook
---

```{r include=FALSE}
library("seqinr")
library("ggplot2")
library("ccmotif") # Version 0.6.6
library("Biostrings")
library("knitr")
library("xlsx")
source("Scripts/DataAnalysis.R")
```

Change here:
```{r}
pathSeq = "../BA Circular Code/Workspace/"
filenames = list.files(pathSeq, pattern = "*.fasta")

pathMatrix = "../BA Circular Code/Workspace/"
matrixnames = list.files(pathMatrix, pattern = "*.RDS")
```

Iterate through all files. Result is stored in data frame:
```{r echo=FALSE}
df = data.frame(Organism=c(),
                Code=c(),
                HasStop=c(),
                Frame=c(),
                Changed_Codons=c(),
                Code_Usage=c(),
                Edit_Score=c(),
                Edit_Distance=c(),
                Increase_MML=c(),
                Max_ML=c()
                )

df_extra = data.frame(Organism=c(),
                      Code=c(),
                      HasStop=c(),
                      Frame=c(),
                      bio_Code_Usage=c())

for (i in 1:length(filenames)) { 
  
  ff = read.fasta(paste(pathSeq,filenames[i],sep=""), as.string = TRUE, forceDNAtolower = FALSE)
  rds = readRDS(paste(pathMatrix,matrixnames[i],sep=""))
  
  tmp = unlist(strsplit(filenames[i],"_"))
  organism = tmp[1]
  code = tmp[2]
  frame = unlist(strsplit(tmp[3],".fasta"))
  
  print(filenames[i])
  print(matrixnames[i])
  print(organism)
  print(code)
  print(frame)
  
  #Code with Stop Codons?
  stop = codes.containsStop(codes.c3[[as.numeric(code)]]$codons)
  # TRUE == no stop codon in code   # False == contains stop codon (ccmotif 0.6.6)
  # !stop is printed
  
  #Codon Distribution
  codons = codon.splitlist(ff)
  cu = codon.usage(codons)
  plot(cu, species = paste(organism,code,frame,sep="_"))
  
  #Code Usage
  p = codes.usage(cu, codes.c3[[as.numeric(code)]])
  print(paste("Achieved code Usage",round(p, 2)))
  
  #Amount of codons
  ca = sum(rds)
  print(paste("Amount of codons: ",ca))
  
  #Unchanged codons
  nc = sum(diag(rds))
  nc_p = nc/ca
  print(paste("Unchanged codons: ",nc, round(nc_p,2)))
  
  #Codons that could not be changed
  unc = unchangednonCCCodons(rds,codes.c3[[as.numeric(code)]])
  unc_p = unc/ca
  print(paste("--------- because no good subsitution: ",unc, round(unc_p,2)))
  
  #Codons that did no need to be changed
  ucc = nc-unc
  ucc_p = ucc/ca
  print(paste("--------- because part of code before: ",ucc, round(ucc_p,2)))
  
  #Changed codons
  cc = ca-nc
  cc_p = cc/ca
  print(paste("Changed codons: ",cc, round(cc_p,2)))
  
  #Average motif length:
  ml = ccmotif.lengthslist(ff, codes.c3[[as.numeric(code)]])
  print("Incode:")
  sum_ml_incode = summary(ml$incode)
  print(sum_ml_incode)
  max_ml = sum_ml_incode[[6]] # max motif length
  print("Outcode:")
  print(summary(ml$outcode))
  
  #Motif length distribution: theoretical distribution (red) copared with the sample (blue):
  r = ccmotif.classes(ml$incode, p, K = 18)
  print(ccmotif.barplotDiff(r$sample, r$geom, codeid = paste(organism,code,frame,sep = "_")))
  
  #Expected mean value of the motif length vs real mean value:
  E = function(p) 1 / (1 - p)
  mmt = E(p)
  mm = mean(ml$incode)
  print(round(c(sample = mm, expected = mmt), digits = 2))
  mmd = mm-mmt # difference
  mmi = mmd/mmt # increase in %
  
  #Edit Score and Edit Distance:
  editScore = getEditScore(rds)
  print(paste("Edit Score:",round(editScore,4)))
  editDistance = getEditDistance(rds,editScore,codes.c3[[as.numeric(code)]])
  print(paste("Edit Distance:",round(editDistance,2)))
  
  
  #Fill data frame
  tmp = data.frame(Organism=organism,
                  Code=code,
                  HasStop=!stop,
                  Frame=frame,
                  Changed_Codons=round(cc_p*100,2),
                  Code_Usage=round(p*100,2),
                  Edit_Score=round(editScore,3),
                  Edit_Distance=round(editDistance,2),
                  Increase_MML=round(mmi*100,2),
                  Max_ML=max_ml)

  
  df = rbind(df,tmp)
  
  # Extra data
  tmp2 = data.frame(Organism=organism,
                  Code=code,
                  HasStop=!stop,
                  Frame=frame,
                  bio_Code_Usage = round(ucc_p,2))
  
  df_extra = rbind(df_extra,tmp2)
}

names(df) = c("Organism","Code","Stop-Codons in Code?","Frame","Mutated Codons (in %)","Achieved Code Usage (in %)","Edit-Score","Edit-Distance","Increase in Mean Motif Length (in %)","Max. Motif Length")

names(df_extra) = c("Organism","Code","Stop-Codons in Code?","Frame", "bio Code usage")

```

Summary:
```{r}

kable(
  df,
  caption = paste("Summary for",organism)
)
```
```{r}
  saveRDS(
    object = df,
    file = "Workspace/Summary_Analysis.RDS"
  )
```

Complete data frame:
```{r}
write.xlsx(df, file = "Workspace/Summary_Analysis.xlsx",
      sheetName = "Complete", append = FALSE)
```

Sorted by mutated Codons
```{r}
df_change = df[order(-df$`Mutated Codons (in %)`),]

write.xlsx(df_change, file = "analysis_results/Summary_Analysis.xlsx",
      sheetName = "Sorted by Mutated Codons", append = TRUE)
```

Sorted by achieved code usage:
```{r}
df_cu = df[order(-df$`Achieved Code Usage (in %)`),]

write.xlsx(df_cu, file = "analysis_results/Summary_Analysis.xlsx",
      sheetName = "Sorted by Code Usage", append = TRUE)
```
Sorted by edit-score (S)
```{r}
df_s = df[order(-df$`Edit-Score`),]

write.xlsx(df_s, file = "analysis_results/Summary_Analysis.xlsx",
      sheetName = "Sorted by Edit-Score", append = TRUE)
```

Sorted by edit-distance (D)
```{r}
df_d = df[order(-df$`Edit-Distance`),]

write.xlsx(df_d, file = "analysis_results/Summary_Analysis.xlsx",
      sheetName = "Sorted by Edit-Distance", append = TRUE)
```

Sorted by increase in motif length
```{r}
df_inc = df[order(-df$`Increase in Mean Motif Length (in %)`),]

write.xlsx(df_inc, file = "analysis_results/Summary_Analysis.xlsx",
      sheetName = "Sorted by Increase MML", append = TRUE)
```

Sorted by max motif length
```{r}
df_max = df[order(-df$`Max. Motif Length`),]

write.xlsx(df_max, file = "analysis_results/Summary_Analysis.xlsx",
      sheetName = "Sorted by max Motif Length", append = TRUE)
```

Durchschnittlicher Anteil an mutierten Codons über alle Organismen
```{r}
round(sum(df$`Mutated Codons (in %)`)/nrow(df),2) # alle Frames

round(sum(df$`Mutated Codons (in %)`[which(df$Frame == 0)])/length(df$`Mutated Codons (in %)`[which(df$Frame == 0)]),2) # Frame 0
round(sum(df$`Mutated Codons (in %)`[which(df$Frame == 1)])/length(df$`Mutated Codons (in %)`[which(df$Frame == 1)]),2) # Frame 0
round(sum(df$`Mutated Codons (in %)`[which(df$Frame == 2)])/length(df$`Mutated Codons (in %)`[which(df$Frame == 2)]),2) # Frame 0
```
Durchschnittlicher Anteil der Code Usage vor den Mutationen
```{r}
round((sum(df_extra$`bio Code usage`[which (df_extra$Frame == 0)])/length(df_extra$`bio Code usage`[which(df$Frame == 0)]))*100,2) #Frame 0
round((sum(df_extra$`bio Code usage`[which (df_extra$Frame == 1)])/length(df_extra$`bio Code usage`[which(df$Frame == 1)]))*100,2) #Frame 0
round((sum(df_extra$`bio Code usage`[which (df_extra$Frame == 2)])/length(df_extra$`bio Code usage`[which(df$Frame == 2)]))*100,2) #Frame 0
```

Durschnittlich errreichte Code Usage
```{r}
round(sum(df$`Achieved Code Usage (in %)`)/nrow(df),2) # alle Frames

round(sum(df$`Achieved Code Usage (in %)`[which(df$Frame == 0)])/length(df$`Achieved Code Usage (in %)`[which(df$Frame == 0)]),2) # Frame 0
round(sum(df$`Achieved Code Usage (in %)`[which(df$Frame == 1)])/length(df$`Achieved Code Usage (in %)`[which(df$Frame == 1)]),2) # Frame 0
round(sum(df$`Achieved Code Usage (in %)`[which(df$Frame == 2)])/length(df$`Achieved Code Usage (in %)`[which(df$Frame == 2)]),2) # Frame 0
```
Erreichte Code usage unterschieden zwischen Stopp und nicht-Stopp:
```{r}
round(sum(df$`Achieved Code Usage (in %)`[which(df$`Stop-Codons in Code?` == FALSE)])/length(df$`Achieved Code Usage (in %)`[which(df$`Stop-Codons in Code?` == FALSE)]),2)

round(sum(df$`Achieved Code Usage (in %)`[which(df$`Stop-Codons in Code?` == TRUE)])/length(df$`Achieved Code Usage (in %)`[which(df$`Stop-Codons in Code?` == TRUE)]),2)
```

Durschnittlicher Edit-Score
```{r}
round(sum(df$`Edit-Score`)/nrow(df),2) # alle Frames

round(sum(df$`Edit-Score`[which(df$Frame == 0)])/length(df$`Edit-Score`[which(df$Frame == 0)]),2) # Frame 0
round(sum(df$`Edit-Score`[which(df$Frame == 1)])/length(df$`Edit-Score`[which(df$Frame == 1)]),2) # Frame 0
round(sum(df$`Edit-Score`[which(df$Frame == 2)])/length(df$`Edit-Score`[which(df$Frame == 2)]),2) # Frame 0
```

Durschnittliche Edit-Distance
```{r}
round(sum(df$`Edit-Distance`)/nrow(df),2) # alle Frames

round(sum(df$`Edit-Distance`[which(df$Frame == 0)])/length(df$`Edit-Distance`[which(df$Frame == 0)]),2) # Frame 0
round(sum(df$`Edit-Distance`[which(df$Frame == 1)])/length(df$`Edit-Distance`[which(df$Frame == 1)]),2) # Frame 0
round(sum(df$`Edit-Distance`[which(df$Frame == 2)])/length(df$`Edit-Distance`[which(df$Frame == 2)]),2) # Frame 0
```

Durschnittliche Motif-Length
```{r}
round(sum(df$`Increase in Mean Motif Length (in %)`)/nrow(df),2) # alle Frames

round(sum(df$`Increase in Mean Motif Length (in %)`[which(df$Frame == 0)])/length(df$`Increase in Mean Motif Length (in %)`[which(df$Frame == 0)]),2) # Frame 0
round(sum(df$`Increase in Mean Motif Length (in %)`[which(df$Frame == 1)])/length(df$`Increase in Mean Motif Length (in %)`[which(df$Frame == 1)]),2) # Frame 0
round(sum(df$`Increase in Mean Motif Length (in %)`[which(df$Frame == 2)])/length(df$`Increase in Mean Motif Length (in %)`[which(df$Frame == 2)]),2) # Frame 0
```

Durschnittliche längstes Motif
```{r}
round(sum(df$`Max. Motif Length`)/nrow(df),0) # alle Frames

round(sum(df$`Max. Motif Length`[which(df$Frame == 0)])/length(df$`Max. Motif Length`[which(df$Frame == 0)]),0) # Frame 0
round(sum(df$`Max. Motif Length`[which(df$Frame == 1)])/length(df$`Max. Motif Length`[which(df$Frame == 1)]),0) # Frame 0
round(sum(df$`Max. Motif Length`[which(df$Frame == 2)])/length(df$`Max. Motif Length`[which(df$Frame == 2)]),0) # Frame 0
```