renozao · ccpalu · Oct 19, 2018
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: NMF
 Type: Package
 Title: Algorithms and framework for Nonnegative Matrix Factorization (NMF)
-Version: 0.23.6
-Date: 2017-06-29
+Version: 0.24.0
+Date: 2018-10-19
 Author: Renaud Gaujoux, Cathal Seoighe
 Maintainer: Renaud Gaujoux <[email protected]>
 Description: This package provides a framework to perform Non-negative Matrix
@@ -110,5 +110,6 @@ Collate:
     'setNMFClass.R'
     'simulation.R'
     'tests.R'
+	'exploration.R'
 RoxygenNote: 6.0.1.9000
 Roxygen: list(markdown = TRUE, namespace_unsorted = TRUE, roclets = c('collate', 'namespace', 'roclets::rd2_roclet'))
diff --git a/NAMESPACE b/NAMESPACE
@@ -237,3 +237,8 @@ import(reshape2)
 export(nmfReport)
 export(syntheticNMF)
 export(nmfCheck)
+export(exploreNmf)
+export(exportAnalysis)
+export(getFeatures)
+export(nmfExplore)
+export(runNmf)
diff --git a/R/exploration.R b/R/exploration.R
@@ -0,0 +1,202 @@
+# Functions easily test and compare ranks and algorithms.
+# Functiosn to extract informations of the final NMF analysis.
+# 
+# Author: Cintia C Palu
+# Creation: 25 May 2018
+# Integrated to NMF: 19 Oct 2018
+###############################################################################
+
+#' @param data table to be analysed
+#'
+#' @param ann information on the data to be displayed on the heatmaps
+#' @param r range of ranks to be tested
+#' @param prefix a string used to name the generated plots
+#' @param seed a number to initialise the algorithm
+#' @param .opt 
+#'
+#' @export
+#' @rdname nmfExplore
+#' @aliases nmfExplore
+
+nmfExplore <- function(data, ann, r, prefix, seed = 123456, .opt = "vP"){
+  #Function to support the decision on algorithms and rank value
+
+  folder <- gsub("[^[:alnum:] ]", '.', prefix)
+  i <- 1
+  while(file.exists(folder)){
+    folder <- paste0(gsub("[^[:alnum:] ]", '.', prefix), i)
+    i <- i+1
+  }
+  dir.create(folder)
+
+  postfix <- paste(r[1], r[length(r)], sep = 'to')
+
+  nrange <- nmf(data, r, method = nmfAlgorithm(), nrun = 50, .opt = .opt, seed = seed)
+  save(nrange, file = paste0(folder, '/', prefix, '_nrange.rda'))
+
+  for(i in names(nrange)){
+    png(filename = paste0(folder, '/', prefix, '_', gsub('\\/', '.', i), '_', postfix, "_consensusmap.png"), 
+        width = 1600, height = 1200, units = "px")
+    par(mfrow = c(4, 3))
+    consensusmap(nrange[[i]], annCol = ann)
+    dev.off()
+  }
+  par(mfrow = c(1, 1))
+
+  N.rand <- randomize(data)
+  # estimate quality measures from the shuffled data
+  # only run NMF using the algorithms that didn't have errors in nrange
+
+  nrand <- nmf(N.rand, r, method = names(nrange), nrun = 50, .opt = .opt, seed = 123456)
+  save(nrand, file = paste0(folder, '/', prefix, '_nrand.rda'))
+  assign(paste0(prefix, "_nrand"), nrand, .GlobalEnv)
+
+  for(i in names(nrange)){#[names(nrange)%in%names(nrand)]){
+    png(filename = paste0(folder, '/', prefix, '_', gsub('\\/', '.', i), '_', postfix, "_ranksurvey.png"), 
+        width = 1600, height = 1200, units = "px")
+    print(plot(nrange[[i]], nrand[[i]]))#, method = i)
+    dev.off()
+  }
+
+  return(nrange)
+}
+
+#' @param meta 
+#'
+#' @param fileID 
+#' @param data 
+#'
+#' @export
+#' @rdname exportAnalysis
+#' @aliases exportAnalysis
+
+exportAnalysis <- function(meta, fileID, data){
+  g <- apply(meta, 2, function (x) which(x == max(x)))
+
+  data <- cbind(g, data)
+  data <- data[names(sort(g)), ]
+  write.table(data, file = paste(fileID, "metagene.tsv", sep = ""), row.names = TRUE, col.names = NA, sep = "\t")
+  return(g)
+}
+
+#' @param nmf.result 
+#'
+#' @param method 
+#' @param original.data 
+#'
+#' @export
+#' @rdname getFeatures
+#' @aliases getFeatures
+
+getFeatures = function(nmf.result, method = 'max', original.data){
+  groups <- (extractFeatures(nmf.result, method))
+  cat(paste('\n\nExtract Groups -', toupper(method), '\n'))
+  if( any(!is.na(groups))){
+    for (g in 1:length(groups)){
+      cat(paste('\nGroup', g, '\n'))
+      if(is.na(groups[g])){
+        cat('No top most contributing feature associated\n')
+      }else{
+        print(rownames(original.data)[groups[[g]]])
+      }
+    }
+  }else{
+    cat('No metagene has a top most contributing feature associated\n')
+  }
+  return(groups)
+}
+
+#' @param original.data the original data matrix
+#'
+#' @param nmf.result nmf object from which we will extract information
+#' @param prefix the name of the folder to save the results
+#' @param postfix a string informing the algorithm and rank used, it will also be used in the file names
+#'
+#' @export
+#' @rdname exploreNmf
+#' @aliases exploreNmf
+
+exploreNmf <- function(original.data, nmf.result, prefix, postfix){
+  # Extracting information of the main features that contribute to defining
+  # each metagene, as well as identifying to which cluster the samples belong to.
+
+
+  setwd(dir = paste0('./', prefix))
+  rankSum <- summary(nmf.result)
+  write.table(rankSum, file = paste(prefix, postfix, "rankSum.tsv", sep = '.'), sep = "\t", row.names = TRUE, 
+              col.names = NA, dec = ".")
+
+  ###################
+  ###################
+  ## Data analysis ##
+  ###################
+  ###################
+     cat('\nStarting analysis\n')
+  # Getting the matrix
+  w <- basis(nmf.result)
+  h <- coef(nmf.result)
+
+  #fit(nmf.result)
+
+  V.hat <- fitted(nmf.result)
+  write.table(V.hat, file = paste(prefix, postfix, "V.HAT.tsv", sep = '.'), row.names = TRUE, col.names = NA, sep = "\t")
+
+  if(is.null(rownames(original.data))){
+    cat('\nNaming rows in the original dataset\n')
+    rownames(original.data) = paste0('R', 1:dim(original.data)[1])
+  }
+
+  groups_kim <- getFeatures(nmf.result, "kim", original.data)
+  groups_max <- getFeatures(nmf.result, "max", original.data)
+
+  ####################
+  #    Exporting     #
+  # relevant Genes   #
+  # for each cluster #
+  ####################
+  cat('\nExporting metagenes information\n')
+  groups_h <- exportAnalysis(h, file = paste(prefix, postfix, ".H", sep = '.'), t(original.data))
+  groups_w <- exportAnalysis(t(w), file = paste(prefix, postfix, "W", sep = '.'), original.data)
+  write.table(w, file = paste(prefix, postfix, "W.tsv", sep = '.'), row.names = TRUE, col.names = NA, sep = "\t")
+  write.table(h, file = paste(prefix, postfix, "H.tsv", sep = '.'), row.names = TRUE, col.names = NA, sep = "\t")
+
+  cat('\nSaving variables\n')
+  save(list = c('w', 'h', 'V.hat', 'groups_w', 'groups_h', 'groups_max', 'groups_kim'), 
+       file = paste(prefix, postfix, "exploreNMF.rda", sep = '.'))
+  setwd(dir = '../')
+  cat('\nExploratory Analysis finished. ')
+  cat('Environment objects saved in the file:\n"')
+  cat(paste(prefix, postfix, 'exploreNMF.rda"\n', sep = '.'))
+}
+
+#' @param original.data 
+#'
+#' @param ann 
+#' @param r 
+#' @param prefix 
+#' @param .opt 
+#' @param alg 
+#' @param maxIter 
+#' @param nrun 
+#'
+#' @export
+#' @rdname runNmf
+#' @aliases runNmf
+
+runNmf <- function (original.data, ann, r, prefix, .opt = "vP", alg, maxIter = 30000, nrun = 1000){
+
+  nmf.result <- nmf(original.data, r, method = alg, .opt = .opt, maxIter = maxIter, nrun = nrun)
+
+  folder <- gsub("[^[:alnum:] ]", '.', prefix)
+  if(!file.exists(folder)){
+    dir.create(folder)
+  }
+  postfix <- paste0(alg, '.r', r)
+
+  save(nmf.result, file = paste0(folder, '/', folder, '.', postfix, '.rda'))
+  exploreNmf(original.data = original.data, nmf.result = nmf.result, prefix = folder, postfix = postfix)
+
+  return(nmf.result)
+}
+
+
diff --git a/man/exploreNmf.Rd b/man/exploreNmf.Rd
@@ -0,0 +1,27 @@
+\name{exploreNmf}
+\alias{exploreNmf}
+\title{Function to extract information on the metagenes obtained after running \code{\link{nmf}}}
+\description{It identifies the main features that contributef to defining each metagene, 
+as well as identifying to which metagene the samples belong to, based on the frequency they were assigned to it. This function is mainly called by the function \code{\link{runNmf}}}
+\usage{
+exploreNmf(original.data, nmf.result, prefix, postfix)
+}
+\arguments{
+  \item{original.data}{Matrix with the original data submited to the \code{\link{nmf}}.}
+  \item{nmf.result}{Object returned by \code{\link{nmf}} function, run with a single rank and a single algorithm.}
+  \item{prefix}{A string used to name the generated plots and the folder to save them.}
+  \item{postfix}{A string that informs the range of ranks tested, to be included as part of the files names.}
+}
+\value{
+It generates files to be saved in the folder named after the \code{prefix}.
+\item{*.rankSum.tsv}{Tab-separated table with the \code{summary} information of \code{nmf.result}.}
+\item{*.V.HAT.tsv}{Tab-separated table with the \code{fitted} results of \code{nmf.result}.}
+\item{*.W.tsv}{Tab-separated table with the \code{basis} results for \code{nmf.result}.}
+\item{*.H.tsv}{Tab-separated table with the \code{coef} results for \code{nmf.result}.}
+\item{*.metagene.tsv}{Tab-separated table informing to which metagene a row or a column of the original data matrix was assigned based on the \code{\link{nmf}} analysis. It is generated by the \code{exportAnalysis} function.}
+\item{*.exploreNMF.rda}{File with R objects that contain relevant information regarding the metagenes.
+They are exported to enable the user to review the information in the future.}
+}
+\author{
+Cintia C Palu
+}
diff --git a/man/exportAnalysis.Rd b/man/exportAnalysis.Rd
@@ -0,0 +1,20 @@
+\name{exportAnalysis}
+\alias{exportAnalysis}
+\title{Function to find to which metagene a row or column of a matrix were more frequently assigned according to the \code{\link{nmf}} results.}
+\description{
+It searches for the higher value in each column of a matrix, in the context of \code{\link{nmf}} the higher value indicates the metagene more frequently assigned to the varaible represented by that column during the multiple \code{\link{nmf}} runs. The metagene information is added to the orginald ata matrix and saved as a .tsv file. It is designed for users interested in use \code{\link{nmf}} to find clusters on their data. \code{exportAnalysis} is autmatically called by \code{\link{exploreNmf}} function}
+\usage{
+exportAnalysis(meta, fileID, data)
+}
+\arguments{
+  \item{meta}{Data matrix generated by the  \code{coef} or \code{basis} functions.}
+  \item{fileID}{A string providing the file name prefix to be used to save the reuslts.}
+  \item{data}{Original matrix submited to \code{\link{nmf}} analysis.}
+}
+\value{
+It generates files to be saved in the folder named after the \code{prefix}.
+\item{*.metagene.tsv}{Tab-separated table informing to which metagene a row or a column of the original data matrix was assigned based on the \code{\link{nmf}} analysis.}
+}
+\author{
+Cintia C Palu
+}
diff --git a/man/getFeatures.Rd b/man/getFeatures.Rd
@@ -0,0 +1,19 @@
+\name{getFeatures}
+\alias{getFeatures}
+\title{It retrieves the top most contributing feature associated with a metagene estimated by \code{\link{nmf}}}
+\description{It applies the \code{\link{extractFeatures}} function with the method of choice (default is 'max')
+to identify the top most contributing features of the metagenes.}
+\usage{
+getFeatures(nmf.result, method = "max", original.data)
+}
+
+\arguments{
+  \item{nmf.result}{Object returned by \code{\link{nmf}} function, run with a single rank and a single algorithm.}
+  \item{method}{Scoring or selection method ("kim" or "max").}
+  \item{original.data}{The original data matrix submiited to the \code{\link{nmf}} analysis.}
+}
+\value{list with the selected features.}
+
+\author{
+Cintia C Palu
+}