DirectReadFasta.txt

An alternative way of reading fasta file directory into R by using "Biostrings" library.

#Reading a fasta file into R
library(Biostrings)                      #using this package

memory.limit(size = 2^19) 

gc(reset = TRUE)
gc(reset = TRUE)

# Maybe it's not as memory efficient as using a text table.
# I need to allocate memory here to make it work.

#Reading the fasta file to R
  fasta <- readDNAStringSet("samples.fas", format="fasta")#

 n_seq<- fasta@ranges@width[1]
 n_sample <-  length(fasta) 


bool <- array(0, dim=c(n_sample, 5*n_seq)) # the boolean vector
  colnames(bool) <- c(paste("A_", 1:n_seq, sep=""),paste("T_", 1:n_seq, sep=""),paste("G_", 1:n_seq, sep=""),paste("C_", 1:n_seq, sep=""),paste("s_", 1:n_seq, sep=""))
  rownames(bool) <- subseq(fasta)@ranges@NAMES

  ## storage
  for (s in 1:n_sample){
     se <- fasta[[s]]
     se <- tolower(se)  # reading the seq data

	for (le in  1:n_seq){  # reading the letters one by one

	 base <- substr(se, le,le)
 
	if(base =="a") {
	bool[s, le] <-1
		} else {

	if(base =="t") {
	bool[s, le+n_seq] <-1
		} else {

	if(base =="g") {
	bool[s, le+n_seq*2] <-1
		} else {

	if(base =="c") {
	bool[s, le+n_seq*3] <-1
		} else {

	if(base =="-") {
	bool[s, le+n_seq*4] <-1


}}}}}}}