-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_matrix.R
28 lines (18 loc) · 1.36 KB
/
create_matrix.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#-------------------------------------------------------------------------------------------------------------
#Creating a Document Term Matrix to analyze data
create_matrix <- function(textColumns, language="english", minDocFreq=1, minWordLength=3, removeNumbers=TRUE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE){
stem_words <- function(x) {
split <- strsplit(x," ")
return(wordStem(split[[1]],language=language))
}
control <- list(language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stripWhitespace=stripWhitespace,minWordLength=minWordLength,stopwords=removeStopwords,minDocFreq=minDocFreq)
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=6)
trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
matrix <- DocumentTermMatrix(corpus,control=control);
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
gc()
return(matrix)
}
#-------------------------------------------------------------------------------------------------------------# R-Complaint-Classifier