-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCompare_Daddit_Mommit.R
109 lines (84 loc) · 4.32 KB
/
Compare_Daddit_Mommit.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
---
title: "R Notebook"
output: html_notebook
---
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*.
```{r}
plot(cars)
```
Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
options(stringsAsFactors = FALSE)
library(tm)
require("SnowballC")
require(slam)
library(wordcloud)
english_stopwords <- stopwords("en")
lines <- readLines("Daddit.txt", encoding = "UTF-8")
targetCorpus <- Corpus(VectorSource(lines))
targetCorpus <- tm_map(targetCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
targetCorpus <- tm_map(targetCorpus, removeNumbers)
targetCorpus <- tm_map(targetCorpus, content_transformer(tolower))
targetCorpus <- tm_map(targetCorpus, removeWords, english_stopwords)
targetCorpus <- tm_map(targetCorpus, stemDocument, language = "en")
targetCorpus <- tm_map(targetCorpus, stripWhitespace)
targetCorpus <- tm_map(targetCorpus, content_transformer(removePunctuation))
targetDTM <- DocumentTermMatrix(targetCorpus)
termCountsTarget <- col_sums(targetDTM)
lines <- readLines("Mommit.txt", encoding = "UTF-8")
comparisonCorpus <- Corpus(VectorSource(lines))
comparisonCorpus <- tm_map(comparisonCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
comparisonCorpus <- tm_map(comparisonCorpus, removeNumbers)
comparisonCorpus <- tm_map(comparisonCorpus, content_transformer(tolower))
comparisonCorpus <- tm_map(comparisonCorpus, removeWords, english_stopwords)
comparisonCorpus <- tm_map(comparisonCorpus, stemDocument, language = "en")
comparisonCorpus <- tm_map(comparisonCorpus, stripWhitespace)
comparisonCorpus <- tm_map(comparisonCorpus, content_transformer(removePunctuation))
comparisonDTM <- DocumentTermMatrix(comparisonCorpus)
termCountsComparison <- col_sums(comparisonDTM)
# Loglikelihood for a single term
term <- "care"
# Determine variables
a <- termCountsTarget[term]
b <- termCountsComparison[term]
c <- sum(termCountsTarget)
d <- sum(termCountsComparison)
Expected1 = c * (a+b) / (c+d)
Expected2 = d * (a+b) / (c+d)
t1 <- a * log((a/Expected1))
t2 <- b * log((b/Expected2))
logLikelihood <- 2 * (t1 + t2)
print(logLikelihood)
# use set operation to get terms only occurring in target document
uniqueTerms <- setdiff(names(termCountsTarget), names(termCountsComparison))
# Have a look into a random selection of terms unique in the target corpus
sample(uniqueTerms, 20)
# Create vector of zeros to append to comparison counts
zeroCounts <- rep(0, length(uniqueTerms))
names(zeroCounts) <- uniqueTerms
termCountsComparison <- c(termCountsComparison, zeroCounts)
# Get list of terms to compare from intersection of target and comparison vocabulary
termsToCompare <- intersect(names(termCountsTarget), names(termCountsComparison))
# Calculate statistics (same as above, but now with vectors!)
a <- termCountsTarget[termsToCompare]
b <- termCountsComparison[termsToCompare]
c <- sum(termCountsTarget)
d <- sum(termCountsComparison)
Expected1 = c * (a+b) / (c+d)
Expected2 = d * (a+b) / (c+d)
t1 <- a * log((a/Expected1) + (a == 0))
t2 <- b * log((b/Expected2) + (b == 0))
logLikelihood <- 2 * (t1 + t2)
# Compare relative frequencies to indicate over/underuse
relA <- a / c
relB <- b / d
# underused terms are multiplied by -1
logLikelihood[relA < relB] <- logLikelihood[relA < relB] * -1
sort(logLikelihood, decreasing=TRUE)[1:100]
sort(logLikelihood, decreasing=FALSE)[1:100]
sorted_loglikelihood_increase <- sort(logLikelihood, decreasing=TRUE)
sorted_loglikelihood_decrease <- sort(logLikelihood, decreasing=FALSE)
write.table(sorted_loglikelihood_increase, "loglikelihood_All_Throw_Conv.csv", row.names = TRUE, col.names=TRUE, sep = "\t")
write.table(sorted_loglikelihood_decrease, "loglikelihood_All_Pseud_Conv.csv", row.names = TRUE, col.names=TRUE, sep = "\t")