-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.html
330 lines (304 loc) · 18.9 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="generator" content="pandoc">
<meta name="author" content="" />
<meta name="dcterms.date" content="2016-10-24" />
<title> Creating LDAvis - A Tutorial</title>
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
<link rel="stylesheet" href="index_files/reveal.js-3.3.0/css/reveal.css"/>
<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>
<link rel="stylesheet" href="index_files/reveal.js-3.3.0/css/theme/black.css" id="theme">
<style type="text/css">
.reveal section img {
background: rgba(255, 255, 255, 0.85);
}
</style>
<!-- some tweaks to reveal css -->
<style type="text/css">
.reveal h1 { font-size: 2.0em; }
.reveal h2 { font-size: 1.5em; }
.reveal h3 { font-size: 1.25em; }
.reveal h4 { font-size: 1em; }
.reveal .slides>section,
.reveal .slides>section>section {
padding: 0px 0px;
}
.reveal table {
border-width: 1px;
border-spacing: 2px;
border-style: dotted;
border-color: gray;
border-collapse: collapse;
font-size: 0.7em;
}
.reveal table th {
border-width: 1px;
padding-left: 10px;
padding-right: 25px;
font-weight: bold;
border-style: dotted;
border-color: gray;
}
.reveal table td {
border-width: 1px;
padding-left: 10px;
padding-right: 25px;
border-style: dotted;
border-color: gray;
}
</style>
<style type="text/css">code{white-space: pre;}</style>
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? 'index_files/reveal.js-3.3.0/css/print/pdf.css' : 'index_files/reveal.js-3.3.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<!--[if lt IE 9]>
<script src="index_files/reveal.js-3.3.0/lib/js/html5shiv.js"></script>
<![endif]-->
<link href="index_files/font-awesome-4.5.0/css/font-awesome.min.css" rel="stylesheet" />
</head>
<body>
<div class="reveal">
<div class="slides">
<section data-background-image="imgs/bg-small.png">
<h1 class="title"><br><small> Creating <a href="https://cran.r-project.org/web/packages/LDAvis/index.html">LDAvis</a> - A Tutorial </small></h1>
<h1 class="subtitle"><small> <br><a href="http://r-addict.com/About.html">Marcin Kosiński</a> </small></h1>
<h2 class="author"><small><a href='https://r-addict.com'><i class='fa fa-comment'></i></a> <a href='https://stackoverflow.com/users/3857701'><i class='fa fa-stack-overflow'></i></a> <a href='https://github.com/MarcinKosinski'><i class='fa fa-github'></i></a> <a href='mailto:[email protected]'><i class='fa fa-envelope-o'></i></a></small><br></h2>
<h3 class="date">October 24, 2016</h3>
</section>
<section><section id="motivation" class="titleslide slide level1"><h1>Motivation</h1></section><section id="motivation-1" class="slide level2">
<h1>Motivation</h1>
<small>
<ul>
<li>Text mining is a new challenge for machine wandering practitioners.</li>
<li>The increased interest in the text mining is caused by an augmentation of internet users and by rapid growth of the internet data which is said that <em>in 80% is a text data</em>.</li>
<li>Extracting information from articles, news, posts and comments have became a desirable skill but what is even more needful are tools for text mining models diagnostics and visualizations.</li>
<li>Such visualizations enable to better understand the insight from a model and provides an easy interface for presenting your research results to greater audience.</li>
</ul>
<p></small></p>
</section></section>
<section><section id="lda-overview" class="titleslide slide level1"><h1>LDA overview</h1></section><section id="lda-overview-1" class="slide level2">
<h1>LDA overview</h1>
<p>From <a href="https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">Wikipedia</a></p>
<small>
<blockquote>
<p>In natural language processing, latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word’s creation is attributable to one of the document’s topics.</p>
</blockquote>
<p></small></p>
</section></section>
<section><section id="data" class="titleslide slide level1"><h1>Data</h1></section><section id="show-case-on-r-bloggers" class="slide level2">
<h1>Show Case on R-Bloggers</h1>
<small>
<p>For this presentation I have used articles from <a href="http://r-bloggers.com/">R-Bloggers</a>.</p>
<p>They can be downloaded from <a href="https://github.com/MarcinKosinski/r-bloggers-harvesting">this repository</a>.</p>
<p>The data harvesting process is explained in this post: <a href="http://r-addict.com/2016/06/21/LDAvis-RBloggers.html">LDAvis Show Case on R-Bloggers</a></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(RSQLite)
db.conn <-<span class="st"> </span>
<span class="st"> </span><span class="kw">dbConnect</span>(
<span class="kw">dbDriver</span>(<span class="st">"SQLite"</span>),
<span class="st">"r-bloggers.db"</span>
)
posts <-<span class="st"> </span><span class="kw">dbGetQuery</span>(db.conn,
<span class="st">"SELECT text from posts"</span>)
<span class="kw">dbDisconnect</span>(db.conn)</code></pre></div>
<p></small></p>
</section><section id="data-preparation-1" class="slide level2">
<h1>Data preparation 1</h1>
<small>
<p>Normally I would use <code>LDA()</code> function from <a href="https://cran.r-project.org/web/packages/topicmodels/topicmodels.pdf">topicmodels</a> package to fit LDA model because the input can be of class <code>DocumentTermMatrix</code> which is an object from <a href="https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf">tm</a> package.</p>
<p><code>DocumentTermMatrix</code> object is very convinient for working with text data (<a href="http://www.rexamine.com/2014/06/text-mining-in-r-automatic-categorization-of-wikipedia-articles/">check this Norbert Ryciak’s post</a>) because there exists <code>tm_map</code> function which can be applied to all documents for stop words removal, lowering capital letters and removal of words that did not occur in x % of documents.</p>
<p>I haven’t seen <code>LDAvis</code> examples for models generated with topicsmodel package so we will use traditional approach to text processing.</p>
<p>The <a href="https://en.wikipedia.org/wiki/Lemmatisation">stemming</a> and stopwords removal was performed during the data collection.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">## the following fragment of code in this section
## is motivated by
## http://cpsievert.github.io/LDAvis/reviews/reviews.html
<span class="co"># tokenize on space and output as a list:</span>
doc.list <-<span class="st"> </span><span class="kw">strsplit</span>(posts[, <span class="dv">1</span>], <span class="st">"[[:space:]]+"</span>)
<span class="co"># compute the table of terms:</span>
term.table <-<span class="st"> </span><span class="kw">table</span>(<span class="kw">unlist</span>(doc.list))
term.table <-<span class="st"> </span><span class="kw">sort</span>(term.table, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)
<span class="co"># remove terms that occur fewer than 5 times:</span>
term.table <-<span class="st"> </span>term.table[term.table ><span class="st"> </span><span class="dv">5</span>]
vocab <-<span class="st"> </span><span class="kw">names</span>(term.table)</code></pre></div>
<p></small></p>
</section><section id="data-preparation-2" class="slide level2">
<h1>Data preparation 2</h1>
<small>
<p>The <code>lda.collapsed.gibbs.sampler()</code> function from <code>lda</code> package has uncomfortable input format (regarding to <code>LDA()</code> from <code>topicmodels</code> package) so I basically used <a href="https://github.com/cpsievert">cpsievert</a> snippets</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># now put the documents into the format required by the lda package:</span>
get.terms <-<span class="st"> </span>function(x) {
index <-<span class="st"> </span><span class="kw">match</span>(x, vocab)
index <-<span class="st"> </span>index[!<span class="kw">is.na</span>(index)]
<span class="kw">rbind</span>(<span class="kw">as.integer</span>(index -<span class="st"> </span><span class="dv">1</span>), <span class="kw">as.integer</span>(<span class="kw">rep</span>(<span class="dv">1</span>, <span class="kw">length</span>(index))))
}
documents <-<span class="st"> </span><span class="kw">lapply</span>(doc.list, get.terms)
<span class="co"># Compute some statistics related to the data set:</span>
D <-<span class="st"> </span><span class="kw">length</span>(documents) <span class="co"># number of documents (3740)</span>
W <-<span class="st"> </span><span class="kw">length</span>(vocab) <span class="co"># number of terms in the vocab (18,536)</span>
doc.length <-<span class="st"> </span><span class="kw">sapply</span>(documents,
function(x) <span class="kw">sum</span>(x[<span class="dv">2</span>, ]))
<span class="co"># number of tokens per document [312, 288, 170, 436, 291, ...]</span>
N <-<span class="st"> </span><span class="kw">sum</span>(doc.length) <span class="co"># total number of tokens in the data (546,827)</span>
term.frequency <-<span class="st"> </span><span class="kw">as.integer</span>(term.table)
<span class="co"># frequencies of terms in the corpus [8939, 5544, 2411, 2410, 2143, ...]</span></code></pre></div>
<p></small></p>
</section></section>
<section><section id="fitting-the-model" class="titleslide slide level1"><h1>Fitting the Model</h1></section><section id="fitting-the-model-1" class="slide level2">
<h1>Fitting the Model</h1>
<small>
<p>From <code>lda</code> package documentation</p>
<blockquote>
<p>… [this function] takes sparsely represented input documents, perform inference, and return point estimates of the latent parameters using the state at the last iteration of Gibbs sampling.</p>
</blockquote>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># MCMC and model tuning parameters:</span>
K <-<span class="st"> </span><span class="dv">20</span>
G <-<span class="st"> </span><span class="dv">5000</span>
alpha <-<span class="st"> </span><span class="fl">0.02</span>
eta <-<span class="st"> </span><span class="fl">0.02</span>
<span class="co"># Fit the model:</span>
<span class="kw">library</span>(lda)
<span class="kw">set.seed</span>(<span class="dv">456</span>)
fit <-<span class="st"> </span><span class="kw">lda.collapsed.gibbs.sampler</span>(
<span class="dt">documents =</span> documents, <span class="dt">K =</span> K,
<span class="dt">vocab =</span> vocab, <span class="dt">num.iterations =</span> G,
<span class="dt">alpha =</span> alpha, <span class="dt">eta =</span> eta,
<span class="dt">initial =</span> <span class="ot">NULL</span>, <span class="dt">burnin =</span> <span class="dv">0</span>,
<span class="dt">compute.log.likelihood =</span> <span class="ot">TRUE</span>
)</code></pre></div>
<p></small></p>
</section><section id="getting-the-model" class="slide level2">
<h1>Getting the Model</h1>
<small>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(archivist)
<span class="kw">saveToRepo</span>(fit, <span class="dt">repoDir =</span> <span class="st">"../Museum"</span>)</code></pre></div>
<p>The computations took very long, so in case you would like to get model faster, I have archived my model on GitHub with the help of <a href="http://r-bloggers.com/r-hero-saves-backup-city-with-archivist-and-github/">archivist</a> package. You can easily load this model to R with</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">archivist::<span class="kw">aread</span>(<span class="st">'MarcinKosinski/Museum/fa93abf0ff93a7f6f3f0c42b7935ad4d'</span>) -><span class="st"> </span>fit</code></pre></div>
<p></small></p>
</section></section>
<section><section id="ldavis-use-case" class="titleslide slide level1"><h1>LDAvis use case</h1></section><section id="ldavis-use-case-1" class="slide level2">
<h1>LDAvis use case</h1>
<small>
<p>If you google out properly you’ll wind out that LDAvis description is</p>
<blockquote>
<p>Tools to create an interactive web-based visualization of a topic model that has been fit to a corpus of text data using Latent Dirichlet Allocation (LDA). Given the estimated parameters of the topic model, it computes various summary statistics as input to an interactive visualization built with D3.js that is accessed via a browser. The goal is to help users interpret the topics in their LDA topic model.</p>
</blockquote>
<p><a href="https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf">Detailed vignette about LDAvis input and output can be found here</a>.</p>
<p></small></p>
</section><section id="ldavis-preparations" class="slide level2">
<h1>LDAvis Preparations</h1>
<small>
<p>To visualize the result using LDAvis, we’ll need estimates of the document-topic distributions, which we denote by the DxK matrix theta, and the set of topic-term distributions, which we denote by the K×W matrix phi.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">theta <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(fit$document_sums +<span class="st"> </span>alpha,
<span class="dv">2</span>,
function(x) x/<span class="kw">sum</span>(x)))
phi <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(<span class="kw">t</span>(fit$topics) +<span class="st"> </span>eta,
<span class="dv">2</span>,
function(x) x/<span class="kw">sum</span>(x)))</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(LDAvis)
<span class="co"># create the JSON object to feed the visualization:</span>
json <-<span class="st"> </span><span class="kw">createJSON</span>(
<span class="dt">phi =</span> phi,
<span class="dt">theta =</span> theta,
<span class="dt">doc.length =</span> doc.length,
<span class="dt">vocab =</span> vocab,
<span class="dt">term.frequency =</span> term.frequency
)
<span class="kw">serVis</span>(json, <span class="dt">out.dir =</span> <span class="st">'vis'</span>,
<span class="dt">open.browser =</span> <span class="ot">FALSE</span>)</code></pre></div>
<p></small></p>
</section><section id="result" class="slide level2">
<h1>Result</h1>
<small>
<p>The result is published under this link</p>
<p><a href="http://r-addict.com/r-bloggers-harvesting/" class="uri">http://r-addict.com/r-bloggers-harvesting/</a></p>
<p>where you can check Intertopic Distance Map (via multidimensional scaling) and top N relevant terms for a topic.</p>
<p></small></p>
</section></section>
</div>
</div>
<script src="index_files/reveal.js-3.3.0/lib/js/head.min.js"></script>
<script src="index_files/reveal.js-3.3.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/reveal.js#configuration
Reveal.initialize({
// Display the page number of the current slide
slideNumber: true,
// Push each slide change to the browser history
history: true,
// Vertical centering of slides
center: true,
// Transition style
transition: 'default', // none/fade/slide/convex/concave/zoom
// Transition style for full page slide backgrounds
backgroundTransition: 'default', // none/fade/slide/convex/concave/zoom
// Optional reveal.js plugins
dependencies: [
]
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
<script>
(function() {
if (window.jQuery) {
Reveal.addEventListener( 'slidechanged', function(event) {
window.jQuery(event.previousSlide).trigger('hidden');
window.jQuery(event.currentSlide).trigger('shown');
});
}
})();
</script>
</body>
</html>