index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="generator" content="pandoc">
  <meta name="author" content="" />
  <meta name="dcterms.date" content="2016-10-24" />
  <title>  Creating LDAvis - A Tutorial</title>
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
  <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
  <link rel="stylesheet" href="index_files/reveal.js-3.3.0/css/reveal.css"/>


<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
  margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>

<link rel="stylesheet" href="index_files/reveal.js-3.3.0/css/theme/black.css" id="theme">

<style type="text/css">
.reveal section img {
  background: rgba(255, 255, 255, 0.85);
}
</style>

  <!-- some tweaks to reveal css -->
  <style type="text/css">
    .reveal h1 { font-size: 2.0em; }
    .reveal h2 { font-size: 1.5em;  }
    .reveal h3 { font-size: 1.25em;	}
    .reveal h4 { font-size: 1em;	}

    .reveal .slides>section,
    .reveal .slides>section>section {
      padding: 0px 0px;
    }


    .reveal table {
      border-width: 1px;
      border-spacing: 2px;
      border-style: dotted;
      border-color: gray;
      border-collapse: collapse;
      font-size: 0.7em;
    }

    .reveal table th {
      border-width: 1px;
      padding-left: 10px;
      padding-right: 25px;
      font-weight: bold;
      border-style: dotted;
      border-color: gray;
    }

    .reveal table td {
      border-width: 1px;
      padding-left: 10px;
      padding-right: 25px;
      border-style: dotted;
      border-color: gray;
    }

  </style>

    <style type="text/css">code{white-space: pre;}</style>

    <!-- Printing and PDF exports -->
    <script>
      var link = document.createElement( 'link' );
      link.rel = 'stylesheet';
      link.type = 'text/css';
      link.href = window.location.search.match( /print-pdf/gi ) ? 'index_files/reveal.js-3.3.0/css/print/pdf.css' : 'index_files/reveal.js-3.3.0/css/print/paper.css';
      document.getElementsByTagName( 'head' )[0].appendChild( link );
    </script>
    <!--[if lt IE 9]>
    <script src="index_files/reveal.js-3.3.0/lib/js/html5shiv.js"></script>
    <![endif]-->

    <link href="index_files/font-awesome-4.5.0/css/font-awesome.min.css" rel="stylesheet" />
</head>
<body>
  <div class="reveal">
    <div class="slides">

<section data-background-image="imgs/bg-small.png">
    <h1 class="title"><br><small> Creating <a href="https://cran.r-project.org/web/packages/LDAvis/index.html">LDAvis</a> - A Tutorial </small></h1>
  <h1 class="subtitle"><small> <br><a href="http://r-addict.com/About.html">Marcin Kosiński</a> </small></h1>
    <h2 class="author"><small><a href='https://r-addict.com'><i class='fa fa-comment'></i></a>  <a href='https://stackoverflow.com/users/3857701'><i class='fa fa-stack-overflow'></i></a>  <a href='https://github.com/MarcinKosinski'><i class='fa fa-github'></i></a>  <a href='mailto:m.p.kosinski@gmail.com'><i class='fa fa-envelope-o'></i></a></small><br></h2>
    <h3 class="date">October 24, 2016</h3>
</section>

<section><section id="motivation" class="titleslide slide level1"><h1>Motivation</h1></section><section id="motivation-1" class="slide level2">
<h1>Motivation</h1>
<small>
<ul>
<li>Text mining is a new challenge for machine wandering practitioners.</li>
<li>The increased interest in the text mining is caused by an augmentation of internet users and by rapid growth of the internet data which is said that <em>in 80% is a text data</em>.</li>
<li>Extracting information from articles, news, posts and comments have became a desirable skill but what is even more needful are tools for text mining models diagnostics and visualizations.</li>
<li>Such visualizations enable to better understand the insight from a model and provides an easy interface for presenting your research results to greater audience.</li>
</ul>
<p></small></p>
</section></section>
<section><section id="lda-overview" class="titleslide slide level1"><h1>LDA overview</h1></section><section id="lda-overview-1" class="slide level2">
<h1>LDA overview</h1>
<p>From <a href="https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">Wikipedia</a></p>
<small>
<blockquote>
<p>In natural language processing, latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word’s creation is attributable to one of the document’s topics.</p>
</blockquote>
<p></small></p>
</section></section>
<section><section id="data" class="titleslide slide level1"><h1>Data</h1></section><section id="show-case-on-r-bloggers" class="slide level2">
<h1>Show Case on R-Bloggers</h1>
<small>
<p>For this presentation I have used articles from <a href="http://r-bloggers.com/">R-Bloggers</a>.</p>
<p>They can be downloaded from <a href="https://github.com/MarcinKosinski/r-bloggers-harvesting">this repository</a>.</p>
<p>The data harvesting process is explained in this post: <a href="http://r-addict.com/2016/06/21/LDAvis-RBloggers.html">LDAvis Show Case on R-Bloggers</a></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(RSQLite)
db.conn &lt;-<span class="st"> </span>
<span class="st">  </span><span class="kw">dbConnect</span>(
    <span class="kw">dbDriver</span>(<span class="st">&quot;SQLite&quot;</span>),
    <span class="st">&quot;r-bloggers.db&quot;</span>
)
posts &lt;-<span class="st"> </span><span class="kw">dbGetQuery</span>(db.conn, 
           <span class="st">&quot;SELECT text from posts&quot;</span>)
<span class="kw">dbDisconnect</span>(db.conn)</code></pre></div>
<p></small></p>
</section><section id="data-preparation-1" class="slide level2">
<h1>Data preparation 1</h1>
<small>
<p>Normally I would use <code>LDA()</code> function from <a href="https://cran.r-project.org/web/packages/topicmodels/topicmodels.pdf">topicmodels</a> package to fit LDA model because the input can be of class <code>DocumentTermMatrix</code> which is an object from <a href="https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf">tm</a> package.</p>
<p><code>DocumentTermMatrix</code> object is very convinient for working with text data (<a href="http://www.rexamine.com/2014/06/text-mining-in-r-automatic-categorization-of-wikipedia-articles/">check this Norbert Ryciak’s post</a>) because there exists <code>tm_map</code> function which can be applied to all documents for stop words removal, lowering capital letters and removal of words that did not occur in x % of documents.</p>
<p>I haven’t seen <code>LDAvis</code> examples for models generated with topicsmodel package so we will use traditional approach to text processing.</p>
<p>The <a href="https://en.wikipedia.org/wiki/Lemmatisation">stemming</a> and stopwords removal was performed during the data collection.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">## the following fragment of code in this section
## is motivated by
## http://cpsievert.github.io/LDAvis/reviews/reviews.html
<span class="co"># tokenize on space and output as a list:</span>
doc.list &lt;-<span class="st"> </span><span class="kw">strsplit</span>(posts[, <span class="dv">1</span>], <span class="st">&quot;[[:space:]]+&quot;</span>)
<span class="co"># compute the table of terms:</span>
term.table &lt;-<span class="st"> </span><span class="kw">table</span>(<span class="kw">unlist</span>(doc.list))
term.table &lt;-<span class="st"> </span><span class="kw">sort</span>(term.table, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)
<span class="co"># remove terms that occur fewer than 5 times:</span>
term.table &lt;-<span class="st"> </span>term.table[term.table &gt;<span class="st"> </span><span class="dv">5</span>]
vocab &lt;-<span class="st"> </span><span class="kw">names</span>(term.table)</code></pre></div>
<p></small></p>
</section><section id="data-preparation-2" class="slide level2">
<h1>Data preparation 2</h1>
<small>
<p>The <code>lda.collapsed.gibbs.sampler()</code> function from <code>lda</code> package has uncomfortable input format (regarding to <code>LDA()</code> from <code>topicmodels</code> package) so I basically used <a href="https://github.com/cpsievert">cpsievert</a> snippets</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># now put the documents into the format required by the lda package:</span>
get.terms &lt;-<span class="st"> </span>function(x) {
  index &lt;-<span class="st"> </span><span class="kw">match</span>(x, vocab)
  index &lt;-<span class="st"> </span>index[!<span class="kw">is.na</span>(index)]
  <span class="kw">rbind</span>(<span class="kw">as.integer</span>(index -<span class="st"> </span><span class="dv">1</span>), <span class="kw">as.integer</span>(<span class="kw">rep</span>(<span class="dv">1</span>, <span class="kw">length</span>(index))))
}
documents &lt;-<span class="st"> </span><span class="kw">lapply</span>(doc.list, get.terms)

<span class="co"># Compute some statistics related to the data set:</span>
D &lt;-<span class="st"> </span><span class="kw">length</span>(documents)  <span class="co"># number of documents (3740)</span>
W &lt;-<span class="st"> </span><span class="kw">length</span>(vocab)  <span class="co"># number of terms in the vocab (18,536)</span>
doc.length &lt;-<span class="st"> </span><span class="kw">sapply</span>(documents, 
                     function(x) <span class="kw">sum</span>(x[<span class="dv">2</span>, ]))  
<span class="co"># number of tokens per document [312, 288, 170, 436, 291, ...]</span>
N &lt;-<span class="st"> </span><span class="kw">sum</span>(doc.length)  <span class="co"># total number of tokens in the data (546,827)</span>
term.frequency &lt;-<span class="st"> </span><span class="kw">as.integer</span>(term.table)  
<span class="co"># frequencies of terms in the corpus [8939, 5544, 2411, 2410, 2143, ...]</span></code></pre></div>
<p></small></p>
</section></section>
<section><section id="fitting-the-model" class="titleslide slide level1"><h1>Fitting the Model</h1></section><section id="fitting-the-model-1" class="slide level2">
<h1>Fitting the Model</h1>
<small>
<p>From <code>lda</code> package documentation</p>
<blockquote>
<p>… [this function] takes sparsely represented input documents, perform inference, and return point estimates of the latent parameters using the state at the last iteration of Gibbs sampling.</p>
</blockquote>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># MCMC and model tuning parameters:</span>
K &lt;-<span class="st"> </span><span class="dv">20</span>
G &lt;-<span class="st"> </span><span class="dv">5000</span>
alpha &lt;-<span class="st"> </span><span class="fl">0.02</span>
eta &lt;-<span class="st"> </span><span class="fl">0.02</span>

<span class="co"># Fit the model:</span>
<span class="kw">library</span>(lda)
<span class="kw">set.seed</span>(<span class="dv">456</span>)
fit &lt;-<span class="st"> </span><span class="kw">lda.collapsed.gibbs.sampler</span>(
  <span class="dt">documents =</span> documents, <span class="dt">K =</span> K, 
  <span class="dt">vocab =</span> vocab, <span class="dt">num.iterations =</span> G, 
  <span class="dt">alpha =</span> alpha, <span class="dt">eta =</span> eta, 
  <span class="dt">initial =</span> <span class="ot">NULL</span>, <span class="dt">burnin =</span> <span class="dv">0</span>,
  <span class="dt">compute.log.likelihood =</span> <span class="ot">TRUE</span>
)</code></pre></div>
<p></small></p>
</section><section id="getting-the-model" class="slide level2">
<h1>Getting the Model</h1>
<small>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(archivist)
<span class="kw">saveToRepo</span>(fit,  <span class="dt">repoDir =</span> <span class="st">&quot;../Museum&quot;</span>)</code></pre></div>
<p>The computations took very long, so in case you would like to get model faster, I have archived my model on GitHub with the help of <a href="http://r-bloggers.com/r-hero-saves-backup-city-with-archivist-and-github/">archivist</a> package. You can easily load this model to R with</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">archivist::<span class="kw">aread</span>(<span class="st">&#39;MarcinKosinski/Museum/fa93abf0ff93a7f6f3f0c42b7935ad4d&#39;</span>) -&gt;<span class="st"> </span>fit</code></pre></div>
<p></small></p>
</section></section>
<section><section id="ldavis-use-case" class="titleslide slide level1"><h1>LDAvis use case</h1></section><section id="ldavis-use-case-1" class="slide level2">
<h1>LDAvis use case</h1>
<small>
<p>If you google out properly you’ll wind out that LDAvis description is</p>
<blockquote>
<p>Tools to create an interactive web-based visualization of a topic model that has been fit to a corpus of text data using Latent Dirichlet Allocation (LDA). Given the estimated parameters of the topic model, it computes various summary statistics as input to an interactive visualization built with D3.js that is accessed via a browser. The goal is to help users interpret the topics in their LDA topic model.</p>
</blockquote>
<p><a href="https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf">Detailed vignette about LDAvis input and output can be found here</a>.</p>
<p></small></p>
</section><section id="ldavis-preparations" class="slide level2">
<h1>LDAvis Preparations</h1>
<small>
<p>To visualize the result using LDAvis, we’ll need estimates of the document-topic distributions, which we denote by the DxK matrix theta, and the set of topic-term distributions, which we denote by the K×W matrix phi.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">theta &lt;-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(fit$document_sums +<span class="st"> </span>alpha,
                 <span class="dv">2</span>,
                 function(x) x/<span class="kw">sum</span>(x)))
phi &lt;-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(<span class="kw">t</span>(fit$topics) +<span class="st"> </span>eta,
               <span class="dv">2</span>,
               function(x) x/<span class="kw">sum</span>(x)))</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(LDAvis)
<span class="co"># create the JSON object to feed the visualization:</span>
json &lt;-<span class="st"> </span><span class="kw">createJSON</span>(
  <span class="dt">phi =</span> phi, 
  <span class="dt">theta =</span> theta, 
  <span class="dt">doc.length =</span> doc.length, 
  <span class="dt">vocab =</span> vocab, 
  <span class="dt">term.frequency =</span> term.frequency
)
<span class="kw">serVis</span>(json, <span class="dt">out.dir =</span> <span class="st">&#39;vis&#39;</span>, 
       <span class="dt">open.browser =</span> <span class="ot">FALSE</span>)</code></pre></div>
<p></small></p>
</section><section id="result" class="slide level2">
<h1>Result</h1>
<small>
<p>The result is published under this link</p>
<p><a href="http://r-addict.com/r-bloggers-harvesting/" class="uri">http://r-addict.com/r-bloggers-harvesting/</a></p>
<p>where you can check Intertopic Distance Map (via multidimensional scaling) and top N relevant terms for a topic.</p>
<p></small></p>
</section></section>
    </div>
  </div>

  <script src="index_files/reveal.js-3.3.0/lib/js/head.min.js"></script>
  <script src="index_files/reveal.js-3.3.0/js/reveal.js"></script>

  <script>

      // Full list of configuration options available at:
      // https://github.com/hakimel/reveal.js#configuration
      Reveal.initialize({
        // Display the page number of the current slide
        slideNumber: true,
        // Push each slide change to the browser history
        history: true,
        // Vertical centering of slides
        center: true,
        // Transition style
        transition: 'default', // none/fade/slide/convex/concave/zoom
        // Transition style for full page slide backgrounds
        backgroundTransition: 'default', // none/fade/slide/convex/concave/zoom

        // Optional reveal.js plugins
        dependencies: [
        ]
      });
    </script>
  <!-- dynamically load mathjax for compatibility with self-contained -->
  <script>
    (function () {
      var script = document.createElement("script");
      script.type = "text/javascript";
      script.src  = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
      document.getElementsByTagName("head")[0].appendChild(script);
    })();
  </script>

<script>
  (function() {
    if (window.jQuery) {
      Reveal.addEventListener( 'slidechanged', function(event) {  
        window.jQuery(event.previousSlide).trigger('hidden');
        window.jQuery(event.currentSlide).trigger('shown');
      });
    }
  })();
</script>


  </body>
</html>