-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBTM.Rmd
111 lines (83 loc) · 2.74 KB
/
BTM.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
---
title: "BTM for KC"
author: "smn"
date: "11/03/2025"
output: pdf_document
---
```{r}
library(tidytext)
library(tidyverse)
library(BTM)
library(textplot)
library(ggraph)
```
```{r}
#load data
kc_osm_dbscan_tag <- read.csv(
"docs/docs_agile/kc_osm_dbscan_tag.csv")
#pre-process data
kc_osm_dbscan_tag$value <- gsub("_", " ", kc_osm_dbscan_tag$value)
#tidytext library for tokenization
kc_osm_dbscan_tag2 <- kc_osm_dbscan_tag %>%
rename(doc_id = cluster,
text = value
)%>%
unnest_tokens(output = token,
text,
token = "words" )
#run btm model for kc
model_osm_kc <- BTM(kc_osm_dbscan_tag2,
k = 10,
iter = 2000,
detailed = TRUE)
#visualize the topicterms
png('docs/docs_agile/btm_osm_kc_agile2.png',
width = 1400, height = 1200,
res = 200)
plot(model_osm_kc, top_n = 10,
title = 'BTM for OSM tags',
labels = paste(round(model_osm_kc$theta *100, 2),
"%", sep = ""))
topicterms_osm <- terms(model_osm_kc, top_n = 10)
topic_prob_df <- as.data.frame(predict(model_osm_kc,
newdata = kc_osm_dbscan_tag2,
type = "sum_b")) %>%
mutate(Topic = names(.)[max.col(.)])
topic_prob_df$cluster = rownames(topic_prob_df)
write.csv(topic_prob_df, "docs/docs_agile/topic_prob_kc_osm.csv")
```
```{r}
wiki_kc <- read.csv(
"../../docs/docs_agile/kc_wiki_dbscan_tag.csv")
wiki_kc_new <- wiki_kc %>%
rename(doc_id = cluster,
text = clean_summary
) %>%
unnest_tokens(output = word,
text,
token = "words") %>%
subset( select = -c(X) )
#run btm model
model_wiki_sum <- BTM(wiki_kc_new,
k = 6,
iter = 2000,
background = TRUE,
trace = 100,
detailed = TRUE)
png('../../docs/docs_agile/btm_wiki_kc_agile.png',
width = 1400, height = 1200,
res = 200)
plot(model_wiki_sum, top_n = 10,
title = 'BTM for wiki summary',
labels = paste(round(model_wiki_sum$theta *100, 2),
"%", sep = ""))
#show the top tokens in each topic
topicterms_wiki <- terms(model_wiki_sum, top_n = 10)
#show the probability distribution of topics for each doc
topic_prob_df <- as.data.frame(predict(model_wiki_sum,
newdata = wiki_kc_new,
type = "sum_b")) %>%
mutate(Topic = names(.)[max.col(.)])
topic_prob_df$cluster = rownames(topic_prob_df)
write.csv(topic_prob_df, "../../docs/docs_agile/wiki_kc_btm.csv")
```