-
Notifications
You must be signed in to change notification settings - Fork 0
/
STM_validation_export.R
139 lines (89 loc) · 4.13 KB
/
STM_validation_export.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# This is a script that exports sample texts for topics into an Excel file
# one topic == one sheet
# Load libraries ----------------------------------------------------------
library(XLConnect)
library(stm)
library(dplyr)
library(stringr)
library(purrr)
# Load topic objects ------------------------------------------------------
stmmodel <- readRDS("stmmodel") # NOTE: These object are exported from the
stmdata <- readRDS("stmdata") # STM R script to avoid re-running the model (saves time)
# get labels for the topics to use later
labels <- labelTopics(stmmodel, n = 10)
# saveRDS(labels, "labels")
# Getting topic texts ---------------------------------------------------
# set number of topics
topicnr <- 90
# clean encoding issues
stmdata$meta$alltext <- str_replace_all(stmdata$meta$alltext, "â\u0080\u0099", "'")
stmdata$meta$alltext <- str_replace_all(stmdata$meta$alltext, "&", "&")
stmdata$meta$alltext <- str_replace_all(stmdata$meta$alltext, "​", " ")
stmdata$meta$alltext <- str_replace_all(stmdata$meta$alltext, "(http[^ )]*)", "LINK REMOVED")
# function for getting example texts to iterate over
get_thoughts <- function(i) {
findThoughts(stmmodel,
texts = stmdata$meta$alltext, n = 25,
topics = i
)$docs
}
# iterate the function over each topic, convert list to a dataframe
df_thoughts <- as.data.frame(lapply(seq(1:topicnr), get_thoughts))
#saveRDS(df_thoughts, "df_thoughts")
# Creating Excel workbook -------------------------------------------------
# load/create empty excel workbook
db <- loadWorkbook("topic_examples_FIN.xlsx", create = TRUE )
# function to iterate over later
make_excel <- function(i) {
# create sheet with topic number
createSheet(db, paste("Topic", i))
# write the data for the topic into the sheet (& split text into title and text)
writeWorksheet(db, str_split_fixed(df_thoughts[, i], fixed("//"), 2),
sheet = paste("Topic", i), startRow = 3, startCol = 1,
header = TRUE
)
# give the sheet a header (A1) with the topic's frex terms
writeWorksheet(db, paste("Topic", i, ":", str_c(labels$frex[i, 1:10], collapse = ", ")),
sheet = paste("Topic", i), startRow = 1, startCol = 1,
header = F
)
}
# run function over each topic - create the final Excel file for topic validation
map(seq(1:topicnr), make_excel)
# Styling the excel -------------------------------------------------------
# add styling (widen cells to make them readable)
setColumnWidth(db, seq(1:topicnr), 1, 10000)
setColumnWidth(db, seq(1:topicnr), 2, 30000)
# create an empty style
cs <- createCellStyle(db)
# Specify to wrap the text
setWrapText(cs, wrap = TRUE)
# apply wrapping-style to all
## NOTE: applying in two steps because XLConnects gets weird if done in one go
map(seq_len(topicnr), ~setCellStyle(db, sheet = .x, row = 4:30, col = 2,
cellstyle = cs))
map(seq_len(topicnr), ~setCellStyle(db, sheet = .x, row = 4:30, col = 1,
cellstyle = cs))
# making the heading (A1) yellow
get_yellow <- createCellStyle(db)
# Specify the fill foreground color
setFillForegroundColor(get_yellow, color = XLC$"COLOR.GOLD")
# specify fill type
setFillPattern(get_yellow, fill = XLC$FILL.SOLID_FOREGROUND)
# make it wrap
setWrapText(get_yellow, wrap = TRUE)
# Set the cell style created above for the top left cell (A1) for all sheets
map(seq_len(topicnr), ~setCellStyle(db, sheet = seq_len(topicnr), row = 1, col = 1, cellstyle = get_yellow))
# Saving the Excel workbook -----------------------------------------------
# save the result
saveWorkbook(db, "topic_examples_FIN.xlsx")
# Alternative solution: long format (used for shiny app) ------------------
# ## iterate over all topics
# df_thoughts_long <- as.data.frame(lapply(seq(1:topicnr), get_thoughts)) %>%
# # pivoting to longer format
# pivot_longer(cols = starts_with("Topic"), names_to = "topic", values_to = "text") %>%
# mutate(topic = as.numeric(str_remove(topic, "Topic."))) %>%
# separate(text, into = c("title", "text"), sep = "//") %>%
# arrange(topic)
#
# saveRDS(df_thoughts_long, "topic_explorer/df_thoughts_long")