-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
320 lines (265 loc) · 15.3 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import base64
import pandas as pd
import plotly.express as px
import random
import requests
import streamlit as st
import string
import time
from collections import Counter
from bs4 import BeautifulSoup
from cryptography.fernet import Fernet
from newspaper import Article
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from scraping import scrape_articles, get_urls_from_google, filter_links
from text_analysis import plot_n_most_frequent_words, stopwords_removal, extract_ngrams, extract_collocations, display_concordance
from urlscraper import scrape_articles_from_urls_with_progress
import os
def app():
st.title("SemanText")
# Features
option = st.sidebar.selectbox("Select a feature", ["URL Scraper", "Most frequent words", "N-gram",
"Rule-Based Collocation", "Key Words in Context"])
# Article scraping from URLs
if option == "URL Scraper":
st.markdown("---")
st.subheader("URL Scraper")
uploaded_urls = st.file_uploader("Upload a text file", type=["txt"],
help="Upload a text file containing one URL per line without quote and separator.")
if st.button("Scrape the URLs"):
if uploaded_urls is not None:
with st.spinner("Scraping in progress..."):
with open("temp_url_file.txt", "wb") as temp_file:
temp_file.write(uploaded_urls.read())
# Scrape articles from the URLs with progress
scraped_df = scrape_articles_from_urls_with_progress("temp_url_file.txt")
st.success("Scraping complete!")
# Display the scraped data
st.write(scraped_df)
# Download the scraped articles
def download_corpus(scraped_df):
csv = scraped_df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="corpus_by_semantext.csv">Export to CSV</a>'
return href
st.markdown(download_corpus(scraped_df), unsafe_allow_html=True)
st.markdown("---")
elif option == "Most frequent words":
st.markdown("---")
st.subheader("Most frequent words")
# Add a file uploader for the CSV files
uploaded_files = st.file_uploader("Upload a CSV file(s)", accept_multiple_files=True, type=["csv"],
help="Upload CSV file which contain the scraped articles from the feature URL Scraper.")
# Merge the uploaded CSV files into a single DataFrame
if uploaded_files:
corpus = pd.DataFrame() # Initialize an empty DataFrame to hold the merged corpus
total_words = 0 # Initialize a variable to hold the total number of words
for file in uploaded_files:
file.seek(0) # Reset the file pointer to the beginning of the file
data = pd.read_csv(file)
corpus = pd.concat([corpus, data], ignore_index=True)
total_words += sum(data["Text"].str.split().str.len()) # Compute the number of words for the current file
# Display the total number of rows and words
st.subheader("Overview")
st.write(f"Total articles: {len(corpus)}")
st.write(f"Total words: {total_words}")
# Display an overview of the number of rows per value of the Publication column
publication_counts = corpus["Publication"].value_counts().to_frame().reset_index()
publication_counts.columns = ["Publication", "Count"]
st.write("Count of articles by Publication:")
st.table(publication_counts)
st.markdown("---")
st.subheader("Merged corpus")
st.write(corpus)
# Create a table for the most frequent words
if 'corpus' in locals() and st.button("Display most frequent words"):
n_words = st.text_input("Maximum words", "30", help="Enter the maximum number of words to display.")
try:
n_most_frequent = int(n_words)
except ValueError:
st.write("Please enter a valid integer for the number of words.")
# Remove stop words and punctuation from the corpus
corpus['Text'] = corpus['Text'].apply(lambda x: stopwords_removal(x.lower().split()))
top_n_words = plot_n_most_frequent_words(corpus, "Text", n=n_most_frequent)
# Create a table for the most frequent words
words_freq = pd.DataFrame(top_n_words, columns=["Word", "Frequency"])
# Remove apostrophes and commas from the words in the table
words_freq["Word"] = words_freq["Word"].str.replace("'", "").str.replace(",", "")
# Remove the row that contains an empty string in the Word column
words_freq = words_freq[words_freq["Word"] != ""]
words_freq = words_freq.sort_values(by="Frequency", ascending=False)
st.table(words_freq)
st.set_option('deprecation.showPyplotGlobalUse', False)
# Download the most frequent words
def download_mostfrequentwords(words_freq):
csv = words_freq.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="mostfrequentwords_by_semantext.csv">Export to CSV</a>'
return href
st.markdown(download_mostfrequentwords(words_freq), unsafe_allow_html=True)
st.markdown("---")
# Create a horizontal bar chart using Plotly
fig = px.bar(
words_freq,
x='Frequency',
y='Word',
orientation='h',
labels={'y': 'Word', 'x': 'Frequency'}
)
fig.update_layout(
title=f"Most frequent words",
xaxis_title="Frequency",
yaxis_title="Word",
autosize=False,
width=800,
height=500
)
st.plotly_chart(fig)
st.set_option('deprecation.showPyplotGlobalUse', False)
elif option == "Rule-Based Collocation":
st.markdown("---")
st.subheader("Rule-Based Collocation")
# Add a file uploader for the CSV files
uploaded_files = st.file_uploader("Upload a CSV file(s)", accept_multiple_files=True, type=["csv"],
help="Upload CSV file which contain the scraped articles from the feature URL Scraper.")
# Merge the uploaded CSV files into a single DataFrame
if uploaded_files:
corpus = pd.DataFrame() # Initialize an empty DataFrame to hold the merged corpus
total_words = 0 # Initialize a variable to hold the total number of words
for file in uploaded_files:
file.seek(0) # Reset the file pointer to the beginning of the file
data = pd.read_csv(file)
corpus = pd.concat([corpus, data], ignore_index=True)
total_words += sum(data["Text"].str.split().str.len()) # Compute the number of words for the current file
# Display the total number of rows and words
st.subheader("Overview")
st.write(f"Total articles: {len(corpus)}")
st.write(f"Total words: {total_words}")
# Display an overview of the number of rows per value of the Publication column
publication_counts = corpus["Publication"].value_counts().to_frame().reset_index()
publication_counts.columns = ["Publication", "Count"]
st.write("Count of articles by Publication:")
st.table(publication_counts)
st.markdown("---")
st.subheader("Merged corpus")
st.write(corpus)
# Save collocations as CSV
def download_collocations(collocations_df):
csv = collocations_df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="collocations_by_semantext.csv">Export to CSV</a>'
return href
st.markdown("---")
# Extract the possible collocations
if 'corpus' in locals() and st.button("Extract collocations"):
# Show a spinner while the collocations are being extracted
with st.spinner("Extracting collocations..."):
collocations_df = extract_collocations(corpus, st)
# Hide the spinner and display the top collocations and their frequencies in a table
st.success("Collocations extracted!")
st.subheader("Top collocations:")
st.table(collocations_df.head(50))
st.markdown(download_collocations(collocations_df), unsafe_allow_html=True)
st.markdown("---")
st.subheader("'NOUN + ADJ' Collocations:")
noun_adj_df = collocations_df[collocations_df['pos_pattern'] == 'NOUN + ADJ']
st.table(noun_adj_df)
st.subheader("'NOUN + NOUN' Collocations:")
noun_noun_df = collocations_df[collocations_df['pos_pattern'] == 'NOUN + NOUN']
st.table(noun_noun_df)
st.subheader("'VERB + NOUN' Collocations:")
verb_noun_df = collocations_df[collocations_df['pos_pattern'] == 'VERB + NOUN']
st.table(verb_noun_df)
elif option == "N-gram":
st.markdown("---")
st.subheader("N-gram")
n_value = st.number_input("Enter the value of 'n' for n-grams", min_value=2, step=1, value=2, help="Enter the value of 'n' for n-grams.")
# Add a file uploader for the CSV files
uploaded_files = st.file_uploader("Upload a CSV file(s)", accept_multiple_files=True, type=["csv"],
help="Upload CSV file which contain the scraped articles from the feature URL Scraper.")
# Merge the uploaded CSV files into a single DataFrame
if uploaded_files:
corpus = pd.DataFrame() # Initialize an empty DataFrame to hold the merged corpus
total_words = 0 # Initialize a variable to hold the total number of words
for file in uploaded_files:
file.seek(0) # Reset the file pointer to the beginning of the file
data = pd.read_csv(file)
corpus = pd.concat([corpus, data], ignore_index=True)
total_words += data["Text"].str.split().str.len().sum() # Compute the number of words for the current file
# Display the total number of rows and words
st.subheader("Overview")
st.write(f"Total articles: {len(corpus)}")
st.write(f"Total words: {total_words}")
# Display an overview of the number of rows per value of the Publication column
publication_counts = corpus["Publication"].value_counts().to_frame().reset_index()
publication_counts.columns = ["Publication", "Count"]
st.write("Count of articles by Publication:")
st.table(publication_counts)
st.subheader("Merged corpus")
st.write(corpus)
# Extract n-grams
if 'corpus' in locals() and st.button("Extract n-grams"):
# Show a spinner while n-grams are being extracted
with st.spinner("Extracting n-grams..."):
ngrams_df = extract_ngrams(corpus, n=n_value)
# Hide the spinner and display the n-grams in a table
st.success("N-grams extracted!")
st.write(f"Top {n_value}-grams:")
st.table(ngrams_df.head(50))
# Download the extracted n-grams
def download_ngram(ngrams_df):
csv = ngrams_df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="n_gram_by_semantext.csv">Export to CSV</a>'
return href
st.markdown(download_ngram(ngrams_df), unsafe_allow_html=True)
st.markdown("---")
elif option == "Key Words in Context":
st.markdown("---")
st.subheader("Key Words in Context")
# Add a file uploader for the CSV files
uploaded_files = st.file_uploader("Upload a CSV file(s)", accept_multiple_files=True, type=["csv"],
help="Upload CSV file which contain the scraped articles from the feature URL Scraper.")
# Merge the uploaded CSV files into a single DataFrame
if uploaded_files:
corpus = pd.DataFrame() # Initialize an empty DataFrame to hold the merged corpus
total_words = 0 # Initialize a variable to hold the total number of words
for file in uploaded_files:
file.seek(0) # Reset the file pointer to the beginning of the file
data = pd.read_csv(file)
corpus = pd.concat([corpus, data], ignore_index=True)
total_words += data["Text"].str.split().str.len().sum() # Compute the number of words for the current file
# Display the total number of rows and words
st.subheader("Overview")
st.write(f"Total articles: {len(corpus)}")
st.write(f"Total words: {total_words}")
# Display an overview of the number of rows per value of the Publication column
publication_counts = corpus["Publication"].value_counts().to_frame().reset_index()
publication_counts.columns = ["Publication", "Count"]
st.write("Count of articles by Publication:")
st.table(publication_counts)
st.markdown("---")
st.subheader("Merged corpus")
st.write(corpus)
st.markdown("---")
st.subheader("Filter")
col = st.selectbox("Select a column to search:", options=data.columns, help="Select a column to search for keywords.")
# Let the user choose the column to search and the keyword to search for
keyword = st.text_input("Enter a keyword to search for:", help="Enter a keyword to search for.")
# Generate and display the concordance
if st.button("Generate Concordance"):
concordance_df = display_concordance(data, col, keyword)
for index, row in concordance_df.iterrows():
kwic = row['KWIC'].replace(keyword, keyword)
concordance_df.at[index, 'KWIC'] = kwic
st.subheader("Concordance")
st.table(concordance_df)
# Footer
with st.container():
st.markdown("---")
st.markdown("Developed by MW Hidayat. Find me on [Twitter](https://twitter.com/casecrit)")
# Run the Streamlit app
if __name__ == '__main__':
st.set_page_config(page_title='SemanText', page_icon=':newspaper:')
app()