-
Notifications
You must be signed in to change notification settings - Fork 0
/
aby_warburg_analysis3.py
130 lines (113 loc) · 5.05 KB
/
aby_warburg_analysis3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from nltk import ngrams
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import spacy
from spacy import displacy
# Download NLTK stopwords if not already present
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
# Initialize stopwords
stop_words = set(stopwords.words('english'))
# Streamlit App
st.title("Aby Warburg-inspired Word Analysis Tool")
# Step 1: Upload Dataset
uploaded_file = st.file_uploader("Upload your dataset (CSV)", type=["csv"])
if uploaded_file:
df = pd.read_csv(uploaded_file)
st.write("Dataset Preview:", df.head())
# Select the text column for analysis
text_column = st.selectbox("Select the text column for analysis", df.columns)
# Step 2: Preprocess and Analyze Text
if st.button("Analyze Words"):
# Combine all text into a single string
all_text = ' '.join(df[text_column].dropna().astype(str))
# Tokenize the text using split (basic whitespace tokenization)
words = all_text.lower().split()
# Remove stopwords and non-alphabetic tokens
words = [word for word in words if word.isalpha() and word not in stop_words]
# Count word frequencies
word_counts = Counter(words)
# Display the most common words
common_words = word_counts.most_common(20)
st.write("Most Common Words:", common_words)
# Word Cloud
st.subheader("Word Cloud")
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(plt)
# Word Connection Visualization
st.subheader("Word Connection Network")
G = nx.Graph()
for i in range(len(words) - 1):
G.add_edge(words[i], words[i + 1])
plt.figure(figsize=(10, 8))
nx.draw_networkx(G, with_labels=True, node_size=20, font_size=10, node_color='skyblue', edge_color='gray')
plt.title('Connections Between Words')
st.pyplot(plt)
# Co-occurrence Matrix Heatmap
st.subheader("Co-occurrence Matrix Heatmap")
vectorizer = CountVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform([' '.join(words)])
Xc = (X.T * X)
Xc.setdiag(0)
co_occurrence_matrix = pd.DataFrame(data=Xc.toarray(), columns=vectorizer.get_feature_names_out(), index=vectorizer.get_feature_names_out())
plt.figure(figsize=(10, 8))
sns.heatmap(co_occurrence_matrix, cmap="YlGnBu", linewidths=0.1)
plt.title('Co-occurrence Matrix Heatmap')
st.pyplot(plt)
# Bigram Network Visualization
st.subheader("Bigram Network")
bigrams = list(ngrams(words, 2))
bigram_counts = Counter(bigrams)
G = nx.Graph()
for pair, count in bigram_counts.items():
G.add_edge(pair[0], pair[1], weight=count)
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=1.5)
nx.draw_networkx(G, pos, node_color='skyblue', edge_color='gray', node_size=500, font_size=10, with_labels=True)
plt.title('Bigram Network')
st.pyplot(plt)
# LDA Topic Modeling Visualization
st.subheader("LDA Topic Modeling")
dictionary = corpora.Dictionary([words])
corpus = [dictionary.doc2bow(words)]
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
html_string = pyLDAvis.prepared_data_to_html(lda_vis)
st.components.v1.html(html_string, height=800)
# Word Embeddings Visualization with PCA
st.subheader("Word Embeddings (PCA)")
model = Word2Vec([words], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv[model.wv.key_to_index]
pca = PCA(n_components=2)
pca_result = pca.fit_transform(word_vectors)
plt.figure(figsize=(10, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1])
for i, word in enumerate(model.wv.key_to_index):
plt.annotate(word, xy=(pca_result[i, 0], pca_result[i, 1]))
plt.title('Word Embeddings Visualization (PCA)')
st.pyplot(plt)
# Dependency Parsing with spaCy
st.subheader("Dependency Parsing (spaCy)")
nlp = spacy.load('en_core_web_sm')
doc = nlp(" ".join(words[:10])) # Example sentence
html = displacy.render(doc, style='dep', jupyter=False)
st.components.v1.html(html, height=300)