-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcharts_data.py
197 lines (153 loc) · 7.25 KB
/
charts_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# Data reading
import pandas as pd
import numpy as np
# NLP
import nltk
from wordcloud import WordCloud
import re
# Network
import networkx as nx
from pyvis.network import Network
import community.community_louvain as cl
# Import stopwords
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(["uh", "oh", "okay", "im", "dont", "know", "yeah", "thats", "youre", "well", "what", "ok", "isnt", "dont",
"yes", "no", "theres", "cant", "didnt", "whats"])
def compute_basic_analytics(script, characters):
total_seasons = int(script['Season'].max())
total_episodes = int(characters['num_episodes'].max())
total_characters = int(characters['name'].count())
total_lines = int(script['Line'].count())
return total_episodes, total_characters, total_lines, total_seasons
def count_lines_by_character(script):
char_lines = (script.groupby(['Character'], as_index=False)['Line']
.count().sort_values(['Line'], ascending=False))
char_lines['Percent'] = char_lines['Line'] / char_lines['Line'].sum()
return char_lines
def num_characters_per_season(script):
seasons_char = (script.groupby(['Season'], as_index=False)['Character']
.nunique().sort_values('Character', ascending=False))
return seasons_char
def num_lines_per_season(script):
count_lines = (script.groupby(['Season'], as_index=False)['Line']
.count().sort_values('Line', ascending=True))
return count_lines
def extract_character_lines(character_name, script):
character_lines = script[script['Character'] == character_name].reset_index(drop=True)
character_lines = character_lines['Line']
return character_lines
def clean_data(character_lines):
data = " ".join(character_lines) # Join the text
data = re.sub(r'[^\w\s]', '', data) # Delete unwanted characters
tokens = data.split()
tokens = [word.lower() for word in tokens] # Lower all the letters
tokens = [word for word in tokens if not word in stop_words] # Ignore stopwords
text = " ".join(tokens) # Create one joined sentence
return text
def word_count(text):
# Create an empty dictionary named 'counts' to store word frequencies.
counts = dict()
# Split the input string 'str' into a list of words using spaces as separators and store it in the 'words' list.
words = text.split()
# Iterate through each word in the 'words' list.
for word in words:
# Check if the word is already in the 'counts' dictionary.
if word in counts:
# If the word is already in the dictionary, increment its frequency by 1.
counts[word] += 1
else:
# If the word is not in the dictionary, add it to the dictionary with a frequency of 1.
counts[word] = 1
# Return the 'counts' dictionary, which contains word frequencies.
return counts
def generate_wordcloud(text):
word_cloud = WordCloud(width=1920, height=1080, background_color="white",
colormap='tab20b', collocations=False).generate(text)
return word_cloud
def prepare_network_data(script, characters):
input = 23338
output = list(range(input + 1))
scenes = list(np.repeat(output, 5))
script = script[7:]
script['Scene'] = scenes[5:] # Add number of a scene to column
char_dict = {}
for group, group_df in script.groupby(['Scene']): # Group by scenes of 5
# Get the list of characters that appear in scene, sorted alphabetically
char_in_scene = str(group_df['Character'].sort_values().unique().tolist())[1:-1].replace("'", "")
# Add to dictionary
if char_in_scene in char_dict.keys():
char_dict[char_in_scene] += 1
else:
char_dict[char_in_scene] = 1
# Sort the dictionary by count (second item)
sorted_char_list = sorted(char_dict.items(), key=lambda item: item[1], reverse=True)
sorted_dict = {}
for k, v in sorted_char_list:
sorted_dict[k] = v
# Get the relations of 2 characters from the character's list
relations = []
for x in characters['name']:
for y in characters['name']:
if x != y and x < y: # Alphabetically
relations.append(x + ', ' + y)
# Add to dictionary only the elements that appear in relations list
rel_dict = {}
for x in relations:
if x in sorted_dict.keys():
rel_dict[x] = sorted_dict[x]
# List sorted by count
sorted_rel = sorted(rel_dict.items(), key=lambda item: item[1], reverse=True)
# Create a dataframe
network_df = pd.DataFrame(sorted_rel, columns=['Source', 'Count'])
network_df[['Source', 'Target']] = network_df['Source'].str.split(',', expand=True)
# Delete rows with less than 2 scenes in the show
network_df['Count'] = network_df[network_df['Count'] > 2]['Count']
network_df = network_df.dropna()
network_df['Count'] = network_df['Count'].astype(int)
# Change columns position
network_df = network_df.loc[:, ['Source', 'Target', 'Count']]
return network_df
def build_network(network_df):
# Edges
network_df.columns = ['Source', 'Target', 'Count']
network_df['Source'] = network_df['Source'].str.replace(' ', '') # Delete accidental spaces
network_df['Target'] = network_df['Target'].str.replace(' ', '')
# Create a graph from a pandas dataframe
G = nx.from_pandas_edgelist(network_df,
source="Source",
target="Target",
edge_attr="Count",
create_using=nx.Graph())
communities = cl.best_partition(G)
# node_degree = nx.betweenness_centrality(G)
node_degree = dict(G.degree)
# Set attribute by the community
nx.set_node_attributes(G, communities, 'group')
# Set attribute by the number of interactions
nx.set_node_attributes(G, node_degree, 'size')
# Create a graph
net = Network(notebook=False, width="1000px", height="900px", bgcolor='white', font_color='black')
net.from_nx(G)
net.save_graph('GilmoreGirlsNetwork.html')
HtmlFile = open('GilmoreGirlsNetwork.html', 'r', encoding='utf-8').read()
HtmlFile = HtmlFile.replace('border: 1px solid lightgray;', '')
return G, HtmlFile
def centralities_charts(G):
# Degree centrality
degree_dict = nx.degree_centrality(G)
degree_df = (pd.DataFrame(degree_dict.items(), columns=['name', 'centrality'])
.sort_values(['centrality'], ascending=False))
# Betweenness centrality
beetweenness_dict = nx.betweenness_centrality(G)
beetweenness_df = (pd.DataFrame(beetweenness_dict.items(), columns=['name', 'centrality'])
.sort_values(['centrality'], ascending=False))
# Closeness centrality
closeness_dict = nx.closeness_centrality(G)
closeness_df = (pd.DataFrame(closeness_dict.items(), columns=['name', 'centrality'])
.sort_values(['centrality'], ascending=False))
# Eigenvector centrality
eigenvector_dict = nx.eigenvector_centrality(G)
eigenvector_df = (pd.DataFrame(eigenvector_dict.items(), columns=['name', 'centrality'])
.sort_values(['centrality'], ascending=False))
return degree_df, beetweenness_df, closeness_df, eigenvector_df