-
Notifications
You must be signed in to change notification settings - Fork 2
/
statistics.py
134 lines (105 loc) · 4.38 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import extractors
from constants import *
import pandas as pd
from numpy import mean
from scipy.stats import trim_mean
import plotly.express as px
from collections import Counter
import enchant
dictionary = enchant.Dict("en_US")
raw_stats = {}
def print_stats(title, library):
print('-------------- %s --------------'%title)
print('{:<35s}{:<8s}'.format('STAT', 'VALUE'))
print('-----------------------------------------------')
for key, value in library.items():
if isinstance(value, (int, float)):
print('{:<35s}{:<8.2f}'.format(key,float(value)))
elif isinstance(value, (str)):
print('{:<35s}{:<35s}'.format(key,value))
print()
dataframe = pd.read_csv(QUALTRICS_FILE)
# get participation stats
progress_df = dataframe['Progress']
progress_df = progress_df[2:]
progress = progress_df.values.tolist()
completion_count = progress.count('100')
total_responses = len(progress)
abandon_count = total_responses - completion_count
raw_stats['total responses'] = total_responses
raw_stats['completion count'] = completion_count
raw_stats['abandon count'] = abandon_count
raw_stats['completion rate %'] = (total_responses - abandon_count) / total_responses * 100
# get completion time stats
completes_df = dataframe[dataframe.Finished == 'True']
duration_df = completes_df['Duration (in seconds)']
duration = duration_df.tolist()
completion_times = list(map(int, duration))
completion_times = [seconds / 60 for seconds in completion_times]
raw_stats['average completion time (m)'] = mean(completion_times)
raw_stats['min completion time (m)'] = min(completion_times)
raw_stats['max completion time (m)'] = max(completion_times)
raw_stats['trim mean completion time (m)'] = trim_mean(completion_times, 0.2)
response_df = pd.read_csv(RESPONSE_FILE)
# find questions with no tags
empty_questions_df = response_df.loc[response_df['Emotion Tags'] == '[]']
empty_questions = list(empty_questions_df['Question Number'])
raw_stats['empty questions'] = str(empty_questions)
print_stats('SURVEY STATS', raw_stats)
# map the locations
latlong_df = completes_df[['LocationLatitude', 'LocationLongitude']]
fig = px.scatter_geo(completes_df,lat='LocationLatitude',lon='LocationLongitude').update_traces(marker=dict(color='red'))
fig.update_layout(title = 'World map', title_x=0.5)
fig.show()
# get clean survey tags
all_tags, descriptor_tags, emotion_tags = extractors.extract_survey()
descriptor_tags_set = set(descriptor_tags)
emotion_tags_set = set(emotion_tags)
all_tags_set = set(all_tags)
tags_in_both = descriptor_tags_set.intersection(emotion_tags_set)
tag_stats = {}
tag_stats['total tags'] = len(all_tags)
tag_stats['total unique tags'] = len(all_tags_set)
tag_stats['total unique tag/class pairs'] = len(emotion_tags_set) + len(descriptor_tags_set)
tag_stats['total descriptor tags'] = len(descriptor_tags)
tag_stats['unique descriptor tags'] = len(descriptor_tags_set)
tag_stats['total emotion tags'] = len(emotion_tags)
tag_stats['unique emotion tags'] = len(emotion_tags_set)
tag_stats['tags described as both'] = len(tags_in_both)
indict_count = 0
for t in all_tags_set:
if dictionary.check(t):
indict_count +=1
tag_stats['% tags in dictionary'] = indict_count/len(all_tags_set)*100
# read literature tags from xlsx
df = pd.read_excel(LIBRARY_FILE, sheet_name='Sound Descriptors')
lit_tag_stats = {}
lit_tag_stats['total tags'] = len(df)
# detect duplicates
lit_words = df['Word'].to_list()
lit_words = [w.lower() for w in lit_words]
for item in lit_words:
if lit_words.count(item) > 1:
print(f'WARNING: found duplicate in lit: {item}')
survey_coverage = 0
for t in lit_words:
if t in all_tags_set:
survey_coverage += 1
lit_tag_stats['% words in survey tags'] = survey_coverage/len(lit_words)*100
lit_tag_stats['% words not in survey tags'] = 100 - lit_tag_stats['% words in survey tags']
lit_coverage = 0
for t in all_tags_set:
if t in lit_words:
lit_coverage += 1
tag_stats['% words in lit words'] = lit_coverage/len(all_tags_set)*100
tag_stats['% not in lit words'] = 100 - tag_stats['% words in lit words']
# print the stats
print_stats('SURVEY TAG STATS', tag_stats)
print_stats('LITERATURE TAG STATS', lit_tag_stats)
print('-------TAGS IN BOTH EMOTION AND DESCRIPTOR---------------')
print(tags_in_both)
print()
print('-------MOST COMMON SURVEY TAGS------------')
occurence_count = Counter(all_tags).most_common(10)
print(occurence_count)
print()