-
Notifications
You must be signed in to change notification settings - Fork 2
/
extractors.py
124 lines (100 loc) · 4.46 KB
/
extractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import csv
import re
import pandas as pd
from constants import *
# Extracts tags from the current processed tags file SURVEY_FILE
def extract_survey():
descriptor_tags = []
emotion_tags = []
all_tags = []
with open(SURVEY_FILE) as data_file:
csv_reader = csv.reader(data_file, delimiter=',')
complete_counter = 0
for row in csv_reader:
# in current layour data is from row[17] to row[218]
# skip column label rows
if row[6] != "TRUE" and row[6] != "FALSE":
continue
# skip the response if not complete
if row[6] == 'FALSE':
continue
# count complete responses
if row[6] == 'TRUE':
complete_counter += 1
# go through row values
for idx, item in enumerate(row):
if(idx > 16 and idx < 219 and row[idx]):
# split string on a variety of symbols
# print(row[idx])
line_tags = re.split(pattern=r"[,./&]", string=row[idx])
# strip spaces from front end end of string
line_tags = [x.strip() for x in line_tags]
# convert to lower case
line_tags = [x.lower() for x in line_tags]
# remove empty strings
line_tags = [x for x in line_tags if x]
# split on or and and
for index, tag in enumerate(line_tags):
and_str = ' and '
or_str = ' or '
if and_str in tag:
split_tag = tag.split(and_str)
line_tags = line_tags[:index] + \
split_tag + line_tags[index+1:]
if or_str in tag:
split_tag = tag.split(or_str)
line_tags = line_tags[:index] + \
split_tag + line_tags[index+1:]
# store tags in appropriate lists
if idx % 2 == 0:
descriptor_tags = descriptor_tags + line_tags
else:
emotion_tags = emotion_tags + line_tags
all_tags = all_tags + line_tags
# print('extracted results from %d complete resplonses'% complete_counter)
# print('returning %d descripton and %d emotion tags'% (len(descriptor_tags), len(emotion_tags),))
return all_tags, descriptor_tags, emotion_tags
# Extracts tags from the hand made descriptors sheet. 'SoundDescriptorsParsed'
def extract_literature():
read_file = pd.read_excel (LIBRARY_FILE, sheet_name='SoundDescriptors')
descriptors = list(read_file['Word'])
# lower case
descriptors = [x.lower() for x in descriptors]
# remove duplicates
descriptors = list(set(descriptors))
print('returning %d tags from lit'% len(descriptors))
return descriptors
def extract_question_num():
read_file = pd.read_csv(SURVEY_FILE)
descriptor_questions = []
emotion_questions = []
descriptor_tags = []
emotion_tags = []
for q_num in range(1,101):
for sub_q in range(1,3):
col_name = 'Q' + str(q_num) + '_' + str(sub_q)
mylist = list(read_file[col_name])
clean_tags = [item.split(',') for item in mylist[2:] if not(pd.isnull(item)) == True]
clean_tags = [item for sublist in clean_tags for item in sublist]
# convert to lower case
clean_tags = [x.lower().strip() for x in clean_tags]
# remove empty strings
clean_tags = [x for x in clean_tags if x]
clean_counter = 0
if sub_q == 1:
for t in clean_tags:
if t not in emotion_tags:
clean_counter +=1
emotion_tags.append(t)
for x in range(1, clean_counter + 1):
emotion_questions.append(q_num)
elif sub_q == 2:
for t in clean_tags:
if t not in descriptor_tags:
clean_counter +=1
descriptor_tags.append(t)
for x in range(1, clean_counter + 1):
descriptor_questions.append(q_num)
else:
print('error')
return descriptor_tags, emotion_tags, descriptor_questions, emotion_questions