forked from renanj/IAT
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdata_preparation.py
153 lines (125 loc) · 5.79 KB
/
data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import shutil
from clustering import prepare_data
from random import shuffle
from math import ceil
from os import listdir, mkdir, system
from os.path import isfile, isdir, join, exists
from knn_labeling import run_knn
import pandas as pd
def check_input(text, max_num):
try:
assert int(text) > 0
assert int(text) <= max_num
return True
except:
print('Batch size should be an integer between 1 and', max_num)
def check_case(text, dataframes_path = None):
if text == 'ALL':
for i in range(num_batches):
prepare_data('dataframes/batch{:04d}'.format(i+1), join(batches_path, 'batch{:04d}'.format(i+1)))
print('\nBatch', i+1, 'prepared.\n')
return False
elif text == 'SKIP':
return False
else:
while not check_input(text, num_batches):
text = input('\nChoose batch number for feature extraction: ')
batch_id = int(text)
prepare_data(dataframes_path + 'batch{:04d}'.format(batch_id), join(batches_path, 'batch{:04d}'.format(batch_id)))
return True
def show_status(batches_path, dataframes_path):
batch1_is_labeled = False
batches_list = listdir(batches_path)
batches_list.sort()
print()
for batch in batches_list:
batch_file = dataframes_path + batch + '.csv'
if isfile(batch_file):
df = pd.read_csv(batch_file)
not_labeled = 0
value_counts = df['colors'].value_counts()
if 0 in value_counts:
not_labeled = value_counts[0]
num_rows = len(df)
num_labeled = num_rows-not_labeled
if not_labeled == 0:
print(batch, '-> features are already extracted. All ' + str(num_rows) + ' images are labeled.')
if batch == 'batch0001':
batch1_is_labeled = True
else:
print(batch, '-> features are already extracted. ' + str(num_labeled) + ' out of ' + str(num_rows) + ' images are labeled.')
else:
print(batch, '-> needs features extraction.')
return batch1_is_labeled
projects_path = 'main/assets/'
print('\nList of ongoing projects:')
list_of_projects = listdir(projects_path)
for folder in list_of_projects:
if isdir(join(projects_path, folder)):
print(folder)
text = input('\nType name of ongoing project, or "NEW" to begin a project: ')
while text not in list_of_projects and text != 'NEW':
text = input('\nType name of ongoing project, or "NEW" to begin a project: ')
if text == 'NEW':
text = input('\nChoose a name for the new project: ')
while text in list_of_projects:
print(text, 'is already an ongoing project.')
text = input('\nChoose a name for the new project: ')
mkdir(join(projects_path, text))
mkdir(join(projects_path, text, 'samples'))
mkdir(join(projects_path, text, 'dataframes'))
mkdir(join(projects_path, text, 'images'))
project_name = text
dataframes_path = 'main/assets/' + project_name + '/dataframes/'
samples_path = 'main/assets/' + project_name + '/samples/' + project_name
batches_path = 'main/assets/' + project_name + '/images/'
print('\nChecking dataset', project_name, '...\n')
if project_name not in list_of_projects:
if not exists(project_name):
raise Exception('Dataset ' + project_name + ' not found.')
shutil.move(project_name, samples_path)
num_batches = len(listdir(batches_path))
if num_batches == 0: #creating batches if they do not exist yet
files = [f for f in listdir(samples_path) if isfile(join(samples_path, f))]
print(str(len(files)) + ' images found.')
text = input('\nChoose batch size (default = 1000): ')
while not check_input(text, len(files)):
text = input('\nChoose batch size (default = 1000): ')
if text == '':
batch_size = 1000
else:
batch_size = int(text)
print('\nProceeding with batch size = ' + str(batch_size))
shuffle(files)
num_batches = ceil(len(files)/batch_size)
for i in range(num_batches):
folder_name = 'batch{:04d}'.format(i+1)
inner_folder_name = folder_name + '/samples'
if exists(join(batches_path, folder_name)):
shutil.rmtree(join(batches_path, folder_name))
mkdir(join(batches_path, folder_name))
mkdir(join(batches_path, inner_folder_name))
folder_files = files[i*batch_size:(i+1)*batch_size]
for f in folder_files:
shutil.move(join(samples_path, f), join(batches_path, inner_folder_name))
shutil.rmtree(samples_path)
print('\n' + str(num_batches) + ' batches created:')
batch1_is_labeled = show_status(batches_path, dataframes_path)
text = input('\nChoose batch number for feature extraction, "ALL" for all batches, or "SKIP" to proceed to labeling: ')
while check_case(text, dataframes_path):
batch1_is_labeled = show_status(batches_path, dataframes_path)
text = input('\nChoose batch number for feature extraction, "ALL" for all batches, or "SKIP" to proceed to labeling: ')
text = input('\nChoose batch for labeling: ')
while not check_input(text, num_batches):
text = input('\nChoose batch for labeling: ')
batch_id = int(text)
path_to_images = join(batches_path, 'batch{:04d}'.format(batch_id), 'samples/')
path_to_csv = dataframes_path + 'batch{:04d}'.format(batch_id) + '.csv'
if batch_id != 1 and batch1_is_labeled:
text = input("\nGenerate labels from batch 1? ('YES', 'NO'): ").lower()
while(text != 'yes' and text != 'no'):
text = input("\nGenerate labels from batch 1? ('YES', 'NO'): ").lower()
if text == 'yes':
path_to_csv = run_knn(join(dataframes_path, 'batch0001.csv'), path_to_csv)
print('\nUsing KNN to infer the labels for', path_to_csv)
system('python main/app.py ' + path_to_images + ' ' + path_to_csv)