-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataset_definition.py
144 lines (125 loc) · 5.5 KB
/
dataset_definition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import json
import glob
import pandas as pd
import tensorflow as tf
import shutil
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from PIL import Image
from tqdm import tqdm
IMG_PATH = "../synthetic_dataset/images/"
JSON_PATH = "../synthetic_dataset/labels/"
TRAIN_PATH = "../synthetic_dataset/train"
TEST_PATH = "../synthetic_dataset/test"
def extractLabels(img_path=IMG_PATH,json_path=JSON_PATH):
'''
Extracts the labels from the labels folder and linked images
'''
#Realizamos una lectura de todos los json y extraemos el toppic 'labels' para almacenarla en una variable con todos los labels de todas las imagenes.
nlabels=[]
img_names= []
#Recorremos el folder donde se almacenan los .json
for file_name in [file for file in os.listdir(json_path) ]:
if file_name.endswith(".json"):
with open(json_path + file_name) as json_file:
content= json.load(json_file)
#Almacenamos con que imagen va relacionado
jpegname= content['imagePath']
#Almacenamos el numero de poligonos que se encuentran dentro de dicho .json
nshapes= len(content['shapes'])
#Recogemos los labels de cada uno de los poligonos anteriores
for topic in range(nshapes):
label=content['shapes'][topic]['label']
#Añadimos cada label a la lista de labels (excepto las clases con los labels Metal_Chain y WashingMachine ya que no tienen las muestras minimas para poder separarlas) y el path de todas las imagenes
if label != 'Metal_Chain' and label != 'WashingMachine':
img_names.append( os.path.join(img_path, content['imagePath']))
nlabels.append(label)
#Mostramos todos los labels e imagenes que hemos analizado
labels=np.array(nlabels)
img_names=np.array(img_names)
return img_names,labels,nlabels
def encodeLabels(labels, nlabels):
'''
Enconde the labels within the available labels
'''
labels, count = np.unique(nlabels, return_counts=True)
#Los mostramos
for idx, l in enumerate(labels):
print(l, ':', count[idx])
#Inicializamos el Encoder de labes
le = preprocessing.LabelEncoder()
#Introducimos los labels en el encoder
le.fit(nlabels)
#Aplicamos la codificacion a los labels introducidos y los almacenamos
Y = le.transform(nlabels)
#Mostramos los labels codificados
return Y
def train_test_split(img_names,Y):
'''
Se realiza la separación de train y test
'''
#Realizamos una division de los datos en 1/5
skf = StratifiedKFold(n_splits=5)
train_index, test_index = next( skf.split(img_names, Y) )
#Almacenamos las imagenes segun donde van a ser movidas
X_train, X_test = img_names[train_index], img_names[test_index]
Y_train, Y_test = Y[train_index], Y[test_index]
#Creamos las carpetas para train y test
try:
os.mkdir(TRAIN_PATH)
except OSError:
print ("Creation of the directory %s failed" % TRAIN_PATH)
else:
print ("Successfully created the directory %s " % TRAIN_PATH)
try:
os.mkdir(TEST_PATH)
except OSError:
print ("Creation of the directory %s failed" % TEST_PATH)
else:
print ("Successfully created the directory %s " % TEST_PATH)
#Transformamos en listas ambas arrays para evitar imagenes duplicadas en las carpetas
X_train= list(dict.fromkeys(X_train))
X_test= list(dict.fromkeys(X_test))
#Copiamos las imagenes especificas a la carpeta train
for f in tqdm(range(len(X_train))):
if os.path.isfile(X_train[f]):
#Recorremos el folder donde se almacenan los .json
for file_name in [file for file in os.listdir(JSON_PATH)]:
with open(JSON_PATH + file_name) as json_file:
content= json.load(json_file)
#Almacenamos con que imagen va relacionado
jpegname= content['imagePath']
full_jpegname= IMG_PATH + jpegname
if full_jpegname == X_train[f]:
full_json_name= JSON_PATH + file_name
shutil.copy(full_json_name,TRAIN_PATH)
shutil.copy(X_train[f],TRAIN_PATH)
#Y a la carpeta test
for f in tqdm(range(len(X_test))):
if (os.path.isfile(X_test[f])):
#Recorremos el folder donde se almacenan los .json
for file_name in [file for file in os.listdir(JSON_PATH)]:
with open(JSON_PATH + file_name) as json_file:
content= json.load(json_file)
#Almacenamos con que imagen va relacionado
jpegname= content['imagePath']
full_jpegname= IMG_PATH + jpegname
if full_jpegname == X_test[f]:
full_json_name= JSON_PATH + file_name
shutil.copy(full_json_name,TEST_PATH)
shutil.copy(X_test[f],TEST_PATH)
def dataset_definition(img_path = IMG_PATH, json_path=JSON_PATH):
'''
Extracts the labels, encodes them and does the train-test split using stratified k folds but only using the first set
'''
img_names,labels,nlabels = extractLabels(img_path,json_path)
encoded_labes = encodeLabels(labels,nlabels)
train_test_split(img_names,encoded_labes)
def main():
dataset_definition()
if __name__ == "__main__":
main()