-
Notifications
You must be signed in to change notification settings - Fork 3
/
imageMaid.py
273 lines (226 loc) · 10.4 KB
/
imageMaid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# No copyright [email protected] 6/12/18
# Pseudo Code:
# iterate through the images in a given directory to: (simple to add a recursive function to do multiple dirs)
# identify the average file size (on disk) of the images
# remove files that are less than 10% of the file size of the average file size - those are too small to work with
# rename or convert all image files to .jpg for consistency
# hash each image and use the hash to check for duplicates - remove any duplicates discovered
# create a list of the different image sizes (height and width) - note the minimum and recommend uniform dimension
# resize images to the chosen uniform dimension
import time
import os
import cv2
import numpy as np
from glob import glob
from PIL import Image
import imagehash
from pydblite.pydblite import Base
import sys
basepath = "./dataset/"
subdirlist = []
# supply a list of classes (objects) to create a dataset
listitems = open('cars.txt', 'r')
for item in listitems:
subdirlist.append(item)
def resize(classdir, height, width):
global basepath
filecount = len(glob(classdir + "/*.jpg"))
index = 0
# by default save images to a new directory
newdir = os.path.basename(os.path.normpath(classdir))
newdir = newdir + "_" + str(height) + "x" + str(width)
newpath = basepath + newdir
if not os.path.isdir(newpath):
os.makedirs(basepath + newdir)
for imagePath in glob(classdir + "/*.jpg"):
index = index + 1
try:
img = cv2.imread(imagePath)
# method #1 - minimalistic crop to achieve proper aspect ratio THEN scale down.
# (images currently smaller than the height + width parameters will not be scaled.
# method #2 - no cropping - simply scale the image width to the width parameter and keep the aspect ratio
# add padding to a dimension if necessary
r = 800.0/img.shape[1]
h = int(img.shape[0] * r)
dim = (width, h)
resized = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
# do we pad? If height is less than 600 we will pad.
if h < 600:
# split the difference and add padding to top and bottom
diff = 600 - h
if not diff%2 == 0:
# the difference is not divisible by 2
# round the half difference to the nearest whole and subtract it from the difference to get the padding
toppad = diff/2
toppad = int(round(toppad))
botpad = diff - toppad
else:
toppad = diff/2
botpad = toppad
resized = cv2.copyMakeBorder(resized, toppad, botpad, 0, 0, cv2.BORDER_REPLICATE)
cv2.imwrite(newpath + '/' + os.path.basename(imagePath), resized)
sys.stdout.write("resizing image #{} of {} \r".format(index, filecount))
sys.stdout.flush()
time.sleep(0.1)
except Exception, e:
print('error resizing/padding image: {} . message: {}'.format(imagePath, e.message))
continue
def inventoryshape(classdir):
filecount = len(glob(classdir + "/*.jpg"))
print('determining optimal image resolution...')
db = Base('shape', save_to_file=False)
db.create('filename', 'height', 'width', 'count')
index = 0
for imagePath in glob(classdir + "/*.jpg"):
index = index + 1
try:
img = cv2.imread(imagePath)
filename = os.path.basename(imagePath)
shape = img.shape
h = shape[0]
w = shape[1]
pre = db(height=h, width=w)
# see if there is already an image of this shape in the DB...
if pre:
# ...if so - update the count
rec_id = pre[0]['__id__']
counter = int(pre[0]['count'])
counter = counter + 1
record = db[rec_id]
db.update(record, count=counter)
else:
# ...if not - insert the new shape
db.insert(filename=filename, height=h, width=w, count=1)
sys.stdout.write("reading shape for image #{} of {} \r".format(index, filecount ))
sys.stdout.flush()
time.sleep(0.1)
except Exception, e:
print('error processing image {}: {}'.format(imagePath, e))
continue
# need to add some more intelligence to this bit of code to auto-select the best image size
heightchk = db("height") < 590
heightcounter = 0
for r in heightchk:
heightcounter = heightcounter + 1
print('{} images ({}%) may lose fidelity by converting to 600x800 pixels'.format(heightcounter,
heightcounter / filecount))
def getstats(classdir):
try:
filelist = os.listdir(classdir.strip())
print("class count: {} ".format(len(filelist)))
# get the average file size for this class
filesizelist = []
for file in filelist:
filesizelist.append(os.path.getsize(classdir.strip() + "/" + file))
# Convert to Numpy - because Numpy is better
npfilesize = np.asarray(filesizelist)
# print("class avg filesize: {} kb".format(np.average(npfilesize) / 1024))
# print("class mean filesize): {} kb".format(np.mean(npfilesize / 1024)))
# print("class minimum filesize: {} bytes".format(np.min(npfilesize)))
# print("class maximum filesize: {} kb".format(np.max(npfilesize / 1024)))
# print("______________________________________")
# print(" ")
cutoff = np.average(npfilesize / 10)
for file in filelist:
path = classdir.strip() + "/" + file
size = os.path.getsize(path)
if size < cutoff:
print("deleting file {} it is only {} bytes ({} kb)".format(file, size, size / 1024))
os.remove(path)
# recalculate AVG size
# get the average file size for this class
filelist = os.listdir(classdir.strip())
filesizelist = []
for file in filelist:
filesizelist.append(os.path.getsize(classdir.strip() + "/" + file))
# Convert to Numpy - because Numpy is better
npfilesize = np.asarray(filesizelist)
# print("new class avg filesize (post-processing): {} kb".format(np.average(npfilesize) / 1024))
# print("new class mean filesize (post-processing): {} kb".format(np.mean(npfilesize / 1024)))
# print("new class minimum filesize (post-processing): {} kb".format(np.min(npfilesize / 1024)))
# print("new class maximum filesize (post-processing): {} kb".format(np.max(npfilesize / 1024)))
# print(" ")
# print("*************************************")
# print(" ")
except Exception, e:
print('Error in the getstats() function: {}'.format(e.message))
def convertrename(classdir):
pngs = glob(classdir + '/*.png')
if len(pngs) > 0:
for p in pngs:
# print('Converting {} to .jpg file'.format(p))
try:
img = cv2.imread(p)
cv2.imwrite(p[:-3] + 'jpg', img)
except Exception, e:
print('Error converting {} to JPG format'.format(p))
continue
else:
print('No .PNG files detected')
jpegs = glob(classdir + '/*.jpeg')
if len(jpegs) > 0:
for j in jpegs:
try:
basename = os.path.splitext(j)[0]
newname = basename + '.jpg'
# print('renaming {} to {}.jpg file'.format(j, basename))
os.rename(j, newname)
except Exception, e:
print('Error renaming {} to .JPG format'.format(j))
continue
else:
print('No .JPEG files detected')
def detectduplicates(classdir):
# Create an in-memory database
db = Base('fingerprinter', save_to_file=False)
db.create('filename', 'hash')
filecount = len(glob(classdir + "/*.jpg"))
duplicatecount = 0
print("creating image fingerprints for de-duplication ...")
index = 0
for imagePath in glob(classdir + "/*.jpg"):
index = index + 1
try:
if os.path.exists(imagePath):
image = Image.open(imagePath)
h = str(imagehash.dhash(image))
filename = os.path.basename(imagePath)
sys.stdout.write("fingerprint created for image # {} of {} \r".format(index, filecount))
sys.stdout.flush()
time.sleep(0.1)
pre = db(hash=h)
if pre:
# This image is a duplicate - delete it
duplicatecount = duplicatecount + 1
os.remove(classdir + "/" + filename)
else:
db.insert(filename=filename, hash=h)
except Exception, e:
print('Error in detectduplicates() function: {}'.format(e))
continue
print('Hashing complete. {} duplicate images were removed from this class repository'.format(duplicatecount))
def processclass(classdir):
try:
if os.path.isdir(classdir):
getstats(classdir)
print("removing images that are less than 10% of the average filesize ...")
print("this will remove any 0kb files and hopefully any corrupt files prior to analyzing with OpenCV...")
print(" ")
print("also converting all .png and renaming .jpeg images to .jpg for consistency")
convertrename(classdir)
# detect and delete duplicates
detectduplicates(classdir)
# take an inventory of the image sizes
inventoryshape(classdir)
# re-format image size with opencv
resize(classdir, 600, 800)
else:
print('Unable to find directory: {} Skipping that class.'.format(classdir))
except Exception, e:
print(str(e))
for subdir in subdirlist:
classdir = os.path.join(basepath, subdir.strip())
print("\n\n\n")
print("*********************")
print("class:: {} ".format(subdir.strip()))
processclass(classdir.strip())