-
Notifications
You must be signed in to change notification settings - Fork 519
/
ImageIndex.cc
385 lines (355 loc) · 11.1 KB
/
ImageIndex.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
// Copyright 2012 Viewfinder. All rights reserved.
// Author: Peter Mattis.
//
// The idea behind the perceptual fingerprint is based on the papers "Fast
// Multiresolution Image Querying" - Jacobs, et al and "Image Similarity Search
// with Compact Data Structures" - Lv, et al. The high-level: downsize the
// image to 32x32 grayscale (i.e. only consider luminance). The downsizing
// removes noise and some minor compression artifacts. Apply a 5x5 box blur to
// remove more noise/compression artifacts. Normalize the image
// orientation. Apply the discrete Haar wavelet transform which computes
// horizontal and vertical image gradients at various scales. Create a
// fingerprint using the resulting gradients by setting a bit in the
// fingerprint if the corresponding gradient has a non-negative value. The
// resulting fingerprint can be compared with another fingerprint using a
// simple hamming distance calculation. Similar fingerprints will correspond to
// similar images. Why does this work? The fingerprint captures image gradients
// at multiple scale levels and similar images will have similar gradient
// profiles.
//
// Note that we can't use an exact match on the fingerprint for
// searching. Minor compression artifacts will still cause a few bits in the
// fingerprint to change. When searching for a matching fingerprint, brute
// force would be possible. The hamming distance calculation is extremely fast
// (xor + number-of-bits-set). But we can do a bit better since we're only
// searching for near-duplicate images with a small number of differences from
// a target fingerprint.
//
// The perceptual hash contains 160 bits. When performing a search, we want to
// find the other fingerprints which have a small hamming distance from our
// search fingerprint. For our fingerprint, a hamming distance <= 5% of the
// fingerprint length usually gives solid confirmation of near-duplicate
// images. 160 bits * 5% == 8 bits.
//
// When adding a fingerprint to the index, we index 12-bit N-grams (tags) for
// each 160-bit fingerprint. There are 13 non-overlapping 12-bit N-grams in
// each fingerprint and, by the pigeon-hole principle, we are guaranteed that 2
// fingerprints with a hamming distance <= 12 will contain at least 1 N-gram in
// common.
//
// A 12-bit tag size provides 4096 buckets. With 100,000 unique images indexed,
// we would expect ~25 fingerprints per bucket. A search needs to examine all
// 13 buckets for a fingerprint giving an expected 325 fingerprints comparisons
// per search.
//
// TODO(peter): Can/should we incorporate chrominance into the fingerprint?
#import "DBFormat.h"
#import "ImageFingerprintPrivate.h"
#import "ImageIndex.h"
#import "Logging.h"
#import "Timer.h"
namespace {
// The tag lengths are expressed in hexadecimal characters (i.e. 4 bits) and
// must sum to 40 characters.
const int kTagLengths[13] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4,
};
const int kMatchThreshold = 12;
const string kImageIndexKeyPrefix = DBFormat::image_index_key("");
const DBRegisterKeyIntrospect kImageIndexKeyIntrospect(
kImageIndexKeyPrefix, NULL, [](Slice value) {
return DBIntrospect::FormatProto<ImageFingerprint>(value);
});
// #define SQUARE_ZIG_ZAG
#ifdef SQUARE_ZIG_ZAG
// Generate a vector of offsets for the zig-zag traversal of the upper-left
// NxN square region of an MxM matrix.
vector<int> MakeSquareZigZagOffsets(int n, int m) {
assert(m >= n);
vector<int> offsets(n * n);
int i = 0, j = 0;
int d = -1;
int start = 0;
int end = n * n - 1;
do {
offsets[start++] = i * m + j;
offsets[end--] = (n - i - 1) * m + n - j - 1;
i += d;
j -= d;
if (i < 0) {
++i;
d = -d;
} else if (j < 0) {
++j;
d = -d;
}
} while (start < end);
if (start == end) {
offsets[start] = i * m + j;
}
return offsets;
}
#else // !SQUARE_ZIG_ZAG
// Generate a vector of offsets for the zig-zag traversal of the first K cells of an NxN matrix.
vector<int> MakeTriangularZigZagOffsets(int k, int n) {
assert(k < (n * n));
vector<int> offsets(k);
int i = 0, j = 0;
int d = -1;
int start = 0;
int end = n * n - 1;
do {
offsets[start++] = i * n + j;
if (end < k) {
offsets[end] = (n - i) * n - j - 1;
}
end--;
i += d;
j -= d;
if (i < 0) {
++i;
d = -d;
} else if (j < 0) {
++j;
d = -d;
}
} while (start < k && start < end);
if (start == end && start < k) {
offsets[start] = i * n + j;
}
return offsets;
}
#endif // !SQUARE_ZIG_ZAG
string EncodeKey(const Slice& hex_term, int i, int n, const string& id) {
return Format("%s%02d:%s#%s", kImageIndexKeyPrefix,
i, hex_term.substr(i, n), id);
}
bool DecodeKey(Slice key, Slice* tag, Slice* id) {
if (!key.starts_with(kImageIndexKeyPrefix)) {
return false;
}
key.remove_prefix(kImageIndexKeyPrefix.size());
const int pos = key.find('#');
if (pos == key.npos) {
return false;
}
if (tag) {
*tag = key.substr(0, pos);
}
if (id) {
*id = key.substr(pos + 1);
}
return true;
}
StringSet GenerateKeys(const ImageFingerprint& f, const string& id) {
StringSet keys;
for (int i = 0; i < f.terms_size(); ++i) {
const string hex_term = BinaryToHex(f.terms(i));
const int n = hex_term.size();
for (int j = 0, k = 0, len; j < n; j += len, ++k) {
len = kTagLengths[k];
keys.insert(EncodeKey(hex_term, j, len, id));
}
}
return keys;
}
string Intersect(const Slice& a, const Slice& b) {
DCHECK_EQ(a.size(), b.size());
string r(a.ToString());
for (int i = 0; i < r.size(); ++i) {
r[i] ^= b[i];
}
return r;
}
int HammingDistance(const Slice& a, const Slice& b) {
DCHECK_EQ(a.size(), b.size());
DCHECK_EQ(0, a.size() % 4);
int count = 0;
const uint32_t* a_ptr = (const uint32_t*)a.data();
const uint32_t* b_ptr = (const uint32_t*)b.data();
for (int i = 0, n = a.size() / 4; i < n; ++i) {
count += __builtin_popcount(a_ptr[i] ^ b_ptr[i]);
}
return count;
}
} // namespace
#ifdef SQUARE_ZIG_ZAG
const vector<int> kZigZagOffsets = MakeSquareZigZagOffsets(
kHaarHashN, kHaarSmallN);
#else // !SQUARE_ZIG_ZAG
const vector<int> kZigZagOffsets = MakeTriangularZigZagOffsets(
kHaarHashBits + kHaarHashSkip, kHaarSmallN);
#endif // !SQUARE_ZIG_ZAG
ostream& operator<<(ostream& os, const ImageFingerprint& f) {
for (int i = 0; i < f.terms_size(); ++i) {
if (i > 0) {
os << ":";
}
os << BinaryToHex(f.terms(i));
}
return os;
}
ImageIndex::ImageIndex(bool histogram)
: histogram_(histogram ? new vector<int>(kHaarHashBits) : NULL),
histogram_count_(0) {
}
ImageIndex::~ImageIndex() {
}
void ImageIndex::Add(const ImageFingerprint& fingerprint,
const string& id, const DBHandle& updates) {
const StringSet keys = GenerateKeys(fingerprint, id);
for (StringSet::const_iterator iter(keys.begin());
iter != keys.end();
++iter) {
updates->PutProto(*iter, fingerprint);
}
if (histogram_.get()) {
// Keep a histogram of the bits set in the fingerprints so that we can
// verify in tests that every bit is being used.
for (int i = 0; i < fingerprint.terms_size(); ++i) {
const string& s = fingerprint.terms(i);
++histogram_count_;
const uint8_t* p = (const uint8_t*)s.data();
for (int j = 0; j < s.size() * 8; ++j) {
if (p[j / 8] & (1 << (j % 8))) {
(*histogram_)[j] += 1;
}
}
}
}
}
void ImageIndex::Remove(const ImageFingerprint& fingerprint,
const string& id, const DBHandle& updates) {
const StringSet keys = GenerateKeys(fingerprint, id);
for (StringSet::const_iterator iter(keys.begin());
iter != keys.end();
++iter) {
updates->Delete(*iter);
}
}
int ImageIndex::Search(const DBHandle& db, const ImageFingerprint& fingerprint,
StringSet* matched_ids) const {
const StringSet keys = GenerateKeys(fingerprint, "");
std::unordered_set<string> checked_ids;
int candidates = 0;
for (StringSet::const_iterator key_iter(keys.begin());
key_iter != keys.end();
++key_iter) {
for (DB::PrefixIterator iter(db, *key_iter);
iter.Valid();
iter.Next()) {
Slice id;
if (!DecodeKey(iter.key(), NULL, &id)) {
DCHECK(false);
continue;
}
const string id_str = id.ToString();
if (ContainsKey(checked_ids, id_str)) {
continue;
}
++candidates;
ImageFingerprint candidate_fingerprint;
if (!candidate_fingerprint.ParseFromArray(
iter.value().data(), iter.value().size())) {
DCHECK(false);
continue;
}
checked_ids.insert(id_str);
const int n = HammingDistance(fingerprint, candidate_fingerprint);
if (n > kMatchThreshold) {
continue;
}
matched_ids->insert(id_str);
}
}
return candidates;
}
string ImageIndex::PrettyHistogram() const {
if (!histogram_.get()) {
return "";
}
vector<string> v(kHaarSmallN * kHaarSmallN);
if (histogram_count_ > 0) {
for (int i = 0; i < kHaarHashSkip; ++i) {
v[kZigZagOffsets[i]] = " ";
}
for (int i = 0; i < histogram_->size(); ++i) {
const int val = (100 * (*histogram_)[i]) / histogram_count_;
v[kZigZagOffsets[i + kHaarHashSkip]] = Format("%3d", val);
}
}
string s;
for (int i = 0; i < kHaarSmallN; ++i) {
if (v[i + kHaarSmallN].empty()) {
break;
}
for (int j = 0; j < kHaarSmallN; ++j) {
const string& t = v[i * kHaarSmallN + j];
if (t.empty()) {
break;
}
s += " " + t;
}
s += "\n";
}
return s;
}
int ImageIndex::TotalTags(const DBHandle& db) const {
int count = 0;
for (DB::PrefixIterator iter(db, kImageIndexKeyPrefix);
iter.Valid();
iter.Next()) {
++count;
}
return count;
}
int ImageIndex::UniqueTags(const DBHandle& db) const {
string last_tag;
int count = 0;
for (DB::PrefixIterator iter(db, kImageIndexKeyPrefix);
iter.Valid();
iter.Next()) {
Slice tag;
if (!DecodeKey(iter.key(), &tag, NULL)) {
DCHECK(false);
continue;
}
if (last_tag != tag) {
++count;
last_tag = tag.ToString();
}
}
return count;
}
ImageFingerprint ImageIndex::Intersect(
const ImageFingerprint& a, const ImageFingerprint& b) {
ImageFingerprint f;
int best_dist = std::numeric_limits<int>::max();
int best_index_a = -1;
int best_index_b = -1;
for (int i = 0; i < a.terms_size(); ++i) {
for (int j = 0; j < b.terms_size(); ++j) {
const int dist = ::HammingDistance(a.terms(i), b.terms(j));
if (best_dist > dist) {
best_dist = dist;
best_index_a = i;
best_index_b = j;
}
}
}
f.add_terms(::Intersect(a.terms(best_index_a), b.terms(best_index_b)));
return f;
}
int ImageIndex::HammingDistance(
const ImageFingerprint& a, const ImageFingerprint& b) {
int best = std::numeric_limits<int>::max();
for (int i = 0; i < a.terms_size(); ++i) {
for (int j = 0; j < b.terms_size(); ++j) {
best = std::min(best, ::HammingDistance(a.terms(i), b.terms(j)));
}
}
return best;
}
// local variables:
// mode: c++
// end: