Skip to content
This repository has been archived by the owner on Sep 20, 2021. It is now read-only.

Commit

Permalink
Displaying sources and duplicates from ImageLinks data (#1)
Browse files Browse the repository at this point in the history
* Show duplicates badge

* Initial imagelink extraction script

* Add merging to extraction script

* Clean up extraction script and update static files

* Update gitignore

* Clean up TODOs

* PR feedback

* Add full_into.txt
  • Loading branch information
tikul authored Mar 8, 2020
1 parent 54457a9 commit 3e67f20
Show file tree
Hide file tree
Showing 26 changed files with 186,928 additions and 86 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.DS_Store
data/
img/
log/
*.pyc
monitor.sh
monitor.sh
24 changes: 24 additions & 0 deletions backend/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
DIM = 2048
TOTAL_NUM_ELEMENTS = 0
ELEMENTS = []
DUPLICATE_COUNTS = {}
IMAGE_SOURCES = {}
HNSW = None


Expand Down Expand Up @@ -43,8 +45,10 @@ def gen_random(path): # Show top 10 closest images for an entry
path = ELEMENTS[index]
res.append({
'distance': str(dist),
'duplicates': getDuplicateCountByPath(path),
'imgPath': genExternalImageURLByPath(path),
'refURL': genReferenceURL(path),
'sources': getSourcesByPath(path),
})
# print("Label:", class_labels[idx])

Expand All @@ -71,6 +75,16 @@ def loadHNSW(loadFromIndex=131490):
print('<< [Loading HNSW] done')


def loadMetadata(filepath='full_info.txt'):
inputfile = open(filepath, 'r')
for line in inputfile:
parsed_line = line.strip().split()
filename = parsed_line[0]
md5 = filename.split(".")[0]
DUPLICATE_COUNTS[md5] = int(parsed_line[1])
IMAGE_SOURCES[md5] = parsed_line[2:]


# Utils Functions
def genExternalImageURLByPath(full_path):
path = full_path[4:] # get rid of "img/" prefix
Expand All @@ -79,3 +93,13 @@ def genExternalImageURLByPath(full_path):
def genReferenceURL(full_path):
path = full_path[4:] # get rid of "img/" prefix
return url_for('serveReact', path=path, _external=True)

def getDuplicateCountByPath(full_path):
path = full_path[4:] # get rid of "img/" prefix
md5 = path.split(".")[0]
return DUPLICATE_COUNTS[md5]

def getSourcesByPath(full_path):
path = full_path[4:] # get rid of "img/" prefix
md5 = path.split(".")[0]
return IMAGE_SOURCES[md5]
186,556 changes: 186,556 additions & 0 deletions full_info.txt

Large diffs are not rendered by default.

242 changes: 242 additions & 0 deletions script/extract-all-parquets-multi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
import collections
import threading
import pyarrow.parquet as pq
from glob import glob
import base64
import io
import os
import os.path
import time
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# ./data/part-
IMAGE_DIR = "data/images"
IMAGELINK_DIR = "data/imagegraph"

# Main Function
def main():
print('>> [Pre-process] Starting image extraction')
extractParquet(IMAGE_DIR,
"image_info.txt",
primary_key='md5',
collect_key='url',
save_ext=True,
save_image=False) # Change save_image to `True` if we want to re-download images

print('>> [Pre-process] Starting image sources extraction')
extractParquet(IMAGELINK_DIR,
"image_links.txt",
primary_key='image_url',
collect_key='src',
save_ext=False,
save_image=False)

print('>> [Pre-process] Starting merge of image data and image links')
merge("image_info.txt", "image_links.txt", "full_info.txt")


def extractParquet(base_dir, output_path, primary_key, collect_key, save_ext, save_image):
print(f'>> [Pre-process] Starting extraction of parquets in {base_dir}')

if save_image:
emptyFolder()

removeIfExist(output_path)

threads = list()
# ext_dicts[i] is a map from `primary_key` to image extension
# collect_dicts[i] is a map from `primary_key` to a set of `collect_key`
ext_dicts = list()
collect_dicts = list()

saver = ImageSaver()
total_threads = 0
for index, table_path in enumerate(glob(os.path.join(base_dir, 'part-*.parquet'))):
ext_dicts.append({})
collect_dicts.append(collections.defaultdict(set))
x = threading.Thread(target=processTable, args=(
index, saver, table_path, ext_dicts[index], collect_dicts[index],
primary_key, collect_key, save_ext, save_image))
threads.append(x)
x.start()
# Rate Limiting
total_threads += 1
if total_threads == 20:
for index, thread in enumerate(threads):
thread.join()
threads = list()
total_threads = 0

for index, thread in enumerate(threads):
thread.join()

# Merge results across all parquets

extensions = {}
if save_ext:
for ext_dict in ext_dicts:
# This overwrites existing values in extensions, but in this case we don't care
extensions.update(ext_dict)

collected_keys = collections.defaultdict(set)
for collect_dict in collect_dicts:
for k, v in collect_dict.items():
collected_keys[k].update(v)

with open(output_path, 'w') as output:
for key, collected in collected_keys.items():
collected_str = ' '.join(collected)
if save_ext:
line = f'{key} {extensions[key]} {collected_str}'
else:
line = f'{key} {collected_str}'

output.write(line + '\n')

print(f'<< [Pre-process] Finished processing {len(collected_keys)} entries')


def processTable(t_index, saver, table_path, ext_dict, collect_dict,
primary_key, collect_key, save_ext, save_image):
table = toPandasTable(table_path)
num_elements = len(table)

report_interval = num_elements if num_elements < 10000 else (num_elements//10)
for index in range(num_elements):
if index % report_interval == 0:
print(
f'>> [Pre-process][Table {t_index + 1}][Image][{index}/{num_elements}]')

try:
row = table.loc[index]
dict_key = row[primary_key]
if not dict_key: # Sometimes `image_url` or `md5` are empty
continue

collect_dict[dict_key].add(row[collect_key])

if save_ext:
# TODO: Can we use row['extension']?
ext_dict[dict_key] = extensionForRow(row)

if save_image and saver.save(row):
ids.add(genImagePath(row))
except Exception as e:
print(f">> [Pre-process] Skipping row {index}. Reason: {e}")

print(
f'<< [Pre-process][Table {t_index + 1}][Image][{num_elements}/{num_elements}]')


def merge(file_path1, file_path2, output_path):
image_links = {}

extension = {}
duplicate_count = {}
image_sources = collections.defaultdict(set)

t1 = time.time()
with open(file_path2, 'r') as file:
row = 0
for line in file:
# url src1 src2 src3
try:
parsed_line = line.strip().split()
url = parsed_line[0]
image_links[url] = set(parsed_line[1:])
except Exception as e:
print(f'>> [Merging] Cannot read row {row} (length: {len(parsed_line)}). Reason: {e}')
row += 1

print(f'>> [Merging] Loaded {file_path2} in {time.time() - t1} seconds')

with open(file_path1, 'r') as file:
for line in file:
# md5 ext url1 url2 url3
parsed_line = line.strip().split()
md5 = parsed_line[0]
extension[md5] = parsed_line[1]
duplicate_count[md5] = len(parsed_line[2:])

for url in parsed_line[2:]:
image_sources[md5].update(image_links[url])

with open(output_path, 'w') as output:
for md5, ext in extension.items():
filename = f'{md5}.{ext}'
duplicates = duplicate_count[md5]
sources = ' '.join(image_sources[md5])
output.write(f'{filename} {duplicates} {sources}\n')



# Utils Functions
def emptyFolder(folder_path='./img/*'):
for path in glob(folder_path):
os.remove(path)


def toPandasTable(path):
pyarrow_table = pq.read_table(path)
return pyarrow_table.to_pandas()


def extensionForRow(row):
return row['filename'].split(".")[-1]

def genImagePath(row):
uid = row['md5']
ext = row['filename'].split(".")[-1]
return uid + '.' + ext


def genImageFullPath(row):
return './img/' + genImagePath(row)


def removeIfExist(path):
if os.path.exists(path):
os.remove(path)


def toPILImage(row, target_size=None):
base64_decoded = base64.b64decode(row.bytes)
try:
res = Image.open(io.BytesIO(base64_decoded)).convert('RGB')
if target_size is None:
return res
else:
return res.resize(target_size)
except Exception as e:
print('[util][toPILImage] Failed:', e)
return False


class ImageSaver:
def __init__(self):
self.ids = set()
self.mutex = threading.Lock()

def save(self, row):
img = toPILImage(row)
if img is False:
return False

output_path = genImageFullPath(row)
self.mutex.acquire()
visited = (output_path in self.ids)
if visited is False:
self.ids.add(output_path)
self.mutex.release()

if visited is True:
img.save(output_path)

return visited


# Trigger Multi Threading Version
if __name__ == '__main__':
main()
35 changes: 0 additions & 35 deletions script/remove-duplicates.py

This file was deleted.

3 changes: 2 additions & 1 deletion server.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from flask_cors import CORS
import os
from flask import Flask, jsonify, send_from_directory, abort
from backend.generator import loadHNSW, gen_random
from backend.generator import loadMetadata, loadHNSW, gen_random
app = Flask(__name__, static_folder='ui/build')

# Local Web Dev Allow CORS
Expand Down Expand Up @@ -44,6 +44,7 @@ def serveImages(path):
if __name__ == '__main__':
# Preparation
loadHNSW()
loadMetadata()
# Production Mode
app.run(
host='0.0.0.0',
Expand Down
22 changes: 11 additions & 11 deletions ui/build/asset-manifest.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"files": {
"main.css": "/static/css/main.0c0e79a1.chunk.css",
"main.js": "/static/js/main.dc87cead.chunk.js",
"main.js.map": "/static/js/main.dc87cead.chunk.js.map",
"main.css": "/static/css/main.ea16c8ab.chunk.css",
"main.js": "/static/js/main.d505ba22.chunk.js",
"main.js.map": "/static/js/main.d505ba22.chunk.js.map",
"runtime-main.js": "/static/js/runtime-main.fda89fee.js",
"runtime-main.js.map": "/static/js/runtime-main.fda89fee.js.map",
"static/js/2.ece64702.chunk.js": "/static/js/2.ece64702.chunk.js",
"static/js/2.ece64702.chunk.js.map": "/static/js/2.ece64702.chunk.js.map",
"static/js/2.50432d61.chunk.js": "/static/js/2.50432d61.chunk.js",
"static/js/2.50432d61.chunk.js.map": "/static/js/2.50432d61.chunk.js.map",
"index.html": "/index.html",
"precache-manifest.8d5b440d1fb6d1a861aa65f0957c11f5.js": "/precache-manifest.8d5b440d1fb6d1a861aa65f0957c11f5.js",
"precache-manifest.fa96f152633727586692ae4d3b668f33.js": "/precache-manifest.fa96f152633727586692ae4d3b668f33.js",
"service-worker.js": "/service-worker.js",
"static/css/main.0c0e79a1.chunk.css.map": "/static/css/main.0c0e79a1.chunk.css.map",
"static/js/2.ece64702.chunk.js.LICENSE": "/static/js/2.ece64702.chunk.js.LICENSE"
"static/css/main.ea16c8ab.chunk.css.map": "/static/css/main.ea16c8ab.chunk.css.map",
"static/js/2.50432d61.chunk.js.LICENSE": "/static/js/2.50432d61.chunk.js.LICENSE"
},
"entrypoints": [
"static/js/runtime-main.fda89fee.js",
"static/js/2.ece64702.chunk.js",
"static/css/main.0c0e79a1.chunk.css",
"static/js/main.dc87cead.chunk.js"
"static/js/2.50432d61.chunk.js",
"static/css/main.ea16c8ab.chunk.css",
"static/js/main.d505ba22.chunk.js"
]
}
2 changes: 1 addition & 1 deletion ui/build/index.html
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.0c0e79a1.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u||[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui||[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.ece64702.chunk.js"></script><script src="/static/js/main.dc87cead.chunk.js"></script></body></html>
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.ea16c8ab.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u||[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui||[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.50432d61.chunk.js"></script><script src="/static/js/main.d505ba22.chunk.js"></script></body></html>
Loading

0 comments on commit 3e67f20

Please sign in to comment.