Displaying sources and duplicates from ImageLinks data (#1)

* Show duplicates badge * Initial imagelink extraction script * Add merging to extraction script * Clean up extraction script and update static files * Update gitignore * Clean up TODOs * PR feedback * Add full_into.txt
archivesunleashed · Mar 8, 2020 · 3e67f20 · 3e67f20
1 parent 54457a9
commit 3e67f20
Show file tree

Hide file tree

Showing 26 changed files with 186,928 additions and 86 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .DS_Store
 data/
 img/
+log/
 *.pyc
-monitor.sh
+monitor.sh
diff --git a/backend/generator.py b/backend/generator.py
@@ -11,6 +11,8 @@
 DIM = 2048
 TOTAL_NUM_ELEMENTS = 0
 ELEMENTS = []
+DUPLICATE_COUNTS = {}
+IMAGE_SOURCES = {}
 HNSW = None
 
 
@@ -43,8 +45,10 @@ def gen_random(path):  # Show top 10 closest images for an entry
         path = ELEMENTS[index]
         res.append({
             'distance': str(dist),
+            'duplicates': getDuplicateCountByPath(path),
             'imgPath': genExternalImageURLByPath(path),
             'refURL': genReferenceURL(path),
+            'sources': getSourcesByPath(path),
         })
         # print("Label:", class_labels[idx])
 
@@ -71,6 +75,16 @@ def loadHNSW(loadFromIndex=131490):
     print('<< [Loading HNSW] done')
 
 
+def loadMetadata(filepath='full_info.txt'):
+    inputfile = open(filepath, 'r')
+    for line in inputfile:
+        parsed_line = line.strip().split()
+        filename = parsed_line[0]
+        md5 = filename.split(".")[0]
+        DUPLICATE_COUNTS[md5] = int(parsed_line[1])
+        IMAGE_SOURCES[md5] = parsed_line[2:]
+
+
 # Utils Functions
 def genExternalImageURLByPath(full_path):
     path = full_path[4:] # get rid of "img/" prefix
@@ -79,3 +93,13 @@ def genExternalImageURLByPath(full_path):
 def genReferenceURL(full_path):
     path = full_path[4:] # get rid of "img/" prefix
     return url_for('serveReact', path=path, _external=True)
+
+def getDuplicateCountByPath(full_path):
+    path = full_path[4:] # get rid of "img/" prefix
+    md5 = path.split(".")[0]
+    return DUPLICATE_COUNTS[md5]
+
+def getSourcesByPath(full_path):
+    path = full_path[4:] # get rid of "img/" prefix
+    md5 = path.split(".")[0]
+    return IMAGE_SOURCES[md5]
diff --git a/full_info.txt b/full_info.txt
diff --git a/script/extract-all-parquets-multi.py b/script/extract-all-parquets-multi.py
@@ -0,0 +1,242 @@
+import collections
+import threading
+import pyarrow.parquet as pq
+from glob import glob
+import base64
+import io
+import os
+import os.path
+import time
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+# ./data/part-
+IMAGE_DIR = "data/images"
+IMAGELINK_DIR = "data/imagegraph"
+
+# Main Function
+def main():
+    print('>> [Pre-process] Starting image extraction')
+    extractParquet(IMAGE_DIR, 
+                  "image_info.txt", 
+                  primary_key='md5',
+                  collect_key='url',
+                  save_ext=True,
+                  save_image=False) # Change save_image to `True` if we want to re-download images
+
+    print('>> [Pre-process] Starting image sources extraction')
+    extractParquet(IMAGELINK_DIR, 
+                  "image_links.txt", 
+                  primary_key='image_url',
+                  collect_key='src',
+                  save_ext=False,
+                  save_image=False)
+
+    print('>> [Pre-process] Starting merge of image data and image links')
+    merge("image_info.txt", "image_links.txt", "full_info.txt")
+
+
+def extractParquet(base_dir, output_path, primary_key, collect_key, save_ext, save_image):
+    print(f'>> [Pre-process] Starting extraction of parquets in {base_dir}')
+
+    if save_image:
+        emptyFolder()
+
+    removeIfExist(output_path)
+
+    threads = list()
+    # ext_dicts[i] is a map from `primary_key` to image extension
+    # collect_dicts[i] is a map from `primary_key` to a set of `collect_key`
+    ext_dicts = list() 
+    collect_dicts = list() 
+
+    saver = ImageSaver()
+    total_threads = 0
+    for index, table_path in enumerate(glob(os.path.join(base_dir, 'part-*.parquet'))):
+        ext_dicts.append({})
+        collect_dicts.append(collections.defaultdict(set))
+        x = threading.Thread(target=processTable, args=(
+            index, saver, table_path, ext_dicts[index], collect_dicts[index],
+            primary_key, collect_key, save_ext, save_image))
+        threads.append(x)
+        x.start()
+        # Rate Limiting
+        total_threads += 1
+        if total_threads == 20:
+            for index, thread in enumerate(threads):
+                thread.join()
+            threads = list()
+            total_threads = 0
+
+    for index, thread in enumerate(threads):
+        thread.join()
+
+    # Merge results across all parquets
+
+    extensions = {}
+    if save_ext:
+        for ext_dict in ext_dicts:
+            # This overwrites existing values in extensions, but in this case we don't care
+            extensions.update(ext_dict)
+
+    collected_keys = collections.defaultdict(set)
+    for collect_dict in collect_dicts:
+        for k, v in collect_dict.items():
+            collected_keys[k].update(v)
+
+    with open(output_path, 'w') as output:
+        for key, collected in collected_keys.items():
+            collected_str = ' '.join(collected)
+            if save_ext:
+                line = f'{key} {extensions[key]} {collected_str}'
+            else:
+                line = f'{key} {collected_str}'
+
+            output.write(line + '\n')
+
+    print(f'<< [Pre-process] Finished processing {len(collected_keys)} entries')
+
+
+def processTable(t_index, saver, table_path, ext_dict, collect_dict,
+        primary_key, collect_key, save_ext, save_image):
+    table = toPandasTable(table_path)
+    num_elements = len(table)
+
+    report_interval = num_elements if num_elements < 10000 else (num_elements//10)
+    for index in range(num_elements):
+        if index % report_interval == 0:
+            print(
+                f'>> [Pre-process][Table {t_index + 1}][Image][{index}/{num_elements}]')
+
+        try:
+            row = table.loc[index]
+            dict_key = row[primary_key]
+            if not dict_key: # Sometimes `image_url` or `md5` are empty
+                continue
+
+            collect_dict[dict_key].add(row[collect_key])
+
+            if save_ext:
+                # TODO: Can we use row['extension']?
+                ext_dict[dict_key] = extensionForRow(row)
+
+            if save_image and saver.save(row):
+                ids.add(genImagePath(row))
+        except Exception as e:
+            print(f">> [Pre-process] Skipping row {index}. Reason: {e}")
+
+    print(
+        f'<< [Pre-process][Table {t_index + 1}][Image][{num_elements}/{num_elements}]')
+
+
+def merge(file_path1, file_path2, output_path):
+    image_links = {}
+
+    extension = {}
+    duplicate_count = {}
+    image_sources = collections.defaultdict(set)
+
+        t1 = time.time()
+    with open(file_path2, 'r') as file:
+        row = 0
+        for line in file:
+            # url src1 src2 src3
+            try:
+                parsed_line = line.strip().split()
+                url = parsed_line[0]
+                image_links[url] = set(parsed_line[1:])
+            except Exception as e:
+                print(f'>> [Merging] Cannot read row {row} (length: {len(parsed_line)}). Reason: {e}')
+            row += 1
+
+    print(f'>> [Merging] Loaded {file_path2} in {time.time() - t1} seconds')
+
+    with open(file_path1, 'r') as file:
+        for line in file:
+            # md5 ext url1 url2 url3
+            parsed_line = line.strip().split()
+            md5 = parsed_line[0]
+            extension[md5] = parsed_line[1]
+            duplicate_count[md5] = len(parsed_line[2:])
+
+            for url in parsed_line[2:]:
+                image_sources[md5].update(image_links[url])
+
+    with open(output_path, 'w') as output:
+        for md5, ext in extension.items():
+            filename = f'{md5}.{ext}'
+            duplicates = duplicate_count[md5]
+            sources = ' '.join(image_sources[md5])
+            output.write(f'{filename} {duplicates} {sources}\n')
+
+
+
+# Utils Functions
+def emptyFolder(folder_path='./img/*'):
+    for path in glob(folder_path):
+        os.remove(path)
+
+
+def toPandasTable(path):
+    pyarrow_table = pq.read_table(path)
+    return pyarrow_table.to_pandas()
+
+
+def extensionForRow(row):
+    return row['filename'].split(".")[-1]
+
+def genImagePath(row):
+    uid = row['md5']
+    ext = row['filename'].split(".")[-1]
+    return uid + '.' + ext
+
+
+def genImageFullPath(row):
+    return './img/' + genImagePath(row)
+
+
+def removeIfExist(path):
+    if os.path.exists(path):
+        os.remove(path)
+
+
+def toPILImage(row, target_size=None):
+    base64_decoded = base64.b64decode(row.bytes)
+    try:
+        res = Image.open(io.BytesIO(base64_decoded)).convert('RGB')
+        if target_size is None:
+            return res
+        else:
+            return res.resize(target_size)
+    except Exception as e:
+        print('[util][toPILImage] Failed:', e)
+        return False
+
+
+class ImageSaver:
+    def __init__(self):
+        self.ids = set()
+        self.mutex = threading.Lock()
+
+    def save(self, row):
+        img = toPILImage(row)
+        if img is False:
+            return False
+
+        output_path = genImageFullPath(row)
+        self.mutex.acquire()
+        visited = (output_path in self.ids)
+        if visited is False:
+            self.ids.add(output_path)
+        self.mutex.release()
+
+        if visited is True:
+            img.save(output_path)
+
+        return visited
+
+
+# Trigger Multi Threading Version
+if __name__ == '__main__':
+    main()
diff --git a/script/remove-duplicates.py b/script/remove-duplicates.py
diff --git a/server.py b/server.py
@@ -1,7 +1,7 @@
 from flask_cors import CORS
 import os
 from flask import Flask, jsonify, send_from_directory, abort
-from backend.generator import loadHNSW, gen_random
+from backend.generator import loadMetadata, loadHNSW, gen_random
 app = Flask(__name__, static_folder='ui/build')
 
 # Local Web Dev Allow CORS
@@ -44,6 +44,7 @@ def serveImages(path):
 if __name__ == '__main__':
     # Preparation
     loadHNSW()
+    loadMetadata()
     # Production Mode
     app.run(
         host='0.0.0.0',

diff --git a/ui/build/asset-manifest.json b/ui/build/asset-manifest.json
@@ -1,22 +1,22 @@
 {
   "files": {
-    "main.css": "/static/css/main.0c0e79a1.chunk.css",
-    "main.js": "/static/js/main.dc87cead.chunk.js",
-    "main.js.map": "/static/js/main.dc87cead.chunk.js.map",
+    "main.css": "/static/css/main.ea16c8ab.chunk.css",
+    "main.js": "/static/js/main.d505ba22.chunk.js",
+    "main.js.map": "/static/js/main.d505ba22.chunk.js.map",
     "runtime-main.js": "/static/js/runtime-main.fda89fee.js",
     "runtime-main.js.map": "/static/js/runtime-main.fda89fee.js.map",
-    "static/js/2.ece64702.chunk.js": "/static/js/2.ece64702.chunk.js",
-    "static/js/2.ece64702.chunk.js.map": "/static/js/2.ece64702.chunk.js.map",
+    "static/js/2.50432d61.chunk.js": "/static/js/2.50432d61.chunk.js",
+    "static/js/2.50432d61.chunk.js.map": "/static/js/2.50432d61.chunk.js.map",
     "index.html": "/index.html",
-    "precache-manifest.8d5b440d1fb6d1a861aa65f0957c11f5.js": "/precache-manifest.8d5b440d1fb6d1a861aa65f0957c11f5.js",
+    "precache-manifest.fa96f152633727586692ae4d3b668f33.js": "/precache-manifest.fa96f152633727586692ae4d3b668f33.js",
     "service-worker.js": "/service-worker.js",
-    "static/css/main.0c0e79a1.chunk.css.map": "/static/css/main.0c0e79a1.chunk.css.map",
-    "static/js/2.ece64702.chunk.js.LICENSE": "/static/js/2.ece64702.chunk.js.LICENSE"
+    "static/css/main.ea16c8ab.chunk.css.map": "/static/css/main.ea16c8ab.chunk.css.map",
+    "static/js/2.50432d61.chunk.js.LICENSE": "/static/js/2.50432d61.chunk.js.LICENSE"
   },
   "entrypoints": [
     "static/js/runtime-main.fda89fee.js",
-    "static/js/2.ece64702.chunk.js",
-    "static/css/main.0c0e79a1.chunk.css",
-    "static/js/main.dc87cead.chunk.js"
+    "static/js/2.50432d61.chunk.js",
+    "static/css/main.ea16c8ab.chunk.css",
+    "static/js/main.d505ba22.chunk.js"
   ]
 }
diff --git a/ui/build/index.html b/ui/build/index.html
@@ -1 +1 @@
-<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.0c0e79a1.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u||[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui||[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.ece64702.chunk.js"></script><script src="/static/js/main.dc87cead.chunk.js"></script></body></html>
+<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.ea16c8ab.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u||[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui||[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.50432d61.chunk.js"></script><script src="/static/js/main.d505ba22.chunk.js"></script></body></html>
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.0c0e79a1.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u\|\|[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)\|\|Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui\|\|[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.ece64702.chunk.js"></script><script src="/static/js/main.dc87cead.chunk.js"></script></body></html>
		<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.ea16c8ab.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u\|\|[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)\|\|Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui\|\|[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.50432d61.chunk.js"></script><script src="/static/js/main.d505ba22.chunk.js"></script></body></html>