This repository has been archived by the owner on Sep 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Displaying sources and duplicates from ImageLinks data (#1)
* Show duplicates badge * Initial imagelink extraction script * Add merging to extraction script * Clean up extraction script and update static files * Update gitignore * Clean up TODOs * PR feedback * Add full_into.txt
- Loading branch information
Showing
26 changed files
with
186,928 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
.DS_Store | ||
data/ | ||
img/ | ||
log/ | ||
*.pyc | ||
monitor.sh | ||
monitor.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
import collections | ||
import threading | ||
import pyarrow.parquet as pq | ||
from glob import glob | ||
import base64 | ||
import io | ||
import os | ||
import os.path | ||
import time | ||
from PIL import Image | ||
from PIL import ImageFile | ||
ImageFile.LOAD_TRUNCATED_IMAGES = True | ||
|
||
# ./data/part- | ||
IMAGE_DIR = "data/images" | ||
IMAGELINK_DIR = "data/imagegraph" | ||
|
||
# Main Function | ||
def main(): | ||
print('>> [Pre-process] Starting image extraction') | ||
extractParquet(IMAGE_DIR, | ||
"image_info.txt", | ||
primary_key='md5', | ||
collect_key='url', | ||
save_ext=True, | ||
save_image=False) # Change save_image to `True` if we want to re-download images | ||
|
||
print('>> [Pre-process] Starting image sources extraction') | ||
extractParquet(IMAGELINK_DIR, | ||
"image_links.txt", | ||
primary_key='image_url', | ||
collect_key='src', | ||
save_ext=False, | ||
save_image=False) | ||
|
||
print('>> [Pre-process] Starting merge of image data and image links') | ||
merge("image_info.txt", "image_links.txt", "full_info.txt") | ||
|
||
|
||
def extractParquet(base_dir, output_path, primary_key, collect_key, save_ext, save_image): | ||
print(f'>> [Pre-process] Starting extraction of parquets in {base_dir}') | ||
|
||
if save_image: | ||
emptyFolder() | ||
|
||
removeIfExist(output_path) | ||
|
||
threads = list() | ||
# ext_dicts[i] is a map from `primary_key` to image extension | ||
# collect_dicts[i] is a map from `primary_key` to a set of `collect_key` | ||
ext_dicts = list() | ||
collect_dicts = list() | ||
|
||
saver = ImageSaver() | ||
total_threads = 0 | ||
for index, table_path in enumerate(glob(os.path.join(base_dir, 'part-*.parquet'))): | ||
ext_dicts.append({}) | ||
collect_dicts.append(collections.defaultdict(set)) | ||
x = threading.Thread(target=processTable, args=( | ||
index, saver, table_path, ext_dicts[index], collect_dicts[index], | ||
primary_key, collect_key, save_ext, save_image)) | ||
threads.append(x) | ||
x.start() | ||
# Rate Limiting | ||
total_threads += 1 | ||
if total_threads == 20: | ||
for index, thread in enumerate(threads): | ||
thread.join() | ||
threads = list() | ||
total_threads = 0 | ||
|
||
for index, thread in enumerate(threads): | ||
thread.join() | ||
|
||
# Merge results across all parquets | ||
|
||
extensions = {} | ||
if save_ext: | ||
for ext_dict in ext_dicts: | ||
# This overwrites existing values in extensions, but in this case we don't care | ||
extensions.update(ext_dict) | ||
|
||
collected_keys = collections.defaultdict(set) | ||
for collect_dict in collect_dicts: | ||
for k, v in collect_dict.items(): | ||
collected_keys[k].update(v) | ||
|
||
with open(output_path, 'w') as output: | ||
for key, collected in collected_keys.items(): | ||
collected_str = ' '.join(collected) | ||
if save_ext: | ||
line = f'{key} {extensions[key]} {collected_str}' | ||
else: | ||
line = f'{key} {collected_str}' | ||
|
||
output.write(line + '\n') | ||
|
||
print(f'<< [Pre-process] Finished processing {len(collected_keys)} entries') | ||
|
||
|
||
def processTable(t_index, saver, table_path, ext_dict, collect_dict, | ||
primary_key, collect_key, save_ext, save_image): | ||
table = toPandasTable(table_path) | ||
num_elements = len(table) | ||
|
||
report_interval = num_elements if num_elements < 10000 else (num_elements//10) | ||
for index in range(num_elements): | ||
if index % report_interval == 0: | ||
print( | ||
f'>> [Pre-process][Table {t_index + 1}][Image][{index}/{num_elements}]') | ||
|
||
try: | ||
row = table.loc[index] | ||
dict_key = row[primary_key] | ||
if not dict_key: # Sometimes `image_url` or `md5` are empty | ||
continue | ||
|
||
collect_dict[dict_key].add(row[collect_key]) | ||
|
||
if save_ext: | ||
# TODO: Can we use row['extension']? | ||
ext_dict[dict_key] = extensionForRow(row) | ||
|
||
if save_image and saver.save(row): | ||
ids.add(genImagePath(row)) | ||
except Exception as e: | ||
print(f">> [Pre-process] Skipping row {index}. Reason: {e}") | ||
|
||
print( | ||
f'<< [Pre-process][Table {t_index + 1}][Image][{num_elements}/{num_elements}]') | ||
|
||
|
||
def merge(file_path1, file_path2, output_path): | ||
image_links = {} | ||
|
||
extension = {} | ||
duplicate_count = {} | ||
image_sources = collections.defaultdict(set) | ||
|
||
t1 = time.time() | ||
with open(file_path2, 'r') as file: | ||
row = 0 | ||
for line in file: | ||
# url src1 src2 src3 | ||
try: | ||
parsed_line = line.strip().split() | ||
url = parsed_line[0] | ||
image_links[url] = set(parsed_line[1:]) | ||
except Exception as e: | ||
print(f'>> [Merging] Cannot read row {row} (length: {len(parsed_line)}). Reason: {e}') | ||
row += 1 | ||
|
||
print(f'>> [Merging] Loaded {file_path2} in {time.time() - t1} seconds') | ||
|
||
with open(file_path1, 'r') as file: | ||
for line in file: | ||
# md5 ext url1 url2 url3 | ||
parsed_line = line.strip().split() | ||
md5 = parsed_line[0] | ||
extension[md5] = parsed_line[1] | ||
duplicate_count[md5] = len(parsed_line[2:]) | ||
|
||
for url in parsed_line[2:]: | ||
image_sources[md5].update(image_links[url]) | ||
|
||
with open(output_path, 'w') as output: | ||
for md5, ext in extension.items(): | ||
filename = f'{md5}.{ext}' | ||
duplicates = duplicate_count[md5] | ||
sources = ' '.join(image_sources[md5]) | ||
output.write(f'{filename} {duplicates} {sources}\n') | ||
|
||
|
||
|
||
# Utils Functions | ||
def emptyFolder(folder_path='./img/*'): | ||
for path in glob(folder_path): | ||
os.remove(path) | ||
|
||
|
||
def toPandasTable(path): | ||
pyarrow_table = pq.read_table(path) | ||
return pyarrow_table.to_pandas() | ||
|
||
|
||
def extensionForRow(row): | ||
return row['filename'].split(".")[-1] | ||
|
||
def genImagePath(row): | ||
uid = row['md5'] | ||
ext = row['filename'].split(".")[-1] | ||
return uid + '.' + ext | ||
|
||
|
||
def genImageFullPath(row): | ||
return './img/' + genImagePath(row) | ||
|
||
|
||
def removeIfExist(path): | ||
if os.path.exists(path): | ||
os.remove(path) | ||
|
||
|
||
def toPILImage(row, target_size=None): | ||
base64_decoded = base64.b64decode(row.bytes) | ||
try: | ||
res = Image.open(io.BytesIO(base64_decoded)).convert('RGB') | ||
if target_size is None: | ||
return res | ||
else: | ||
return res.resize(target_size) | ||
except Exception as e: | ||
print('[util][toPILImage] Failed:', e) | ||
return False | ||
|
||
|
||
class ImageSaver: | ||
def __init__(self): | ||
self.ids = set() | ||
self.mutex = threading.Lock() | ||
|
||
def save(self, row): | ||
img = toPILImage(row) | ||
if img is False: | ||
return False | ||
|
||
output_path = genImageFullPath(row) | ||
self.mutex.acquire() | ||
visited = (output_path in self.ids) | ||
if visited is False: | ||
self.ids.add(output_path) | ||
self.mutex.release() | ||
|
||
if visited is True: | ||
img.save(output_path) | ||
|
||
return visited | ||
|
||
|
||
# Trigger Multi Threading Version | ||
if __name__ == '__main__': | ||
main() |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,22 @@ | ||
{ | ||
"files": { | ||
"main.css": "/static/css/main.0c0e79a1.chunk.css", | ||
"main.js": "/static/js/main.dc87cead.chunk.js", | ||
"main.js.map": "/static/js/main.dc87cead.chunk.js.map", | ||
"main.css": "/static/css/main.ea16c8ab.chunk.css", | ||
"main.js": "/static/js/main.d505ba22.chunk.js", | ||
"main.js.map": "/static/js/main.d505ba22.chunk.js.map", | ||
"runtime-main.js": "/static/js/runtime-main.fda89fee.js", | ||
"runtime-main.js.map": "/static/js/runtime-main.fda89fee.js.map", | ||
"static/js/2.ece64702.chunk.js": "/static/js/2.ece64702.chunk.js", | ||
"static/js/2.ece64702.chunk.js.map": "/static/js/2.ece64702.chunk.js.map", | ||
"static/js/2.50432d61.chunk.js": "/static/js/2.50432d61.chunk.js", | ||
"static/js/2.50432d61.chunk.js.map": "/static/js/2.50432d61.chunk.js.map", | ||
"index.html": "/index.html", | ||
"precache-manifest.8d5b440d1fb6d1a861aa65f0957c11f5.js": "/precache-manifest.8d5b440d1fb6d1a861aa65f0957c11f5.js", | ||
"precache-manifest.fa96f152633727586692ae4d3b668f33.js": "/precache-manifest.fa96f152633727586692ae4d3b668f33.js", | ||
"service-worker.js": "/service-worker.js", | ||
"static/css/main.0c0e79a1.chunk.css.map": "/static/css/main.0c0e79a1.chunk.css.map", | ||
"static/js/2.ece64702.chunk.js.LICENSE": "/static/js/2.ece64702.chunk.js.LICENSE" | ||
"static/css/main.ea16c8ab.chunk.css.map": "/static/css/main.ea16c8ab.chunk.css.map", | ||
"static/js/2.50432d61.chunk.js.LICENSE": "/static/js/2.50432d61.chunk.js.LICENSE" | ||
}, | ||
"entrypoints": [ | ||
"static/js/runtime-main.fda89fee.js", | ||
"static/js/2.ece64702.chunk.js", | ||
"static/css/main.0c0e79a1.chunk.css", | ||
"static/js/main.dc87cead.chunk.js" | ||
"static/js/2.50432d61.chunk.js", | ||
"static/css/main.ea16c8ab.chunk.css", | ||
"static/js/main.d505ba22.chunk.js" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.0c0e79a1.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u||[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui||[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.ece64702.chunk.js"></script><script src="/static/js/main.dc87cead.chunk.js"></script></body></html> | ||
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>AUT Demo</title><link href="/static/css/main.ea16c8ab.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(f){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,l=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&l.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(f[r]=o[r]);for(s&&s(e);l.length;)l.shift()();return c.push.apply(c,u||[]),a()}function a(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={1:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return f[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=f,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="/";var r=this.webpackJsonpui=this.webpackJsonpui||[],n=r.push.bind(r);r.push=e,r=r.slice();for(var o=0;o<r.length;o++)e(r[o]);var s=n;a()}([])</script><script src="/static/js/2.50432d61.chunk.js"></script><script src="/static/js/main.d505ba22.chunk.js"></script></body></html> |
Oops, something went wrong.