-
Notifications
You must be signed in to change notification settings - Fork 3
/
PanoScraper.py
166 lines (134 loc) · 5.39 KB
/
PanoScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import multiprocessing as mp
import os
import subprocess
from datatypes.panorama import Panorama
from itertools import islice
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from subprocess import DEVNULL, STDOUT
from time import perf_counter
# null crops per pano
NULLS_PER_PANO = 0
BLACK_THRESHOLD = (10, 10, 10)
BATCH_TXT_FOLDER = "batches"
SFTP_KEY_PATH = "PATH TO SFTP KEY"
def bulk_scrape_panos(data_chunk, local_dir, remote_dir):
t_start = perf_counter()
pano_set = set()
if not os.path.isdir(BATCH_TXT_FOLDER):
os.makedirs(BATCH_TXT_FOLDER)
# accumulate list of pano ids to gather from sftp
df_dict = data_chunk.to_dict('records')
for row in df_dict:
# print(row['gsv_panorama_id'])
pano_id = row['gsv_panorama_id']
if pano_id != 'tutorial':
pano_set.add(pano_id)
# get available cpu_count
cpu_count = mp.cpu_count() if mp.cpu_count() <= 8 else 8
# split pano set into chunks for multithreading
pano_set_size = len(pano_set)
i = 0
processes = []
while i < pano_set_size:
chunk_size = (pano_set_size - i) // cpu_count
pano_ids = set(islice(pano_set, i, i + chunk_size))
process = mp.Process(target=acquire_n_panos, args=(remote_dir, local_dir, pano_ids, cpu_count))
processes.append(process)
cpu_count -= 1
i += chunk_size
# start processes
for p in processes:
p.start()
# join processes once finished
for p in processes:
p.join()
# remove batch txts
for file in os.scandir(BATCH_TXT_FOLDER):
os.remove(file.path)
t_stop = perf_counter()
execution_time = t_stop - t_start
# print("Finished Scraping.")
# print()
return pano_set_size, execution_time
# Get a collection of "null" rows from a pano.
# TODO: update with new label structure
# def get_null_rows(pano, min_dist = 70, bottom_space = 1600, side_space = 300):
# null_rows = []
# while len(null_rows) < NULLS_PER_PANO:
# x = random.uniform(side_space, GSV_IMAGE_WIDTH - side_space)
# y = random.uniform(- (GSV_IMAGE_HEIGHT/2 - bottom_space), 0)
# point = Point(x, y)
# valid_null = True
# for feat in pano.all_feats():
# if point.dist(feat.point()) <= min_dist:
# valid_null = False
# break
# if valid_null:
# # Using 0 for "null" label_type_id.
# row = [pano.pano_id, x, y, 0, pano.photog_heading, pano.photog_pitch, None, None, None]
# null_rows.append(row)
# return null_rows
def acquire_n_panos(remote_dir, local_dir, pano_ids, thread_id):
sftp_command_list = ['cd {}'.format(remote_dir), 'lcd {}'.format(local_dir)]
# create collection of commands
for pano_id in pano_ids:
# get first two characters of pano id
two_chars = pano_id[:2]
# get jpg for pano id
sftp_command_list.append('-get ./{prefix}/{full_id}.jpg'.format(prefix=two_chars, full_id=pano_id))
thread_batch_txt = f'{BATCH_TXT_FOLDER}/batch{thread_id}.text'
bash_command = f'sftp -b {thread_batch_txt} -P 9000 -i {SFTP_KEY_PATH} [email protected]'
with open(thread_batch_txt, 'w', newline='') as sftp_file:
for sftp_command in sftp_command_list:
sftp_file.write("%s\n" % sftp_command)
sftp_file.write('quit\n')
sftp = subprocess.Popen(bash_command.split(), shell=False, stdout=DEVNULL, stderr=STDOUT)
result = sftp.communicate()
print(result)
if sftp.returncode != 0:
print("sftp failed on one or more commands: {0}".format(sftp_command_list))
# print("done acquiring n panos")
# def clean_panos(path_to_panos):
# t_start = perf_counter()
# # get list of pano paths
# panos = glob.glob(path_to_panos + "/*.jpg")
# # get available cpu_count
# cpu_count = mp.cpu_count()
# # split pano set into chunks for multithreading
# pano_set_size = len(panos)
# i = 0
# processes = []
# while i < pano_set_size:
# chunk_size = (pano_set_size - i) // cpu_count
# pano_ids = set(islice(panos, i, i + chunk_size))
# process = mp.Process(target=clean_n_panos, args=(pano_ids,))
# processes.append(process)
# cpu_count -= 1
# i += chunk_size
# # start processes
# for p in processes:
# p.start()
# # join processes once finished
# for p in processes:
# p.join()
# t_stop = perf_counter()
# execution_time = t_stop - t_start
# return execution_time
# def clean_n_panos(panos):
# for pano_path in panos:
# with Image.open(pano_path) as p:
# original_size = p.size
# if original_size != (GSV_IMAGE_WIDTH, GSV_IMAGE_HEIGHT):
# # check if pano needs cleaning by looking for black space
# try:
# pix = p.load()
# if pix[GSV_IMAGE_WIDTH, GSV_IMAGE_HEIGHT] <= BLACK_THRESHOLD and pix[original_size[0] - 1, original_size[1] - 1] <= BLACK_THRESHOLD:
# print("resizing ", pano_path)
# im = p.crop((0, 0, GSV_IMAGE_WIDTH, GSV_IMAGE_HEIGHT))
# im = im.resize(original_size)
# im.save(pano_path)
# except Exception as e:
# print("error on ", p)
# print(p.size)
# print(e)