forked from cvisionai/tator-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_summary.py
728 lines (595 loc) · 28.7 KB
/
make_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
#!/usr/bin/env python3
"""
Script that gathers the localization information of a particular section.
Thumbnails can of each localization can also be generated.
"""
import argparse
import datetime
import glob
import logging
import os
import pathlib
import shutil
import sys
import traceback
import pandas as pd
import progressbar
import tator
logging.basicConfig(
filename='make_summary.log',
filemode='w',
format='%(asctime)s %(levelname)s:%(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO)
logger = logging.getLogger(__name__)
def sanitize_string(string: str) -> str:
""" Sanitize a string to be safe for filenames
Args:
string: String to sanitize for filenames
Returns:
Version of input string safe for filenames
"""
return string.replace("|", "-").\
replace("/","-").\
replace("\\","-").\
replace(":","-").\
replace("*","").\
replace('"',"").\
replace("<","").\
replace(">","").\
replace("|","-")
def get_attribute_type_data(localization_types_df: pd.DataFrame) -> dict:
""" Gets a dictionary of ordered attribute type information per localization type id
Args:
localization_types_df: DataFrame of output to
Returns:
Dictionary of pandas.DataFrame objects describing the attribute types.
Keys are localization type ids
"""
attribute_data = {}
for _, row in localization_types_df.iterrows():
df = pd.DataFrame(row.attribute_types)
if 'order' in df.keys():
df = df.sort_values('order')
if len(df[df['order'] == 0]) > 1:
log_msg = 'More than one attribute type has an order value of 0. Only expected one.'
logging.warning(log_msg)
attribute_data[row['id']] = df
return attribute_data
def process_localization(
host: str,
tator_api: tator.api,
project_id: int,
section_name: str,
media: tator.models.Media,
localization: tator.models.Localization,
localization_types_df: pd.DataFrame,
attribute_types_info: dict,
image_folder: str,
disable_thumbnails: bool,
thumbnail_filename_pattern: str,
print_timing_info: bool) -> dict:
""" Returns a dictionary of localization properties and generates a thumbnail image
Args:
host: Host URL
tator_api: Interface object to tator connected to the host
project_id: Unique identifier of project associated with the given localization
section_name: Name of the section associated with the localization
media: Media associated with the localization
localization: Localization to process
localization_types_df: DataFrame of localization types associated with the localization
attribute_types_info: Output of get_attribute_type_data
image_folder: Folder that will contain the thumbnail image
disable_thumbnails: Don't create thumbnails if true
thumbnail_filename_pattern: Refer to the help description
print_timing_info: True if timing info should be printed to console / logger
Returns:
Dictionary of localization data. Keys are localization properties.
Postconditions:
Thumbnail image of the localization is created in the given folder
Raises:
ValueError if a localization dtype is invalid.
"""
# Get the media's pixel width to convert the relative width/height and position info
width = media.width
height = media.height
# Grab the dtype information associated with this localization
df = localization_types_df
dtype_matches = df[df['id'] == localization.type]
if len(dtype_matches) != 1:
log_msg = f'Localization {localization.id} ignored. Unexpected dtype match ({len(dtype_matches)} matches)'
logging.warning(log_msg)
raise ValueError("Invalid dtype match")
dtype = dtype_matches.iloc[0]['dtype']
# Grab the URL associated with this localization
url = f"{host.rstrip('/')}/{project_id}/annotation/{media.id}?frame={localization.frame}"
# Get user name
user = tator_api.get_user(id=localization.user)
user_name = f'{user.first_name} {user.last_name}'
# Create the datum that will capture the localization info to be stored in the summary
datum={
'section': section_name,
'media_id': media.id,
'media': media.name,
'dtype': dtype,
'x': localization.x * width if localization.x is not None else None,
'y': localization.y * height if localization.y is not None else None,
'frame': localization.frame,
'id': localization.id,
'user': user_name,
'user_id': localization.user,
'url': url}
# Apply the width/height attributes based on the localization dtype
if dtype == 'box':
datum.update({
'width': localization.width * width if localization.width is not None else None,
'height': localization.height * height if localization.height is not None else None
})
elif dtype == 'line':
datum.update({
'width': localization.u * width if localization.u is not None else None,
'height': localization.v * height if localization.v is not None else None
})
elif dtype == 'dot':
datum.update({
'width': 0.0,
'height': 0.0
})
else:
log_msg = f'Localization dtype invalid {dtype}'
logging.warning(log_msg)
raise ValueError("Invalid dtype detected")
# Save the attributes of the localization specific to the localization type
datum.update(localization.attributes)
if not disable_thumbnails:
# Lastly, create the thumbnail of this localization and move it to the user defined location
# If the localization is a dot, take a large enough image segment,
# otherwise use the default marigins
timeout_sec = 10
if dtype =='dot':
minimum_margin = 10
margin_x = 50 #TODO Parameterize these constants
margin_y = 50
loc_x_pixels = int(localization.x * width)
loc_y_pixels = int(localization.y * height)
# Handle if the default margins will go outside the image bounds
if loc_x_pixels - margin_x < 0:
margin_x = loc_x_pixels
elif loc_x_pixels + margin_x > width:
margin_x = width - loc_x_pixels
if loc_y_pixels - margin_y < 0:
margin_y = loc_y_pixels
elif loc_y_pixels + margin_y > height:
margin_y = height - loc_y_pixels
if margin_x < minimum_margin or margin_y < minimum_margin:
msg = f"Dot graphic of {localization.id} not retrieved. Margins too small (x,y margins: {margin_x} {margin_y})"
logging.info(msg)
print(msg)
datum.update({'thumbnail': ''})
return datum
try:
start = datetime.datetime.now()
if print_timing_info:
msg = f"...Getting graphic of {localization.id}"
print(msg)
logging.info(msg)
image_path = tator_api.get_localization_graphic(
localization.id,
use_default_margins=False,
margin_x=margin_x,
margin_y=margin_y,
_request_timeout=timeout_sec)
end = datetime.datetime.now()
if print_timing_info:
msg = f"...done. ({(end - start).total_seconds()} seconds)"
print(msg)
logging.info(msg)
except Exception:
log_msg = f"ERROR: Problem with get_localization_graphic({localization.id})"
logging.error(log_msg)
error_msg = traceback.format_exc()
logging.error(error_msg)
raise ValueError(log_msg)
else:
try:
start = datetime.datetime.now()
if print_timing_info:
msg = f"...Getting graphic of {localization.id}"
print(msg)
logging.info(msg)
image_path = tator_api.get_localization_graphic(localization.id, _request_timeout=timeout_sec)
end = datetime.datetime.now()
if print_timing_info:
msg = f"...done. ({(end - start).total_seconds()} seconds)"
print(msg)
logging.info(msg)
except Exception:
log_msg = f"ERROR: Problem with get_localization_graphic({localization.id})"
logging.error(log_msg)
error_msg = traceback.format_exc()
logging.error(error_msg)
raise ValueError(log_msg)
"""
try:
image_path = tator_api.get_localization_graphic(localization.id)
except:
# This is a band-aid that should not happen anymore if the endpoint is updated
# on the server. There was an issue with the localization graphic endpoint with
# localizations that might result in a 1 pixel width and/or height. This would
# result in an exception and this try/except block attempts to redo the graphic
# with a small margin around the presumable box. If another exception occurs,
# then that is a problem that should be raised.
#
# #TODO It's worth revisiting removing this down the road based on the comments above.
image_path = tator_api.get_localization_graphic(
localization.id,
use_default_margins=False,
margin_x=1,
margin_y=1)
"""
# We will strip off the extension from the media name
media_name_str = os.path.splitext(sanitize_string(media.name))[0]
# The thumbnail created is a temporary file. Move it using a specific filename/path.
if thumbnail_filename_pattern is None:
target_filename = f'{localization.id}'
elif thumbnail_filename_pattern == 'all':
# The filename of the thumbnail will have the following format:
# - primary attribute (if available, order == 0)
# - media name
# - frame number
# - localization id
# - all other attributes
# - .png
target_filename = ''
main_basename = f'{media_name_str}_Frame_{localization.frame}_Id_{localization.id}_'
set_main_basename = False
if len(attribute_types_info[localization.type]) > 0:
# Since we have a sorted attribute type dataframe, if the first entry doesn't
# have a 'primary attribute flag' (i.e. order == 0), we can just iterate over
# the entries.
for _, row in attribute_types_info[localization.type].iterrows():
# Not all of the attributes available for a localization will always be used
# If it doesn't exist, skip over it
attr_name = row['name']
if attr_name in localization.attributes:
attr_val = str(localization.attributes[attr_name])
else:
continue
if row['order'] == 0:
target_filename += f"{sanitize_string(attr_val)}_"
else:
target_filename += f"{sanitize_string(row['name'])}_{sanitize_string(attr_val)}_"
if not set_main_basename:
target_filename += main_basename
set_main_basename = True
else:
# In the case where there are no attributes associated with the localizaiton, just use
# the base string defined earlier.
target_filename = main_basename
else:
# Using a string of attribute names and allowed localization values
target_filename = ''
tokens = thumbnail_filename_pattern.split(':')
for index, token in enumerate(tokens):
if token == 'media_name':
target_filename += f'{media_name_str}'
elif token == 'frame_number':
target_filename += f'Frame_{localization.frame}'
elif token == 'localization_id':
target_filename += f'Id_{localization.id}'
else:
if token in localization.attributes:
attr_val = str(localization.attributes[token])
target_filename += f'{sanitize_string(attr_val)}'
else:
raise ValueError(f"Invalid thumbnail_filename_pattern token provided: {token}")
if index < len(tokens) - 1:
target_filename += '_'
target_filename += '.png'
# The thumbnail created is a temporary file. Move it using a specific filename/path.
target_path = os.path.join(image_folder, target_filename)
# Force an extended path if running this script on windows.
# Otherwise, we'll get burned by the max path.
if os.name == 'nt':
target_path = '\\\\?\\' + os.path.abspath(target_path)
# If the target path already exists, attempt to add a prefix to it ("*_#.png")
if os.path.exists(target_path):
original_path = target_path
target_filename = os.path.splitext(target_path)[0]
extension = os.path.splitext(target_path)[1]
index_count = 1
target_path = target_filename + f'_{index_count}' + extension
while os.path.exists(target_path):
index_count += 1
target_path = target_filename + f'_{index_count}' + extension
log_msg = f'Duplicate filename detected. Changed path: {original_path} to {target_path}'
logging.warning(log_msg)
os.makedirs(image_folder, exist_ok=True)
shutil.move(image_path, target_path)
else:
# No thumbnail created. Empty string for the outgoing datum
target_path = ''
datum.update({'thumbnail': target_path})
return datum
def process_section(
host: str,
tator_api: tator.api,
project_id: int,
section_name: str,
medias: list,
localization_types_df: pd.DataFrame,
attribute_types_info: dict,
column_names: list,
summary_filename: str=None,
image_folder: str='images',
disable_thumbnails: bool=False,
thumbnail_filename_pattern: str=None,
print_timing_info: bool=False) -> None:
""" Creates the report for the given project section
Args:
host: Host URL
tator_api: Interface object to tator connected to the host
project_id: Unique identifier of project associated with the given section
section_name: Name of the section to be processed
medias: List of media associated with the given section
localization_types_df: DataFrame of localization types associated with the localization
attribute_types_info: Output of get_attribute_type_data
column_names: Columns to output in the section report
summary_filename: File name to use for the summary report
If None, then section_name_summary.csv is used
image_folder: Folder that will contain the thumbnail image
disable_thumbnails: Don't create thumbnails if true
thumbnail_filename_pattern: Refer to the help description
print_timing_info: True if timing info should be printed to console / logger
Postconditions:
Images of the localizations created in the image folder
.csv report of the section created
"""
# Display a progress bar in the console based on the medias processed
bar = progressbar.ProgressBar()
# Let's get to work, grab information about each localization in the media and store
# it in a dictionary. This dictionary will then be stored in a list to be written out to file.
report_data = []
print("-------------------------------------------------------------------------------")
print(f"Processing section: {section_name}")
print("-------------------------------------------------------------------------------")
logging.info(f"Processing section: {section_name}")
for media_index, media in enumerate(medias):
msg = f"Processing localizations of media (ID={media.id}) {media_index+1}/{len(medias)}"
print(msg)
logging.info(msg)
# Loop through each of the localizations associated with this media
# Process the localization and get the report data
try:
errors_detected = 0
localizations = tator_api.get_localization_list(project=project_id, media_id=[media.id])
processed_localization_count = 0
log_msg = f'Number of localizations: {len(localizations)}'
logging.info(log_msg)
bar = progressbar.ProgressBar()
for localization in bar(localizations):
try_count = 1
max_retries = 10
while try_count < max_retries:
try:
datum = process_localization(
host=host,
project_id=project_id,
tator_api=tator_api,
section_name=section_name,
media=media,
localization=localization,
localization_types_df=localization_types_df,
attribute_types_info=attribute_types_info,
image_folder=image_folder,
disable_thumbnails=disable_thumbnails,
thumbnail_filename_pattern=thumbnail_filename_pattern,
print_timing_info=print_timing_info)
report_data.append(datum)
processed_localization_count += 1
break
except Exception:
error_msg = f'Error encountered processing localization {localization.id}. Retrying ({try_count + 1} of {max_retries})'
logging.error(error_msg)
error_msg = traceback.format_exc()
logging.error(error_msg)
errors_detected += 1
try_count += 1
if errors_detected > 0:
print(f"WARNING: Retries occurred while processing data. Review the .log file for more info.")
if len(localizations) != processed_localization_count:
error_msg = f'ERROR - not all localizations processed in media: {media.id} ({processed_localization_count} out of {len(localizations)})'
print(error_msg)
logging.error(error_msg)
except Exception:
error_msg = f'Error encountered processing localization list from media {media.id}'
logging.error(error_msg)
error_msg = traceback.format_exc()
logging.error(error_msg)
print(f"ERROR: Problem retrieving localizations from media {media.id}. Review the .log file for more info.")
# Create the final .csv report
output_name = f'{section_name}_summary.csv'
if summary_filename:
output_name = summary_filename
log_msg = f'Creating summary report file: {output_name}'
logging.info(log_msg)
df = pd.DataFrame(data=report_data, columns=column_names)
df.to_csv(output_name, index=False)
def process_project(
host: str,
token: str,
project_id: int,
section_name: str=None,
summary_filename: str=None,
disable_thumbnails: bool=False,
thumbnail_filename_pattern: str=None,
rerun_project: bool=False,
print_timing_info: bool=False) -> None:
""" Creates the summary report and thumbnail image for the given project/section
Args:
host: Host URL
token: User token used for connecting to the host
project_id: Unique identifier of project associated with the given section
section_name: Name of the section to be processed
If None, then all sections are processed
summary_filename: File name to use for the summary report
If None, then section_name_summary.csv is used
Only valid if section_name is not None
disable_thumbnails: Don't create thumbnails if true
thumbnail_filename_pattern: Refer to the help description
rerun_project: Reruns the project starting with the last section processed, using
the list of .csv filenames. Cannot be combined with section_name.
print_timing_info: True if timing info should be printed to console / logger
Postconditions:
Images of localizations created in images/
.csv report of the section created. If no section specified, report and
thumbnails are created for each section.
"""
# Get the interface to Tator
tator_api = tator.get_api(host=host, token=token)
# Get the list of projects and ensure the project exists.
# Otherwise, complain and spit out what projects are available.
projects = tator_api.get_project_list()
found_project = False
for project_info in projects:
if project_info.id == project_id:
found_project = True
break
if not found_project:
error_msg = f"Invalid project ID provided: {project_id}"
logging.error(error_msg)
logging.error("Project list: ")
logging.error(projects)
raise ValueError(error_msg)
# Report column names will always have the following.
# Additional column names will be added based on the project
column_names=[
'section','media_id','media','thumbnail','id', 'user',
'user_id','frame','dtype','x','y','width','height','url']
# Get the attributes associated with the localizations in this project
# and save them as a report column
localization_types = tator_api.get_localization_type_list(project=project_id)
for loc_type in localization_types:
for attr in loc_type.attribute_types:
if attr.name not in column_names:
column_names.append(attr.name)
# Grab the localization types again associated with this section.
# Convert this to a dataframe to make it easier to use.
localization_types_df = tator.util.to_dataframe(localization_types)
# Get the ordered attribute information per localization dtype.
attribute_types_info = get_attribute_type_data(localization_types_df=localization_types_df)
# If a section name was provided, then verify the corresponding section exists
# If not, then complain back to the user.
# If no section name was provided, process all of the sections.
sections = tator_api.get_section_list(project=project_id)
ignore_summary_filename = True
if rerun_project:
if section_name is not None:
error_msg = f"Invalid argument combination - cannot rerun project with a section name"
logging.error(error_msg)
raise ValueError(error_msg)
# User is attempting to re-run the project starting with the last section processed.
# Grab the _summary.csv filenames in the current directory
summary_files = glob.glob('*_summary.csv')
last_section_file_time = 0
last_section = None
processed_sections = []
for filename in summary_files:
section = filename.split('_summary.csv')[0]
file_info = pathlib.Path(filename)
if last_section_file_time < file_info.stat().st_ctime:
last_section = section
last_section_file_time = file_info.stat().st_ctime
processed_sections.append(section)
filtered_sections = []
for section in sections:
if section.name not in processed_sections or section.name == last_section:
filtered_sections.append(section)
else:
log_msg = f"Skipping section: {section.name}"
logging.info(log_msg)
sections = filtered_sections.copy()
log_msg = f"Sections to process: {[section.name for section in sections]}"
logging.info(log_msg)
print(log_msg)
if section_name is not None:
# Section name specified
if section_name not in [section.name for section in sections]:
error_msg = f"Invalid section name provided: {section_name}"
logging.error(error_msg)
logging.error("Section names: ")
logging.error(sections.keys())
raise ValueError(error_msg)
sections = [section for section in sections if section.name == section_name]
else:
# No section name specified, process all sections
# For now use the default summary output filename for each section
# #TODO Maybe a basename can be used in the future.
if summary_filename:
log_msg = f'Ignoring provided summary file name: ({summary_filename}) Processing all sections.'
logging.warning(log_msg)
ignore_summary_filename = True
# Loop over the selected sections and create the report(s) and thumbnail(s)
for section in sections:
# First grab all the medias associated with this section
medias = tator_api.get_media_list(project=project_id, section=section.id)
# Ready to rock and roll
log_msg = f"Processing section: {section.name}"
logging.info(log_msg)
section_summary_filename = None if ignore_summary_filename else summary_filename
process_section(
host=host,
tator_api=tator_api,
project_id=project_id,
section_name=section.name,
medias=medias,
column_names=column_names,
localization_types_df=localization_types_df,
attribute_types_info=attribute_types_info,
summary_filename=section_summary_filename,
disable_thumbnails=disable_thumbnails,
thumbnail_filename_pattern=thumbnail_filename_pattern,
print_timing_info=print_timing_info)
def main():
""" Main routine of this script
"""
# Create the argument list and extract them
parser = argparse.ArgumentParser(
description="Create localizations report and thumbnails for the section(s) in the given project.")
parser = tator.get_parser(parser=parser)
parser.add_argument('--project', type=int, required=True,
help='Unique project id')
parser.add_argument('--section', type=str, required=False,
help='Optional section name to process')
parser.add_argument('--output', type=str, required=False,
help='Filename of section csv report. Only use if section was provided.')
parser.add_argument('--disable-thumbnails', action='store_true',
help='Ignore thumbnail creation, no thumbnails reported.')
parser.add_argument('--thumbnail-filename-pattern', type=str, required=False,
help='Default thumbnail filename is just the localization id. Use this string to specify otherwise. \n' +
'Use <all> to use "media-name_frame-num_localization-id_all-attributes.png" \n' +
'Specify a specific set with: "media_name:frame_number:localization_id:Attribute Name 1:Attribute Name 2" \n' +
'Note: media_name, frame_number, localization_id are the only parameters that can be used (outside of attributes). \n' +
'Split up the set with the colon\n' +
'Extension of media_name is not included. \n' +
'Entire string needs to be encapsulated in double quotes. \n' +
'This argument is ignored if disable-thumbnails is true. ')
parser.add_argument('--rerun-project', action='store_true',
help='Script will start at the last section it processed (using the current directory of csv filenames)). Do not combine with section option.')
parser.add_argument('--print-timing-info', action='store_true',
help='Print timing')
args = parser.parse_args()
logger.info(args)
# Create the report(s) and thumbnail(s)
process_project(
host=args.host,
token=args.token,
project_id=args.project,
section_name=args.section,
summary_filename=args.output,
disable_thumbnails=args.disable_thumbnails,
thumbnail_filename_pattern=args.thumbnail_filename_pattern,
rerun_project=args.rerun_project,
print_timing_info=args.print_timing_info)
if __name__=="__main__":
main()