From 423f2b6cc5be1b007bafb6bef59b5fd6e682374d Mon Sep 17 00:00:00 2001 From: Stepan Anokhin Date: Thu, 21 Apr 2022 19:06:17 +0700 Subject: [PATCH 1/2] Split exif task for db and csv (#484) --- winnow/pipeline/luigi/exif.py | 108 ++++++++++++++++++++++++---- winnow/storage/db_result_storage.py | 4 +- 2 files changed, 96 insertions(+), 16 deletions(-) diff --git a/winnow/pipeline/luigi/exif.py b/winnow/pipeline/luigi/exif.py index 0bad1950..8295a2ee 100644 --- a/winnow/pipeline/luigi/exif.py +++ b/winnow/pipeline/luigi/exif.py @@ -1,13 +1,17 @@ import logging import os from datetime import datetime -from typing import Iterable, Iterator, Any, Tuple +from typing import Iterable, Iterator, Any, Tuple, Optional import luigi import pandas as pd from cached_property import cached_property +from sqlalchemy import func -from winnow.pipeline.luigi.platform import PipelineTask +from db import Database +from db.schema import TaskLogRecord +from winnow.collection.file_collection import FileCollection +from winnow.pipeline.luigi.platform import PipelineTask, ConstTarget from winnow.pipeline.luigi.targets import FileWithTimestampTarget from winnow.pipeline.pipeline_context import PipelineContext from winnow.pipeline.progress_monitor import ProgressMonitor, BaseProgressMonitor @@ -16,7 +20,83 @@ from winnow.utils.metadata_extraction import extract_from_list_of_videos, convert_to_df, parse_and_filter_metadata_df -class ExifTask(PipelineTask): +class DBExifTarget(luigi.Target): + """Exif data in the database.""" + + # Database task log prefix attribute + LOG_TASK_NAME = "extract-exif" + LOG_PREFIX_ATTR = "prefix" + + def __init__(self, prefix: str, database: Database, coll: FileCollection): + self.prefix: str = prefix + self.database: Database = database + self.coll: FileCollection = coll + + def exists(self): + return not self.coll.any(prefix=self.prefix, min_mtime=self.last_time) + + @property + def last_time(self) -> Optional[datetime]: + """Get the last log record.""" + with self.database.session_scope() as session: + task_name = TaskLogRecord.task_name == self.LOG_TASK_NAME + has_prefix = TaskLogRecord.details[self.LOG_PREFIX_ATTR].as_string() == self.prefix + task_filters = (task_name, has_prefix) + last_time = session.query(func.max(TaskLogRecord.timestamp)).filter(*task_filters) + latest = TaskLogRecord.timestamp == last_time + record: TaskLogRecord = session.query(TaskLogRecord).filter(latest, *task_filters).one_or_none() + if record is None: + return None + return record.timestamp + + def write_log(self, time: datetime): + """Write a log record.""" + with self.database.session_scope() as session: + details = {self.LOG_PREFIX_ATTR: self.prefix} + record = TaskLogRecord(task_name=self.LOG_TASK_NAME, timestamp=time, details=details) + session.add(record) + + +class DBExifTask(PipelineTask): + """Extract EXIF data and save it to the database.""" + + prefix: str = luigi.Parameter(default=".") + + def output(self): + if not self.config.database.use: + return ConstTarget(exists=True) + return DBExifTarget( + prefix=self.prefix, + database=self.pipeline.database, + coll=self.pipeline.coll, + ) + + def run(self): + target = self.output() + latest_time = target.last_time + self.logger.info( + "Extracting EXIF metadata from files with prefix '%s' created after %s", + self.prefix, + latest_time, + ) + + target_time = self.pipeline.coll.max_mtime(prefix=self.prefix) + file_keys = list(self.pipeline.coll.iter_keys(prefix=self.prefix, min_mtime=latest_time)) + self.logger.info("Extracting exif for %s files", len(file_keys)) + exif_df = extract_exif( + file_keys=file_keys, + pipeline=self.pipeline, + progress=self.progress.subtask(0.9), + logger=self.logger, + ) + + self.logger.info("Saving EXIF to database") + save_exif_database(file_keys=file_keys, exif_df=exif_df, pipeline=self.pipeline) + target.write_log(target_time) + self.progress.complete() + + +class ExifReportTask(PipelineTask): """Extract EXIF data and save it to the CSV.""" prefix: str = luigi.Parameter(default=".") @@ -59,13 +139,8 @@ def run(self): self.logger.info("Removing previous EXIF file %s", latest_path) os.remove(latest_path) - if self.config.database.use: - self.logger.info("Saving EXIF to database") - save_exif_database(file_keys=file_keys, exif_df=exif_df, pipeline=self.pipeline) - self.progress.complete() - -class ExifFileListFileTask(PipelineTask): +class ExifReportFileListFileTask(PipelineTask): """Extract EXIF data and save it to the CSV.""" path_list_file: str = luigi.Parameter() @@ -94,11 +169,6 @@ def run(self): exif_df.to_csv(output) self.progress.increase(0.05) - if self.config.database.use: - self.logger.info("Saving EXIF to database") - save_exif_database(file_keys=file_keys, exif_df=exif_df, pipeline=self.pipeline) - self.progress.complete() - @cached_property def result_path(self) -> str: """Resolved result report path.""" @@ -108,6 +178,16 @@ def result_path(self) -> str: return os.path.join(self.output_directory, "exif", f"exif_list_{list_hash}.csv") +class ExifTask(PipelineTask): + """Extract exif task.""" + + prefix: str = luigi.Parameter(default=".") + + def requires(self): + yield ExifReportTask(config=self.config, prefix=self.prefix) + yield DBExifTask(config=self.config, prefix=self.prefix) + + def extract_exif( file_keys: Iterable[FileKey], pipeline: PipelineContext, diff --git a/winnow/storage/db_result_storage.py b/winnow/storage/db_result_storage.py index 363e6a73..e3cc70b6 100644 --- a/winnow/storage/db_result_storage.py +++ b/winnow/storage/db_result_storage.py @@ -2,7 +2,7 @@ import logging from functools import wraps from time import time -from typing import Dict +from typing import Dict, Iterator, Tuple from sqlalchemy import tuple_ from sqlalchemy.orm import joinedload, aliased @@ -282,7 +282,7 @@ def add_file_exif(self, path, sha256, exif): session.add(exif_entity) @benchmark - def add_exifs(self, entries): + def add_exifs(self, entries: Iterator[Tuple[str, str, Dict]]): """Add metadata to multiple files. Args: From b644901323e5a538083a879db7169c6c5ad0bc7e Mon Sep 17 00:00:00 2001 From: Stepan Anokhin Date: Thu, 21 Apr 2022 19:18:26 +0700 Subject: [PATCH 2/2] Show preview from the middle of video --- server/server/api/embeddings.py | 2 +- web/src/server-api/v1/transform/FilesTransformer.ts | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/server/server/api/embeddings.py b/server/server/api/embeddings.py index 532659fb..5f312ae0 100644 --- a/server/server/api/embeddings.py +++ b/server/server/api/embeddings.py @@ -203,7 +203,7 @@ def get_neighbors(algorithm): continue results.append( { - "file": Transform.file(file), + "file": Transform.file(file, exif=True), "distance": neighbor.distance, "x": neighbor.x, "y": neighbor.y, diff --git a/web/src/server-api/v1/transform/FilesTransformer.ts b/web/src/server-api/v1/transform/FilesTransformer.ts index baa01a2f..fe2c59df 100644 --- a/web/src/server-api/v1/transform/FilesTransformer.ts +++ b/web/src/server-api/v1/transform/FilesTransformer.ts @@ -45,6 +45,7 @@ import { QueryClusterResultsDTO, } from "../dto/matches"; import { Updates } from "../../../lib/entity/Entity"; +import thumbnailURL from "../../../application/api/files/helpers/thumbnailURL"; /** * Argument and result transformer for file API endpoint. @@ -226,7 +227,7 @@ export default class FilesTransformer { hash: data.sha256, fingerprint: data.signature, exif: data.exif, - preview: `/api/v1/files/${data.id}/thumbnail?time=0`, + preview: thumbnailURL(data.id, meta.length / 2), playbackURL: `/api/v1/files/${data.id}/watch`, scenes: this.scenes(data), relatedCount: data.related_count,