Skip to content

Commit

Permalink
Merge pull request #494 from stepan-anokhin/484-ensure-exif-is-saved-…
Browse files Browse the repository at this point in the history
…to-db

Ensure exif is saved to db (#484)
  • Loading branch information
johnhbenetech authored Apr 21, 2022
2 parents c7765fb + b644901 commit 3d06e7e
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 18 deletions.
2 changes: 1 addition & 1 deletion server/server/api/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def get_neighbors(algorithm):
continue
results.append(
{
"file": Transform.file(file),
"file": Transform.file(file, exif=True),
"distance": neighbor.distance,
"x": neighbor.x,
"y": neighbor.y,
Expand Down
3 changes: 2 additions & 1 deletion web/src/server-api/v1/transform/FilesTransformer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import {
QueryClusterResultsDTO,
} from "../dto/matches";
import { Updates } from "../../../lib/entity/Entity";
import thumbnailURL from "../../../application/api/files/helpers/thumbnailURL";

/**
* Argument and result transformer for file API endpoint.
Expand Down Expand Up @@ -226,7 +227,7 @@ export default class FilesTransformer {
hash: data.sha256,
fingerprint: data.signature,
exif: data.exif,
preview: `/api/v1/files/${data.id}/thumbnail?time=0`,
preview: thumbnailURL(data.id, meta.length / 2),
playbackURL: `/api/v1/files/${data.id}/watch`,
scenes: this.scenes(data),
relatedCount: data.related_count,
Expand Down
108 changes: 94 additions & 14 deletions winnow/pipeline/luigi/exif.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import logging
import os
from datetime import datetime
from typing import Iterable, Iterator, Any, Tuple
from typing import Iterable, Iterator, Any, Tuple, Optional

import luigi
import pandas as pd
from cached_property import cached_property
from sqlalchemy import func

from winnow.pipeline.luigi.platform import PipelineTask
from db import Database
from db.schema import TaskLogRecord
from winnow.collection.file_collection import FileCollection
from winnow.pipeline.luigi.platform import PipelineTask, ConstTarget
from winnow.pipeline.luigi.targets import FileWithTimestampTarget
from winnow.pipeline.pipeline_context import PipelineContext
from winnow.pipeline.progress_monitor import ProgressMonitor, BaseProgressMonitor
Expand All @@ -16,7 +20,83 @@
from winnow.utils.metadata_extraction import extract_from_list_of_videos, convert_to_df, parse_and_filter_metadata_df


class ExifTask(PipelineTask):
class DBExifTarget(luigi.Target):
"""Exif data in the database."""

# Database task log prefix attribute
LOG_TASK_NAME = "extract-exif"
LOG_PREFIX_ATTR = "prefix"

def __init__(self, prefix: str, database: Database, coll: FileCollection):
self.prefix: str = prefix
self.database: Database = database
self.coll: FileCollection = coll

def exists(self):
return not self.coll.any(prefix=self.prefix, min_mtime=self.last_time)

@property
def last_time(self) -> Optional[datetime]:
"""Get the last log record."""
with self.database.session_scope() as session:
task_name = TaskLogRecord.task_name == self.LOG_TASK_NAME
has_prefix = TaskLogRecord.details[self.LOG_PREFIX_ATTR].as_string() == self.prefix
task_filters = (task_name, has_prefix)
last_time = session.query(func.max(TaskLogRecord.timestamp)).filter(*task_filters)
latest = TaskLogRecord.timestamp == last_time
record: TaskLogRecord = session.query(TaskLogRecord).filter(latest, *task_filters).one_or_none()
if record is None:
return None
return record.timestamp

def write_log(self, time: datetime):
"""Write a log record."""
with self.database.session_scope() as session:
details = {self.LOG_PREFIX_ATTR: self.prefix}
record = TaskLogRecord(task_name=self.LOG_TASK_NAME, timestamp=time, details=details)
session.add(record)


class DBExifTask(PipelineTask):
"""Extract EXIF data and save it to the database."""

prefix: str = luigi.Parameter(default=".")

def output(self):
if not self.config.database.use:
return ConstTarget(exists=True)
return DBExifTarget(
prefix=self.prefix,
database=self.pipeline.database,
coll=self.pipeline.coll,
)

def run(self):
target = self.output()
latest_time = target.last_time
self.logger.info(
"Extracting EXIF metadata from files with prefix '%s' created after %s",
self.prefix,
latest_time,
)

target_time = self.pipeline.coll.max_mtime(prefix=self.prefix)
file_keys = list(self.pipeline.coll.iter_keys(prefix=self.prefix, min_mtime=latest_time))
self.logger.info("Extracting exif for %s files", len(file_keys))
exif_df = extract_exif(
file_keys=file_keys,
pipeline=self.pipeline,
progress=self.progress.subtask(0.9),
logger=self.logger,
)

self.logger.info("Saving EXIF to database")
save_exif_database(file_keys=file_keys, exif_df=exif_df, pipeline=self.pipeline)
target.write_log(target_time)
self.progress.complete()


class ExifReportTask(PipelineTask):
"""Extract EXIF data and save it to the CSV."""

prefix: str = luigi.Parameter(default=".")
Expand Down Expand Up @@ -59,13 +139,8 @@ def run(self):
self.logger.info("Removing previous EXIF file %s", latest_path)
os.remove(latest_path)

if self.config.database.use:
self.logger.info("Saving EXIF to database")
save_exif_database(file_keys=file_keys, exif_df=exif_df, pipeline=self.pipeline)
self.progress.complete()


class ExifFileListFileTask(PipelineTask):
class ExifReportFileListFileTask(PipelineTask):
"""Extract EXIF data and save it to the CSV."""

path_list_file: str = luigi.Parameter()
Expand Down Expand Up @@ -94,11 +169,6 @@ def run(self):
exif_df.to_csv(output)
self.progress.increase(0.05)

if self.config.database.use:
self.logger.info("Saving EXIF to database")
save_exif_database(file_keys=file_keys, exif_df=exif_df, pipeline=self.pipeline)
self.progress.complete()

@cached_property
def result_path(self) -> str:
"""Resolved result report path."""
Expand All @@ -108,6 +178,16 @@ def result_path(self) -> str:
return os.path.join(self.output_directory, "exif", f"exif_list_{list_hash}.csv")


class ExifTask(PipelineTask):
"""Extract exif task."""

prefix: str = luigi.Parameter(default=".")

def requires(self):
yield ExifReportTask(config=self.config, prefix=self.prefix)
yield DBExifTask(config=self.config, prefix=self.prefix)


def extract_exif(
file_keys: Iterable[FileKey],
pipeline: PipelineContext,
Expand Down
4 changes: 2 additions & 2 deletions winnow/storage/db_result_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
from functools import wraps
from time import time
from typing import Dict
from typing import Dict, Iterator, Tuple

from sqlalchemy import tuple_
from sqlalchemy.orm import joinedload, aliased
Expand Down Expand Up @@ -282,7 +282,7 @@ def add_file_exif(self, path, sha256, exif):
session.add(exif_entity)

@benchmark
def add_exifs(self, entries):
def add_exifs(self, entries: Iterator[Tuple[str, str, Dict]]):
"""Add metadata to multiple files.
Args:
Expand Down

0 comments on commit 3d06e7e

Please sign in to comment.