Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Integrate text search" #469

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/ci-winnow-uts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ jobs:
- name: Check formatting with black
run: |
# Check formatting of the dedup-app python files
black --line-length 120 --check *.py db winnow tests task_queue remote security template_support thumbnail cli rpc --exclude='rpc_pb2*'
black --line-length 120 --check *.py db winnow tests task_queue remote security template_support thumbnail cli
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 winnow db tests task_queue remote security template_support thumbnail cli rpc --exclude 'rpc_pb2*' --count --select=E9,F63,F7,F82 --show-source --statistics
flake8 winnow db tests task_queue remote security template_support thumbnail cli --count --select=E9,F63,F7,F82 --show-source --statistics
# --ignore=E203 as it is not PEP8 and black style compliant. See: https://github.com/psf/black/issues/315
# Also --ignore=W503 as it is not compatible with black style.
flake8 winnow db tests task_queue remote security template_support thumbnail cli rpc --exclude 'rpc_pb2*' --count --max-complexity=10 --ignore=E203,W503 --max-line-length=120 --statistics
flake8 winnow db tests task_queue remote security template_support thumbnail cli --count --max-complexity=10 --ignore=E203,W503 --max-line-length=120 --statistics
- name: Test with pytest
run: |
export PYTHONPATH="$(pwd)"
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@ winnow/feature_extraction/pretrained_models/**
notebooks/.ipynb_checkpoints/
tests/test_data/test_output/
.env
winnow/text_search/models/**
4 changes: 2 additions & 2 deletions cli/cli/handlers/db_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def files(
req.audio = boolean("audio", audio)
req.min_length = valid_duration_millis("min_length", min_length)
req.max_length = valid_duration_millis("max_length", max_length)
req.include = [FileInclude.EXIF, FileInclude.SCENES, FileInclude.META]
req.include = [FileInclude.exif, FileInclude.scenes, FileInclude.meta]
req.extensions = extensions
req.date_from = valid_date("date_from", date_from)
req.date_to = valid_date("date_to", date_to)
Expand All @@ -69,7 +69,7 @@ def files(
database = Database.from_uri(self._config.database.uri)
with database.session_scope(expunge=True) as session:
results = FilesDAO.list_files(req, session)
files = [Transform.file(item.file) for item in results.items]
files = [Transform.file(file) for file in results.items]
formatter = resolve_formatter(format=output)
formatter.format(
files, fields, file=sys.stdout, highlights={"path": path, "hash": hash, "hash_short": hash}
Expand Down
5 changes: 3 additions & 2 deletions cli/cli/handlers/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import os

from winnow.pipeline.extract_exif import extract_exif
from winnow.pipeline.pipeline_context import PipelineContext


class PipelineCli:
"""Process video files."""
Expand All @@ -13,8 +16,6 @@ def all(self):
from winnow.pipeline.detect_scenes import detect_scenes
from winnow.pipeline.generate_local_matches import generate_local_matches
from winnow.utils.files import scan_videos
from winnow.pipeline.extract_exif import extract_exif
from winnow.pipeline.pipeline_context import PipelineContext

configure_logging_cli()

Expand Down
1 change: 0 additions & 1 deletion cli/security

This file was deleted.

47 changes: 10 additions & 37 deletions db/access/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import enum
import itertools
from datetime import datetime
from typing import List, Optional, Iterator, Set, Iterable, Collection
from typing import List, Optional, Iterator, Set

from dataclasses import dataclass
from sqlalchemy import or_, and_, func, literal_column, tuple_
Expand Down Expand Up @@ -51,7 +51,7 @@ class FileInclude(enum.Enum):
TEMPLATES = "templates"


# Some included data are mapped to the File fields.
# Some of the included data are mapped to the File fields.
_FILE_FIELDS = {
FileInclude.EXIF: Files.exif,
FileInclude.META: Files.meta,
Expand All @@ -68,8 +68,8 @@ def get_file_fields(included_fields: Set[FileInclude]):
class ListFilesRequest:
"""Parameters for list-files query."""

limit: Optional[int] = 20
offset: Optional[int] = 0
limit: int = 20
offset: int = 0
path_query: str = None
extensions: List[str] = ()
exif: bool = None
Expand All @@ -78,7 +78,7 @@ class ListFilesRequest:
max_length: int = None
date_from: datetime = None
date_to: datetime = None
include: Collection[FileInclude] = ()
include: List[FileInclude] = ()
sort: Optional[FileSort] = None
match_filter: FileMatchFilter = FileMatchFilter.ALL
related_distance: float = 0.4
Expand Down Expand Up @@ -242,27 +242,23 @@ class FilesDAO:
_countable_match = aliased(Matches)

@staticmethod
def list_files(
req: ListFilesRequest, session, entity=Files, selected_ids: Iterable[int] = None
) -> ListFilesResults:
def list_files(req: ListFilesRequest, session) -> ListFilesResults:
"""Query multiple files."""
# Count files
query = session.query(Files.id)
query = FilesDAO._filter_ids(query, selected_ids)
query = session.query(Files)
query = FilesDAO._filter_by_file_attributes(req, query)
counts = FilesDAO.counts(query, req.related_distance, req.duplicate_distance)

# Select files
included_values = FilesDAO._make_loaders(req)
query = session.query(entity, *(included.value for included in included_values))
query = FilesDAO._filter_ids(query, selected_ids)
query = session.query(Files, *(included.value for included in included_values))
query = FilesDAO._filter_by_file_attributes(req, query)
query = FilesDAO._filter_by_matches(req, query)
query = FilesDAO._advice_query(query, req, included_values)
query = FilesDAO._sort_items(req, query)
query = FilesDAO._apply_limit_offset(req, query)

# Retrieve slice
query = query.offset(req.offset).limit(req.limit)
items = FilesDAO._collect_items(query.all(), included_values)

return ListFilesResults(items=items, counts=counts)
Expand Down Expand Up @@ -299,15 +295,6 @@ def file_matches(file_id, session: Session, *, false_positive=False) -> Query:
query = query.filter(Matches.false_positive == false_positive)
return query

@staticmethod
def _apply_limit_offset(req: ListFilesRequest, query: Query) -> Query:
"""Apply limit and offset params."""
if req.limit is not None:
query = query.limit(req.limit)
if req.offset is not None:
query = query.offset(req.offset)
return query

@staticmethod
def _make_loaders(req: ListFilesRequest, loader_types=_VALUE_LOADERS) -> List[QueryValueLoader]:
"""Get loaders for required values."""
Expand All @@ -332,7 +319,7 @@ def _collect_items(items: List, included_values: List[QueryValueLoader]) -> List
# Each item is a File entity
return list(map(FileData, items))

# Otherwise, each item is a tuple of values
# Otherwise each item is a tuple of values
result = []
for item in items:
file_data = FileData(file=item[0])
Expand Down Expand Up @@ -368,13 +355,6 @@ def _sort_items(req: ListFilesRequest, query: Query) -> Query:
)
return query

@staticmethod
def _filter_ids(query: Query, selected_ids: Iterable[int] = None) -> Query:
"""Ensure ids are from the provided collection."""
if selected_ids is not None:
query = query.filter(Files.id.in_(tuple(selected_ids)))
return query

@staticmethod
def _filter_path(req: ListFilesRequest, query: Query) -> Query:
"""Filter by file name."""
Expand Down Expand Up @@ -516,13 +496,6 @@ def query_local_files(session: Session, path_hash_pairs) -> Query:
query = query.filter(tuple_(Files.file_path, Files.sha256).in_(tuple(path_hash_pairs)))
return query

@staticmethod
def query_local_file_ids(session: Session, path_hash_pairs) -> Query:
"""Query local files by (path, hash) pairs."""
query = session.query(Files.id).filter(Files.contributor == None) # noqa: E711
query = query.filter(tuple_(Files.file_path, Files.sha256).in_(tuple(path_hash_pairs)))
return query

@staticmethod
def query_remote_files(session: Session, repository_name: str = None, contributor_name: str = None) -> Query:
"""Query remote signatures from database."""
Expand Down
36 changes: 1 addition & 35 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ services:
redis:
image: "redis:alpine"
ports:
- "6379:6379"
- 6379:6379
networks:
- postgres-compose-network

Expand Down Expand Up @@ -108,40 +108,6 @@ services:
networks:
- postgres-compose-network

rpc:
image: "johnhbenetech/videodeduplication:${BENETECH_RUNTIME:-gpu}${BENETECH_MODE}"
build:
context: .
dockerfile: "docker/Dockerfile.dedup-${BENETECH_RUNTIME:-gpu}"
target: prod
args:
GIT_HASH: "${GIT_HASH:-UNKNOWN}"
runtime: "${BENETECH_DOCKER_RUNTIME:-nvidia}"
command: bash -ic "python -m rpc.server"
environment:
WINNOW_CONFIG: "/project/config/config.yaml"
RPC_SERVER_PORT: 50051
RPC_SERVER_HOST: rpc
ports:
- "50051:50051"
volumes:
# Set the BENETECH_DATA_LOCATION environment variable to the path
# on your host machine where you placed the source data
- "${BENETECH_DATA_LOCATION:?\n\nPlease set \"BENETECH_DATA_LOCATION\" environment variable to the root folder of your video files.}:/project/data"
# You may want to set BENETECH_TASK_LOGS environment variable to
# keep pipeline logs in a specific directory in your host fs.
- "${BENETECH_TASK_LOGS:-pipeline-logs}:/project/pipeline-logs"
# You can specify BENETECH_FILE_STORAGE_DIRECTORY environment variable to
# keep template examples in a specific directory in your host fs.
- "${BENETECH_FILE_STORAGE_DIRECTORY:-file-storage}:/project/file-storage"
# You can specify BENETECH_CONFIG_DIRECTORY environment variable to
# keep application configs in a specific directory in your host fs.
- "${BENETECH_CONFIG_DIRECTORY:-config}:/project/config"
depends_on:
- postgres
networks:
- postgres-compose-network

server:
image: "johnhbenetech/videodeduplication:server${BENETECH_MODE}"
build:
Expand Down
7 changes: 2 additions & 5 deletions environment-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,16 @@ dependencies:
- pyyaml
- cached-property
- dataclasses
- celery==5.0.5
- celery
- redis
- billiard
- librosa
- fire
- inquirer
- pytimeparse
- annoy
- yt-dlp
- youtube-dl
- dacite
- deprecation
- torch
- grpcio==1.43.0
- grpcio-tools==1.43.0

prefix: C:\ProgramData\Anaconda3\envs\winnow
7 changes: 2 additions & 5 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,16 @@ dependencies:
- pyyaml
- cached-property
- dataclasses
- celery==5.0.5
- celery
- redis
- billiard
- librosa
- fire
- inquirer
- pytimeparse
- annoy
- yt-dlp
- youtube-dl
- dacite
- deprecation
- torch
- grpcio==1.43.0
- grpcio-tools==1.43.0

prefix: C:\ProgramData\Anaconda3\envs\winnow
Loading