Skip to content

Commit

Permalink
add find_media_errors.py to find critical errors in media files
Browse files Browse the repository at this point in the history
  • Loading branch information
double16 committed Aug 9, 2024
1 parent 109aa7d commit 283a947
Show file tree
Hide file tree
Showing 4 changed files with 236 additions and 3 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ RUN chmod 0644 /etc/logrotate.d/dvr &&\
ln -s /usr/local/share/dvrprocess/edl_normalize.py /usr/local/bin/ &&\
ln -s /usr/local/share/dvrprocess/find_need_transcode.py /usr/local/bin/ &&\
ln -s /usr/local/share/dvrprocess/find_need_comcut.py /usr/local/bin/ &&\
ln -s /usr/local/share/dvrprocess/find_media_errors.py /usr/local/bin/ &&\
ln -s /usr/local/share/dvrprocess/transcode-apply.py /usr/local/bin/ &&\
ln -s /usr/local/share/dvrprocess/smart-comcut.py /usr/local/bin/ &&\
ln -s /usr/local/share/dvrprocess/tvshow-summary.py /usr/local/bin/ &&\
Expand Down
69 changes: 69 additions & 0 deletions dvrprocess/common/config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import configparser
import hashlib
import logging
import os
import sys
import tempfile
import threading
from configparser import ConfigParser
from enum import Enum
from functools import lru_cache

Expand Down Expand Up @@ -263,3 +265,70 @@ def bytes_to_human_str(byte_count: int) -> str:
if byte_count > KILOBYTES_MULT:
return "{:.2f}M".format(float(byte_count) / KILOBYTES_MULT)
return str(byte_count)


def _mkv_fingerprint(path: str) -> str:
stat = os.stat(path)
hash_object = hashlib.sha256()
hash_object.update(os.path.basename(path).encode())
hash_object.update(str(stat.st_size).encode())
# Include the first 32k of data? Is it unnecessary I/O?
# with open(path, 'rb') as file:
# hash_object.update(file.read(32768))
return hash_object.hexdigest()


def get_file_config(path: str) -> ConfigParser:
"""
Gets file level configuration using an .ini file.
:param path: path to the media file.
:return: config, never None
"""
config_path = path + ".ini"
config = ConfigParser()
if os.path.exists(config_path):
config.read(config_path)
if config.get('general', 'fingerprint') != _mkv_fingerprint(path):
config = ConfigParser()
return config


def set_file_config(path: str, config: ConfigParser):
"""
Sets file level configuration using an .ini file.
:param path: path to the media file.
:param config: the config
"""
config_path = path + ".ini"
if not config.has_section('general'):
config.add_section('general')
config.set('general', 'fingerprint', _mkv_fingerprint(path))
with open(config_path, "wt") as file:
config.write(file)


def get_file_config_option(path: str, section: str, option: str) -> str:
"""
Get an option from the file config.
:param path:
:param section:
:param option:
:return:
"""
return get_file_config(path).get(section, option, fallback=None)


def set_file_config_option(path: str, section: str, option: str, value: str):
"""
Set an option for the file config.
:param path:
:param section:
:param option:
:param value:
:return:
"""
config = get_file_config(path)
if not config.has_section(section):
config.add_section(section)
config.set(section, option, value)
set_file_config(path, config)
164 changes: 164 additions & 0 deletions dvrprocess/find_media_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env python3
import getopt
import logging
import os
import subprocess
import sys
import time
from collections.abc import Iterable

import common
from common import tools, config

logger = logging.getLogger(__name__)

ERROR_THRESHOLD = 300


def usage():
print(f"""{sys.argv[0]} [media_paths]
List media files with errors that exceed a threshold.
Output options:
1. Absolute paths terminated with null (this can be changed) with the intent to be piped into xargs or similar tool.
2. Nagios monitoring output, which is also human readable. This also provides some estimates on time to transcode.
-t, --terminator="\\n"
Set the output terminator, defaults to null (0).
-d, --dir=
Directory containing media. Defaults to {common.get_media_roots()}
--nagios
Output for Nagios monitoring. Also human readable with statistics and estimates of transcode time.
--time-limit={config.get_global_config_option('background_limits', 'time_limit')}
Limit runtime. Set to 0 for no limit.
--ignore-compute
Ignore current compute availability.
""", file=sys.stderr)


def find_media_errors_cli(argv):
roots = []
terminator = '\0'
nagios_output = False
time_limit = config.get_global_config_time_seconds('background_limits', 'time_limit')
check_compute = True

try:
opts, args = getopt.getopt(argv, "t:d:",
["terminator=", "dir=", "nagios", "time-limit=", "ignore-compute"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt == '-h':
usage()
return 2
elif opt in ("-d", "--dir"):
roots.append(arg)
elif opt == '--nagios':
nagios_output = True
elif opt in ("-t", "--terminator"):
if arg == '\\n':
terminator = '\n'
elif arg == '\\0':
terminator = '\0'
else:
terminator = arg
elif opt in ['--time-limit']:
time_limit = config.parse_seconds(arg)
elif opt == '--ignore-compute':
check_compute = False

if not roots:
roots = common.get_media_roots()

if args:
media_paths = common.get_media_paths(roots, args)
else:
media_paths = common.get_media_paths(roots)
logger.debug("media_paths = %s", media_paths)

if common.check_already_running():
return 0

generator = media_errors_generator(media_paths=media_paths, media_roots=roots,
time_limit=time_limit, check_compute=check_compute)

if nagios_output:
corrupt_files = list(generator)
corrupt_files.sort(key=lambda e: e.error_count, reverse=True)

if len(corrupt_files) > 25:
level = "CRITICAL"
code = 2
elif len(corrupt_files) > 0:
level = "WARNING"
code = 1
else:
level = "OK"
code = 0

print(f"MEDIA_ERRORS {level}: files: {len(corrupt_files)} | MEDIA_ERRORS;{len(corrupt_files)}")
for e in corrupt_files:
print(f"{e.file_name};{e.error_count}")
return code
else:
for e in generator:
sys.stdout.write(e.file_name)
sys.stdout.write(terminator)
return 0


class MediaErrorFileInfo(object):

def __init__(self, file_name: str, host_file_path: str, size: float, error_count: int):
self.file_name = file_name
self.host_file_path = host_file_path
self.size = size
self.error_count = error_count


def media_errors_generator(media_paths: list[str], media_roots: list[str],
time_limit=config.get_global_config_time_seconds('background_limits', 'time_limit'),
check_compute=True) -> Iterable[MediaErrorFileInfo]:
time_start = time.time()

for media_path in media_paths:
for root, dirs, files in os.walk(media_path, topdown=True):
for file in common.filter_for_mkv(files):
duration = time.time() - time_start
if 0 < time_limit < duration:
logger.debug(
f"Exiting normally after processing {common.s_to_ts(int(duration))}, limit of {common.s_to_ts(time_limit)} reached")
return

filepath = os.path.join(root, file)
cached_error_count = config.get_file_config_option(filepath, 'error', 'count')
if cached_error_count:
error_count = int(cached_error_count)
else:
if check_compute and common.should_stop_processing():
# when compute limit is reached, use cached data
logger.debug("not enough compute available, only using cached data")
continue
error_count = len(tools.ffmpeg.check_output(
['-y', '-v', 'error', '-i', filepath, '-c:v', 'vnull', '-c:a', 'anull', '-f', 'null',
'/dev/null'],
stderr=subprocess.STDOUT, text=True).splitlines())
config.set_file_config_option(filepath, 'error', 'count', str(error_count))
if error_count <= ERROR_THRESHOLD:
continue
file_info = MediaErrorFileInfo(
file_name=common.get_media_file_relative_to_root(filepath, media_roots)[0],
host_file_path=filepath,
size=os.stat(filepath).st_size,
error_count=error_count)
yield file_info


if __name__ == '__main__':
os.nice(15)
common.setup_cli(level=logging.ERROR, start_gauges=False)
sys.exit(find_media_errors_cli(sys.argv[1:]))
5 changes: 2 additions & 3 deletions dvrprocess/find_need_comcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
import common
from common import constants, edl_util

#
# Find media that has pending commercials to cut.
#

logger = logging.getLogger(__name__)

Expand All @@ -20,6 +17,8 @@ def usage():
List files needing commercials cut.
Output options:
1. Absolute paths terminated with null (this can be changed) with the intent to be piped into xargs or similar tool.
2. Nagios monitoring output, which is also human readable. This also provides some estimates on time to transcode.
Expand Down

0 comments on commit 283a947

Please sign in to comment.