Skip to content

Commit

Permalink
Merge branch 'main' into debug0
Browse files Browse the repository at this point in the history
  • Loading branch information
jordimas committed May 4, 2024
2 parents 422d20f + 09bce62 commit 599bee7
Show file tree
Hide file tree
Showing 15 changed files with 349 additions and 213 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python

name: CI

on: [push, pull_request]

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Install dependencies
run: |
python --version
python -m pip install --upgrade pip
pip install -r transcribe-batch/requirements.txt
make install-dev-tools
- name: Test
run: |
make test
- name: check code
run: |
make run-check-code
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: docker-build-transcribe-models docker-build-transcribe-service docker-build-transcribe-batch docker-run test whisper-models benchmark-run
.PHONY: docker-build-transcribe-models docker-build-transcribe-service docker-build-transcribe-batch docker-run test whisper-models benchmark-run install-dev-tools run-check-code

build-all: docker-build-transcribe-models docker-build-transcribe-service docker-build-transcribe-batch

Expand Down Expand Up @@ -30,3 +30,10 @@ benchmark-run: whisper-models benchmark-samples
@python3 -c 'import faster_whisper; print(f"faster_whisper: {faster_whisper.__version__}")'
@python3 -c 'import ctranslate2; print(f"ctranslate2: {ctranslate2.__version__}")'
@whisper-ctranslate2 --version

install-dev-tools:
pip install -r requirements-dev.txt

run-check-code:
python -m black --check transcribe-batch/ transcribe-service/
python -m flake8 --ignore E501,W503 transcribe-batch/ transcribe-service/
5 changes: 5 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
jiwer
flake8==7.*
black==24.*
nose2

80 changes: 47 additions & 33 deletions transcribe-batch/batchfilesdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,20 @@
import logging
from typing import Optional

class BatchFile():
def __init__(self,
filename_dbrecord: str,
filename: str,
email: str,
model_name: str,
original_filename: str,
estimated_time: int,
highlight_words: Optional[int] = None,
num_chars: Optional[int] = None,
num_sentences: Optional[int] = None):


class BatchFile:
def __init__(
self,
filename_dbrecord: str,
filename: str,
email: str,
model_name: str,
original_filename: str,
estimated_time: int,
highlight_words: Optional[int] = None,
num_chars: Optional[int] = None,
num_sentences: Optional[int] = None,
):
self.filename_dbrecord = filename_dbrecord
self.filename = filename
self.email = email
Expand All @@ -45,11 +47,13 @@ def __init__(self,
self.num_chars = num_chars
self.num_sentences = num_sentences


# This is a disk based priority queue with works as filenames
# as items to store
class Queue(): # works with filenames
class Queue: # works with filenames
g_check_directory = True
def __init__(self, entries = '/srv/data/entries'):

def __init__(self, entries="/srv/data/entries"):
self.ENTRIES = entries

def _find(self, directory, pattern):
Expand Down Expand Up @@ -84,9 +88,7 @@ def delete(self, filename):
os.remove(filename)



class BatchFilesDB(Queue):

SEPARATOR = "\t"

def get_record_file_from_uuid(self, _uuid):
Expand All @@ -101,20 +103,28 @@ def _optional_int(self, string):
def _optional_bool(self, string):
return None if string == "None" or len(string) == 0 else string == "True"

def create(self, filename, email, model_name, original_filename, highlight_words = None,
num_chars = None, num_sentences = None, record_uuid = None):

def create(
self,
filename,
email,
model_name,
original_filename,
highlight_words=None,
num_chars=None,
num_sentences=None,
record_uuid=None,
):
if not record_uuid:
record_uuid = self.get_new_uuid()

filename_dbrecord = self.get_record_file_from_uuid(record_uuid)
_estimated_time = 0 # Old field no longer used
line = f"v2{self.SEPARATOR}{filename}{self.SEPARATOR}{email}{self.SEPARATOR}{model_name}{self.SEPARATOR}{original_filename}{self.SEPARATOR}{_estimated_time}"
_estimated_time = 0 # Old field no longer used
line = f"v2{self.SEPARATOR}{filename}{self.SEPARATOR}{email}{self.SEPARATOR}{model_name}{self.SEPARATOR}{original_filename}{self.SEPARATOR}{_estimated_time}"
line += f"{self.SEPARATOR}{highlight_words}{self.SEPARATOR}{num_chars}{self.SEPARATOR}{num_sentences}"
self.put(filename_dbrecord, line)
return record_uuid

def select(self, email = None):
def select(self, email=None):
filenames = self.get_all()
records = []
for filename in filenames:
Expand All @@ -126,7 +136,7 @@ def select(self, email = None):
records.append(record)

return records

def _read_record_from_uuid(self, _uuid):
record_fullpath = os.path.join(self.ENTRIES, _uuid + ".dbrecord")
record = self._read_record(record_fullpath)
Expand All @@ -138,18 +148,22 @@ def _read_record(self, filename_dbrecord):
line = fh.readline()
components = line.split(self.SEPARATOR)
if components[0] == "v2":
return BatchFile(filename_dbrecord = filename_dbrecord,
filename = components[1],
email = components[2],
model_name = components[3],
original_filename = components[4],
estimated_time = int(components[5]),
highlight_words = self._optional_bool(components[6]),
num_chars = self._optional_int(components[7]),
num_sentences = self._optional_int(components[8]))
return BatchFile(
filename_dbrecord=filename_dbrecord,
filename=components[1],
email=components[2],
model_name=components[3],
original_filename=components[4],
estimated_time=int(components[5]),
highlight_words=self._optional_bool(components[6]),
num_chars=self._optional_int(components[7]),
num_sentences=self._optional_int(components[8]),
)
else:
raise RuntimeError("dbrecord version not supported")

except Exception as exception:
logging.error(f"_read_record. Unable to read {filename_dbrecord}. Error: {exception}")
logging.error(
f"_read_record. Unable to read {filename_dbrecord}. Error: {exception}"
)
return None
56 changes: 29 additions & 27 deletions transcribe-batch/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
from typing import Optional
from langdetect import detect_langs

class Command(object):

class Command(object):
TIMEOUT_ERROR = -1
NO_ERROR = 0

Expand Down Expand Up @@ -67,8 +67,6 @@ def target():


class Execution(object):


def __init__(self, threads):
self.threads = threads

Expand Down Expand Up @@ -131,20 +129,23 @@ def _sox_errors(self, sox_errfile):
logging.error(f"_sox_errors. Error: {exception}")
return return_code

def run_conversion(self,
original_filename: str,
source_file: str,
converted_audio: str,
timeout: int):

def run_conversion(
self,
original_filename: str,
source_file: str,
converted_audio: str,
timeout: int,
):
result = self._run_ffmpeg(source_file, converted_audio, timeout)
if result != Command.NO_ERROR:
converted_audio_fix = tempfile.NamedTemporaryFile().name + ".wav"

_format = self._get_extension(original_filename)
sox_errfile = "sox-error.log"

cmd = f"sox -t {_format} {source_file} {converted_audio_fix} 2> {sox_errfile}"
cmd = (
f"sox -t {_format} {source_file} {converted_audio_fix} 2> {sox_errfile}"
)
Command(cmd).run(timeout=timeout)
result = self._sox_errors(sox_errfile)
logging.debug(f"Run {cmd} with result {result}")
Expand All @@ -155,7 +156,6 @@ def run_conversion(self,

return result


def _whisper_errors(self, whisper_errfile):
try:
if os.path.getsize(whisper_errfile) == 0:
Expand All @@ -168,17 +168,17 @@ def _whisper_errors(self, whisper_errfile):
except Exception as exception:
logging.error(f"whisper_errfile. Error: {exception}")


def run_inference(self,
source_file: str,
original_filename: str,
model: str,
converted_audio: str,
timeout: int,
highlight_words: Optional[int] = None,
num_chars: Optional[int] = None,
num_sentences: Optional[int] = None):

def run_inference(
self,
source_file: str,
original_filename: str,
model: str,
converted_audio: str,
timeout: int,
highlight_words: Optional[int] = None,
num_chars: Optional[int] = None,
num_sentences: Optional[int] = None,
):
WHISPER_PATH = "whisper-ctranslate2"
OUTPUT_DIR = "output_dir/"
options = ""
Expand All @@ -187,7 +187,7 @@ def run_inference(self,
if highlight_words:
options += " --highlight_words True"
word_timestamps = True

if num_chars:
options += f" --max_line_width {num_chars}"
word_timestamps = True
Expand All @@ -201,8 +201,8 @@ def run_inference(self,

logging.debug(f"Options: {options}")
start_time = datetime.datetime.now()
compute_type = os.environ.get('COMPUTE_TYPE', "int8")
verbose = os.environ.get('WHISPER_VERBOSE', "false").lower()
compute_type = os.environ.get("COMPUTE_TYPE", "int8")
verbose = os.environ.get("WHISPER_VERBOSE", "false").lower()
device = os.environ.get("DEVICE", "cpu")
device_index = os.environ.get("DEVICE_INDEX", "0")
redirect = " > /dev/null" if verbose == "false" else ""
Expand All @@ -215,7 +215,7 @@ def run_inference(self,
end_time = datetime.datetime.now() - start_time

logging.debug(f"Run {cmd} in {end_time} with result {result}")

if os.path.exists(converted_audio):
os.remove(converted_audio)

Expand All @@ -237,7 +237,9 @@ def get_transcription_language(self, file_txt):
_lang = detect_langs(all_text)[0]
language = _lang.lang
prob = _lang.prob
logging.debug(f"get_transcription_language. language: {language}, prob: {prob}")
logging.debug(
f"get_transcription_language. language: {language}, prob: {prob}"
)

logging.debug(f"get_transcription_language. size: {size}, lang: {language}")
return language
Expand Down
11 changes: 7 additions & 4 deletions transcribe-batch/lockfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import os
import time

class LockFile():

class LockFile:
def __init__(self, filename):
self.filename = filename + ".lock"

Expand All @@ -33,7 +33,9 @@ def create(self):
fh.write("")
fh.close()
except Exception as e:
logging.error(f"LockFile.create. Failed to create lock for {self.filename} - {str(e)}")
logging.error(
f"LockFile.create. Failed to create lock for {self.filename} - {str(e)}"
)
return False

return True
Expand All @@ -43,7 +45,9 @@ def delete(self):
try:
os.remove(self.filename)
except Exception as e:
logging.error(f"LockFile.delete. Error deleting file {self.filename}: {e}")
logging.error(
f"LockFile.delete. Error deleting file {self.filename}: {e}"
)

def has_lock(self):
has_lock = False
Expand All @@ -61,4 +65,3 @@ def has_lock(self):
has_lock = True

return has_lock

Loading

0 comments on commit 599bee7

Please sign in to comment.