Merge branch 'main' into debug0

Softcatala · May 4, 2024 · 599bee7 · 599bee7
2 parents 422d20f + 09bce62
commit 599bee7
Show file tree

Hide file tree

Showing 15 changed files with 349 additions and 213 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,25 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+
+name: CI
+
+on: [push, pull_request]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install dependencies
+      run: |
+        python --version
+        python -m pip install --upgrade pip
+        pip install -r transcribe-batch/requirements.txt 
+        make install-dev-tools
+    - name: Test
+      run: |
+        make test
+    - name: check code
+      run: |
+        make run-check-code
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: docker-build-transcribe-models docker-build-transcribe-service docker-build-transcribe-batch docker-run test whisper-models benchmark-run
+.PHONY: docker-build-transcribe-models docker-build-transcribe-service docker-build-transcribe-batch docker-run test whisper-models benchmark-run install-dev-tools run-check-code
 
 build-all: docker-build-transcribe-models docker-build-transcribe-service docker-build-transcribe-batch
 
@@ -30,3 +30,10 @@ benchmark-run: whisper-models benchmark-samples
 	@python3 -c 'import faster_whisper; print(f"faster_whisper: {faster_whisper.__version__}")'
 	@python3 -c 'import ctranslate2; print(f"ctranslate2: {ctranslate2.__version__}")'
 	@whisper-ctranslate2 --version
+
+install-dev-tools:
+	pip install -r requirements-dev.txt
+
+run-check-code:
+	python -m black --check transcribe-batch/ transcribe-service/
+	python -m flake8 --ignore E501,W503 transcribe-batch/ transcribe-service/
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,5 @@
+jiwer
+flake8==7.*
+black==24.*
+nose2
+
diff --git a/transcribe-batch/batchfilesdb.py b/transcribe-batch/batchfilesdb.py
@@ -23,18 +23,20 @@
 import logging
 from typing import Optional
 
-class BatchFile():
-    def __init__(self,
-                 filename_dbrecord: str,
-                 filename: str,
-                 email: str,
-                 model_name: str,
-                 original_filename: str,
-                 estimated_time: int,
-                 highlight_words: Optional[int] = None,
-                 num_chars: Optional[int] = None,
-                 num_sentences: Optional[int] = None):
-
+
+class BatchFile:
+    def __init__(
+        self,
+        filename_dbrecord: str,
+        filename: str,
+        email: str,
+        model_name: str,
+        original_filename: str,
+        estimated_time: int,
+        highlight_words: Optional[int] = None,
+        num_chars: Optional[int] = None,
+        num_sentences: Optional[int] = None,
+    ):
         self.filename_dbrecord = filename_dbrecord
         self.filename = filename
         self.email = email
@@ -45,11 +47,13 @@ def __init__(self,
         self.num_chars = num_chars
         self.num_sentences = num_sentences
 
+
 # This is a disk based priority queue with works as filenames
 # as items to store
-class Queue(): # works with filenames
+class Queue:  # works with filenames
     g_check_directory = True
-    def __init__(self, entries = '/srv/data/entries'):
+
+    def __init__(self, entries="/srv/data/entries"):
         self.ENTRIES = entries
 
     def _find(self, directory, pattern):
@@ -84,9 +88,7 @@ def delete(self, filename):
         os.remove(filename)
 
 
-
 class BatchFilesDB(Queue):
-
     SEPARATOR = "\t"
 
     def get_record_file_from_uuid(self, _uuid):
@@ -101,20 +103,28 @@ def _optional_int(self, string):
     def _optional_bool(self, string):
         return None if string == "None" or len(string) == 0 else string == "True"
 
-    def create(self, filename, email, model_name, original_filename, highlight_words = None,
-                  num_chars = None, num_sentences = None, record_uuid = None):
-
+    def create(
+        self,
+        filename,
+        email,
+        model_name,
+        original_filename,
+        highlight_words=None,
+        num_chars=None,
+        num_sentences=None,
+        record_uuid=None,
+    ):
         if not record_uuid:
             record_uuid = self.get_new_uuid()
 
         filename_dbrecord = self.get_record_file_from_uuid(record_uuid)
-        _estimated_time = 0 # Old field no longer used
-        line =  f"v2{self.SEPARATOR}{filename}{self.SEPARATOR}{email}{self.SEPARATOR}{model_name}{self.SEPARATOR}{original_filename}{self.SEPARATOR}{_estimated_time}"
+        _estimated_time = 0  # Old field no longer used
+        line = f"v2{self.SEPARATOR}{filename}{self.SEPARATOR}{email}{self.SEPARATOR}{model_name}{self.SEPARATOR}{original_filename}{self.SEPARATOR}{_estimated_time}"
         line += f"{self.SEPARATOR}{highlight_words}{self.SEPARATOR}{num_chars}{self.SEPARATOR}{num_sentences}"
         self.put(filename_dbrecord, line)
         return record_uuid
 
-    def select(self, email = None):
+    def select(self, email=None):
         filenames = self.get_all()
         records = []
         for filename in filenames:
@@ -126,7 +136,7 @@ def select(self, email = None):
             records.append(record)
 
         return records
-        
+
     def _read_record_from_uuid(self, _uuid):
         record_fullpath = os.path.join(self.ENTRIES, _uuid + ".dbrecord")
         record = self._read_record(record_fullpath)
@@ -138,18 +148,22 @@ def _read_record(self, filename_dbrecord):
                 line = fh.readline()
                 components = line.split(self.SEPARATOR)
                 if components[0] == "v2":
-                     return BatchFile(filename_dbrecord = filename_dbrecord,
-                                     filename = components[1],
-                                     email = components[2],
-                                     model_name = components[3],
-                                     original_filename = components[4],
-                                     estimated_time = int(components[5]),
-                                     highlight_words = self._optional_bool(components[6]),
-                                     num_chars = self._optional_int(components[7]),
-                                     num_sentences = self._optional_int(components[8]))
+                    return BatchFile(
+                        filename_dbrecord=filename_dbrecord,
+                        filename=components[1],
+                        email=components[2],
+                        model_name=components[3],
+                        original_filename=components[4],
+                        estimated_time=int(components[5]),
+                        highlight_words=self._optional_bool(components[6]),
+                        num_chars=self._optional_int(components[7]),
+                        num_sentences=self._optional_int(components[8]),
+                    )
                 else:
                     raise RuntimeError("dbrecord version not supported")
 
         except Exception as exception:
-            logging.error(f"_read_record. Unable to read {filename_dbrecord}. Error: {exception}")
+            logging.error(
+                f"_read_record. Unable to read {filename_dbrecord}. Error: {exception}"
+            )
             return None
diff --git a/transcribe-batch/execution.py b/transcribe-batch/execution.py
@@ -29,8 +29,8 @@
 from typing import Optional
 from langdetect import detect_langs
 
-class Command(object):
 
+class Command(object):
     TIMEOUT_ERROR = -1
     NO_ERROR = 0
 
@@ -67,8 +67,6 @@ def target():
 
 
 class Execution(object):
-
-
     def __init__(self, threads):
         self.threads = threads
 
@@ -131,20 +129,23 @@ def _sox_errors(self, sox_errfile):
             logging.error(f"_sox_errors. Error: {exception}")
             return return_code
 
-    def run_conversion(self,
-                       original_filename: str,
-                       source_file: str,
-                       converted_audio: str,
-                       timeout: int):
-
+    def run_conversion(
+        self,
+        original_filename: str,
+        source_file: str,
+        converted_audio: str,
+        timeout: int,
+    ):
         result = self._run_ffmpeg(source_file, converted_audio, timeout)
         if result != Command.NO_ERROR:
             converted_audio_fix = tempfile.NamedTemporaryFile().name + ".wav"
 
             _format = self._get_extension(original_filename)
             sox_errfile = "sox-error.log"
 
-            cmd = f"sox -t {_format} {source_file} {converted_audio_fix} 2> {sox_errfile}"
+            cmd = (
+                f"sox -t {_format} {source_file} {converted_audio_fix} 2> {sox_errfile}"
+            )
             Command(cmd).run(timeout=timeout)
             result = self._sox_errors(sox_errfile)
             logging.debug(f"Run {cmd} with result {result}")
@@ -155,7 +156,6 @@ def run_conversion(self,
 
         return result
 
-
     def _whisper_errors(self, whisper_errfile):
         try:
             if os.path.getsize(whisper_errfile) == 0:
@@ -168,17 +168,17 @@ def _whisper_errors(self, whisper_errfile):
         except Exception as exception:
             logging.error(f"whisper_errfile. Error: {exception}")
 
-
-    def run_inference(self,
-                    source_file: str,
-                    original_filename: str,
-                    model: str,
-                    converted_audio: str,
-                    timeout: int,
-                    highlight_words: Optional[int] = None,
-                    num_chars: Optional[int] = None,
-                    num_sentences: Optional[int] = None):
-
+    def run_inference(
+        self,
+        source_file: str,
+        original_filename: str,
+        model: str,
+        converted_audio: str,
+        timeout: int,
+        highlight_words: Optional[int] = None,
+        num_chars: Optional[int] = None,
+        num_sentences: Optional[int] = None,
+    ):
         WHISPER_PATH = "whisper-ctranslate2"
         OUTPUT_DIR = "output_dir/"
         options = ""
@@ -187,7 +187,7 @@ def run_inference(self,
         if highlight_words:
             options += " --highlight_words True"
             word_timestamps = True
- 
+
         if num_chars:
             options += f" --max_line_width {num_chars}"
             word_timestamps = True
@@ -201,8 +201,8 @@ def run_inference(self,
 
         logging.debug(f"Options: {options}")
         start_time = datetime.datetime.now()
-        compute_type = os.environ.get('COMPUTE_TYPE', "int8")
-        verbose = os.environ.get('WHISPER_VERBOSE', "false").lower()
+        compute_type = os.environ.get("COMPUTE_TYPE", "int8")
+        verbose = os.environ.get("WHISPER_VERBOSE", "false").lower()
         device = os.environ.get("DEVICE", "cpu")
         device_index = os.environ.get("DEVICE_INDEX", "0")
         redirect = " > /dev/null" if verbose == "false" else ""
@@ -215,7 +215,7 @@ def run_inference(self,
         end_time = datetime.datetime.now() - start_time
 
         logging.debug(f"Run {cmd} in {end_time} with result {result}")
-            
+
         if os.path.exists(converted_audio):
             os.remove(converted_audio)
 
@@ -237,7 +237,9 @@ def get_transcription_language(self, file_txt):
                     _lang = detect_langs(all_text)[0]
                     language = _lang.lang
                     prob = _lang.prob
-                    logging.debug(f"get_transcription_language. language: {language}, prob: {prob}")
+                    logging.debug(
+                        f"get_transcription_language. language: {language}, prob: {prob}"
+                    )
 
             logging.debug(f"get_transcription_language. size: {size}, lang: {language}")
             return language

diff --git a/transcribe-batch/lockfile.py b/transcribe-batch/lockfile.py
@@ -22,8 +22,8 @@
 import os
 import time
 
-class LockFile():
 
+class LockFile:
     def __init__(self, filename):
         self.filename = filename + ".lock"
 
@@ -33,7 +33,9 @@ def create(self):
             fh.write("")
             fh.close()
         except Exception as e:
-            logging.error(f"LockFile.create. Failed to create lock for {self.filename} - {str(e)}")
+            logging.error(
+                f"LockFile.create. Failed to create lock for {self.filename} - {str(e)}"
+            )
             return False
 
         return True
@@ -43,7 +45,9 @@ def delete(self):
             try:
                 os.remove(self.filename)
             except Exception as e:
-                logging.error(f"LockFile.delete. Error deleting file {self.filename}: {e}")
+                logging.error(
+                    f"LockFile.delete. Error deleting file {self.filename}: {e}"
+                )
 
     def has_lock(self):
         has_lock = False
@@ -61,4 +65,3 @@ def has_lock(self):
             has_lock = True
 
         return has_lock
-