Merge pull request #91 from beeldengeluid/88-cleanup

Codebase is cleaned up!
beeldengeluid · Sep 23, 2024 · aa4e026 · aa4e026
2 parents 989f3f0 + a837df6
commit aa4e026
Show file tree

Hide file tree

Showing 27 changed files with 274 additions and 17,885 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -14,5 +14,4 @@ __pycache__
 s3-creds.env
 .vscode
 .env
-.env.override
-/data
+.env.override
diff --git a/.env b/.env
@@ -1,12 +1,10 @@
 # passed to --input-uri (see main.py)
-INPUT_URI=http://model-hosting.beng.nl/kaldi-nl-test.mp3
-# uncomment if you have downloaded s3://x-omg-daan-av/dane-asr-worker-sample-data.tar.gz
-# INPUT_URI=http://fake-hosting.beng.nl/2101608150135908031__NOS_JOURNAAL_-WON01207359.mp4
+INPUT_URI=http://model-hosting.beng.nl/whisper-test.mp3
 
 # passed to --output-uri (see main.py)
 OUTPUT_URI=s3://x-omg-daan-av/assets/2101608150135908031__NOS_JOURNAAL_-WON01207359/
 
-# make sure to mount this dir into the container (see docker-compose-dane-worker.yml)
+# make sure to mount these dirs into the container (see docker-compose.yml)
 DATA_BASE_DIR=./data
 MODEL_BASE_DIR=./model
 

diff --git a/.gitignore b/.gitignore
@@ -2,11 +2,11 @@
 !/data/README.md
 !/data/input/
 /data/input/*
-!/data/input/testsource__testcarrier.wav
+!/data/input/whisper-test.mp3
 !/data/output
 /data/output/*
-!/data/output/testsource__testcarrier
-!/data/output/testsource__testcarrier/*
+!/data/output/whisper-test
+!/data/output/whisper-test/*
 /model/*
 __pycache__
 .pytest_cache

diff --git a/Dockerfile b/Dockerfile
@@ -1,16 +1,16 @@
 FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
 
-# Create dirs for:
-# - Injecting config.yml: /root/.DANE
-# - Mount point for input & output files: /mnt/dane-fs
-# - Storing the source code: /src
-# - Storing the model: /model
-RUN mkdir /root/.DANE /mnt/dane-fs /src /data /model
-
+# Install ffmpeg
 RUN apt-get update && \
     apt-get install -y python3-pip python3.11-dev python-is-python3 ffmpeg && \
     rm -rf /var/lib/apt/lists/*
 
+# Create dirs for:
+# - Storing the source code: /src
+# - Storing the input & output files: /data
+# - Storing the model: /model
+RUN mkdir /src /data /model
+
 
 WORKDIR /src
 
@@ -29,6 +29,6 @@ RUN poetry install --without dev --no-root && rm -rf $POETRY_CACHE_DIR
 COPY ./ /src
 
 # Write provenance info about software versions to file
-RUN echo "dane-whisper-asr-worker;https://github.com/beeldengeluid/dane-whisper-asr-worker/commit/$(git rev-parse HEAD)" >> /software_provenance.txt
+RUN echo "whisper-asr-worker;https://github.com/beeldengeluid/whisper-asr-worker/commit/$(git rev-parse HEAD)" >> /software_provenance.txt
 
 ENTRYPOINT ["./docker-entrypoint.sh"]
diff --git a/README.md b/README.md
@@ -1,10 +1,72 @@
-# dane-whisper-asr-worker
+# whisper-asr-worker
+
+ASR Worker that uses faster-whisper as the backend, to be used for transcribing AV material from B&G.
+
+This is still a WIP, so it is subject to change.
+
+There are 2 ways in which the whisper-asr-worker can be tested **(ON THE CPU)**:
+
+## 1. Docker CPU run (recommended)
+
+1. Check if Docker is installed
+2. Make sure you have the `.env.override` file in your local repo folder
+3. In `.env.override`, change `W_DEVICE` from `cuda` to `cpu`
+4. Comment out the lines indicated in `docker-compose.yml`
+5. Open your preferred terminal and navigate to the local repository folder
+6. To build the image, execute the following command:
+```
+docker build . -t whisper-asr-worker
+```
+7. To run the worker, execute the following command:
+```
+docker compose up
+```
+
+## 2. Local CPU run
+
+All commands should be run within WSL if on Windows or within your terminal if on Linux.
+
+1. Follow the steps [here](https://github.com/beeldengeluid/dane-example-worker/wiki/Setting-up-a-new-worker) (under "Adding `pyproject.toml` and generating a `poetry.lock` based on it") to install Poetry and the dependencies required to run the worker
+2. Make sure you have the `.env.override` file in your local repo folder
+3. In `.env.override`, change `W_DEVICE` from `cuda` to `cpu`
+4. Install `ffmpeg`. You can run this command, for example:
+```
+apt-get -y update && apt-get -y upgrade && apt-get install -y --no-install-recommends ffmpeg
+```
+5. Navigate to `scripts`, then execute the following command:
+```
+./run.sh
+```
+
+## Running the worker using a CUDA-compatible GPU
+
+To run the worker with a CUDA-compatible GPU instead of the CPU, either:
+- skip steps 3 & 4 from "Docker CPU run" 
+- skip step 3 from "Local run"
+
+**(OUTDATED BUT STILL MIGHT BE RELEVANT)** To run it using a GPU via Docker, check [the instructions from the dane-example-worker](https://github.com/beeldengeluid/dane-example-worker/wiki/Containerization#running-the-container-locally-using-cuda-compatible-gpu).
+
+Make sure to replace `dane-example-worker` in the `docker run` command with `dane-whisper-asr-worker`.
+
+## Expected run
+
+The expected run of this worker (whose pipeline is defined in `asr.py`) should 
+
+1. download the input file if it isn't downloaded already in `/data/input/` via `download.py`
+
+2. download the model if not present via `model_download.py`
+
+3. run `transcode.py` if the input file is a video to convert it to audio format (though there are plans to remove this and instead use the [audio-extraction-worker](https://github.com/beeldengeluid/audio-extraction-worker/) to extract the audio)
+
+4. run `whisper.py` to transcribe the audio and save it in `/data/output/` if a transcription doesn't already exist
+5. convert Whisper's output to DAAN index format using `daan_transcript.py`
+6. (optional) transfer the output to an S3 bucket.
 
 ## Model options
 
-If you prefer to use your own model that is stored locally, make sure to set `BASE_MOUNT_MODEL` to the path where the model files can be found. A model found locally will take precedence over downloading it from Huggingface or S3 (so, no matter how `WHISPER_ASR_SETTINGS.MODEL` is set, it will ignore it if a model is present locally).
+If you prefer to use your own model that is stored locally, make sure to set `MODEL_BASE_DIR` to the path where the model files can be found. A model found locally will take precedence over downloading it from Huggingface or S3 (so, no matter how `W_MODEL` is set, it will ignore it if a model is present locally).
 
-The pre-trained Whisper model version can be adjusted in the `config.yml` file by editing the `MODEL` parameter within `WHISPER_ASR_SETTINGS`. Possible options are:
+The pre-trained Whisper model version can be adjusted in the `.env` file by editing the `W_MODEL` parameter. Possible options are:
 
 |Size|Parameters|
 |---|---|
@@ -16,12 +78,6 @@ The pre-trained Whisper model version can be adjusted in the `config.yml` file b
 |`large-v2`|1550 M|
 |`large-v3`|1550 M|
 
-We recommend version `large-v2` as it performs better than `large-v3` in our benchmarks.
-
-You can also specify an S3 URI if you have your own custom model available via S3.
-
-## Running via Docker using a CUDA compatible GPU
-
-To run it using a GPU via Docker, check [the instructions from the dane-example-worker](https://github.com/beeldengeluid/dane-example-worker/wiki/Containerization#running-the-container-locally-using-cuda-compatible-gpu).
+We recommend version `large-v2` as it performs better than `large-v3` in our [benchmarks](https://opensource-spraakherkenning-nl.github.io/ASR_NL_results/).
 
-Make sure to replace `dane-example-worker` in the `docker run` command with `dane-whisper-asr-worker`.
+You can also specify an S3 URI if you have your own custom model available via S3 (by modifying the `W_MODEL` parameter).
diff --git a/simple_asr.py → asr.py b/simple_asr.py → asr.py
@@ -25,7 +25,7 @@ def run(input_uri: str, output_uri: str) -> bool:
     asset_id, extension = get_asset_info(input_path)
     output_path = asr_output_dir(input_path)
 
-    # 2. Check if the input file is suitable for processing any further
+    # 2. check if the input file is suitable for processing any further
     transcoded_file_path = try_transcode(input_path, asset_id, extension)
     if not transcoded_file_path:
         logger.error("The transcode failed to yield a valid file to continue with")
@@ -57,7 +57,7 @@ def run(input_uri: str, output_uri: str) -> bool:
     return True
 
 
-# if (S3) output_uri is supplied transfers data to S3 location
+# if (S3) output_uri is supplied transfers data to (S3) location
 def transfer_asr_output(output_path: str, asset_id: str) -> bool:
     logger.info(f"Transferring {output_path} to S3 (asset={asset_id})")
     if any(
@@ -95,7 +95,7 @@ def asr_already_done(output_dir):
     return os.path.exists(os.path.join(output_dir, WHISPER_JSON_FILE))
 
 
-# check if there is a daan-transcript.json
+# check if there is a daan-es-transcript.json
 def daan_transcript_already_done(output_dir):
     daan_transcript = os.path.join(output_dir, DAAN_JSON_FILE)
     logger.info(f"Checking existence of {daan_transcript}")

diff --git a/base_util.py b/base_util.py
@@ -31,7 +31,7 @@ def extension_to_mime_type(extension: str) -> str:
         ".wav": "audio/wav",
     }
 
-    return mime_dict.get(extension, "application/octet-stream")
+    return mime_dict.get(extension, "unknown")
 
 
 # used by asr.py and transcode.py

diff --git a/config.py b/config.py
@@ -17,14 +17,14 @@ def assert_int(param: str) -> int:
 
 
 def assert_tuple(param: str) -> str:
-    value = os.environ.get(param, "(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)")
+    value = os.environ.get(param, "(0.0,0.2,0.4,0.6,0.8,1.0)")
     try:
         tuple(eval(value))
         return value
     except ValueError:
         assert (
             False
-        ), f"Please enter a valid tuple, e.g. (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), for {param}, not |{value}|"
+        ), f"Please enter a valid tuple, e.g. (0.0,0.2,0.4,0.6,0.8,1.0), for {param}, not |{value}|"
 
 
 # main input & output params

diff --git a/daan_transcript.py b/daan_transcript.py
@@ -18,7 +18,7 @@ class ParsedResult(TypedDict):
     carrierId: str
 
 
-# asr_output_dir e.g mount/asr-output/1272-128104-0000
+# asr_output_dir e.g /data/output/whisper-test/
 def generate_daan_transcript(asr_output_dir: str) -> bool:
     logger.info(f"Generating transcript from: {asr_output_dir}")
     whisper_transcript = load_whisper_transcript(asr_output_dir)
@@ -29,7 +29,7 @@ def generate_daan_transcript(asr_output_dir: str) -> bool:
     transcript = parse_whisper_transcript(whisper_transcript)
 
     try:
-        # write transcript.json
+        # write daan-es-transcript.json
         with open(
             os.path.join(asr_output_dir, DAAN_JSON_FILE), "w+", encoding="utf-8"
         ) as f:

diff --git a/data/input/testsource__testcarrier.wav b/data/input/testsource__testcarrier.wav
diff --git a/data/input/whisper-test.mp3 b/data/input/whisper-test.mp3