env.yml

name: speech2text
channels:
  - conda-forge
  - pytorch
  - nvidia
dependencies:
  - git
  - ffmpeg
  - libsndfile
  - numpy=1.26.4
  - python=3.10
  - pydub
  - pytorch-lightning=2.2.5
  - nvidia::cuda-libraries-dev
  - pytorch::pytorch=2.3.1=py3.10_cuda12.1_cudnn8.9.2_0
  - pytorch::torchvision
  - pytorch::torchaudio
  - pytorch::pytorch-cuda=12.1
  - pip
  - pip:
    - pyannote.audio
    # Requires ctranslate 4.4.0 and faster-whisper 1.0.3. Link to the issue: https://github.com/m-bain/whisperX/issues/901
    # "https://github.com/m-bain/whisperX/commit/caa7121064c1bb406be30c50891a9b8217252592" or
    # "https://github.com/m-bain/whisperX/commit/5080b7188c3666b5d8648346c0c12599a58bd695"
    - ctranslate2==4.4.0
    - faster-whisper==1.0.3
    - whisperx @ git+https://github.com/federicotorrielli/BetterWhisperX@bd9b897cd3fdb8c23863cbf9f6517640b5c6bf50