diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62ef879f..141e249c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,6 +17,8 @@ repos: hooks: - id: mypy args: [--ignore-missing-imports] + additional_dependencies: + - pydantic>=1.10.4 - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks rev: v2.12.0 hooks: @@ -54,6 +56,8 @@ repos: rev: v2.3.0 hooks: - id: codespell + additional_dependencies: + - tomli - repo: https://github.com/hija/clean-dotenv rev: v0.0.7 diff --git a/poetry.lock b/poetry.lock index 563e3712..e57b733a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -365,13 +365,13 @@ numpy = "*" [[package]] name = "certifi" -version = "2024.2.2" +version = "2024.6.2" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, - {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, + {file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"}, + {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"}, ] [[package]] @@ -703,13 +703,13 @@ toml = ["tomli"] [[package]] name = "datasets" -version = "2.19.1" +version = "2.19.2" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" files = [ - {file = "datasets-2.19.1-py3-none-any.whl", hash = "sha256:f7a78d15896f45004ccac1c298f3c7121f92f91f6f2bfbd4e4f210f827e6e411"}, - {file = "datasets-2.19.1.tar.gz", hash = "sha256:0df9ef6c5e9138cdb996a07385220109ff203c204245578b69cca905eb151d3a"}, + {file = "datasets-2.19.2-py3-none-any.whl", hash = "sha256:e07ff15d75b1af75c87dd96323ba2a361128d495136652f37fd62f918d17bb4e"}, + {file = "datasets-2.19.2.tar.gz", hash = "sha256:eccb82fb3bb5ee26ccc6d7a15b7f1f834e2cc4e59b7cff7733a003552bad51ef"}, ] [package.dependencies] @@ -725,7 +725,7 @@ pandas = "*" pyarrow = ">=12.0.0" pyarrow-hotfix = "*" pyyaml = ">=5.1" -requests = ">=2.19.0" +requests = ">=2.32.1" tqdm = ">=4.62.1" xxhash = "*" @@ -733,7 +733,7 @@ xxhash = "*" apache-beam = ["apache-beam (>=2.26.0)"] audio = ["librosa", "soundfile (>=0.12.1)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"] @@ -741,9 +741,9 @@ quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"] torch = ["torch"] -vision = ["Pillow (>=6.2.1)"] +vision = ["Pillow (>=9.4.0)"] [[package]] name = "debugpy" @@ -880,20 +880,20 @@ dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4 [[package]] name = "fileformats" -version = "0.11.2" +version = "0.11.3" description = "Classes for representing different file formats in Python classes for use in type hinting in data workflows" optional = false python-versions = ">=3.8" files = [ - {file = "fileformats-0.11.2-py3-none-any.whl", hash = "sha256:12a9c04e251e741d4b5ce0323a8723546e9d0dc1176fe724d44f0f4e80118d82"}, - {file = "fileformats-0.11.2.tar.gz", hash = "sha256:97beaa64e658b139110994bc9119522208f72780d65233e640536fa56942da0c"}, + {file = "fileformats-0.11.3-py3-none-any.whl", hash = "sha256:460a56344addb30b82fa9fe134cc1552aac7eeb76d6dcaf0ecea77d76095b361"}, + {file = "fileformats-0.11.3.tar.gz", hash = "sha256:c1bbc82d4039e6dc5f0e5063fa18afe516e5bc2312a24f9aa199ef8ccfc355ae"}, ] [package.dependencies] typing-extensions = {version = ">=4.6.3", markers = "python_version < \"3.11\""} [package.extras] -dev = ["black", "codespell", "fileformats[test]", "flake8", "flake8-pyproject", "pre-commit", "pydata-sphinx-theme (>=0.13)"] +dev = ["black", "codespell", "flake8", "flake8-pyproject", "pre-commit", "pydata-sphinx-theme (>=0.13)", "pydra (>=0.23.0a0)", "pytest (>=6.2.5)", "pytest-cov (>=2.12.1)", "pytest-env (>=0.6.2)"] docs = ["docutils (>=0.10)", "furo (>=2022.2.14.1)", "mock (>1.0)", "numpydoc (>=0.6.0)", "packaging", "sphinx (>=2.1.2)", "sphinx-argparse (>=0.2.0)", "sphinx-click (>=3.1)"] test = ["pydra (>=0.23.0a0)", "pytest (>=6.2.5)", "pytest-cov (>=2.12.1)", "pytest-env (>=0.6.2)"] @@ -1050,13 +1050,13 @@ files = [ [[package]] name = "huggingface-hub" -version = "0.23.2" +version = "0.23.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.23.2-py3-none-any.whl", hash = "sha256:48727a16e704d409c4bb5913613308499664f22a99743435dc3a13b23c485827"}, - {file = "huggingface_hub-0.23.2.tar.gz", hash = "sha256:f6829b62d5fdecb452a76fdbec620cba4c1573655a8d710c1df71735fd9edbd2"}, + {file = "huggingface_hub-0.23.3-py3-none-any.whl", hash = "sha256:22222c41223f1b7c209ae5511d2d82907325a0e3cdbce5f66949d43c598ff3bc"}, + {file = "huggingface_hub-0.23.3.tar.gz", hash = "sha256:1a1118a0b3dea3bab6c325d71be16f5ffe441d32f3ac7c348d6875911b694b5b"}, ] [package.dependencies] @@ -1082,6 +1082,21 @@ testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gr torch = ["safetensors", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] +[[package]] +name = "hyperpyyaml" +version = "1.2.2" +description = "Extensions to YAML syntax for better python interaction" +optional = false +python-versions = "*" +files = [ + {file = "HyperPyYAML-1.2.2-py3-none-any.whl", hash = "sha256:3c5864bdc8864b2f0fbd7bc495e7e8fdf2dfd5dd80116f72da27ca96a128bdeb"}, + {file = "HyperPyYAML-1.2.2.tar.gz", hash = "sha256:bdb734210d18770a262f500fe5755c7a44a5d3b91521b06e24f7a00a36ee0f87"}, +] + +[package.dependencies] +pyyaml = ">=5.1" +"ruamel.yaml" = ">=0.17.28" + [[package]] name = "identify" version = "2.5.36" @@ -1187,13 +1202,13 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio [[package]] name = "ipython" -version = "8.24.0" +version = "8.25.0" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.10" files = [ - {file = "ipython-8.24.0-py3-none-any.whl", hash = "sha256:d7bf2f6c4314984e3e02393213bab8703cf163ede39672ce5918c51fe253a2a3"}, - {file = "ipython-8.24.0.tar.gz", hash = "sha256:010db3f8a728a578bb641fdd06c063b9fb8e96a9464c63aec6310fbcb5e80501"}, + {file = "ipython-8.25.0-py3-none-any.whl", hash = "sha256:53eee7ad44df903a06655871cbab66d156a051fd86f3ec6750470ac9604ac1ab"}, + {file = "ipython-8.25.0.tar.gz", hash = "sha256:c6ed726a140b6e725b911528f80439c534fac915246af3efc39440a6b0f9d716"}, ] [package.dependencies] @@ -1212,7 +1227,7 @@ typing-extensions = {version = ">=4.6", markers = "python_version < \"3.12\""} [package.extras] all = ["ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole]", "ipython[test,test-extra]"] black = ["black"] -doc = ["docrepr", "exceptiongroup", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "stack-data", "typing-extensions"] +doc = ["docrepr", "exceptiongroup", "intersphinx-registry", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "tomli", "typing-extensions"] kernel = ["ipykernel"] matplotlib = ["matplotlib"] nbconvert = ["nbconvert"] @@ -1285,6 +1300,21 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "jiwer" +version = "3.0.4" +description = "Evaluate your speech-to-text system with similarity measures such as word error rate (WER)" +optional = false +python-versions = "<4.0,>=3.7" +files = [ + {file = "jiwer-3.0.4-py3-none-any.whl", hash = "sha256:d6761a1cb7c5a8e3f4bafa96cf4b4f125e2ccc82b625f74dd23557414e97f86f"}, + {file = "jiwer-3.0.4.tar.gz", hash = "sha256:2438acdc7ca22128fcab4be60db595809d2b5e73785b736de36dc3281a2a6ae8"}, +] + +[package.dependencies] +click = ">=8.1.3,<9.0.0" +rapidfuzz = ">=3,<4" + [[package]] name = "joblib" version = "1.4.2" @@ -1846,18 +1876,15 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] [[package]] name = "nodeenv" -version = "1.8.0" +version = "1.9.1" description = "Node.js virtual environment builder" optional = false -python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ - {file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"}, - {file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"}, + {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, + {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, ] -[package.dependencies] -setuptools = "*" - [[package]] name = "numba" version = "0.59.1" @@ -2506,13 +2533,13 @@ files = [ [[package]] name = "prompt-toolkit" -version = "3.0.45" +version = "3.0.46" description = "Library for building powerful interactive command lines in Python" optional = false python-versions = ">=3.7.0" files = [ - {file = "prompt_toolkit-3.0.45-py3-none-any.whl", hash = "sha256:a29b89160e494e3ea8622b09fa5897610b437884dcdcd054fdc1308883326c2a"}, - {file = "prompt_toolkit-3.0.45.tar.gz", hash = "sha256:07c60ee4ab7b7e90824b61afa840c8f5aad2d46b3e2e10acc33d8ecc94a49089"}, + {file = "prompt_toolkit-3.0.46-py3-none-any.whl", hash = "sha256:45abe60a8300f3c618b23c16c4bb98c6fc80af8ce8b17c7ae92db48db3ee63c1"}, + {file = "prompt_toolkit-3.0.46.tar.gz", hash = "sha256:869c50d682152336e23c4db7f74667639b5047494202ffe7670817053fd57795"}, ] [package.dependencies] @@ -2643,18 +2670,18 @@ files = [ [[package]] name = "pydantic" -version = "2.7.2" +version = "2.7.3" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.7.2-py3-none-any.whl", hash = "sha256:834ab954175f94e6e68258537dc49402c4a5e9d0409b9f1b86b7e934a8372de7"}, - {file = "pydantic-2.7.2.tar.gz", hash = "sha256:71b2945998f9c9b7919a45bde9a50397b289937d215ae141c1d0903ba7149fd7"}, + {file = "pydantic-2.7.3-py3-none-any.whl", hash = "sha256:ea91b002777bf643bb20dd717c028ec43216b24a6001a280f83877fd2655d0b4"}, + {file = "pydantic-2.7.3.tar.gz", hash = "sha256:c46c76a40bb1296728d7a8b99aa73dd70a48c3510111ff290034f860c99c419e"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.18.3" +pydantic-core = "2.18.4" typing-extensions = ">=4.6.1" [package.extras] @@ -2662,90 +2689,90 @@ email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.18.3" +version = "2.18.4" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.18.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:744697428fcdec6be5670460b578161d1ffe34743a5c15656be7ea82b008197c"}, - {file = "pydantic_core-2.18.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b40c05ced1ba4218b14986fe6f283d22e1ae2ff4c8e28881a70fb81fbfcda7"}, - {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:544a9a75622357076efb6b311983ff190fbfb3c12fc3a853122b34d3d358126c"}, - {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e2e253af04ceaebde8eb201eb3f3e3e7e390f2d275a88300d6a1959d710539e2"}, - {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:855ec66589c68aa367d989da5c4755bb74ee92ccad4fdb6af942c3612c067e34"}, - {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d3e42bb54e7e9d72c13ce112e02eb1b3b55681ee948d748842171201a03a98a"}, - {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6ac9ffccc9d2e69d9fba841441d4259cb668ac180e51b30d3632cd7abca2b9b"}, - {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c56eca1686539fa0c9bda992e7bd6a37583f20083c37590413381acfc5f192d6"}, - {file = "pydantic_core-2.18.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:17954d784bf8abfc0ec2a633108207ebc4fa2df1a0e4c0c3ccbaa9bb01d2c426"}, - {file = "pydantic_core-2.18.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:98ed737567d8f2ecd54f7c8d4f8572ca7c7921ede93a2e52939416170d357812"}, - {file = "pydantic_core-2.18.3-cp310-none-win32.whl", hash = "sha256:9f9e04afebd3ed8c15d67a564ed0a34b54e52136c6d40d14c5547b238390e779"}, - {file = "pydantic_core-2.18.3-cp310-none-win_amd64.whl", hash = "sha256:45e4ffbae34f7ae30d0047697e724e534a7ec0a82ef9994b7913a412c21462a0"}, - {file = "pydantic_core-2.18.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:b9ebe8231726c49518b16b237b9fe0d7d361dd221302af511a83d4ada01183ab"}, - {file = "pydantic_core-2.18.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b8e20e15d18bf7dbb453be78a2d858f946f5cdf06c5072453dace00ab652e2b2"}, - {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0d9ff283cd3459fa0bf9b0256a2b6f01ac1ff9ffb034e24457b9035f75587cb"}, - {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f7ef5f0ebb77ba24c9970da18b771711edc5feaf00c10b18461e0f5f5949231"}, - {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73038d66614d2e5cde30435b5afdced2b473b4c77d4ca3a8624dd3e41a9c19be"}, - {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6afd5c867a74c4d314c557b5ea9520183fadfbd1df4c2d6e09fd0d990ce412cd"}, - {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd7df92f28d351bb9f12470f4c533cf03d1b52ec5a6e5c58c65b183055a60106"}, - {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:80aea0ffeb1049336043d07799eace1c9602519fb3192916ff525b0287b2b1e4"}, - {file = "pydantic_core-2.18.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:aaee40f25bba38132e655ffa3d1998a6d576ba7cf81deff8bfa189fb43fd2bbe"}, - {file = "pydantic_core-2.18.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9128089da8f4fe73f7a91973895ebf2502539d627891a14034e45fb9e707e26d"}, - {file = "pydantic_core-2.18.3-cp311-none-win32.whl", hash = "sha256:fec02527e1e03257aa25b1a4dcbe697b40a22f1229f5d026503e8b7ff6d2eda7"}, - {file = "pydantic_core-2.18.3-cp311-none-win_amd64.whl", hash = "sha256:58ff8631dbab6c7c982e6425da8347108449321f61fe427c52ddfadd66642af7"}, - {file = "pydantic_core-2.18.3-cp311-none-win_arm64.whl", hash = "sha256:3fc1c7f67f34c6c2ef9c213e0f2a351797cda98249d9ca56a70ce4ebcaba45f4"}, - {file = "pydantic_core-2.18.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f0928cde2ae416a2d1ebe6dee324709c6f73e93494d8c7aea92df99aab1fc40f"}, - {file = "pydantic_core-2.18.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bee9bb305a562f8b9271855afb6ce00223f545de3d68560b3c1649c7c5295e9"}, - {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e862823be114387257dacbfa7d78547165a85d7add33b446ca4f4fae92c7ff5c"}, - {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6a36f78674cbddc165abab0df961b5f96b14461d05feec5e1f78da58808b97e7"}, - {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba905d184f62e7ddbb7a5a751d8a5c805463511c7b08d1aca4a3e8c11f2e5048"}, - {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fdd362f6a586e681ff86550b2379e532fee63c52def1c666887956748eaa326"}, - {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24b214b7ee3bd3b865e963dbed0f8bc5375f49449d70e8d407b567af3222aae4"}, - {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:691018785779766127f531674fa82bb368df5b36b461622b12e176c18e119022"}, - {file = "pydantic_core-2.18.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:60e4c625e6f7155d7d0dcac151edf5858102bc61bf959d04469ca6ee4e8381bd"}, - {file = "pydantic_core-2.18.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4e651e47d981c1b701dcc74ab8fec5a60a5b004650416b4abbef13db23bc7be"}, - {file = "pydantic_core-2.18.3-cp312-none-win32.whl", hash = "sha256:ffecbb5edb7f5ffae13599aec33b735e9e4c7676ca1633c60f2c606beb17efc5"}, - {file = "pydantic_core-2.18.3-cp312-none-win_amd64.whl", hash = "sha256:2c8333f6e934733483c7eddffdb094c143b9463d2af7e6bd85ebcb2d4a1b82c6"}, - {file = "pydantic_core-2.18.3-cp312-none-win_arm64.whl", hash = "sha256:7a20dded653e516a4655f4c98e97ccafb13753987434fe7cf044aa25f5b7d417"}, - {file = "pydantic_core-2.18.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:eecf63195be644b0396f972c82598cd15693550f0ff236dcf7ab92e2eb6d3522"}, - {file = "pydantic_core-2.18.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2c44efdd3b6125419c28821590d7ec891c9cb0dff33a7a78d9d5c8b6f66b9702"}, - {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e59fca51ffbdd1638b3856779342ed69bcecb8484c1d4b8bdb237d0eb5a45e2"}, - {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:70cf099197d6b98953468461d753563b28e73cf1eade2ffe069675d2657ed1d5"}, - {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63081a49dddc6124754b32a3774331467bfc3d2bd5ff8f10df36a95602560361"}, - {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:370059b7883485c9edb9655355ff46d912f4b03b009d929220d9294c7fd9fd60"}, - {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a64faeedfd8254f05f5cf6fc755023a7e1606af3959cfc1a9285744cc711044"}, - {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19d2e725de0f90d8671f89e420d36c3dd97639b98145e42fcc0e1f6d492a46dc"}, - {file = "pydantic_core-2.18.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:67bc078025d70ec5aefe6200ef094576c9d86bd36982df1301c758a9fff7d7f4"}, - {file = "pydantic_core-2.18.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:adf952c3f4100e203cbaf8e0c907c835d3e28f9041474e52b651761dc248a3c0"}, - {file = "pydantic_core-2.18.3-cp38-none-win32.whl", hash = "sha256:9a46795b1f3beb167eaee91736d5d17ac3a994bf2215a996aed825a45f897558"}, - {file = "pydantic_core-2.18.3-cp38-none-win_amd64.whl", hash = "sha256:200ad4e3133cb99ed82342a101a5abf3d924722e71cd581cc113fe828f727fbc"}, - {file = "pydantic_core-2.18.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:304378b7bf92206036c8ddd83a2ba7b7d1a5b425acafff637172a3aa72ad7083"}, - {file = "pydantic_core-2.18.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c826870b277143e701c9ccf34ebc33ddb4d072612683a044e7cce2d52f6c3fef"}, - {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e201935d282707394f3668380e41ccf25b5794d1b131cdd96b07f615a33ca4b1"}, - {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5560dda746c44b48bf82b3d191d74fe8efc5686a9ef18e69bdabccbbb9ad9442"}, - {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b32c2a1f8032570842257e4c19288eba9a2bba4712af542327de9a1204faff8"}, - {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:929c24e9dea3990bc8bcd27c5f2d3916c0c86f5511d2caa69e0d5290115344a9"}, - {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1a8376fef60790152564b0eab376b3e23dd6e54f29d84aad46f7b264ecca943"}, - {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dccf3ef1400390ddd1fb55bf0632209d39140552d068ee5ac45553b556780e06"}, - {file = "pydantic_core-2.18.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:41dbdcb0c7252b58fa931fec47937edb422c9cb22528f41cb8963665c372caf6"}, - {file = "pydantic_core-2.18.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:666e45cf071669fde468886654742fa10b0e74cd0fa0430a46ba6056b24fb0af"}, - {file = "pydantic_core-2.18.3-cp39-none-win32.whl", hash = "sha256:f9c08cabff68704a1b4667d33f534d544b8a07b8e5d039c37067fceb18789e78"}, - {file = "pydantic_core-2.18.3-cp39-none-win_amd64.whl", hash = "sha256:4afa5f5973e8572b5c0dcb4e2d4fda7890e7cd63329bd5cc3263a25c92ef0026"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:77319771a026f7c7d29c6ebc623de889e9563b7087911b46fd06c044a12aa5e9"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:df11fa992e9f576473038510d66dd305bcd51d7dd508c163a8c8fe148454e059"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d531076bdfb65af593326ffd567e6ab3da145020dafb9187a1d131064a55f97c"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d33ce258e4e6e6038f2b9e8b8a631d17d017567db43483314993b3ca345dcbbb"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1f9cd7f5635b719939019be9bda47ecb56e165e51dd26c9a217a433e3d0d59a9"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cd4a032bb65cc132cae1fe3e52877daecc2097965cd3914e44fbd12b00dae7c5"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f2718430098bcdf60402136c845e4126a189959d103900ebabb6774a5d9fdb"}, - {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c0037a92cf0c580ed14e10953cdd26528e8796307bb8bb312dc65f71547df04d"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b95a0972fac2b1ff3c94629fc9081b16371dad870959f1408cc33b2f78ad347a"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a62e437d687cc148381bdd5f51e3e81f5b20a735c55f690c5be94e05da2b0d5c"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b367a73a414bbb08507da102dc2cde0fa7afe57d09b3240ce82a16d608a7679c"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ecce4b2360aa3f008da3327d652e74a0e743908eac306198b47e1c58b03dd2b"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bd4435b8d83f0c9561a2a9585b1de78f1abb17cb0cef5f39bf6a4b47d19bafe3"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:616221a6d473c5b9aa83fa8982745441f6a4a62a66436be9445c65f241b86c94"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7e6382ce89a92bc1d0c0c5edd51e931432202b9080dc921d8d003e616402efd1"}, - {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ff58f379345603d940e461eae474b6bbb6dab66ed9a851ecd3cb3709bf4dcf6a"}, - {file = "pydantic_core-2.18.3.tar.gz", hash = "sha256:432e999088d85c8f36b9a3f769a8e2b57aabd817bbb729a90d1fe7f18f6f1f39"}, + {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"}, + {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"}, + {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"}, + {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"}, + {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"}, + {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"}, + {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"}, + {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"}, + {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"}, + {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"}, + {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"}, + {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"}, + {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"}, + {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"}, + {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"}, + {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"}, + {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"}, + {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"}, + {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"}, + {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"}, + {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"}, + {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"}, + {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"}, + {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"}, + {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"}, + {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"}, + {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"}, + {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"}, + {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"}, + {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"}, + {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"}, + {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"}, + {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"}, ] [package.dependencies] @@ -2797,13 +2824,13 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pytest" -version = "8.2.1" +version = "8.2.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.2.1-py3-none-any.whl", hash = "sha256:faccc5d332b8c3719f40283d0d44aa5cf101cec36f88cde9ed8f2bc0538612b1"}, - {file = "pytest-8.2.1.tar.gz", hash = "sha256:5046e5b46d8e4cac199c373041f26be56fdb81eb4e67dc11d4e10811fc3408fd"}, + {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, + {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, ] [package.dependencies] @@ -3060,6 +3087,111 @@ files = [ [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} +[[package]] +name = "rapidfuzz" +version = "3.9.3" +description = "rapid fuzzy string matching" +optional = false +python-versions = ">=3.8" +files = [ + {file = "rapidfuzz-3.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdb8c5b8e29238ec80727c2ba3b301efd45aa30c6a7001123a6647b8e6f77ea4"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3bd0d9632088c63a241f217742b1cf86e2e8ae573e01354775bd5016d12138c"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:153f23c03d4917f6a1fc2fb56d279cc6537d1929237ff08ee7429d0e40464a18"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a96c5225e840f1587f1bac8fa6f67562b38e095341576e82b728a82021f26d62"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b777cd910ceecd738adc58593d6ed42e73f60ad04ecdb4a841ae410b51c92e0e"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53e06e4b81f552da04940aa41fc556ba39dee5513d1861144300c36c33265b76"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c7ca5b6050f18fdcacdada2dc5fb7619ff998cd9aba82aed2414eee74ebe6cd"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:87bb8d84cb41446a808c4b5f746e29d8a53499381ed72f6c4e456fe0f81c80a8"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:959a15186d18425d19811bea86a8ffbe19fd48644004d29008e636631420a9b7"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a24603dd05fb4e3c09d636b881ce347e5f55f925a6b1b4115527308a323b9f8e"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0d055da0e801c71dd74ba81d72d41b2fa32afa182b9fea6b4b199d2ce937450d"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:875b581afb29a7213cf9d98cb0f98df862f1020bce9d9b2e6199b60e78a41d14"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-win32.whl", hash = "sha256:6073a46f61479a89802e3f04655267caa6c14eb8ac9d81a635a13805f735ebc1"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:119c010e20e561249b99ca2627f769fdc8305b07193f63dbc07bca0a6c27e892"}, + {file = "rapidfuzz-3.9.3-cp310-cp310-win_arm64.whl", hash = "sha256:790b0b244f3213581d42baa2fed8875f9ee2b2f9b91f94f100ec80d15b140ba9"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f57e8305c281e8c8bc720515540e0580355100c0a7a541105c6cafc5de71daae"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4fc7b784cf987dbddc300cef70e09a92ed1bce136f7bb723ea79d7e297fe76d"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b422c0a6fe139d5447a0766268e68e6a2a8c2611519f894b1f31f0a392b9167"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f50fed4a9b0c9825ff37cf0bccafd51ff5792090618f7846a7650f21f85579c9"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b80eb7cbe62348c61d3e67e17057cddfd6defab168863028146e07d5a8b24a89"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f45be77ec82da32ce5709a362e236ccf801615cc7163b136d1778cf9e31b14"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd84b7f652a5610733400307dc732f57c4a907080bef9520412e6d9b55bc9adc"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3e6d27dad8c990218b8cd4a5c99cbc8834f82bb46ab965a7265d5aa69fc7ced7"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:05ee0696ebf0dfe8f7c17f364d70617616afc7dafe366532730ca34056065b8a"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2bc8391749e5022cd9e514ede5316f86e332ffd3cfceeabdc0b17b7e45198a8c"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:93981895602cf5944d89d317ae3b1b4cc684d175a8ae2a80ce5b65615e72ddd0"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:754b719a4990735f66653c9e9261dcf52fd4d925597e43d6b9069afcae700d21"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-win32.whl", hash = "sha256:14c9f268ade4c88cf77ab007ad0fdf63699af071ee69378de89fff7aa3cae134"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc1991b4cde6c9d3c0bbcb83d5581dc7621bec8c666c095c65b4277233265a82"}, + {file = "rapidfuzz-3.9.3-cp311-cp311-win_arm64.whl", hash = "sha256:0c34139df09a61b1b557ab65782ada971b4a3bce7081d1b2bee45b0a52231adb"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d6a210347d6e71234af5c76d55eeb0348b026c9bb98fe7c1cca89bac50fb734"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b300708c917ce52f6075bdc6e05b07c51a085733650f14b732c087dc26e0aaad"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83ea7ca577d76778250421de61fb55a719e45b841deb769351fc2b1740763050"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8319838fb5b7b5f088d12187d91d152b9386ce3979ed7660daa0ed1bff953791"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:505d99131afd21529293a9a7b91dfc661b7e889680b95534756134dc1cc2cd86"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c52970f7784518d7c82b07a62a26e345d2de8c2bd8ed4774e13342e4b3ff4200"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:143caf7247449055ecc3c1e874b69e42f403dfc049fc2f3d5f70e1daf21c1318"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b8ab0fa653d9225195a8ff924f992f4249c1e6fa0aea563f685e71b81b9fcccf"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:57e7c5bf7b61c7320cfa5dde1e60e678d954ede9bb7da8e763959b2138391401"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:51fa1ba84653ab480a2e2044e2277bd7f0123d6693051729755addc0d015c44f"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:17ff7f7eecdb169f9236e3b872c96dbbaf116f7787f4d490abd34b0116e3e9c8"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:afe7c72d3f917b066257f7ff48562e5d462d865a25fbcabf40fca303a9fa8d35"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-win32.whl", hash = "sha256:e53ed2e9b32674ce96eed80b3b572db9fd87aae6742941fb8e4705e541d861ce"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:35b7286f177e4d8ba1e48b03612f928a3c4bdac78e5651379cec59f95d8651e6"}, + {file = "rapidfuzz-3.9.3-cp312-cp312-win_arm64.whl", hash = "sha256:e6e4b9380ed4758d0cb578b0d1970c3f32dd9e87119378729a5340cb3169f879"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a39890013f6d5b056cc4bfdedc093e322462ece1027a57ef0c636537bdde7531"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b5bc0fdbf419493163c5c9cb147c5fbe95b8e25844a74a8807dcb1a125e630cf"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efe6e200a75a792d37b960457904c4fce7c928a96ae9e5d21d2bd382fe39066e"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de077c468c225d4c18f7188c47d955a16d65f21aab121cbdd98e3e2011002c37"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f917eaadf5388466a95f6a236f678a1588d231e52eda85374077101842e794e"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858ba57c05afd720db8088a8707079e8d024afe4644001fe0dbd26ef7ca74a65"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d36447d21b05f90282a6f98c5a33771805f9222e5d0441d03eb8824e33e5bbb4"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:acbe4b6f1ccd5b90c29d428e849aa4242e51bb6cab0448d5f3c022eb9a25f7b1"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:53c7f27cdf899e94712972237bda48cfd427646aa6f5d939bf45d084780e4c16"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:6175682a829c6dea4d35ed707f1dadc16513270ef64436568d03b81ccb6bdb74"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:5276df395bd8497397197fca2b5c85f052d2e6a66ffc3eb0544dd9664d661f95"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:77b5c4f3e72924d7845f0e189c304270066d0f49635cf8a3938e122c437e58de"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-win32.whl", hash = "sha256:8add34061e5cd561c72ed4febb5c15969e7b25bda2bb5102d02afc3abc1f52d0"}, + {file = "rapidfuzz-3.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:604e0502a39cf8e67fa9ad239394dddad4cdef6d7008fdb037553817d420e108"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:21047f55d674614eb4b0ab34e35c3dc66f36403b9fbfae645199c4a19d4ed447"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a56da3aff97cb56fe85d9ca957d1f55dbac7c27da927a86a2a86d8a7e17f80aa"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:964c08481aec2fe574f0062e342924db2c6b321391aeb73d68853ed42420fd6d"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5e2b827258beefbe5d3f958243caa5a44cf46187eff0c20e0b2ab62d1550327a"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c6e65a301fcd19fbfbee3a514cc0014ff3f3b254b9fd65886e8a9d6957fb7bca"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbe93ba1725a8d47d2b9dca6c1f435174859427fbc054d83de52aea5adc65729"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aca21c0a34adee582775da997a600283e012a608a107398d80a42f9a57ad323d"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:256e07d3465173b2a91c35715a2277b1ee3ae0b9bbab4e519df6af78570741d0"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:802ca2cc8aa6b8b34c6fdafb9e32540c1ba05fca7ad60b3bbd7ec89ed1797a87"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:dd789100fc852cffac1449f82af0da139d36d84fd9faa4f79fc4140a88778343"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:5d0abbacdb06e27ff803d7ae0bd0624020096802758068ebdcab9bd49cf53115"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:378d1744828e27490a823fc6fe6ebfb98c15228d54826bf4e49e4b76eb5f5579"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-win32.whl", hash = "sha256:5d0cb272d43e6d3c0dedefdcd9d00007471f77b52d2787a4695e9dd319bb39d2"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:15e4158ac4b3fb58108072ec35b8a69165f651ba1c8f43559a36d518dbf9fb3f"}, + {file = "rapidfuzz-3.9.3-cp39-cp39-win_arm64.whl", hash = "sha256:58c6a4936190c558d5626b79fc9e16497e5df7098589a7e80d8bff68148ff096"}, + {file = "rapidfuzz-3.9.3-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5410dc848c947a603792f4f51b904a3331cf1dc60621586bfbe7a6de72da1091"}, + {file = "rapidfuzz-3.9.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:282d55700a1a3d3a7980746eb2fcd48c9bbc1572ebe0840d0340d548a54d01fe"}, + {file = "rapidfuzz-3.9.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc1037507810833646481f5729901a154523f98cbebb1157ba3a821012e16402"}, + {file = "rapidfuzz-3.9.3-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5e33f779391caedcba2ba3089fb6e8e557feab540e9149a5c3f7fea7a3a7df37"}, + {file = "rapidfuzz-3.9.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41a81a9f311dc83d22661f9b1a1de983b201322df0c4554042ffffd0f2040c37"}, + {file = "rapidfuzz-3.9.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a93250bd8fae996350c251e1752f2c03335bb8a0a5b0c7e910a593849121a435"}, + {file = "rapidfuzz-3.9.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3617d1aa7716c57d120b6adc8f7c989f2d65bc2b0cbd5f9288f1fc7bf469da11"}, + {file = "rapidfuzz-3.9.3-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:ad04a3f5384b82933213bba2459f6424decc2823df40098920856bdee5fd6e88"}, + {file = "rapidfuzz-3.9.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8709918da8a88ad73c9d4dd0ecf24179a4f0ceba0bee21efc6ea21a8b5290349"}, + {file = "rapidfuzz-3.9.3-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b770f85eab24034e6ef7df04b2bfd9a45048e24f8a808e903441aa5abde8ecdd"}, + {file = "rapidfuzz-3.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930b4e6fdb4d914390141a2b99a6f77a52beacf1d06aa4e170cba3a98e24c1bc"}, + {file = "rapidfuzz-3.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:c8444e921bfc3757c475c4f4d7416a7aa69b2d992d5114fe55af21411187ab0d"}, + {file = "rapidfuzz-3.9.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2c1d3ef3878f871abe6826e386c3d61b5292ef5f7946fe646f4206b85836b5da"}, + {file = "rapidfuzz-3.9.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d861bf326ee7dabc35c532a40384541578cd1ec1e1b7db9f9ecbba56eb76ca22"}, + {file = "rapidfuzz-3.9.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cde6b9d9ba5007077ee321ec722fa714ebc0cbd9a32ccf0f4dd3cc3f20952d71"}, + {file = "rapidfuzz-3.9.3-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bb6546e7b6bed1aefbe24f68a5fb9b891cc5aef61bca6c1a7b1054b7f0359bb"}, + {file = "rapidfuzz-3.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d8a57261ef7996d5ced7c8cba9189ada3fbeffd1815f70f635e4558d93766cb"}, + {file = "rapidfuzz-3.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:67201c02efc596923ad950519e0b75ceb78d524177ea557134d6567b9ac2c283"}, + {file = "rapidfuzz-3.9.3.tar.gz", hash = "sha256:b398ea66e8ed50451bce5997c430197d5e4b06ac4aa74602717f792d8d8d06e2"}, +] + +[package.extras] +full = ["numpy"] + [[package]] name = "referencing" version = "0.35.1" @@ -3165,13 +3297,13 @@ files = [ [[package]] name = "requests" -version = "2.32.2" +version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" files = [ - {file = "requests-2.32.2-py3-none-any.whl", hash = "sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c"}, - {file = "requests-2.32.2.tar.gz", hash = "sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289"}, + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] [package.dependencies] @@ -3292,6 +3424,83 @@ files = [ {file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"}, ] +[[package]] +name = "ruamel-yaml" +version = "0.18.6" +description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruamel.yaml-0.18.6-py3-none-any.whl", hash = "sha256:57b53ba33def16c4f3d807c0ccbc00f8a6081827e81ba2491691b76882d0c636"}, + {file = "ruamel.yaml-0.18.6.tar.gz", hash = "sha256:8b27e6a217e786c6fbe5634d8f3f11bc63e0f80f6a5890f28863d9c45aac311b"}, +] + +[package.dependencies] +"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.13\""} + +[package.extras] +docs = ["mercurial (>5.7)", "ryd"] +jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"] + +[[package]] +name = "ruamel-yaml-clib" +version = "0.2.8" +description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml" +optional = false +python-versions = ">=3.6" +files = [ + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b42169467c42b692c19cf539c38d4602069d8c1505e97b86387fcf7afb766e1d"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:07238db9cbdf8fc1e9de2489a4f68474e70dffcb32232db7c08fa61ca0c7c462"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fff3573c2db359f091e1589c3d7c5fc2f86f5bdb6f24252c2d8e539d4e45f412"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:aa2267c6a303eb483de8d02db2871afb5c5fc15618d894300b88958f729ad74f"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:840f0c7f194986a63d2c2465ca63af8ccbbc90ab1c6001b1978f05119b5e7334"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:024cfe1fc7c7f4e1aff4a81e718109e13409767e4f871443cbff3dba3578203d"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win32.whl", hash = "sha256:c69212f63169ec1cfc9bb44723bf2917cbbd8f6191a00ef3410f5a7fe300722d"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win_amd64.whl", hash = "sha256:cabddb8d8ead485e255fe80429f833172b4cadf99274db39abc080e068cbcc31"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bef08cd86169d9eafb3ccb0a39edb11d8e25f3dae2b28f5c52fd997521133069"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:b16420e621d26fdfa949a8b4b47ade8810c56002f5389970db4ddda51dbff248"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:25c515e350e5b739842fc3228d662413ef28f295791af5e5110b543cf0b57d9b"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:1707814f0d9791df063f8c19bb51b0d1278b8e9a2353abbb676c2f685dee6afe"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:46d378daaac94f454b3a0e3d8d78cafd78a026b1d71443f4966c696b48a6d899"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:09b055c05697b38ecacb7ac50bdab2240bfca1a0c4872b0fd309bb07dc9aa3a9"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win32.whl", hash = "sha256:53a300ed9cea38cf5a2a9b069058137c2ca1ce658a874b79baceb8f892f915a7"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win_amd64.whl", hash = "sha256:c2a72e9109ea74e511e29032f3b670835f8a59bbdc9ce692c5b4ed91ccf1eedb"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ebc06178e8821efc9692ea7544aa5644217358490145629914d8020042c24aa1"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:edaef1c1200c4b4cb914583150dcaa3bc30e592e907c01117c08b13a07255ec2"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d176b57452ab5b7028ac47e7b3cf644bcfdc8cacfecf7e71759f7f51a59e5c92"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_24_aarch64.whl", hash = "sha256:1dc67314e7e1086c9fdf2680b7b6c2be1c0d8e3a8279f2e993ca2a7545fecf62"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3213ece08ea033eb159ac52ae052a4899b56ecc124bb80020d9bbceeb50258e9"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aab7fd643f71d7946f2ee58cc88c9b7bfc97debd71dcc93e03e2d174628e7e2d"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-win32.whl", hash = "sha256:5c365d91c88390c8d0a8545df0b5857172824b1c604e867161e6b3d59a827eaa"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-win_amd64.whl", hash = "sha256:1758ce7d8e1a29d23de54a16ae867abd370f01b5a69e1a3ba75223eaa3ca1a1b"}, + {file = "ruamel.yaml.clib-0.2.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a5aa27bad2bb83670b71683aae140a1f52b0857a2deff56ad3f6c13a017a26ed"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c58ecd827313af6864893e7af0a3bb85fd529f862b6adbefe14643947cfe2942"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_12_0_arm64.whl", hash = "sha256:f481f16baec5290e45aebdc2a5168ebc6d35189ae6fea7a58787613a25f6e875"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:77159f5d5b5c14f7c34073862a6b7d34944075d9f93e681638f6d753606c6ce6"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7f67a1ee819dc4562d444bbafb135832b0b909f81cc90f7aa00260968c9ca1b3"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4ecbf9c3e19f9562c7fdd462e8d18dd902a47ca046a2e64dba80699f0b6c09b7"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:87ea5ff66d8064301a154b3933ae406b0863402a799b16e4a1d24d9fbbcbe0d3"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-win32.whl", hash = "sha256:75e1ed13e1f9de23c5607fe6bd1aeaae21e523b32d83bb33918245361e9cc51b"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-win_amd64.whl", hash = "sha256:3f215c5daf6a9d7bbed4a0a4f760f3113b10e82ff4c5c44bec20a68c8014f675"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1b617618914cb00bf5c34d4357c37aa15183fa229b24767259657746c9077615"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a6a9ffd280b71ad062eae53ac1659ad86a17f59a0fdc7699fd9be40525153337"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:305889baa4043a09e5b76f8e2a51d4ffba44259f6b4c72dec8ca56207d9c6fe1"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:700e4ebb569e59e16a976857c8798aee258dceac7c7d6b50cab63e080058df91"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e2b4c44b60eadec492926a7270abb100ef9f72798e18743939bdbf037aab8c28"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e79e5db08739731b0ce4850bed599235d601701d5694c36570a99a0c5ca41a9d"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-win32.whl", hash = "sha256:955eae71ac26c1ab35924203fda6220f84dce57d6d7884f189743e2abe3a9fbe"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-win_amd64.whl", hash = "sha256:56f4252222c067b4ce51ae12cbac231bce32aee1d33fbfc9d17e5b8d6966c312"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:03d1162b6d1df1caa3a4bd27aa51ce17c9afc2046c31b0ad60a0a96ec22f8001"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bba64af9fa9cebe325a62fa398760f5c7206b215201b0ec825005f1b18b9bccf"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:a1a45e0bb052edf6a1d3a93baef85319733a888363938e1fc9924cb00c8df24c"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:da09ad1c359a728e112d60116f626cc9f29730ff3e0e7db72b9a2dbc2e4beed5"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:184565012b60405d93838167f425713180b949e9d8dd0bbc7b49f074407c5a8b"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a75879bacf2c987c003368cf14bed0ffe99e8e85acfa6c0bfffc21a090f16880"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-win32.whl", hash = "sha256:84b554931e932c46f94ab306913ad7e11bba988104c5cff26d90d03f68258cd5"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-win_amd64.whl", hash = "sha256:25ac8c08322002b06fa1d49d1646181f0b2c72f5cbc15a85e80b4c30a544bb15"}, + {file = "ruamel.yaml.clib-0.2.8.tar.gz", hash = "sha256:beb2e0404003de9a4cab9753a8805a8fe9320ee6673136ed7f04255fe60bb512"}, +] + [[package]] name = "ruff" version = "0.3.7" @@ -3552,20 +3761,67 @@ transformers = ">=4.34.0,<5.0.0" dev = ["pre-commit", "pytest", "ruff (>=0.3.0)"] [[package]] -name = "setuptools" -version = "70.0.0" -description = "Easily download, build, install, upgrade, and uninstall Python packages" +name = "sentencepiece" +version = "0.2.0" +description = "SentencePiece python wrapper" optional = false -python-versions = ">=3.8" +python-versions = "*" files = [ - {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, - {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, + {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227"}, + {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452"}, + {file = "sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7b67e724bead13f18db6e1d10b6bbdc454af574d70efbb36f27d90387be1ca3"}, + {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fde4b08cfe237be4484c6c7c2e2c75fb862cfeab6bd5449ce4caeafd97b767a"}, + {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c378492056202d1c48a4979650981635fd97875a00eabb1f00c6a236b013b5e"}, + {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1380ce6540a368de2ef6d7e6ba14ba8f3258df650d39ba7d833b79ee68a52040"}, + {file = "sentencepiece-0.2.0-cp310-cp310-win32.whl", hash = "sha256:a1151d6a6dd4b43e552394aed0edfe9292820272f0194bd56c7c1660a0c06c3d"}, + {file = "sentencepiece-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:d490142b0521ef22bc1085f061d922a2a6666175bb6b42e588ff95c0db6819b2"}, + {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:17982700c4f6dbb55fa3594f3d7e5dd1c8659a274af3738e33c987d2a27c9d5c"}, + {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7c867012c0e8bcd5bdad0f791609101cb5c66acb303ab3270218d6debc68a65e"}, + {file = "sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fd6071249c74f779c5b27183295b9202f8dedb68034e716784364443879eaa6"}, + {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f90c55a65013cbb8f4d7aab0599bf925cde4adc67ae43a0d323677b5a1c6cb"}, + {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b293734059ef656dcd65be62ff771507bea8fed0a711b6733976e1ed3add4553"}, + {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e58b47f933aca74c6a60a79dcb21d5b9e47416256c795c2d58d55cec27f9551d"}, + {file = "sentencepiece-0.2.0-cp311-cp311-win32.whl", hash = "sha256:c581258cf346b327c62c4f1cebd32691826306f6a41d8c4bec43b010dee08e75"}, + {file = "sentencepiece-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0993dbc665f4113017892f1b87c3904a44d0640eda510abcacdfb07f74286d36"}, + {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ea5f536e32ea8ec96086ee00d7a4a131ce583a1b18d130711707c10e69601cb2"}, + {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0cb51f53b6aae3c36bafe41e86167c71af8370a039f542c43b0cce5ef24a68c"}, + {file = "sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3212121805afc58d8b00ab4e7dd1f8f76c203ddb9dc94aa4079618a31cf5da0f"}, + {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a3149e3066c2a75e0d68a43eb632d7ae728c7925b517f4c05c40f6f7280ce08"}, + {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:632f3594d3e7ac8b367bca204cb3fd05a01d5b21455acd097ea4c0e30e2f63d7"}, + {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f295105c6bdbb05bd5e1b0cafbd78ff95036f5d3641e7949455a3f4e5e7c3109"}, + {file = "sentencepiece-0.2.0-cp312-cp312-win32.whl", hash = "sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251"}, + {file = "sentencepiece-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a673a72aab81fef5ebe755c6e0cc60087d1f3a4700835d40537183c1703a45f"}, + {file = "sentencepiece-0.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4547683f330289ec4f093027bfeb87f9ef023b2eb6f879fdc4a8187c7e0ffb90"}, + {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd6175f7eaec7142d2bf6f6597ce7db4c9ac89acf93fcdb17410c3a8b781eeb"}, + {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:859ba1acde782609a0910a26a60e16c191a82bf39b5621107552c0cd79fad00f"}, + {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcbbef6cc277f8f18f36959e305f10b1c620442d75addc79c21d7073ae581b50"}, + {file = "sentencepiece-0.2.0-cp36-cp36m-win32.whl", hash = "sha256:536b934e244829e3fe6c4f198652cd82da48adb9aa145c9f00889542726dee3d"}, + {file = "sentencepiece-0.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0a91aaa3c769b52440df56fafda683b3aa48e3f2169cf7ee5b8c8454a7f3ae9b"}, + {file = "sentencepiece-0.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:787e480ca4c1d08c9985a7eb1eae4345c107729c99e9b5a9a00f2575fc7d4b4b"}, + {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4d158189eb2ecffea3a51edf6d25e110b3678ec47f1a40f2d541eafbd8f6250"}, + {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1e5ca43013e8935f25457a4fca47e315780172c3e821b4b13a890668911c792"}, + {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7140d9e5a74a0908493bb4a13f1f16a401297bd755ada4c707e842fbf6f0f5bf"}, + {file = "sentencepiece-0.2.0-cp37-cp37m-win32.whl", hash = "sha256:6cf333625234f247ab357b0bd9836638405ea9082e1543d5b8408f014979dcbf"}, + {file = "sentencepiece-0.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea"}, + {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20813a68d4c221b1849c62c30e1281ea81687894d894b8d4a0f4677d9311e0f5"}, + {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:926ef920ae2e8182db31d3f5d081ada57804e3e1d3a8c4ef8b117f9d9fb5a945"}, + {file = "sentencepiece-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:89f65f69636b7e9c015b79dff9c9985a9bc7d19ded6f79ef9f1ec920fdd73ecf"}, + {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f67eae0dbe6f2d7d6ba50a354623d787c99965f068b81e145d53240198021b0"}, + {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98501e075f35dd1a1d5a20f65be26839fcb1938752ec61539af008a5aa6f510b"}, + {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3d1d2cc4882e8d6a1adf9d5927d7716f80617fc693385661caff21888972269"}, + {file = "sentencepiece-0.2.0-cp38-cp38-win32.whl", hash = "sha256:b99a308a2e5e569031ab164b74e6fab0b6f37dfb493c32f7816225f4d411a6dd"}, + {file = "sentencepiece-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:cdb701eec783d3ec86b7cd4c763adad8eaf6b46db37ee1c36e5e6c44b3fe1b5f"}, + {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1e0f9c4d0a6b0af59b613175f019916e28ade076e21242fd5be24340d8a2f64a"}, + {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:298f21cc1366eb60311aedba3169d30f885c363ddbf44214b0a587d2908141ad"}, + {file = "sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f1ec95aa1e5dab11f37ac7eff190493fd87770f7a8b81ebc9dd768d1a3c8704"}, + {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b06b70af54daa4b4904cbb90b4eb6d35c9f3252fdc86c9c32d5afd4d30118d8"}, + {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22e37bac44dd6603388cb598c64ff7a76e41ca774646f21c23aadfbf5a2228ab"}, + {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5"}, + {file = "sentencepiece-0.2.0-cp39-cp39-win32.whl", hash = "sha256:38aed822fb76435fa1f12185f10465a94ab9e51d5e8a9159e9a540ce926f0ffd"}, + {file = "sentencepiece-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8cf876516548b5a1d6ac4745d8b554f5c07891d55da557925e5c13ff0b4e6ad"}, + {file = "sentencepiece-0.2.0.tar.gz", hash = "sha256:a52c19171daaf2e697dc6cbe67684e0fa341b1248966f6aebb541de654d15843"}, ] -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] - [[package]] name = "six" version = "1.16.0" @@ -3646,6 +3902,29 @@ numpy = "*" docs = ["linkify-it-py", "myst-parser", "sphinx", "sphinx-book-theme"] test = ["pytest"] +[[package]] +name = "speechbrain" +version = "1.0.0" +description = "All-in-one speech toolkit in pure Python and Pytorch" +optional = false +python-versions = ">=3.7" +files = [ + {file = "speechbrain-1.0.0-py3-none-any.whl", hash = "sha256:3f163958fc5a6dc05851700e8cee9d828110fbb7a8defec911f25849ce33f45f"}, + {file = "speechbrain-1.0.0.tar.gz", hash = "sha256:f44797a23e0351f6ebf5ffd323bf44857431b2ffa55ec8e0c9f66e2ac6f6cea7"}, +] + +[package.dependencies] +huggingface-hub = "*" +hyperpyyaml = "*" +joblib = "*" +numpy = "*" +packaging = "*" +scipy = "*" +sentencepiece = "*" +torch = ">=1.9" +torchaudio = "*" +tqdm = "*" + [[package]] name = "stack-data" version = "0.6.3" @@ -3667,17 +3946,17 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] [[package]] name = "sympy" -version = "1.12" +version = "1.12.1" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" files = [ - {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, - {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, + {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"}, + {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"}, ] [package.dependencies] -mpmath = ">=0.19" +mpmath = ">=1.1.0,<1.4.0" [[package]] name = "threadpoolctl" @@ -4050,13 +4329,13 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, [[package]] name = "transformers" -version = "4.41.1" +version = "4.41.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.8.0" files = [ - {file = "transformers-4.41.1-py3-none-any.whl", hash = "sha256:f0680e0b1a01067eccd11f62f0522409422c7d6f91d532fe0f50b136a406129d"}, - {file = "transformers-4.41.1.tar.gz", hash = "sha256:fa859e4c66f0896633a3bf534e0d9a29a9a88478a49f94c5d8270537dc61cc42"}, + {file = "transformers-4.41.2-py3-none-any.whl", hash = "sha256:05555d20e43f808de1ef211ab64803cdb513170cef70d29a888b589caebefc67"}, + {file = "transformers-4.41.2.tar.gz", hash = "sha256:80a4db216533d573e9cc7388646c31ed9480918feb7c55eb211249cb23567f87"}, ] [package.dependencies] @@ -4139,13 +4418,13 @@ tutorials = ["matplotlib", "pandas", "tabulate", "torch"] [[package]] name = "typing-extensions" -version = "4.12.0" +version = "4.12.1" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.12.0-py3-none-any.whl", hash = "sha256:b349c66bea9016ac22978d800cfff206d5f9816951f12a7d0ec5578b0a819594"}, - {file = "typing_extensions-4.12.0.tar.gz", hash = "sha256:8cbcdc8606ebcb0d95453ad7dc5065e6237b6aa230a31e81d0f440c30fed5fd8"}, + {file = "typing_extensions-4.12.1-py3-none-any.whl", hash = "sha256:6024b58b69089e5a89c347397254e35f1bf02a907728ec7fee9bf0fe837d203a"}, + {file = "typing_extensions-4.12.1.tar.gz", hash = "sha256:915f5e35ff76f56588223f15fdd5938f9a1cf9195c0de25130c627e4d597f6d1"}, ] [[package]] @@ -4429,20 +4708,20 @@ multidict = ">=4.0" [[package]] name = "zipp" -version = "3.19.0" +version = "3.19.2" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.8" files = [ - {file = "zipp-3.19.0-py3-none-any.whl", hash = "sha256:96dc6ad62f1441bcaccef23b274ec471518daf4fbbc580341204936a5a3dddec"}, - {file = "zipp-3.19.0.tar.gz", hash = "sha256:952df858fb3164426c976d9338d3961e8e8b3758e2e059e0f754b8c4262625ee"}, + {file = "zipp-3.19.2-py3-none-any.whl", hash = "sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c"}, + {file = "zipp-3.19.2.tar.gz", hash = "sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "37f8aaa4ac3c3d0fd9cb8e1d69b5419f93196e9647721f7e2aec53342e0981f6" +content-hash = "29a57eaee26c5cc55f49ab5ebb6c003a40941a2f277fd344248f82f10f858bae" diff --git a/pyproject.toml b/pyproject.toml index cd17d839..2ea59107 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "senselab" -version = "0.1.2.dev23+c952ded" +version = "0.0.1" description = "Senselab is a Python package that simplifies building pipelines for speech and voice analysis." authors = [ "Fabio Catania ", @@ -28,7 +28,7 @@ packages = [{include = "senselab", from = "src"}] python = "^3.10" click = "^8.1.7" jsonschema = "^4.21.1" -datasets = "^2.18.0" +datasets = "^2.19.2" torch = "^2.2.2" torchvision = "^0.17.2" torchaudio = "^2.2.2" @@ -38,21 +38,23 @@ soundfile = "^0.12.1" ffmpeg-python = "^0.2.0" ipykernel = "^6.29.4" pydra = "^0.23" -pydantic = "^2.7.1" +pydantic = "^2.7.3" accelerate = "^0.29.3" -huggingface-hub = "^0.23.0" +huggingface-hub = "^0.23.3" praat-parselmouth = "^0.4.3" iso-639 = {git = "https://github.com/noumar/iso639.git", tag = "0.4.5"} opensmile = "^2.5.0" audiomentations = "^0.35.0" torch-audiomentations = "^0.11.1" sentence-transformers = "^2.7.0" +jiwer = "^3.0.4" +speechbrain = "^1.0.0" [tool.poetry.group.dev] optional = true [tool.poetry.group.dev.dependencies] -pytest = "^8.1.1" +pytest = "^8.2.2" pytest-mock = "^3.14.0" mypy = "^1.9.0" pre-commit = "^3.7.0" @@ -79,6 +81,9 @@ testpaths = [ [tool.mypy] ignore_missing_imports = true +plugins = [ + "pydantic.mypy" +] [tool.ruff] exclude = [ @@ -104,7 +109,7 @@ exclude = [ "node_modules", "venv" ] -line-length = 80 +line-length = 120 indent-width = 4 src = ["src"] target-version = "py310" @@ -140,10 +145,10 @@ pattern = "default-unprefixed" [tool.codespell] skip = [ - "./poetry.lock", - "./docs_style/pdoc-theme/syntax-highlighting.css" + "poetry.lock", + "docs_style/pdoc-theme/syntax-highlighting.css" ] -ignore-words-list = ["senselab"] +ignore-words-list = ["senselab", "nd", "astroid", "wil"] [build-system] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] diff --git a/scripts/experiment1.py b/scripts/experiment1.py deleted file mode 100644 index d1eb75b6..00000000 --- a/scripts/experiment1.py +++ /dev/null @@ -1,170 +0,0 @@ -"""This script is used to test the audio tasks.""" - -from typing import Any, Dict - -import pydra - -from senselab.audio.tasks.preprocessing import resample_hf_dataset -from senselab.audio.tasks.preprocessing_pydra import resample_hf_dataset_pt -from senselab.audio.tasks.speech_to_text import transcribe_dataset_with_hf -from senselab.audio.tasks.speech_to_text_pydra import ( - transcribe_dataset_with_hf_pt, -) -from senselab.utils.decorators import get_response_time -from senselab.utils.tasks.input_output import read_files_from_disk -from senselab.utils.tasks.input_output_pydra import read_files_from_disk_pt - - -@get_response_time -def workflow(data: Dict[str, Any]) -> None: - """This function reads files from disk and transcribes them with Whisper.""" - print("Starting to read files from disk...") - dataset = read_files_from_disk(data["files"]) - print(f"Dataset loaded with {len(dataset)} records.") - - print("Resampling dataset...") - dataset = resample_hf_dataset(dataset, 16000) - print("Resampled dataset.") - - """ - print("Pushing dataset to the hub...") - push_dataset_to_hub(dataset, - remote_repository="fabiocat/test", - split="train") - print("Dataset pushed to the hub successfully.") - """ - - print("Transcribing dataset...") - _ = transcribe_dataset_with_hf( - dataset=dataset, model_id="openai/whisper-tiny", language="en" - ) # facebook/wav2vec2-base-960h - print("Transcribed dataset.") - - """ - print("Pushing dataset to the hub...") - push_dataset_to_hub(transcript_dataset, - remote_repository="fabiocat/transcript") - print("Dataset pushed to the hub successfully.") - """ - - -@get_response_time -def pydra_workflow(data: Dict[str, Any]) -> None: - """This function reads files from disk and transcribes them with Whisper.""" - wf0 = pydra.Workflow(name="wf0", input_spec=["x"], x=data["files"]) - wf0.add( - read_files_from_disk_pt( - name="read_files_from_disk_name", files=wf0.lzin.x - ) - ) - wf0.add( - resample_hf_dataset_pt( - name="resample_hf_dataset_name", - dataset=wf0.read_files_from_disk_name.lzout.out, - resample_rate=16000, - ) - ) - - """ - wf0.add(push_dataset_to_hub_pt(name='push_audio_dataset_to_hub_name', - dataset=wf0.resample_hf_dataset_name.lzout.out, - remote_repository="fabiocat/test", - split="train") - ) - """ - - wf0.add( - transcribe_dataset_with_hf_pt( - name="transcribe_dataset_name", - dataset=wf0.resample_hf_dataset_name.lzout.out, - model_id="openai/whisper-tiny", - language="en", - ) - ) - - """ - wf0.add(push_dataset_to_hub_pt(name='push_transcript_dataset_to_hub_name', - dataset=wf0.transcribe_dataset_name.lzout.out, - remote_repository="fabiocat/transcript") - ) - """ - - wf0.set_output([("out", wf0.transcribe_dataset_name.lzout.out)]) - - # PYDRA RUN - with pydra.Submitter(plugin="serial") as sub: - sub(wf0) - - _ = wf0.result() - - -@get_response_time -def pydra_workflow2(data: Dict[str, Any]) -> None: - """This function reads files from disk and transcribes them with Whisper.""" - wf0 = pydra.Workflow(name="wf0", input_spec=["x"], x=data["files"]) - - wf0.add( - read_files_from_disk_pt( - name="read_files_from_disk_name", files=wf0.lzin.x - ).split("files", files=wf0.lzin.x) - ) - wf0.add( - resample_hf_dataset_pt( - name="resample_hf_dataset_name", - dataset=wf0.read_files_from_disk_name.lzout.out, - resample_rate=16000, - ) - ) - - """ - wf0.add(push_dataset_to_hub_pt(name='push_audio_dataset_to_hub_name', - dataset=wf0.resample_hf_dataset_name.lzout.out, - remote_repository="fabiocat/test", - split="train")) - """ - - wf0.add( - transcribe_dataset_with_hf_pt( - name="transcribe_dataset_name", - dataset=wf0.resample_hf_dataset_name.lzout.out, - model_id="openai/whisper-tiny", - language="en", - ) - ) - wf0.combine("x") - - """ - wf0.add(push_dataset_to_hub_pt(name='push_transcript_dataset_to_hub_name', - dataset=wf0.transcribe_dataset_name.lzout.out, - remote_repository="fabiocat/transcript")) - """ - - # TODO: create a dataset object from the combined transcripts - wf0.set_output([("out", wf0.transcribe_dataset_name.lzout.out)]) - - # PYDRA RUN - with pydra.Submitter(plugin="serial") as sub: - sub(wf0) - - _ = wf0.result() - - -data = { - "files": [ - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - ] -} - -workflow(data) -print("\n\n") -pydra_workflow(data) -print("\n\n") -pydra_workflow2(data) diff --git a/scripts/experiment2.py b/scripts/experiment2.py deleted file mode 100644 index 0ab1f7d4..00000000 --- a/scripts/experiment2.py +++ /dev/null @@ -1,46 +0,0 @@ -"""This script is used to test the audio tasks.""" - -from senselab.audio.tasks.features_extraction.opensmile import ( - extract_feats_from_dataset, -) -from senselab.audio.tasks.features_extraction.praat_parselmouth import ( - get_hf_dataset_durations, - get_hf_dataset_f0_descriptors, - get_hf_dataset_harmonicity_descriptors, - get_hf_dataset_jitter_descriptors, - get_hf_dataset_shimmer_descriptors, -) -from senselab.utils.tasks.input_output import read_files_from_disk - -dataset = read_files_from_disk( - [ - "/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav" - ] -) - -print(dataset) - -duration_dataset = get_hf_dataset_durations(dataset) -f0_dataset = get_hf_dataset_f0_descriptors(dataset, f0min=100, f0max=500) -harmonicity_dataset = get_hf_dataset_harmonicity_descriptors(dataset, f0min=100) -jitter_dataset = get_hf_dataset_jitter_descriptors( - dataset, f0min=100, f0max=500 -) -shimmer_dataset = get_hf_dataset_shimmer_descriptors( - dataset, f0min=100, f0max=500 -) - -print(duration_dataset) -print(f0_dataset) -print(harmonicity_dataset) -print(jitter_dataset) -print(shimmer_dataset) - -opensmile_feats = extract_feats_from_dataset( - dataset, - audio_column="audio", - feature_set="eGeMAPSv02", - feature_level="Functionals", -) - -print(opensmile_feats) diff --git a/scripts/experiment3.py b/scripts/experiment3.py deleted file mode 100644 index 85da5002..00000000 --- a/scripts/experiment3.py +++ /dev/null @@ -1,9 +0,0 @@ -"""This script is used to test the video tasks.""" - -from senselab.video.input_output import extract_audios_from_local_videos - -files = ["../src/tests/data_for_testing/video_48khz_stereo_16bits.mp4"] -dataset = extract_audios_from_local_videos(files) - -print("dataset") -print(dataset) diff --git a/scripts/experiment4.py b/scripts/experiment4.py deleted file mode 100644 index b14eb339..00000000 --- a/scripts/experiment4.py +++ /dev/null @@ -1,17 +0,0 @@ -"""This script is used to test the voice cloning task.""" - -from senselab.audio.tasks.preprocessing import resample_hf_dataset -from senselab.audio.tasks.voice_cloning import clone_voice_in_dataset_with_KNNVC -from senselab.utils.tasks.input_output import read_files_from_disk - -files = ["../src/tests/data_for_testing/audio_48khz_mono_16bits.wav"] -dataset = read_files_from_disk(files) - -print("Resampling dataset...") -dataset = resample_hf_dataset(dataset, 16000) -print("Resampled dataset.") - -cloned_dataset = clone_voice_in_dataset_with_KNNVC(dataset, dataset) - -print("cloned_dataset") -# print(cloned_dataset) diff --git a/scripts/experiment5.py b/scripts/experiment5.py deleted file mode 100644 index 7e65e8b4..00000000 --- a/scripts/experiment5.py +++ /dev/null @@ -1,43 +0,0 @@ -"""This script is used to test the audio tasks.""" - -from typing import Any, Dict - -from torch_audiomentations import Compose, Gain, PolarityInversion - -from senselab.audio.tasks.data_augmentation import augment_hf_dataset -from senselab.utils.decorators import get_response_time -from senselab.utils.tasks.input_output import read_files_from_disk - - -@get_response_time -def workflow(data: Dict[str, Any], augmentation: Compose) -> None: - """This function reads files from disk and transcribes them with Whisper.""" - print("Starting to read files from disk...") - dataset = read_files_from_disk(data["files"]) - print(f"Dataset loaded with {len(dataset)} records.") - - print("Augmenting dataset...") - dataset = augment_hf_dataset(dataset, augmentation) - print("Augmented dataset.") - - -# Initialize augmentation callable -apply_augmentation = Compose( - transforms=[ - Gain( - min_gain_in_db=-15.0, - max_gain_in_db=5.0, - p=0.5, - ), - PolarityInversion(p=0.5), - ] -) - -data = { - "files": [ - "../src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "../src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - ] -} - -workflow(data, apply_augmentation) diff --git a/scripts/experiment6.py b/scripts/experiment6.py deleted file mode 100644 index fdd36557..00000000 --- a/scripts/experiment6.py +++ /dev/null @@ -1,48 +0,0 @@ -"""This script is used to test the audio tasks.""" - -from typing import Any, Dict - -from senselab.audio.tasks.preprocessing import resample_hf_dataset -from senselab.audio.tasks.speech_to_text import transcribe_dataset_with_hf -from senselab.text.tasks.sentence_transofmers_embeddings_extraction import ( - extract_embeddings_from_hf_dataset, -) -from senselab.utils.decorators import get_response_time -from senselab.utils.tasks.input_output import read_files_from_disk - - -@get_response_time -def workflow(data: Dict[str, Any]) -> None: - """This function reads files from disk and transcribes them with Whisper.""" - print("Starting to read files from disk...") - dataset = read_files_from_disk(data["files"]) - print(f"Dataset loaded with {len(dataset)} records.") - - print("Resampling dataset...") - dataset = resample_hf_dataset(dataset, 16000) - print("Resampled dataset.") - - print("Transcribing dataset...") - transcript_dataset = transcribe_dataset_with_hf( - dataset=dataset, model_id="openai/whisper-tiny", language="en" - ) # facebook/wav2vec2-base-960h - print("Transcribed dataset.") - - print("Extracting embeddings...") - _ = extract_embeddings_from_hf_dataset( - transcript_dataset, - model_id="sentence-transformers/paraphrase-MiniLM-L6-v2", - text_column="asr", - ) - print("Extracted embeddings.") - - -data = { - "files": [ - "../src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - "../src/tests/data_for_testing/audio_48khz_mono_16bits.wav", - ] -} - -workflow(data) -print("\n\n") diff --git a/scripts/pyannote_31_experiment.py b/scripts/pyannote_31_experiment.py deleted file mode 100644 index 84905b27..00000000 --- a/scripts/pyannote_31_experiment.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Demonstrates use of pyannote_31.py with the PolyAI/minds14 dataset. - -PolyAI/minds14 dataset: -https://huggingface.co/datasets/PolyAI/minds14 -""" - - -import json - -from datasets import load_dataset - -from senselab.audio.tasks.preprocessing import resample_hf_dataset -from senselab.audio.tasks.pyannote_speaker_diarization import ( - pyannote_diarize, -) -from senselab.utils.tasks.input_output import _from_hf_dataset_to_dict - -HF_TOKEN = "YOUR HF_TOKEN" -dataset = load_dataset("PolyAI/minds14", "en-US", split="train") -dataset = dataset.select(range(4)) - -dataset = _from_hf_dataset_to_dict(dataset) - -print("Resampling dataset...") -dataset = resample_hf_dataset(dataset, 16000) -print("Resampled dataset.") - -print("Diarizing dataset...") -dataset_diarized = pyannote_diarize(dataset, - batched=True, - batch_size=2, - model_revision="3.1") -print("Diarized dataset.") - -print(json.dumps(dataset_diarized, indent=4)) diff --git a/src/senselab/audio/tasks/data_augmentation.py b/src/senselab/audio/tasks/data_augmentation.py index fc49c645..58ba61b1 100644 --- a/src/senselab/audio/tasks/data_augmentation.py +++ b/src/senselab/audio/tasks/data_augmentation.py @@ -1,54 +1,65 @@ """This module implements some utilities for audio data augmentation.""" -from typing import Any, Dict +from typing import List, Union import torch -from datasets import Dataset from torch_audiomentations import Compose -from senselab.utils.tasks.input_output import ( - _from_dict_to_hf_dataset, - _from_hf_dataset_to_dict, +from senselab.utils.data_structures.audio import ( + Audio, + batch_audios, + unbatch_audios, ) +from senselab.utils.device import DeviceType, _select_device_and_dtype -def augment_hf_dataset( - dataset: Dict[str, Any], augmentation: Compose, audio_column: str = "audio" -) -> Dict[str, Any]: - """Resamples a Hugging Face `Dataset` object.""" - hf_dataset = _from_dict_to_hf_dataset(dataset) - - def _augment_hf_row( - row: Dataset, augmentation: Compose, audio_column: str - ) -> Dict[str, Any]: - waveform = row[audio_column]["array"] - sampling_rate = row[audio_column]["sampling_rate"] - - # Ensure waveform is a PyTorch tensor - if not isinstance(waveform, torch.Tensor): - waveform = torch.tensor(waveform) - if waveform.dim() == 1: - waveform = waveform.unsqueeze(0).unsqueeze( - 0 - ) # [num_samples] -> [1, 1, num_samples] - elif waveform.dim() == 2: - waveform = waveform.unsqueeze( - 1 - ) # [batch_size, num_samples] -> [batch_size, 1, num_samples] - - augmented_hf_row = augmentation( - waveform, sample_rate=sampling_rate - ).squeeze() - - return { - "augmented_audio": { - "array": augmented_hf_row, - "sampling_rate": sampling_rate, - } - } - - augmented_hf_dataset = hf_dataset.map( - lambda x: _augment_hf_row(x, augmentation, audio_column) +def augment_audios( + audios: List[Audio], augmentation: Compose, device_options: Union[DeviceType, List[DeviceType]] = [DeviceType.CPU] +) -> List[Audio]: + """Augments all provided audios with a given augmentation, either individually or all batched together. + + Augment all audios with a user defined augmentation that can be a composition of multiple augmentations. This + augmentation is either performed on each audio individually or all of the audios provided are batched together + and run at once. NOTE: if batching, all audios must have the same sampling rate. + + Args: + audios: List of Audios whose data will be augmented with the given augmentations + augmentation: A Composition of augmentations to run on each audio (uses torch-audiomentations), should have its + output_type set to "dict" + device_options: The device, or a List of possible devices, to use for augmenting. If the chosen device + is MPS or CUDA then the audios are all batched together, so for optimal performance, batching should + be done by passing a batch_size worth of audios ar a time + + Returns: + List of audios that has passed the all of input audios through the provided augmentation. This does + not necessarily mean that the augmentation has been run on every audio. For more information, + see the torch-audiomentations documentation. + """ + augmentation.output_type = "dict" + new_audios = [] + device_type, dtype = _select_device_and_dtype( + compatible_devices=device_options if isinstance(device_options, List) else [device_options] ) - augmented_hf_dataset = augmented_hf_dataset.remove_columns([audio_column]) - return _from_hf_dataset_to_dict(augmented_hf_dataset) + if device_type == DeviceType.CPU: + for audio in audios: + audio_to_augment = audio.waveform.unsqueeze(0) + augmented_audio = augmentation(audio_to_augment, sample_rate=audio.sampling_rate).samples + new_audios.append( + Audio( + waveform=torch.squeeze(augmented_audio), + sampling_rate=audio.sampling_rate, + metadata=audio.metadata.copy(), + orig_path_or_id=audio.orig_path_or_id, + ) + ) + else: + batched_audios, sampling_rates, metadatas = batch_audios(audios) + + batched_audios = batched_audios.to(device=torch.device(str(device_type)), dtype=dtype) + sampling_rate = sampling_rates[0] if isinstance(sampling_rates, List) else sampling_rates + augmented_audio = augmentation(batched_audios, sample_rate=sampling_rate).samples + + augmented_audio = augmented_audio.detach().cpu() + return unbatch_audios(augmented_audio, sampling_rates, metadatas) + + return new_audios diff --git a/src/senselab/audio/tasks/features_extraction/torchaudio.py b/src/senselab/audio/tasks/features_extraction/torchaudio.py new file mode 100644 index 00000000..53453ad5 --- /dev/null +++ b/src/senselab/audio/tasks/features_extraction/torchaudio.py @@ -0,0 +1 @@ +"""This module provides the implementation of torchaudio utilities for audio features extraction.""" diff --git a/src/senselab/audio/tasks/preprocessing.py b/src/senselab/audio/tasks/preprocessing.py index d30bbb37..06ff3db4 100644 --- a/src/senselab/audio/tasks/preprocessing.py +++ b/src/senselab/audio/tasks/preprocessing.py @@ -1,49 +1,135 @@ """This module implements some utilities for the preprocessing task.""" -from typing import Any, Dict +from typing import List, Tuple -import torch import torchaudio.functional as F -from datasets import Dataset - -from senselab.utils.tasks.input_output import ( - _from_dict_to_hf_dataset, - _from_hf_dataset_to_dict, -) - - -def resample_hf_dataset( - dataset: Dict[str, Any], resample_rate: int, rolloff: float = 0.99 -) -> Dict[str, Any]: - """Resamples a Hugging Face `Dataset` object.""" - hf_dataset = _from_dict_to_hf_dataset(dataset) - - def _resample_hf_row( - row: Dataset, resample_rate: int, rolloff: float = 0.99 - ) -> Dict[str, Any]: - """Resamples audio data in a hf dataset row. - - A lower rolloff will therefore reduce the amount of aliasing, - but it will also reduce some of the higher frequencies. - """ - waveform = row["audio"]["array"] - # Ensure waveform is a PyTorch tensor - if not isinstance(waveform, torch.Tensor): - waveform = torch.tensor(waveform) - sampling_rate = row["audio"]["sampling_rate"] - - resampled_waveform = F.resample( - waveform, sampling_rate, resample_rate, rolloff=rolloff + +from senselab.utils.data_structures.audio import Audio + + +def resample_audios(audios: List[Audio], resample_rate: int, rolloff: float = 0.99) -> List[Audio]: + """Resamples all Audios to a given sampling rate. + + Takes a list of audios and resamples each into the new sampling rate. Notably does not assume any + specific structure of the audios (can vary in stereo vs. mono as well as their original sampling rate) + + Args: + audios: List of Audios to resample + resample_rate: Rate at which to resample the Audio + rolloff: The roll-off frequency of the filter, as a fraction of the Nyquist. + Lower values reduce anti-aliasing, but also reduce some of the highest frequencies + + Returns: + List of Audios that have all been resampled to the given resampling rate + """ + resampled_audios = [] + for audio in audios: + new_metadata = audio.metadata.copy() + new_metadata_pre_proc = new_metadata.setdefault("preprocessing", []) + new_metadata_pre_proc.append(f"resample_{audio.sampling_rate}_to_{resample_rate}") + + resampled = F.resample(audio.waveform, audio.sampling_rate, resample_rate, rolloff=rolloff) + resampled_audios.append( + Audio( + waveform=resampled, + sampling_rate=resample_rate, + metadata=new_metadata, + orig_path_or_id=audio.orig_path_or_id, + ) ) + return resampled_audios + + +def downmix_audios_to_mono(audios: List[Audio]) -> List[Audio]: + """Downmixes a list of Audio objects to mono by averaging all channels. + + Args: + audios (List[Audio]): A list of Audio objects with a tensor representing the audio waveform. + Shape: (num_channels, num_samples). + + Returns: + List[Audio]: The list of audio objects with a mono waveform averaged from all channels. Shape: (num_samples). + """ + down_mixed_audios = [] + for audio in audios: + new_metadata = audio.metadata.copy() + new_metadata_pre_proc = new_metadata.setdefault("preprocessing", []) + new_metadata_pre_proc.append("downmix_mono_averaging") + down_mixed_audios.append( + Audio( + waveform=audio.waveform.mean(dim=0, keepdim=True), + sampling_rate=audio.sampling_rate, + metadata=new_metadata, + orig_path_or_id=audio.orig_path_or_id, + ) + ) + + return down_mixed_audios + - return { - "audio": { - "array": resampled_waveform, - "sampling_rate": resample_rate, - } - } - - resampled_hf_dataset = hf_dataset.map( - lambda x: _resample_hf_row(x, resample_rate, rolloff) - ) - return _from_hf_dataset_to_dict(resampled_hf_dataset) +def select_channel_from_audios(audios: List[Audio], channel_index: int) -> List[Audio]: + """Selects a specific channel from a list of Audio objects. + + Args: + audios (List[Audio]): A list of Audio objects with a tensor representing the audio waveform. + Shape: (num_channels, num_samples). + channel_index (int): The index of the channel to select. + + Returns: + List[Audio]: The list of audio objects with the selected channel. Shape: (1, num_samples). + """ + mono_channel_audios = [] + for audio in audios: + if audio.waveform.size(0) <= channel_index: # should consider how much sense negative values make + raise ValueError("channel_index should be valid") + + new_metadata = audio.metadata.copy() + new_metadata_pre_proc = new_metadata.setdefault("preprocessing", []) + new_metadata_pre_proc.append(f"downmix_mono_select_{channel_index}") + + mono_channel_audios.append( + Audio( + waveform=audio.waveform[channel_index, :], + sampling_rate=audio.sampling_rate, + metadata=new_metadata, + orig_path_or_id=audio.orig_path_or_id, + ) + ) + return mono_channel_audios + + +def chunk_audios(data: List[Tuple[Audio, Tuple[float, float]]]) -> List[Audio]: + """Chunks the input audios based on the start and end timestamp. + + Args: + data: List of tuples containing an Audio object and a tuple with start and end (in seconds) for chunking. + + Returns: + List of Audios that have been chunked based on the provided timestamps + """ + chunked_audios = [] + + for audio, timestamps in data: + start, end = timestamps + if start < 0: + raise ValueError("Start time must be greater than or equal to 0.") + duration = audio.waveform.shape[1] / audio.sampling_rate + if end > duration: + raise ValueError(f"End time must be less than the duration of the audio file ({duration} seconds).") + start_sample = int(start * audio.sampling_rate) + end_sample = int(end * audio.sampling_rate) + chunked_waveform = audio.waveform[:, start_sample:end_sample] + + new_metadata = audio.metadata.copy() + new_metadata_pre_proc = new_metadata.setdefault("preprocessing", []) + new_metadata_pre_proc.append(f"chunk_{start}_{end}") + + chunked_audios.append( + Audio( + waveform=chunked_waveform, + sampling_rate=audio.sampling_rate, + metadata=new_metadata, + orig_path_or_id=audio.orig_path_or_id, + ) + ) + return chunked_audios diff --git a/src/senselab/audio/tasks/preprocessing_pydra.py b/src/senselab/audio/tasks/preprocessing_pydra.py index f3b122a7..c7c2b335 100644 --- a/src/senselab/audio/tasks/preprocessing_pydra.py +++ b/src/senselab/audio/tasks/preprocessing_pydra.py @@ -1,6 +1,15 @@ """This module defines a pydra API for the preprocessing task.""" + import pydra -from senselab.audio.tasks.preprocessing import resample_hf_dataset +from senselab.audio.tasks.preprocessing import ( + chunk_audios, + downmix_audios_to_mono, + resample_audios, + select_channel_from_audios, +) -resample_hf_dataset_pt = pydra.mark.task(resample_hf_dataset) +resample_audios_pt = pydra.mark.task(resample_audios) +downmix_audios_to_mono_pt = pydra.mark.task(downmix_audios_to_mono) +chunk_audios_pt = pydra.mark.task(chunk_audios) +select_channel_from_audios_pt = pydra.mark.task(select_channel_from_audios) diff --git a/src/senselab/audio/tasks/speech_to_text.py b/src/senselab/audio/tasks/speech_to_text.py index e49c9470..6387209a 100644 --- a/src/senselab/audio/tasks/speech_to_text.py +++ b/src/senselab/audio/tasks/speech_to_text.py @@ -6,7 +6,7 @@ from datasets import Dataset from transformers import pipeline -from senselab.utils.functions import DeviceType, _select_device_and_dtype +from senselab.utils.device import DeviceType, _select_device_and_dtype from senselab.utils.hf import HFModel from senselab.utils.tasks.input_output import ( _from_dict_to_hf_dataset, @@ -42,16 +42,9 @@ def _prepare_hf_asr_pipeline( ) -> pipeline: """Prepare a Hugging Face ASR pipeline.""" _ = HFModel(hf_model_id=model_id) # check HF model is valid - - if device is None: - device, torch_dtype = _select_device_and_dtype( - device_options=[DeviceType.CUDA, DeviceType.CPU] - ) - # MPS is not supported for now - else: - device, torch_dtype = _select_device_and_dtype( - device_options=[device] - ) + device, torch_dtype = _select_device_and_dtype( + user_preference=device, compatible_devices=[DeviceType.CUDA, DeviceType.CPU] + ) pipe = pipeline( "automatic-speech-recognition", diff --git a/src/senselab/audio/tasks/speech_to_text_evaluation.py b/src/senselab/audio/tasks/speech_to_text_evaluation.py new file mode 100644 index 00000000..b950dcb9 --- /dev/null +++ b/src/senselab/audio/tasks/speech_to_text_evaluation.py @@ -0,0 +1,88 @@ +"""This module implements some utilities for evaluating a transcription.""" + +import jiwer + + +def calculate_wer(reference: str, hypothesis: str) -> float: + """Calculate the Word Error Rate (WER) between the reference and hypothesis. + + Args: + reference (str): The ground truth text. + hypothesis (str): The predicted text. + + Returns: + float: The WER score. + + Examples: + >>> calculate_wer("hello world", "hello duck") + 0.5 + """ + return jiwer.wer(reference, hypothesis) + + +def calculate_mer(reference: str, hypothesis: str) -> float: + """Calculate the Match Error Rate (MER) between the reference and hypothesis. + + Args: + reference (str): The ground truth text. + hypothesis (str): The predicted text. + + Returns: + float: The MER score. + + Examples: + >>> calculate_mer("hello world", "hello duck") + 0.5 + """ + return jiwer.mer(reference, hypothesis) + + +def calculate_wil(reference: str, hypothesis: str) -> float: + """Calculate the Word Information Lost (WIL) between the reference and hypothesis. + + Args: + reference (str): The ground truth text. + hypothesis (str): The predicted text. + + Returns: + float: The WIL score. + + Examples: + >>> calculate_wil("hello world", "hello duck") + 0.75 + """ + return jiwer.wil(reference, hypothesis) + + +def calculate_wip(reference: str, hypothesis: str) -> float: + """Calculate the Word Information Preserved (WIP) between the reference and hypothesis. + + Args: + reference (str): The ground truth text. + hypothesis (str): The predicted text. + + Returns: + float: The WIP score. + + Examples: + >>> calculate_wip("hello world", "hello duck") + 0.25 + """ + return jiwer.wip(reference, hypothesis) + + +def calculate_cer(reference: str, hypothesis: str) -> float: + """Calculate the Character Error Rate (CER) between the reference and hypothesis. + + Args: + reference (str): The ground truth text. + hypothesis (str): The predicted text. + + Returns: + float: The CER score. + + Examples: + >>> calculate_cer("hello world", "hello duck") + 0.45454545454545453 + """ + return jiwer.cer(reference, hypothesis) diff --git a/src/senselab/audio/tasks/speech_to_text_evaluation_pydra.py b/src/senselab/audio/tasks/speech_to_text_evaluation_pydra.py new file mode 100644 index 00000000..5731500c --- /dev/null +++ b/src/senselab/audio/tasks/speech_to_text_evaluation_pydra.py @@ -0,0 +1,17 @@ +"""This module defines a pydra API for the speech to text evaluation task.""" + +import pydra + +from senselab.audio.tasks.speech_to_text_evaluation import ( + calculate_cer, + calculate_mer, + calculate_wer, + calculate_wil, + calculate_wip, +) + +calculate_wer_pt = pydra.mark.task(calculate_wer) +calculate_mer_pt = pydra.mark.task(calculate_mer) +calculate_wil_pt = pydra.mark.task(calculate_wil) +calculate_wip_pt = pydra.mark.task(calculate_wip) +calculate_cer_pt = pydra.mark.task(calculate_cer) diff --git a/src/senselab/audio/tasks/voice_cloning.py b/src/senselab/audio/tasks/voice_cloning.py index 75fa7948..5e033787 100644 --- a/src/senselab/audio/tasks/voice_cloning.py +++ b/src/senselab/audio/tasks/voice_cloning.py @@ -1,11 +1,11 @@ """This module implements some utilities for the voice cloning task.""" -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple import torch from datasets import Dataset -from senselab.utils.functions import DeviceType, _select_device_and_dtype +from senselab.utils.device import DeviceType, _select_device_and_dtype from senselab.utils.tasks.input_output import ( _from_dict_to_hf_dataset, _from_hf_dataset_to_dict, @@ -30,13 +30,11 @@ def _setup_knn_vc_model( model_revision: str, prematched_vocoder: bool, device: Optional[DeviceType] = None, - ) -> Any: # noqa: ANN401 + ) -> Tuple[object, DeviceType, torch.dtype]: """Prepare a KNNVC pipeline.""" repo_id = f"{model_id}:{model_revision}" device, torch_dtype = _select_device_and_dtype( - device_options=[device] - if device - else [DeviceType.CUDA, DeviceType.CPU] + user_preference=device, compatible_devices=[DeviceType.CUDA, DeviceType.CPU] ) knn_vc = torch.hub.load( repo_id, @@ -75,12 +73,8 @@ def _get_waveform(dataset: Dataset, column: str) -> torch.Tensor: out_wav = knn_vc_model.match(query_seq, matching_set, topk=topk) return {"cloned_waveform": out_wav} - hf_source_dataset = _from_dict_to_hf_dataset( - source_dataset, audio_columns=[source_audio_column] - ) - hf_target_dataset = _from_dict_to_hf_dataset( - target_dataset, audio_columns=[target_audio_column] - ) + hf_source_dataset = _from_dict_to_hf_dataset(source_dataset, audio_columns=[source_audio_column]) + hf_target_dataset = _from_dict_to_hf_dataset(target_dataset, audio_columns=[target_audio_column]) knn_vc, device, torch_dtype = _setup_knn_vc_model( model_id=model_id, diff --git a/src/senselab/utils/constants.py b/src/senselab/utils/constants.py new file mode 100644 index 00000000..cf3d616e --- /dev/null +++ b/src/senselab/utils/constants.py @@ -0,0 +1,5 @@ +"""Constants used by Senselab.""" + +import uuid + +SENSELAB_NAMESPACE = uuid.uuid3(uuid.NAMESPACE_URL, "https://github.com/sensein/senselab") diff --git a/src/senselab/utils/data_structures/audio.py b/src/senselab/utils/data_structures/audio.py new file mode 100644 index 00000000..611319e8 --- /dev/null +++ b/src/senselab/utils/data_structures/audio.py @@ -0,0 +1,158 @@ +"""Data structures relevant for audio tasks and pipelines. + +Contains data structures that are useful for audio tasks and pipelines that this package defines. +The most basic unit is an Audio object which represents the necessary information of a loaded audio +file and its corresponding metadata. Other functionality and abstract data types are provided for +ease of maintaining the codebase and offering consistent public APIs. +""" + +import uuid +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import torch +import torchaudio +from pydantic import BaseModel, Field, ValidationInfo, field_validator + +from senselab.utils.constants import SENSELAB_NAMESPACE + + +class Audio(BaseModel): + """Pydantic model for audio and its corresponding metadata. + + Pydantic model for audio that holds the necessary attributes, the actual decoded audio data + and the sampling rate, to work with audio in python. Contains metadata information as needed + and has a unique identifier for every audio. + + Attributes: + waveform: The actual audio data read from an audio file, stored as a torch.Tensor + of shape (num_channels, num_samples) + sampling_rate: The sampling rate of the audio file + orig_path_or_id: Optional str for the original path or an ID to track file over time + metadata: Optional metadata dictionary of information associated with this Audio instance + (e.g. participant demographics, audio settings, location information) + """ + + waveform: torch.Tensor + sampling_rate: int + orig_path_or_id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4())) + metadata: Dict = Field(default={}) + model_config = {"arbitrary_types_allowed": True} + + @field_validator("waveform", mode="before") + def convert_to_tensor( + cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor], info: ValidationInfo + ) -> torch.Tensor: + """Converts the audio data to torch.Tensor of shape (num_channels, num_samples).""" + temporary_tensor = None + if isinstance(v, list): + temporary_tensor = torch.tensor(v) + elif isinstance(v, np.ndarray): + temporary_tensor = torch.tensor(v) + elif isinstance(v, torch.Tensor): + temporary_tensor = v + else: + raise ValueError("Unsupported data type") + + if len(temporary_tensor.shape) == 1: + # make the audio data [channels=1, samples] + temporary_tensor = temporary_tensor.unsqueeze(0) + return temporary_tensor + + @classmethod + def from_filepath(cls, filepath: str, metadata: Dict = {}) -> "Audio": + """Creates an Audio instance from an audio file. + + Args: + filepath: Filepath of the audio file to read from + metadata: Additional information associated with the audio file + """ + array, sampling_rate = torchaudio.load(filepath) + + return cls(waveform=array, sampling_rate=sampling_rate, orig_path_or_id=filepath, metadata=metadata) + + def id(self) -> str: + """Generate a unique identifier for the Audio. + + Generate a unique identifier for the Audio where equivalent waveforms and sampling + rates generate the same IDs. + + Returns: String UUID of the Audio generated by an MD5 hash of the waveform and the sampling_rate + """ + return str(uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate))) + + def __eq__(self, other: object) -> bool: + """Overloads the default BaseModel equality to correctly check equivalence, ignoring metadata.""" + if isinstance(other, Audio): + return self.id() == other.id() + return False + + +def batch_audios(audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]: + """Batches the Audios together into a single Tensor, keeping individual Audio information separate. + + Batch all of the Audios into a single Tensor of shape (len(audios), num_channels, num_samples). + Keeps the Audio information related to each sampling rate and metadata separate for each Audio to + allow for unbatching after running relevant functionality. + + Args: + audios: List of audios to batch together. NOTE: Should all have the same number of channels + and is generally advised to have the same sampling rates if running functionality + that relies on the sampling rate. + + Returns: + Returns a tuple of a Tensor that will have the shape (len(audios), num_channels, num_samples), + the sampling rate (an integer if all have the same sampling rate), and a list of each individual + audio's metadata information. + + Raises: + RuntimeError: if all of the Audios do not have the same number of channels + """ + sampling_rates = [] + batched_audio = [] + metadatas = [] + for audio in audios: + sampling_rates.append(audio.sampling_rate) + batched_audio.append(audio.waveform) + metadatas.append(audio.metadata) + + return_sampling_rates: List[int] | int = int(sampling_rates[0]) if len(set(sampling_rates)) == 1 else sampling_rates + + return torch.stack(batched_audio), return_sampling_rates, metadatas + + +def unbatch_audios(batched_audio: torch.Tensor, sampling_rates: int | List[int], metadatas: List[Dict]) -> List[Audio]: + """Unbatches Audios into a List of Audio objects. + + Uses the batched Audios, their respective sampling rates, and their corresponding metadatas to create + a list of Audios. + + Args: + batched_audio: torch.Tensor of shape (batch_size, num_channels, num_samples) to unstack + sampling_rates: The sampling rate of each batched audio if they differ or a single sampling rate for all of them + metadatas: The respective metadata for each of the batched audios + + Returns: + List of Audio objects representing each of the Audios that were previously batched together + + Raises: + ValueError if the batched_audio is not in the correct shape or if the number of batched_audios does not + match the amount of metadata and sampling rates (if they were provided as a List) that were provided. + """ + if len(batched_audio.shape) != 3: + raise ValueError("Expected batched audios to be of shape (batch_size, num_channels, samples)") + elif batched_audio.shape[0] != len(metadatas) or ( + isinstance(sampling_rates, List) and batched_audio.shape[0] != len(sampling_rates) + ): + raise ValueError( + "Expected sizes of batched_audio, sampling_rates (if provided as a litst) \ + and metadata to be equal" + ) + + audios = [] + for i in range(len(metadatas)): + sampling_rate = sampling_rates[i] if isinstance(sampling_rates, List) else sampling_rates + metadata = metadatas[i] + audio = batched_audio[i] + audios.append(Audio(waveform=audio, sampling_rate=sampling_rate, metadata=metadata)) + return audios diff --git a/src/senselab/utils/data_structures/dataset.py b/src/senselab/utils/data_structures/dataset.py new file mode 100644 index 00000000..51a4a595 --- /dev/null +++ b/src/senselab/utils/data_structures/dataset.py @@ -0,0 +1,217 @@ +"""Data structures relevant for managing datasets.""" + +import math +import uuid +from typing import Any, Dict, List, Union, no_type_check + +from pydantic import BaseModel, Field, ValidationInfo, field_validator + +from senselab.utils.data_structures.audio import Audio +from senselab.utils.data_structures.video import Video + + +class Participant(BaseModel): + """Data structure for a participant in a dataset.""" + + id: str = Field(default_factory=lambda: str(uuid.uuid4())) + metadata: Dict = Field(default={}) + + @field_validator("id", mode="before") + def set_id(cls, v: str) -> str: + """Set the unique id of the participant.""" + return v or str(uuid.uuid4()) + + def __eq__(self, other: object) -> bool: + """Overloads the default BaseModel equality to correctly check that ids are equivalent.""" + if isinstance(other, Participant): + return self.id == other.id + return False + + +class Session(BaseModel): + """Data structure for a session in a dataset.""" + + id: str = Field(default_factory=lambda: str(uuid.uuid4())) + metadata: Dict = Field(default={}) + + @field_validator("id", mode="before") + def set_id(cls, v: str) -> str: + """Set the unique id of the session.""" + return v or str(uuid.uuid4()) + + def __eq__(self, other: object) -> bool: + """Overloads the default BaseModel equality to correctly check that ids are equivalent.""" + if isinstance(other, Session): + return self.id == other.id + return False + + +class SenselabDataset(BaseModel): + """Class for maintaining SenseLab datasets and functionalities. + + Maintains collections of Audios, Videos, and metadata for use of the Senselab tools + and pipelines. Includes the ability to manage Sessions and Participants. + + Attributes: + audios: List of Audios that are generated based on list of audio filepaths + videos: List of Videos generated from a list of video filepaths + metadata: Metadata related to the dataset overall but not necessarily the metadata of + indivudal audios in the dataset + sessions: Session ID mapping to Session instance + participants: Mapping of participant ID to a Participant instance + """ + + participants: Dict[str, Participant] = Field(default_factory=dict) + sessions: Dict[str, Session] = Field(default_factory=dict) + audios: List[Audio] = [] + videos: List[Video] = [] + metadata: Dict = Field(default={}) + + @field_validator("audios", mode="before") + @classmethod + def generate_audios_from_filepaths(cls, v: Union[List[str], List[Audio]], _: ValidationInfo) -> List[Audio]: + """Generate the audios in the dataset from a list of audio filepaths. + + Generates the audios in the dataset by taking in a list of audio filepaths + or a list of Audios + + Args: + v: Input for audios attribute that we're validating by generating the Audios if filepaths + are provided or just the list of Audios if pre-generated and passed in + + Returns: + List of Audios that instantiates the audios attribute in the dataset + """ + audio_list = [] + if len(v) == 0: + return [] + else: + for audio in v: + if isinstance(audio, Audio): + audio_list.append(audio) + else: + audio_list.append(Audio.from_filepath(audio)) + return audio_list + + @field_validator("videos", mode="before") + @classmethod + def generate_videos_from_filepaths(cls, v: Union[List[str], List[Video]], _: ValidationInfo) -> List[Video]: + """Generate the videos in the dataset from a list of video filepaths. + + Generates the videos in the dataset by taking in a list of video filepaths + or a list of Videos + + Args: + v: Input for videos attribute that we're validating by generating the Videos if filepaths + are provided or just the list of Videos if pre-generated and passed in + + Returns: + List of Videos that instantiates the videos attribute in the dataset + """ + video_list = [] + if len(v) == 0: + return [] + else: + for video in v: + if isinstance(video, Video): + video_list.append(video) + elif isinstance(video, str): + video_list.append(Video.from_filepath(video)) + + else: + raise ValueError("Unsupported video list") + return video_list + + @classmethod + @no_type_check + def create_bids_dataset(cls, bids_root_filepath: str) -> "SenselabDataset": + """Create a dataset from a BIDS organized directory. + + Creates a new dataset based off of a BIDS directory structure as defined at + https://sensein.group/biometrics-book/updated_bids.html + """ + pass + + def create_audio_split_for_pydra_task(self, batch_size: int = 1) -> List[List[Audio]]: + """Splits the audio data for Pydra tasks. + + Creates a split of the audio data that can be used for creating individual Pydra tasks using + the .split functionality. Splits the data such that the inputs for a Pydra workflow are either + optimized for the GPU's batch size or a single Audio per CPU thread. + + Args: + batch_size: How to batch Audios for a Pydra task; defaults to 1 since CPU won't batch + + Returns: + List of Lists of Audio where each List of Audios will be an input to a Pydra task. + Each of the sublists are either of size 1 for CPUs or at most batch_size for GPU optimization. + + Raises: + ValueError if the batch size is invalid (less than 1) + """ + if batch_size > 1: + # Creates batches of at most size batch_size except the last which contains the remainder of audios + return [ + self.audios[batch_size * i : min(batch_size * (i + 1), len(self.audios))] + for i in range(math.ceil(len(self.audios) / batch_size)) + ] + elif batch_size < 1: + raise ValueError("Batch size must be greater than or equal to 1") + else: + return [[audio] for audio in self.audios] + + def audio_merge_from_pydra_task(self, audios_to_merge: List[List[Audio]]) -> None: + """Write later. + + Logic Pydra: + audios: List of audios that want to give to task + split: List[List[Audios]] -> task List[Audio] + pydra task(List[Audio]) -> List[Audio] + merge(List[List[Audio]]) <- might be a wrapped instead of List of lists + TODO: Figure out what a merge behavior looks like from Pydra + """ + self.audios = [] + for audio_task_input in audios_to_merge: + for audio_output in audio_task_input: + self.audios.append(audio_output) + + @field_validator("participants", mode="before") + def check_unique_participant_id(cls, v: Dict[str, Participant], values: Any) -> Dict[str, Participant]: # noqa: ANN401 + """Check if participant IDs are unique.""" + print("type(values)") + print(type(values)) + input("Press Enter to continue...") + participants = values.get("participants", {}) + for participant_id, _ in v.items(): + if participant_id in participants: + raise ValueError(f"Participant with ID {participant_id} already exists.") + return v + + @field_validator("sessions", mode="before") + def check_unique_session_id(cls, v: Dict[str, Session], values: Any) -> Dict[str, Session]: # noqa: ANN401 + """Check if session IDs are unique.""" + sessions = values.get("sessions", {}) + for session_id, _ in v.items(): + if session_id in sessions: + raise ValueError(f"Session with ID {session_id} already exists.") + return v + + def add_participant(self, participant: Participant) -> None: + """Add a participant to the dataset.""" + if participant.id in self.participants: + raise ValueError(f"Participant with ID {participant.id} already exists.") + self.participants[participant.id] = participant + + def add_session(self, session: Session) -> None: + """Add a session to the dataset.""" + if session.id in self.sessions: + raise ValueError(f"Session with ID {session.id} already exists.") + self.sessions[session.id] = session + + def get_participants(self) -> List[Participant]: + """Get the list of participants in the dataset.""" + return list(self.participants.values()) + + def get_sessions(self) -> List[Session]: + """Get the list of sessions in the dataset.""" + return list(self.sessions.values()) diff --git a/src/senselab/utils/data_structures/video.py b/src/senselab/utils/data_structures/video.py new file mode 100644 index 00000000..f9a6d75e --- /dev/null +++ b/src/senselab/utils/data_structures/video.py @@ -0,0 +1,79 @@ +"""Data structures relevant for video tasks and pipelines.""" + +import uuid +from typing import Dict, Optional + +import torch +from pydantic import BaseModel, Field, ValidationInfo, field_validator +from torchvision.io import read_video + +from senselab.utils.constants import SENSELAB_NAMESPACE +from senselab.utils.data_structures.audio import Audio + + +class Video(BaseModel): + """Pydantic model for video and its corresponding metadata. + + Pydantic model for video that holds the necessary attributes, the actual decoded video data + and the frame rate, to work with videos in python. Contains metadata information as needed + and has a unique identifier for every video. + + Attributes: + frames: Represent the video as a Tensor of all of its frames, each of which is an image + that we represent through a Tensor of (C, H, W) + frame_rate: Also known as the frames per second (fps), defines the time component + of a video (often an integer but some use cases of float approximations) + audio: the audio associated with the Video (optional) + orig_path_or_id: Optional str for the original path or an ID to track file over time + metadata: Optional metadata dictionary of information associated with this Video instance + (e.g. participant demographics, video settings, location information) + """ + + frames: torch.Tensor + frame_rate: float + audio: Optional[Audio] + orig_path_or_id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4())) + metadata: Dict = Field(default={}) + model_config = {"arbitrary_types_allowed": True} + + @field_validator("frames", mode="before") + def check_frames(cls, v: torch.Tensor, _: ValidationInfo) -> torch.Tensor: + """Check that the frames are the correct Tensor shape of (T,C,H,W).""" + if len(v.shape) != 4: + raise ValueError( + "Expected frames to be of shape (T, C, H, W) where T is the number of frames, \ + C is the channels, and H and W are the height and width" + ) + return v + + @classmethod + def from_filepath(cls, filepath: str, metadata: Dict = {}) -> "Video": + """Creates a Video instance from a video file. + + Args: + filepath: Filepath of the video file to read from + metadata: Additional information associated with the video file + """ + v_frames, a_frames, v_metadata = read_video(filename=filepath, output_format="TCHW") + v_fps = v_metadata["video_fps"] + a_fps = v_metadata["audio_fps"] + v_audio = Audio(waveform=a_frames, sampling_rate=a_fps, orig_path_or_id=filepath) + + return cls(frames=v_frames, frame_rate=v_fps, audio=v_audio, orig_path_or_id=filepath, metadata=metadata) + + def id(self) -> str: + """Generate a unique identifier for the Video. + + Generate a unique identifier for the Video where equivalent video frames and frame rate + and audio generate the same IDs. + + Returns: String UUID of the Video generated by an MD5 hash of the frames and the frame rate and audio + """ + temp_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.frames)), str(self.frame_rate)) + return str(temp_hash) if not self.audio else str(uuid.uuid3(temp_hash, self.audio.id())) + + def __eq__(self, other: object) -> bool: + """Overloads the default BaseModel equality to correctly check equivalence, ignoring metadata.""" + if isinstance(other, Audio): + return self.id() == other.id() + return False diff --git a/src/senselab/utils/device.py b/src/senselab/utils/device.py new file mode 100644 index 00000000..2143d933 --- /dev/null +++ b/src/senselab/utils/device.py @@ -0,0 +1,77 @@ +"""Utility functions for utilizing different devices in Senselab.""" + +from enum import Enum +from typing import Optional + +import torch + + +class DeviceType(Enum): + """Device types for PyTorch operations.""" + + CPU: str = "cpu" + CUDA: str = "cuda" + MPS: str = "mps" + + +DTYPE_MAP = {DeviceType.CPU: torch.float32, DeviceType.CUDA: torch.float16, DeviceType.MPS: torch.float32} + + +def _select_device_and_dtype( + user_preference: Optional[DeviceType] = None, + compatible_devices: list[DeviceType] = [ + DeviceType.CPU, + DeviceType.CUDA, + DeviceType.MPS, + ], +) -> tuple[DeviceType, torch.dtype]: + """Determines the device and data type for PyTorch operations. + + Allows users to give preferences for DeviceType, but determines based + on compatible and available devices. Chooses the fastest option if no + user preference is given. + + Args: + user_preference: Optional DeviceType that the user wants to use + compatible_devices: DeviceTypes that work with the functionality of the method calling this + Returns: + Tuple of (DeviceType, torch.dtype) where the device is both available and compatible and the + dtype is the best performing dtype for that DeviceType + Raises: + ValueError: if the user specifies a preference that is not available or compatible and a safety + call if no devices are available or compatible (we believe this to be impossible to trigger). + """ + available_devices = [DeviceType.CPU] + if torch.cuda.is_available(): + available_devices.append(DeviceType.CUDA) + + if torch.backends.mps.is_available(): + available_devices.append(DeviceType.MPS) + + # Check compatible and available + useable_devices = [] + for device in available_devices: + if device in compatible_devices: + useable_devices.append(device) + + # User preference or fastest option + if user_preference: + if user_preference not in useable_devices: + raise ValueError( + "Requested user DeviceType is either not available or\ + compatible with this functionality." + ) + else: + return user_preference, DTYPE_MAP[user_preference] + else: + if DeviceType.CUDA in useable_devices: + return DeviceType.CUDA, DTYPE_MAP[DeviceType.CUDA] + elif DeviceType.MPS in useable_devices: + return DeviceType.MPS, DTYPE_MAP[DeviceType.MPS] + elif DeviceType.CPU in useable_devices: + return DeviceType.CPU, DTYPE_MAP[DeviceType.CPU] + else: + raise ValueError( + "Something went really wrong and no devices were available or \ + compatible." + ) diff --git a/src/senselab/utils/functions.py b/src/senselab/utils/functions.py index 65798534..4267e107 100644 --- a/src/senselab/utils/functions.py +++ b/src/senselab/utils/functions.py @@ -1,41 +1,8 @@ """Utility functions for senselab.""" import os -from enum import Enum from typing import List -import torch - - -class DeviceType(Enum): - """Device types for PyTorch operations.""" - - CPU: str = "cpu" - CUDA: str = "cuda" - MPS: str = "mps" - - -def _select_device_and_dtype( - device_options: list[DeviceType] = [ - DeviceType.CPU, - DeviceType.CUDA, - DeviceType.MPS, - ], -) -> tuple[DeviceType, torch.dtype]: - """Determines the device and data type for PyTorch operations.""" - if torch.cuda.is_available() and DeviceType.CUDA in device_options: - device = DeviceType.CUDA - torch_dtype = torch.float16 # Using half precision for CUDA - elif torch.backends.mps.is_available() and DeviceType.MPS in device_options: - device = DeviceType.MPS - torch_dtype = torch.float32 - # Default to float32 on MPS for better precision - else: - device = DeviceType.CPU - torch_dtype = torch.float32 - # Default to float32 on CPU for better precision - return device, torch_dtype - def get_common_directory(files: List[str]) -> str: """A function to get the common directory from a list of file paths. diff --git a/src/senselab/utils/hf.py b/src/senselab/utils/hf.py index 468e0a02..636d7a88 100644 --- a/src/senselab/utils/hf.py +++ b/src/senselab/utils/hf.py @@ -16,27 +16,27 @@ class HFModel(BaseModel): @field_validator("hf_model_id") def validate_hf_model_id(cls, value: str) -> str: - """Validate the hf_model_id.""" + """Validate the hf_model_id. + + # TODO: enabling using HF token + """ if not value: raise ValueError("hf_model_id cannot be empty") if not os.path.isfile(value) and not _check_hf_repo_exists( - value, "model", None + repo_id=value, revision="main", repo_type="model", token=None ): raise ValueError("hf_model_id is not a valid Hugging Face model") return value def _check_hf_repo_exists( - repo_id: str, repo_type: str, token: Optional[str] = None + repo_id: str, revision: str = "main", repo_type: str = "model", token: Optional[str] = None ) -> bool: """Private function to check if a Hugging Face repository exists.""" api = HfApi() try: - repo_refs = api.list_repo_refs( - repo_id=repo_id, repo_type=repo_type, token=token - ) - if repo_refs.branches: - return True - except Exception as e: - raise RuntimeError(f"An error occurred: {e}") - return False + api.list_repo_commits(repo_id=repo_id, revision=revision, repo_type=repo_type, token=token) + return True + except Exception: + # raise RuntimeError(f"An error occurred: {e}") + return False diff --git a/src/senselab/utils/tasks/cca_cka.py b/src/senselab/utils/tasks/cca_cka.py new file mode 100644 index 00000000..8e83b317 --- /dev/null +++ b/src/senselab/utils/tasks/cca_cka.py @@ -0,0 +1,133 @@ +"""This module is for computing CCA and CKA.""" + +from enum import Enum + +import torch + + +def compute_cca(features_x: torch.Tensor, features_y: torch.Tensor) -> float: + """Compute the mean squared CCA correlation (R^2_{CCA}). + + Args: + features_x (torch.Tensor): A num_examples x num_features matrix of features. + features_y (torch.Tensor): A num_examples x num_features matrix of features. + + Returns: + float: The mean squared CCA correlations between X and Y. + """ + qx, _ = torch.linalg.qr(features_x) + qy, _ = torch.linalg.qr(features_y) + result = torch.norm(qx.t() @ qy) ** 2 / min(features_x.shape[1], features_y.shape[1]) + return result.item() if isinstance(result, torch.Tensor) else float(result) + + +class CKAKernelType(Enum): + """CKA kernel types.""" + + LINEAR = "linear" + RBF = "rbf" + + +def compute_cka( + features_x: torch.Tensor, + features_y: torch.Tensor, + kernel: CKAKernelType = CKAKernelType.LINEAR, + threshold: float = 1.0, +) -> float: + """Compute CKA between feature matrices. + + Args: + features_x (torch.Tensor): A num_examples x num_features matrix of features. + features_y (torch.Tensor): A num_examples x num_features matrix of features. + kernel (CKAKernelType): Type of kernel to use (CKAKernelType.LINEAR or CKAKernelType.RBF). + Default is CKAKernelType.LINEAR. + threshold (float): Fraction of median Euclidean distance to use as RBF kernel bandwidth + (used only if kernel is CKAKernelType.RBF). + + Returns: + float: The value of CKA between X and Y. + """ + + def _gram_linear(x: torch.Tensor) -> torch.Tensor: + """Compute Gram (kernel) matrix for a linear kernel. + + Args: + x (torch.Tensor): A num_examples x num_features matrix of features. + + Returns: + torch.Tensor: A num_examples x num_examples Gram matrix of examples. + """ + return x @ x.t() + + def _gram_rbf(x: torch.Tensor, threshold: float = 1.0) -> torch.Tensor: + """Compute Gram (kernel) matrix for an RBF kernel. + + Args: + x (torch.Tensor): A num_examples x num_features matrix of features. + threshold (float): Fraction of median Euclidean distance to use as RBF kernel bandwidth. + + Returns: + torch.Tensor: A num_examples x num_examples Gram matrix of examples. + """ + dot_products = x @ x.t() + sq_norms = torch.diag(dot_products) + sq_distances = -2 * dot_products + sq_norms[:, None] + sq_norms[None, :] + sq_median_distance = torch.median(sq_distances) + return torch.exp(-sq_distances / (2 * threshold**2 * sq_median_distance)) + + def _center_gram(gram: torch.Tensor) -> torch.Tensor: + """Center a symmetric Gram matrix. + + This is equivalent to centering the (possibly infinite-dimensional) features + induced by the kernel before computing the Gram matrix. + + Args: + gram (torch.Tensor): A num_examples x num_examples symmetric matrix. + + Returns: + torch.Tensor: A symmetric matrix with centered columns and rows. + + Raises: + ValueError: If the input is not a symmetric matrix. + """ + if not torch.allclose(gram, gram.t()): + raise ValueError("Input must be a symmetric matrix.") + + n = gram.size(0) + unit = torch.ones(n, n, device=gram.device) + eye = torch.eye(n, device=gram.device) + unit = unit / n + haitch = eye - unit + centered_gram = haitch.mm(gram).mm(haitch) + return centered_gram + + def _cka(gram_x: torch.Tensor, gram_y: torch.Tensor) -> torch.Tensor: + """Compute CKA. + + Args: + gram_x (torch.Tensor): A num_examples x num_examples Gram matrix. + gram_y (torch.Tensor): A num_examples x num_examples Gram matrix. + + Returns: + float: The value of CKA between X and Y. + """ + gram_x = _center_gram(gram_x) + gram_y = _center_gram(gram_y) + + scaled_hsic = torch.sum(gram_x * gram_y) + + normalization_x = torch.norm(gram_x) + normalization_y = torch.norm(gram_y) + return scaled_hsic / (normalization_x * normalization_y) + + if kernel == CKAKernelType.LINEAR: + gram_x = _gram_linear(features_x) + gram_y = _gram_linear(features_y) + elif kernel == CKAKernelType.RBF: + gram_x = _gram_rbf(features_x, threshold) + gram_y = _gram_rbf(features_y, threshold) + else: + raise ValueError("Unsupported kernel type. Use CKAKernelType.LINEAR or CKAKernelType.RBF.") + + result = _cka(gram_x, gram_y) + return result.item() if isinstance(result, torch.Tensor) else float(result) diff --git a/src/senselab/utils/tasks/cca_cka_pydra.py b/src/senselab/utils/tasks/cca_cka_pydra.py new file mode 100644 index 00000000..8ddbf997 --- /dev/null +++ b/src/senselab/utils/tasks/cca_cka_pydra.py @@ -0,0 +1,8 @@ +"""This module defines a pydra API for the CCA and CKA tasks.""" + +import pydra + +from senselab.utils.tasks.cca_cka import compute_cca, compute_cka + +compute_cca_pt = pydra.mark.task(compute_cca) +compute_cka_pt = pydra.mark.task(compute_cka) diff --git a/src/senselab/utils/tasks/cosine_similarity.py b/src/senselab/utils/tasks/cosine_similarity.py new file mode 100644 index 00000000..71a0508b --- /dev/null +++ b/src/senselab/utils/tasks/cosine_similarity.py @@ -0,0 +1,43 @@ +"""This module provides the implementation of cosine similarity.""" + +import torch + + +def compute_cosine_similarity(tensor1: torch.Tensor, tensor2: torch.Tensor) -> float: + """Compute the cosine similarity between two torch tensors. + + Args: + tensor1 (Tensor): The first input tensor. + tensor2 (Tensor): The second input tensor. + + Returns: + float: The cosine similarity between the two input tensors. + + Raises: + ValueError: If the input tensors are not of the same shape. + + Examples: + >>> tensor1 = torch.tensor([1.0, 2.0, 3.0]) + >>> tensor2 = torch.tensor([4.0, 5.0, 6.0]) + >>> cosine_similarity(tensor1, tensor2) + 0.9746318461970762 + + >>> tensor1 = torch.tensor([1.0, 0.0, -1.0]) + >>> tensor2 = torch.tensor([-1.0, 0.0, 1.0]) + >>> cosine_similarity(tensor1, tensor2) + -1.0 + + Note: + This function assumes the input tensors are 1-dimensional and have the same shape. + """ + if tensor1.dim() != 1 or tensor2.dim() != 1: + raise ValueError("Input tensors must be 1-dimensional") + if tensor1.shape != tensor2.shape: + raise ValueError("Input tensors must have the same shape") + + dot_product = torch.dot(tensor1, tensor2) + norm_tensor1 = torch.norm(tensor1) + norm_tensor2 = torch.norm(tensor2) + + cosine_sim = dot_product / (norm_tensor1 * norm_tensor2) + return cosine_sim.item() diff --git a/src/senselab/utils/tasks/cosine_similarity_pydra.py b/src/senselab/utils/tasks/cosine_similarity_pydra.py new file mode 100644 index 00000000..4a3e7dd3 --- /dev/null +++ b/src/senselab/utils/tasks/cosine_similarity_pydra.py @@ -0,0 +1,7 @@ +"""This module defines a pydra API for computing cosine similarity.""" + +import pydra + +from senselab.utils.tasks.cosine_similarity import compute_cosine_similarity + +cosine_similarity_pt = pydra.mark.task(compute_cosine_similarity) diff --git a/src/senselab/utils/tasks/cross_correlation.py b/src/senselab/utils/tasks/cross_correlation.py new file mode 100644 index 00000000..d88bb32d --- /dev/null +++ b/src/senselab/utils/tasks/cross_correlation.py @@ -0,0 +1,51 @@ +"""This module contains functions for computing the normalized cross-correlation between two signals.""" + +import numpy as np +import torch +from scipy.signal import correlate + + +def compute_normalized_cross_correlation(signal1: torch.Tensor, signal2: torch.Tensor) -> torch.Tensor: + """Calculate the normalized cross-correlation between two signals. + + Args: + signal1 (torch.Tensor): The first input signal as a PyTorch tensor. + signal2 (torch.Tensor): The second input signal as a PyTorch tensor. + + Returns: + torch.Tensor: The normalized cross-correlation value between the two input signals. + + Examples: + >>> signal1 = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0]) + >>> signal2 = torch.tensor([2.0, 3.0, 4.0]) + >>> normalized_cross_correlation(signal1, signal2) + Tensor([0.30151134, 0.51298918, 0.77459667, 0.9486833 , 0.90453403, 0.70710678, 0.43643578]) + + Note: + This function assumes the input signals are one-dimensional + and contain sufficient elements for meaningful cross-correlation. + """ + # Ensure the inputs are 1D tensors + if signal1.ndim != 1 or signal2.ndim != 1: + raise ValueError("Input signals must be one-dimensional") + + # Convert PyTorch tensors to NumPy arrays + signal1 = signal1.numpy() + signal2 = signal2.numpy() + + # Calculate the energy of each signal + energy_signal1 = np.sum(signal1**2) + energy_signal2 = np.sum(signal2**2) + + # Check for zero energy to avoid division by zero + if energy_signal1 == 0 or energy_signal2 == 0: + raise ZeroDivisionError("One of the input signals has zero energy, causing division by zero in normalization") + + # Compute the cross-correlation + cross_correlation = correlate(signal1, signal2) + + # Calculate the normalized cross-correlation + normalized_cross_correlation = cross_correlation / np.sqrt(energy_signal1 * energy_signal2) + + print(normalized_cross_correlation) + return torch.Tensor(normalized_cross_correlation) diff --git a/src/senselab/utils/tasks/cross_correlation_pydra.py b/src/senselab/utils/tasks/cross_correlation_pydra.py new file mode 100644 index 00000000..b42ff20c --- /dev/null +++ b/src/senselab/utils/tasks/cross_correlation_pydra.py @@ -0,0 +1,7 @@ +"""This module defines a pydra API for computing cross correlation between two signals.""" + +import pydra + +from senselab.utils.tasks.cross_correlation import compute_normalized_cross_correlation + +compute_normalized_cross_correlation_pt = pydra.mark.task(compute_normalized_cross_correlation) diff --git a/src/senselab/utils/tasks/eer.py b/src/senselab/utils/tasks/eer.py new file mode 100644 index 00000000..59b12479 --- /dev/null +++ b/src/senselab/utils/tasks/eer.py @@ -0,0 +1,19 @@ +"""This module implements some utilities for computing the Equal Error Rate (EER).""" + +from typing import Tuple + +import torch +from speechbrain.utils.metric_stats import EER + + +def compute_eer(predictions: torch.Tensor, targets: torch.Tensor) -> Tuple[float, float]: + """Compute the Equal Error Rate (EER). + + Args: + predictions (torch.Tensor): A 1D tensor of predictions. + targets (torch.Tensor): A 1D tensor of targets. + + Returns: + Tuple[float, float]: The EER and the threshold for the EER. + """ + return EER(predictions, targets) diff --git a/src/senselab/utils/tasks/eer_pydra.py b/src/senselab/utils/tasks/eer_pydra.py new file mode 100644 index 00000000..6c9ab83d --- /dev/null +++ b/src/senselab/utils/tasks/eer_pydra.py @@ -0,0 +1,7 @@ +"""This module defines a pydra API for computing EER.""" + +import pydra + +from senselab.utils.tasks.eer import compute_eer + +compute_eer_pt = pydra.mark.task(compute_eer) diff --git a/src/senselab/utils/tasks/input_output.py b/src/senselab/utils/tasks/input_output.py index 892ca655..850b9b2e 100644 --- a/src/senselab/utils/tasks/input_output.py +++ b/src/senselab/utils/tasks/input_output.py @@ -34,9 +34,7 @@ def _from_files_to_dataset(files: List[File]) -> Dataset: return _from_hf_dataset_to_dict(dataset) -def read_dataset_from_disk( - input_path: str, split: str, streaming: bool = False -) -> Dict[str, Any]: +def read_dataset_from_disk(input_path: str, split: str, streaming: bool = False) -> Dict[str, Any]: """Loads a Hugging Face `Dataset` object from disk. It determines the format based on the file extension or directory. @@ -58,9 +56,7 @@ def read_dataset_from_disk( return _from_hf_dataset_to_dict(dataset) except Exception as e: # Generic error handling, e.g., network issues, data loading issues - raise RuntimeError( - f"An error occurred while loading the dataset: {str(e)}" - ) + raise RuntimeError(f"An error occurred while loading the dataset: {str(e)}") def read_dataset_from_hub( @@ -73,10 +69,9 @@ def read_dataset_from_hub( It includes support for private repositories. """ - if not _check_hf_repo_exists(remote_repository, "dataset", hf_token): + if not _check_hf_repo_exists(remote_repository, "main", "dataset", hf_token): raise RuntimeError( - f"The repository {remote_repository} - {revision} - {split}" - " does not exist or could not be accessed." + f"The repository {remote_repository} - {revision} - {split}" " does not exist or could not be accessed." ) # Load the dataset @@ -89,9 +84,7 @@ def read_dataset_from_hub( ) except Exception as e: # Generic error handling, e.g., network issues, data loading issues - raise RuntimeError( - f"An error occurred while loading the dataset: {str(e)}" - ) + raise RuntimeError(f"An error occurred while loading the dataset: {str(e)}") return _from_hf_dataset_to_dict(dataset) @@ -117,9 +110,7 @@ def push_dataset_to_hub( token=hf_token, ) else: - hf_dataset.push_to_hub( - repo_id=remote_repository, revision=revision, split=split - ) + hf_dataset.push_to_hub(repo_id=remote_repository, revision=revision, split=split) except Exception as e: raise RuntimeError(f"Failed to push dataset to the hub: {str(e)}") return @@ -140,27 +131,21 @@ def save_dataset_to_disk( output_path = os.path.join(output_directory, output_name) # No extension for Arrow, it's a directory else: - output_path = os.path.join( - output_directory, f"{output_name}.{output_format}" - ) + output_path = os.path.join(output_directory, f"{output_name}.{output_format}") # Create the output directory, ignore error if it already exists os.makedirs(output_directory, exist_ok=True) if output_format == "parquet": - def _save_hf_dataset_as_parquet( - dataset: Dataset, output_path: str - ) -> None: + def _save_hf_dataset_as_parquet(dataset: Dataset, output_path: str) -> None: """Saves a Hugging Face `Dataset` object to parquet format.""" dataset.to_parquet(output_path) _save_hf_dataset_as_parquet(hf_dataset, output_path) elif output_format == "json": - def _save_hf_dataset_as_json( - dataset: Dataset, output_path: str - ) -> None: + def _save_hf_dataset_as_json(dataset: Dataset, output_path: str) -> None: """Saves a Hugging Face `Dataset` object to json format.""" dataset.to_json(output_path) @@ -181,9 +166,7 @@ def _save_hf_dataset_as_sql(dataset: Dataset, output_path: str) -> None: _save_hf_dataset_as_sql(hf_dataset, output_path) elif output_format == "arrow": - def _save_hf_dataset_as_arrow( - dataset: Dataset, output_path: str - ) -> None: + def _save_hf_dataset_as_arrow(dataset: Dataset, output_path: str) -> None: """Saves a Hugging Face `Dataset` object in Apache Arrow format.""" dataset.save_to_disk(output_path) diff --git a/src/tests/audio/tasks/data_augmentation_test.py b/src/tests/audio/tasks/data_augmentation_test.py new file mode 100644 index 00000000..07ef3df2 --- /dev/null +++ b/src/tests/audio/tasks/data_augmentation_test.py @@ -0,0 +1,39 @@ +"""Module for testing data augmentation on audios.""" + +import torch +from torch_audiomentations import Compose, PolarityInversion + +from senselab.audio.tasks.data_augmentation import augment_audios +from senselab.utils.data_structures.audio import Audio +from senselab.utils.data_structures.dataset import SenselabDataset + + +def test_audio_data_augmentation() -> None: + """Test data augmentations using the new Audio data types.""" + apply_augmentation = Compose(transforms=[PolarityInversion(p=1, output_type="dict")], output_type="dict") + + audio_paths = [ + "src/tests/data_for_testing/audio_48khz_mono_16bits.wav", + "src/tests/data_for_testing/audio_48khz_stereo_16bits.wav", + ] + audio_dataset_from_paths = SenselabDataset(audios=audio_paths) + mono_audio, stereo_audio = audio_dataset_from_paths.create_audio_split_for_pydra_task() + mono_inverted = augment_audios(mono_audio, apply_augmentation) + stereo_inverted = augment_audios(stereo_audio, apply_augmentation) + assert torch.equal( + mono_audio[0].waveform, -1 * mono_inverted[0].waveform + ), "Audio should have been inverted by the augmentation" + assert torch.equal( + stereo_audio[0].waveform, -1 * stereo_inverted[0].waveform + ), "Audio should have been inverted by the augmentation and not affected by stereo audio" + + batched_audio = SenselabDataset( + audios=[ + Audio(waveform=stereo_audio[0].waveform[0], sampling_rate=stereo_audio[0].sampling_rate), + Audio(waveform=stereo_audio[0].waveform[1], sampling_rate=stereo_audio[0].sampling_rate), + ] + ).create_audio_split_for_pydra_task(2) + batch_inverted = augment_audios(batched_audio[0], apply_augmentation) + assert torch.equal(batched_audio[0][0].waveform, -1 * batch_inverted[0].waveform) and torch.equal( + batched_audio[0][1].waveform, -1 * batch_inverted[1].waveform + ) diff --git a/src/tests/audio/tasks/preprocessing_test.py b/src/tests/audio/tasks/preprocessing_test.py new file mode 100644 index 00000000..e7c09c35 --- /dev/null +++ b/src/tests/audio/tasks/preprocessing_test.py @@ -0,0 +1,115 @@ +"""Module for testing the preprocessing functionality of Audios.""" + +import math + +import pytest +import torch + +from senselab.audio.tasks.preprocessing import ( + chunk_audios, + downmix_audios_to_mono, + resample_audios, + select_channel_from_audios, +) +from senselab.utils.data_structures.audio import Audio + + +def test_resample_audios() -> None: + """Tests functionality for resampling Audio objects.""" + resample_rate = 36000 + mono_audio = Audio.from_filepath(("src/tests/data_for_testing/audio_48khz_mono_16bits.wav")) + resampled_expected_size = mono_audio.waveform.shape[1] / 48000 * resample_rate + + resampled_audio = resample_audios([mono_audio], resample_rate) + assert math.ceil(resampled_expected_size) == resampled_audio[0].waveform.shape[1] + + stereo_audio = Audio.from_filepath(("src/tests/data_for_testing/audio_48khz_stereo_16bits.wav")) + resampled_expected_size = stereo_audio.waveform.shape[1] / 48000 * resample_rate + + resampled_audio = resample_audios([stereo_audio], resample_rate) + assert math.ceil(resampled_expected_size) == resampled_audio[0].waveform.shape[1] + + +def test_downmix_audios() -> None: + """Tests functionality for downmixing Audio objects.""" + mono_audio = Audio.from_filepath(("src/tests/data_for_testing/audio_48khz_mono_16bits.wav")) + down_mixed_audios = downmix_audios_to_mono([mono_audio]) + assert down_mixed_audios[0].waveform.dim() == 2, "Mono audio should maintain the (num_channels, num_samples) shape" + assert down_mixed_audios[0].waveform.shape[0] == 1, "Mono audio should remain mono after downmixing" + assert down_mixed_audios[0].waveform.size(1) == mono_audio.waveform.size( + 1 + ), "Downmixed mono audio should have correct number of samples" + + stereo_audio = Audio.from_filepath("src/tests/data_for_testing/audio_48khz_stereo_16bits.wav") + down_mixed_audios = downmix_audios_to_mono([stereo_audio]) + assert down_mixed_audios[0].waveform.dim() == 2, "Mono audio should maintain the (num_channels, num_samples) shape" + assert down_mixed_audios[0].waveform.shape[0] == 1, "Stereo audio should become mono after downmixing" + assert down_mixed_audios[0].waveform.size(1) == stereo_audio.waveform.size( + 1 + ), "Downmixed stereo audio should have correct number of samples" + assert torch.isclose( + down_mixed_audios[0].waveform, stereo_audio.waveform.mean(dim=0, keepdim=True) + ).all(), "Downmixed audio should be the mean of the stereo channels" + + +def test_select_channel_from_audios() -> None: + """Tests functionality for selecting a specific channel from Audio objects.""" + + def check_selected_channel(audio: Audio, channel_to_select: int) -> None: + """Checks if the original selected audio channel is the same as the returned selected audio channel.""" + selected_channel_audios = select_channel_from_audios([audio], channel_to_select) + assert selected_channel_audios[0].waveform.shape[0] == 1, "Selected channel audio should be mono" + assert ( + selected_channel_audios[0].waveform.shape[1] == audio.waveform.shape[1] + ), "Selected channel audio should have the correct number of samples" + assert torch.equal( + selected_channel_audios[0].waveform[0, :], audio.waveform[channel_to_select, :] + ), "Selected channel audio should be the same as the selected channel of the original audio" + + channel_to_select = 0 + mono_audio = Audio.from_filepath("src/tests/data_for_testing/audio_48khz_mono_16bits.wav") + check_selected_channel(mono_audio, channel_to_select) + + stereo_audio = Audio.from_filepath("src/tests/data_for_testing/audio_48khz_stereo_16bits.wav") + check_selected_channel(stereo_audio, channel_to_select) + + channel_to_select = 1 + check_selected_channel(stereo_audio, channel_to_select) + + +def test_chunk_audios() -> None: + """Tests functionality for chunking Audio objects.""" + # Test data setup + audio_path = "src/tests/data_for_testing/audio_48khz_mono_16bits.wav" + audio = Audio.from_filepath(audio_path) + audio_duration = audio.waveform.shape[1] / audio.sampling_rate + + # Test cases + test_data = [ + (audio, (0.0, 1.0)), # Normal case within bounds + (audio, (1.0, 2.0)), # Normal case within bounds + ] + + chunked_audios = chunk_audios(test_data) + + # Verify chunked audio lengths + for i, (original_audio, (start, end)) in enumerate(test_data): + start_sample = int(start * original_audio.sampling_rate) + end_sample = int(end * original_audio.sampling_rate) + expected_length = end_sample - start_sample + assert chunked_audios[i].waveform.shape[1] == expected_length + # Test case where start time is negative + with pytest.raises(ValueError, match="Start time must be greater than or equal to 0."): + chunk_audios([(audio, (-1.0, 1.0))]) + + # Test case where end time exceeds duration + try: + chunk_audios([(audio, (0.0, audio_duration + 1.0))]) + except ValueError as e: + assert str(e) == f"End time must be less than the duration of the audio file ({audio_duration} seconds)." + else: + pytest.fail("ValueError not raised") + + # Test case where end time equals duration + chunked_audio = chunk_audios([(audio, (0.0, audio_duration))])[0] + assert chunked_audio.waveform.shape[1] == audio.waveform.shape[1] diff --git a/src/tests/audio/tasks/speech_to_text_evaluation_test.py b/src/tests/audio/tasks/speech_to_text_evaluation_test.py new file mode 100644 index 00000000..f58027ad --- /dev/null +++ b/src/tests/audio/tasks/speech_to_text_evaluation_test.py @@ -0,0 +1,64 @@ +"""Module for testing speech-to-text evaluation.""" + +from senselab.audio.tasks.speech_to_text_evaluation import ( + calculate_cer, + calculate_mer, + calculate_wer, + calculate_wil, + calculate_wip, +) + + +def test_calculate_wer() -> None: + """Tests the calculation of Word Error Rate (WER).""" + reference = "hello world" + hypothesis = "hello duck" + expected_wer = 0.5 + + wer = calculate_wer(reference, hypothesis) + + assert wer == expected_wer, f"Expected WER: {expected_wer}, but got: {wer}" + + +def test_calculate_mer() -> None: + """Tests the calculation of Match Error Rate (MER).""" + reference = "hello world" + hypothesis = "hello duck" + expected_mer = 0.5 + + mer = calculate_mer(reference, hypothesis) + + assert mer == expected_mer, f"Expected MER: {expected_mer}, but got: {mer}" + + +def test_calculate_wil() -> None: + """Tests the calculation of Word Information Lost (WIL).""" + reference = "hello world" + hypothesis = "hello duck" + expected_wil = 0.75 + + wil = calculate_wil(reference, hypothesis) + + assert wil == expected_wil, f"Expected WIL: {expected_wil}, but got: {wil}" + + +def test_calculate_wip() -> None: + """Tests the calculation of Word Information Preserved (WIP).""" + reference = "hello world" + hypothesis = "hello duck" + expected_wip = 0.25 + + wip = calculate_wip(reference, hypothesis) + + assert wip == expected_wip, f"Expected WIP: {expected_wip}, but got: {wip}" + + +def test_calculate_cer() -> None: + """Tests the calculation of Character Error Rate (CER).""" + reference = "hello world" + hypothesis = "hello duck" + expected_cer = 0.45454545454545453 + + cer = calculate_cer(reference, hypothesis) + + assert cer == expected_cer, f"Expected CER: {expected_cer}, but got: {cer}" diff --git a/src/tests/utils/data_structures/audio_test.py b/src/tests/utils/data_structures/audio_test.py new file mode 100644 index 00000000..b375202f --- /dev/null +++ b/src/tests/utils/data_structures/audio_test.py @@ -0,0 +1,45 @@ +"""Module for testing Audio data structures.""" + +import torch +import torchaudio + +from senselab.utils.data_structures.audio import Audio + + +def test_audio_creation() -> None: + """Tests the functionality for creating data instances.""" + mono_audio_data, mono_sr = torchaudio.load("src/tests/data_for_testing/audio_48khz_mono_16bits.wav") + stereo_audio_data, stereo_sr = torchaudio.load("src/tests/data_for_testing/audio_48khz_stereo_16bits.wav") + + mono_audio = Audio( + waveform=mono_audio_data, + sampling_rate=mono_sr, + orig_path_or_id="src/tests/data_for_testing/audio_48khz_mono_16bits.wav", + ) + mono_audio_from_file = Audio.from_filepath("src/tests/data_for_testing/audio_48khz_mono_16bits.wav") + assert mono_audio == mono_audio_from_file, "Mono audios are not exactly equivalent" + + stereo_audio = Audio( + waveform=stereo_audio_data, + sampling_rate=stereo_sr, + orig_path_or_id="src/tests/data_for_testing/audio_48khz_stereo_16bits.wav", + ) + stereo_audio_uuid = Audio(waveform=stereo_audio_data, sampling_rate=stereo_sr) + stereo_audio_from_file = Audio.from_filepath("src/tests/data_for_testing/audio_48khz_stereo_16bits.wav") + assert stereo_audio == stereo_audio_from_file, "Stereo audios are not exactly equivalent" + assert stereo_audio == stereo_audio_uuid, "Stereo audio with different IDs should still be equivalent" + + audio_single_tensor = Audio(waveform=mono_audio_data[0], sampling_rate=mono_sr) + assert torch.equal( + mono_audio.waveform, audio_single_tensor.waveform + ), "Mono audios of tensor shape (num_samples,) should be reshaped to (1, num_samples)" + + audio_from_list = Audio(waveform=list(mono_audio_data[0]), sampling_rate=mono_sr) + audio_from_list_of_lists = Audio(waveform=[list(mono_audio_data[0])], sampling_rate=mono_sr) + audio_from_numpy = Audio(waveform=mono_audio_data.numpy(), sampling_rate=mono_sr) + + assert torch.equal(mono_audio.waveform, audio_from_list.waveform), "List audio should've been converted to Tensor" + assert torch.equal( + mono_audio.waveform, audio_from_list_of_lists.waveform + ), "List of lists audio should've been converted to Tensor" + assert torch.equal(mono_audio.waveform, audio_from_numpy.waveform), "NumPy audio should've been converted to Tensor" diff --git a/src/tests/utils/data_structures/dataset_test.py b/src/tests/utils/data_structures/dataset_test.py new file mode 100644 index 00000000..b9c06d29 --- /dev/null +++ b/src/tests/utils/data_structures/dataset_test.py @@ -0,0 +1,153 @@ +"""Module for testing the Participant, Session, and SenselabDataset classes.""" + +import pytest +import torchaudio + +from senselab.utils.data_structures.audio import Audio +from senselab.utils.data_structures.dataset import Participant, SenselabDataset, Session + + +def test_create_participant() -> None: + """Test creating a participant.""" + participant = Participant(metadata={"name": "John Doe"}) + assert isinstance(participant, Participant) + assert participant.metadata["name"] == "John Doe" + + +def test_create_session() -> None: + """Test creating a session.""" + session = Session(metadata={"description": "Initial session"}) + assert isinstance(session, Session) + assert session.metadata["description"] == "Initial session" + + +def test_add_participant() -> None: + """Test adding a participant to the dataset.""" + dataset = SenselabDataset() + participant = Participant() + dataset.add_participant(participant) + assert participant.id in dataset.participants + + +def test_add_duplicate_participant() -> None: + """Test adding a duplicate participant to the dataset.""" + dataset = SenselabDataset() + participant = Participant() + dataset.add_participant(participant) + with pytest.raises(ValueError): + dataset.add_participant(participant) + + +def test_add_session() -> None: + """Test adding a session to the dataset.""" + dataset = SenselabDataset() + session = Session() + dataset.add_session(session) + assert session.id in dataset.sessions + + +def test_add_duplicate_session() -> None: + """Test adding a duplicate session to the dataset.""" + dataset = SenselabDataset() + session = Session() + dataset.add_session(session) + with pytest.raises(ValueError): + dataset.add_session(session) + + +def test_get_participants() -> None: + """Test getting the list of participants.""" + dataset = SenselabDataset() + participant1 = Participant() + participant2 = Participant() + dataset.add_participant(participant1) + dataset.add_participant(participant2) + participants = dataset.get_participants() + assert len(participants) == 2 + assert participant1 in participants + assert participant2 in participants + + +def test_get_sessions() -> None: + """Test getting the list of sessions.""" + dataset = SenselabDataset() + session1 = Session() + session2 = Session() + dataset.add_session(session1) + dataset.add_session(session2) + sessions = dataset.get_sessions() + assert len(sessions) == 2 + assert session1 in sessions + assert session2 in sessions + + +def test_audio_dataset_creation() -> None: + """Tests the creation of AudioDatasets with various ways of generating them.""" + audio_paths = [ + "src/tests/data_for_testing/audio_48khz_mono_16bits.wav", + "src/tests/data_for_testing/audio_48khz_stereo_16bits.wav", + ] + + mono_audio_data, mono_sr = torchaudio.load("src/tests/data_for_testing/audio_48khz_mono_16bits.wav") + stereo_audio_data, stereo_sr = torchaudio.load("src/tests/data_for_testing/audio_48khz_stereo_16bits.wav") + mono_audio = Audio( + waveform=mono_audio_data, + sampling_rate=mono_sr, + orig_path_or_id="src/tests/data_for_testing/audio_48khz_mono_16bits.wav", + ) + stereo_audio = Audio( + waveform=stereo_audio_data, + sampling_rate=stereo_sr, + orig_path_or_id="src/tests/data_for_testing/audio_48khz_stereo_16bits.wav", + ) + + audio_dataset_from_paths = SenselabDataset(audios=audio_paths) + assert ( + audio_dataset_from_paths.audios[0] == mono_audio and audio_dataset_from_paths.audios[1] == stereo_audio + ), "Audio data generated from paths does not equal creating the individually" + + audio_dataset_from_data = SenselabDataset( + audios=[ + Audio(waveform=mono_audio_data, sampling_rate=mono_sr), + Audio(waveform=stereo_audio_data, sampling_rate=stereo_sr), + ], + ) + + assert audio_dataset_from_paths == audio_dataset_from_data, "Audio datasets should be equivalent" + + +def test_audio_dataset_splits() -> None: + """Tests the AudioDataset split functionality.""" + audio_paths = [ + "src/tests/data_for_testing/audio_48khz_mono_16bits.wav", + "src/tests/data_for_testing/audio_48khz_stereo_16bits.wav", + ] + audio_dataset = SenselabDataset(audios=audio_paths) + mono_audio_data, mono_sr = torchaudio.load("src/tests/data_for_testing/audio_48khz_mono_16bits.wav") + stereo_audio_data, stereo_sr = torchaudio.load("src/tests/data_for_testing/audio_48khz_stereo_16bits.wav") + mono_audio = Audio( + waveform=mono_audio_data, + sampling_rate=mono_sr, + orig_path_or_id="src/tests/data_for_testing/audio_48khz_mono_16bits.wav", + ) + stereo_audio = Audio( + waveform=stereo_audio_data, + sampling_rate=stereo_sr, + orig_path_or_id="src/tests/data_for_testing/audio_48khz_stereo_16bits.wav", + ) + + no_param_cpu_split = audio_dataset.create_audio_split_for_pydra_task() + assert no_param_cpu_split == [ + [mono_audio], + [stereo_audio], + ], "Default split should have been a list of each audio in its own list" + + gpu_split_exact = audio_dataset.create_audio_split_for_pydra_task(2) + assert gpu_split_exact == [ + [mono_audio, stereo_audio] + ], "Exact GPU split should generate a list with one list of all of the audios" + + gpu_excess_split = audio_dataset.create_audio_split_for_pydra_task(4) + assert gpu_excess_split == [ + [mono_audio, stereo_audio] + ], "Excess GPU split should generate a list with one list of all of the audios, unpadded" diff --git a/src/tests/utils/tasks/cca_cka_test.py b/src/tests/utils/tasks/cca_cka_test.py new file mode 100644 index 00000000..90f02769 --- /dev/null +++ b/src/tests/utils/tasks/cca_cka_test.py @@ -0,0 +1,53 @@ +"""Module for testing the CCA and CKA functions.""" + +import torch + +from senselab.utils.tasks.cca_cka import CKAKernelType, compute_cca, compute_cka + + +def test_compute_cca() -> None: + """Test compute_cca function with random input tensors.""" + # Create input tensors + features_x = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) + features_y = torch.tensor([[2.0, 4.0], [6.0, 8.0], [10.0, 12.0]]) + expected = 1.0 # Since features_y is a linear transformation of features_x, CCA should be perfect. + + # Call the compute_cca function + cca_value = compute_cca(features_x, features_y) + + # Assert that the result is a float + assert isinstance(cca_value, float), "Output should be a float." + + assert torch.isclose(torch.tensor(cca_value), torch.tensor(expected), atol=1e-6) + + +def test_compute_cka_linear() -> None: + """Test compute_cka function with linear kernel and random input tensors.""" + # Create input tensors + features_x = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) + features_y = torch.tensor([[2.0, 4.0], [6.0, 8.0], [10.0, 12.0]]) + expected = 1.0 # Since features_y is a linear transformation of features_x, linear CKA should be perfect. + + # Call the compute_cka function with linear kernel + cka_value = compute_cka(features_x, features_y, kernel=CKAKernelType.LINEAR) + + # Assert that the result is a float + assert isinstance(cka_value, float), "Output should be a float." + + assert torch.isclose(torch.tensor(cka_value), torch.tensor(expected), atol=1e-6) + + +def test_compute_cka_rbf() -> None: + """Test compute_cka function with RBF kernel and random input tensors.""" + # Create input tensors + features_x = torch.tensor([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]]) + features_y = torch.tensor([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]]) + expected = 1.0 # Since features_y is the same as features_x, RBF CKA should be perfect. + + # Call the compute_cka function with rbf kernel + cka_value = compute_cka(features_x, features_y, kernel=CKAKernelType.RBF) + + # Assert that the result is a float + assert isinstance(cka_value, float), "Output should be a float." + + assert torch.isclose(torch.tensor(cka_value), torch.tensor(expected), atol=1e-6) diff --git a/src/tests/utils/tasks/cosine_similarity_test.py b/src/tests/utils/tasks/cosine_similarity_test.py new file mode 100644 index 00000000..8e02b6ac --- /dev/null +++ b/src/tests/utils/tasks/cosine_similarity_test.py @@ -0,0 +1,47 @@ +"""This module contains unit tests for the cosine similarity function.""" + +import pytest +import torch + +from senselab.utils.tasks.cosine_similarity import compute_cosine_similarity + + +def test_cosine_similarity_identical_vectors() -> None: + """Test cosine similarity for identical vectors.""" + tensor1 = torch.tensor([1.0, 2.0, 3.0]) + tensor2 = torch.tensor([1.0, 2.0, 3.0]) + similarity = compute_cosine_similarity(tensor1, tensor2) + assert torch.isclose(torch.tensor(similarity), torch.tensor(1.0), atol=1e-6) + + +def test_cosine_similarity_opposite_vectors() -> None: + """Test cosine similarity for opposite vectors.""" + tensor1 = torch.tensor([1.0, 0.0, -1.0]) + tensor2 = torch.tensor([-1.0, 0.0, 1.0]) + similarity = compute_cosine_similarity(tensor1, tensor2) + assert torch.isclose(torch.tensor(similarity), torch.tensor(-1.0), atol=1e-6) + + +def test_cosine_similarity_orthogonal_vectors() -> None: + """Test cosine similarity for orthogonal vectors.""" + tensor1 = torch.tensor([1.0, 0.0]) + tensor2 = torch.tensor([0.0, 1.0]) + similarity = compute_cosine_similarity(tensor1, tensor2) + assert torch.isclose(torch.tensor(similarity), torch.tensor(0.0), atol=1e-6) + + +def test_cosine_similarity_non_identical_vectors() -> None: + """Test cosine similarity for non-identical but non-orthogonal vectors.""" + tensor1 = torch.tensor([1.0, 2.0, 3.0]) + tensor2 = torch.tensor([4.0, 5.0, 6.0]) + expected_value = 0.9746318461970762 + similarity = compute_cosine_similarity(tensor1, tensor2) + assert torch.isclose(torch.tensor(similarity), torch.tensor(expected_value), atol=1e-6) + + +def test_cosine_similarity_different_shapes() -> None: + """Test cosine similarity for tensors of different shapes, expecting a ValueError.""" + tensor1 = torch.tensor([1.0, 2.0]) + tensor2 = torch.tensor([1.0, 2.0, 3.0]) + with pytest.raises(ValueError): + compute_cosine_similarity(tensor1, tensor2) diff --git a/src/tests/utils/tasks/cross_correlation_test.py b/src/tests/utils/tasks/cross_correlation_test.py new file mode 100644 index 00000000..4c84ada2 --- /dev/null +++ b/src/tests/utils/tasks/cross_correlation_test.py @@ -0,0 +1,53 @@ +"""Module for testing the compute_normalized_cross_correlation function.""" + +import pytest +import torch + +from senselab.utils.tasks.cross_correlation import compute_normalized_cross_correlation + + +def test_normalized_cross_correlation_basic() -> None: + """Test normalized cross-correlation for basic identical signals.""" + signal1 = torch.tensor([1.0, 1.0]) + signal2 = torch.tensor([1.0, 1.0]) + expected_result = torch.tensor([0.5, 1.0, 0.5], dtype=torch.float32) + result = compute_normalized_cross_correlation(signal1, signal2) + assert torch.allclose(result, expected_result, atol=1e-4), f"Expected {expected_result}, but got {result}" + + +def test_normalized_cross_correlation_different_lengths() -> None: + """Test normalized cross-correlation for signals of different lengths.""" + signal1 = torch.tensor([1.0, 2.0, 1.0]) + signal2 = torch.tensor([1.0, 2.0]) + expected_result = torch.tensor([0.3651, 0.9129, 0.7303, 0.1826], dtype=torch.float32) + result = compute_normalized_cross_correlation(signal1, signal2) + assert torch.allclose(result, expected_result, atol=1e-4), f"Expected {expected_result}, but got {result}" + + +def test_normalized_cross_correlation_zero_signal() -> None: + """Test normalized cross-correlation with a zero signal.""" + signal1 = torch.tensor([0.0, 0.0, 0.0, 0.0]) + signal2 = torch.tensor([1.0, 2.0, 3.0]) + with pytest.raises(ZeroDivisionError): + compute_normalized_cross_correlation(signal1, signal2) + + +def test_normalized_cross_correlation_empty_signal() -> None: + """Test normalized cross-correlation with an empty signal.""" + signal1 = torch.tensor([]) + signal2 = torch.tensor([1.0, 2.0, 3.0]) + with pytest.raises(ZeroDivisionError): + compute_normalized_cross_correlation(signal1, signal2) + + +def test_normalized_cross_correlation_non_1d_signal() -> None: + """Test normalized cross-correlation with non-1D signals.""" + signal1 = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) + signal2 = torch.tensor([1.0, 2.0, 3.0]) + with pytest.raises(ValueError): + compute_normalized_cross_correlation(signal1, signal2) + + signal1 = torch.tensor([1.0, 2.0, 3.0]) + signal2 = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) + with pytest.raises(ValueError): + compute_normalized_cross_correlation(signal1, signal2) diff --git a/src/tests/utils/tasks/eer_test.py b/src/tests/utils/tasks/eer_test.py new file mode 100644 index 00000000..46324073 --- /dev/null +++ b/src/tests/utils/tasks/eer_test.py @@ -0,0 +1,28 @@ +"""This module contains unit tests for the EER function.""" + +import torch + +from senselab.utils.tasks.eer import compute_eer + + +def test_compute_eer() -> None: + """Test that the EER is computed correctly for perfectly separable data.""" + predictions = torch.tensor([0.6, 0.7, 0.8, 0.5]) + targets = torch.tensor([0.4, 0.3, 0.2, 0.1]) + eer, threshold = compute_eer(predictions, targets) + # Since we expect perfect separation, the EER should be 0 + assert eer == 0.0, "EER should be 0 for perfectly separable data" + assert 0 <= threshold <= 1, "Threshold should be between 0 and 1" + + +def test_compute_eer_random() -> None: + """Test that the EER is computed correctly for random predictions and targets.""" + # Set random seed for reproducibility + torch.manual_seed(42) + predictions = torch.rand(100) + targets = torch.randint(0, 2, (100,)) + eer, threshold = compute_eer(predictions, targets) + assert isinstance(eer, float), "EER should be a float" + assert isinstance(threshold, float), "Threshold should be a float" + assert 0 <= eer <= 1, "EER should be between 0 and 1" + assert 0 <= threshold <= 1, "Threshold should be between 0 and 1"